diff options
Diffstat (limited to 'src/spdk/dpdk/lib/librte_eal')
246 files changed, 56870 insertions, 0 deletions
diff --git a/src/spdk/dpdk/lib/librte_eal/Makefile b/src/spdk/dpdk/lib/librte_eal/Makefile new file mode 100644 index 000000000..2fda40d23 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-y += include +DIRS-$(CONFIG_RTE_EXEC_ENV_LINUX) += linux +DEPDIRS-linux := include +DIRS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += freebsd +DEPDIRS-freebsd := include + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/meson.build b/src/spdk/dpdk/lib/librte_eal/arm/include/meson.build new file mode 100644 index 000000000..73b750a18 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/meson.build @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation. + +arch_headers = files( + 'rte_atomic_32.h', + 'rte_atomic_64.h', + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags_32.h', + 'rte_cpuflags_64.h', + 'rte_cpuflags.h', + 'rte_cycles_32.h', + 'rte_cycles_64.h', + 'rte_cycles.h', + 'rte_io_64.h', + 'rte_io.h', + 'rte_memcpy_32.h', + 'rte_memcpy_64.h', + 'rte_memcpy.h', + 'rte_pause_32.h', + 'rte_pause_64.h', + 'rte_pause.h', + 'rte_prefetch_32.h', + 'rte_prefetch_64.h', + 'rte_prefetch.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', +) +install_headers(arch_headers, subdir: get_option('include_subdir_arch')) diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic.h new file mode 100644 index 000000000..40e14e56f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_ATOMIC_ARM_H_ +#define _RTE_ATOMIC_ARM_H_ + +#ifdef RTE_ARCH_64 +#include <rte_atomic_64.h> +#else +#include <rte_atomic_32.h> +#endif + +#endif /* _RTE_ATOMIC_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic_32.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic_32.h new file mode 100644 index 000000000..7dc0d06d1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic_32.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_ATOMIC_ARM32_H_ +#define _RTE_ATOMIC_ARM32_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_atomic.h" + +#define rte_mb() __sync_synchronize() + +#define rte_wmb() do { asm volatile ("dmb st" : : : "memory"); } while (0) + +#define rte_rmb() __sync_synchronize() + +#define rte_smp_mb() rte_mb() + +#define rte_smp_wmb() rte_wmb() + +#define rte_smp_rmb() rte_rmb() + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_wmb() + +#define rte_io_rmb() rte_rmb() + +#define rte_cio_wmb() rte_wmb() + +#define rte_cio_rmb() rte_rmb() + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic_64.h new file mode 100644 index 000000000..7b7099cdc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_atomic_64.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_ATOMIC_ARM64_H_ +#define _RTE_ATOMIC_ARM64_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_atomic.h" +#include <rte_branch_prediction.h> +#include <rte_compat.h> +#include <rte_debug.h> + +#define rte_mb() asm volatile("dsb sy" : : : "memory") + +#define rte_wmb() asm volatile("dsb st" : : : "memory") + +#define rte_rmb() asm volatile("dsb ld" : : : "memory") + +#define rte_smp_mb() asm volatile("dmb ish" : : : "memory") + +#define rte_smp_wmb() asm volatile("dmb ishst" : : : "memory") + +#define rte_smp_rmb() asm volatile("dmb ishld" : : : "memory") + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_wmb() + +#define rte_io_rmb() rte_rmb() + +#define rte_cio_wmb() asm volatile("dmb oshst" : : : "memory") + +#define rte_cio_rmb() asm volatile("dmb oshld" : : : "memory") + +/*------------------------ 128 bit atomic operations -------------------------*/ + +#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS) +#define __ATOMIC128_CAS_OP(cas_op_name, op_string) \ +static __rte_noinline rte_int128_t \ +cas_op_name(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated) \ +{ \ + /* caspX instructions register pair must start from even-numbered + * register at operand 1. + * So, specify registers for local variables here. + */ \ + register uint64_t x0 __asm("x0") = (uint64_t)old.val[0]; \ + register uint64_t x1 __asm("x1") = (uint64_t)old.val[1]; \ + register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0]; \ + register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1]; \ + asm volatile( \ + op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]" \ + : [old0] "+r" (x0), \ + [old1] "+r" (x1) \ + : [upd0] "r" (x2), \ + [upd1] "r" (x3), \ + [dst] "r" (dst) \ + : "memory"); \ + old.val[0] = x0; \ + old.val[1] = x1; \ + return old; \ +} + +__ATOMIC128_CAS_OP(__cas_128_relaxed, "casp") +__ATOMIC128_CAS_OP(__cas_128_acquire, "caspa") +__ATOMIC128_CAS_OP(__cas_128_release, "caspl") +__ATOMIC128_CAS_OP(__cas_128_acq_rel, "caspal") + +#undef __ATOMIC128_CAS_OP + +#endif + +__rte_experimental +static inline int +rte_atomic128_cmp_exchange(rte_int128_t *dst, rte_int128_t *exp, + const rte_int128_t *src, unsigned int weak, int success, + int failure) +{ + /* Always do strong CAS */ + RTE_SET_USED(weak); + /* Ignore memory ordering for failure, memory order for + * success must be stronger or equal + */ + RTE_SET_USED(failure); + /* Find invalid memory order */ + RTE_ASSERT(success == __ATOMIC_RELAXED || + success == __ATOMIC_ACQUIRE || + success == __ATOMIC_RELEASE || + success == __ATOMIC_ACQ_REL || + success == __ATOMIC_SEQ_CST); + + rte_int128_t expected = *exp; + rte_int128_t desired = *src; + rte_int128_t old; + +#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS) + if (success == __ATOMIC_RELAXED) + old = __cas_128_relaxed(dst, expected, desired); + else if (success == __ATOMIC_ACQUIRE) + old = __cas_128_acquire(dst, expected, desired); + else if (success == __ATOMIC_RELEASE) + old = __cas_128_release(dst, expected, desired); + else + old = __cas_128_acq_rel(dst, expected, desired); +#else +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE) +#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \ + (mo) == __ATOMIC_SEQ_CST) + + int ldx_mo = __HAS_ACQ(success) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED; + int stx_mo = __HAS_RLS(success) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED; + +#undef __HAS_ACQ +#undef __HAS_RLS + + uint32_t ret = 1; + + /* ldx128 can not guarantee atomic, + * Must write back src or old to verify atomicity of ldx128; + */ + do { + +#define __LOAD_128(op_string, src, dst) { \ + asm volatile( \ + op_string " %0, %1, %2" \ + : "=&r" (dst.val[0]), \ + "=&r" (dst.val[1]) \ + : "Q" (src->val[0]) \ + : "memory"); } + + if (ldx_mo == __ATOMIC_RELAXED) + __LOAD_128("ldxp", dst, old) + else + __LOAD_128("ldaxp", dst, old) + +#undef __LOAD_128 + +#define __STORE_128(op_string, dst, src, ret) { \ + asm volatile( \ + op_string " %w0, %1, %2, %3" \ + : "=&r" (ret) \ + : "r" (src.val[0]), \ + "r" (src.val[1]), \ + "Q" (dst->val[0]) \ + : "memory"); } + + if (likely(old.int128 == expected.int128)) { + if (stx_mo == __ATOMIC_RELAXED) + __STORE_128("stxp", dst, desired, ret) + else + __STORE_128("stlxp", dst, desired, ret) + } else { + /* In the failure case (since 'weak' is ignored and only + * weak == 0 is implemented), expected should contain + * the atomically read value of dst. This means, 'old' + * needs to be stored back to ensure it was read + * atomically. + */ + if (stx_mo == __ATOMIC_RELAXED) + __STORE_128("stxp", dst, old, ret) + else + __STORE_128("stlxp", dst, old, ret) + } + +#undef __STORE_128 + + } while (unlikely(ret)); +#endif + + /* Unconditionally updating expected removes an 'if' statement. + * expected should already be in register if not in the cache. + */ + *exp = old; + + return (old.int128 == expected.int128); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_byteorder.h new file mode 100644 index 000000000..9ec4a9750 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_byteorder.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_BYTEORDER_ARM_H_ +#define _RTE_BYTEORDER_ARM_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <rte_common.h> +#include "generic/rte_byteorder.h" + +/* fix missing __builtin_bswap16 for gcc older then 4.8 */ +#if !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + +static inline uint16_t rte_arch_bswap16(uint16_t _x) +{ + uint16_t x = _x; + + asm volatile ("rev16 %w0,%w1" + : "=r" (x) + : "r" (x) + ); + return x; +} + +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) +#endif + +/* ARM architecture is bi-endian (both big and little). */ +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + +#define rte_cpu_to_le_16(x) (x) +#define rte_cpu_to_le_32(x) (x) +#define rte_cpu_to_le_64(x) (x) + +#define rte_cpu_to_be_16(x) rte_bswap16(x) +#define rte_cpu_to_be_32(x) rte_bswap32(x) +#define rte_cpu_to_be_64(x) rte_bswap64(x) + +#define rte_le_to_cpu_16(x) (x) +#define rte_le_to_cpu_32(x) (x) +#define rte_le_to_cpu_64(x) (x) + +#define rte_be_to_cpu_16(x) rte_bswap16(x) +#define rte_be_to_cpu_32(x) rte_bswap32(x) +#define rte_be_to_cpu_64(x) rte_bswap64(x) + +#else /* RTE_BIG_ENDIAN */ + +#define rte_cpu_to_le_16(x) rte_bswap16(x) +#define rte_cpu_to_le_32(x) rte_bswap32(x) +#define rte_cpu_to_le_64(x) rte_bswap64(x) + +#define rte_cpu_to_be_16(x) (x) +#define rte_cpu_to_be_32(x) (x) +#define rte_cpu_to_be_64(x) (x) + +#define rte_le_to_cpu_16(x) rte_bswap16(x) +#define rte_le_to_cpu_32(x) rte_bswap32(x) +#define rte_le_to_cpu_64(x) rte_bswap64(x) + +#define rte_be_to_cpu_16(x) (x) +#define rte_be_to_cpu_32(x) (x) +#define rte_be_to_cpu_64(x) (x) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BYTEORDER_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags.h new file mode 100644 index 000000000..022e7da55 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CPUFLAGS_ARM_H_ +#define _RTE_CPUFLAGS_ARM_H_ + +#ifdef RTE_ARCH_64 +#include <rte_cpuflags_64.h> +#else +#include <rte_cpuflags_32.h> +#endif + +#endif /* _RTE_CPUFLAGS_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags_32.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags_32.h new file mode 100644 index 000000000..b5347be1e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags_32.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CPUFLAGS_ARM32_H_ +#define _RTE_CPUFLAGS_ARM32_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Enumeration of all CPU features supported + */ +enum rte_cpu_flag_t { + RTE_CPUFLAG_SWP = 0, + RTE_CPUFLAG_HALF, + RTE_CPUFLAG_THUMB, + RTE_CPUFLAG_A26BIT, + RTE_CPUFLAG_FAST_MULT, + RTE_CPUFLAG_FPA, + RTE_CPUFLAG_VFP, + RTE_CPUFLAG_EDSP, + RTE_CPUFLAG_JAVA, + RTE_CPUFLAG_IWMMXT, + RTE_CPUFLAG_CRUNCH, + RTE_CPUFLAG_THUMBEE, + RTE_CPUFLAG_NEON, + RTE_CPUFLAG_VFPv3, + RTE_CPUFLAG_VFPv3D16, + RTE_CPUFLAG_TLS, + RTE_CPUFLAG_VFPv4, + RTE_CPUFLAG_IDIVA, + RTE_CPUFLAG_IDIVT, + RTE_CPUFLAG_VFPD32, + RTE_CPUFLAG_LPAE, + RTE_CPUFLAG_EVTSTRM, + RTE_CPUFLAG_AES, + RTE_CPUFLAG_PMULL, + RTE_CPUFLAG_SHA1, + RTE_CPUFLAG_SHA2, + RTE_CPUFLAG_CRC32, + RTE_CPUFLAG_V7L, + /* The last item */ + RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags_64.h new file mode 100644 index 000000000..95cc01474 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cpuflags_64.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_CPUFLAGS_ARM64_H_ +#define _RTE_CPUFLAGS_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Enumeration of all CPU features supported + */ +enum rte_cpu_flag_t { + RTE_CPUFLAG_FP = 0, + RTE_CPUFLAG_NEON, + RTE_CPUFLAG_EVTSTRM, + RTE_CPUFLAG_AES, + RTE_CPUFLAG_PMULL, + RTE_CPUFLAG_SHA1, + RTE_CPUFLAG_SHA2, + RTE_CPUFLAG_CRC32, + RTE_CPUFLAG_ATOMICS, + RTE_CPUFLAG_AARCH64, + /* The last item */ + RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles.h new file mode 100644 index 000000000..e8ffa894b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CYCLES_ARM_H_ +#define _RTE_CYCLES_ARM_H_ + +#ifdef RTE_ARCH_64 +#include <rte_cycles_64.h> +#else +#include <rte_cycles_32.h> +#endif + +#endif /* _RTE_CYCLES_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles_32.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles_32.h new file mode 100644 index 000000000..f79718ce8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles_32.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CYCLES_ARM32_H_ +#define _RTE_CYCLES_ARM32_H_ + +/* ARM v7 does not have suitable source of clock signals. The only clock counter + available in the core is 32 bit wide. Therefore it is unsuitable as the + counter overlaps every few seconds and probably is not accessible by + userspace programs. Therefore we use clock_gettime(CLOCK_MONOTONIC_RAW) to + simulate counter running at 1GHz. +*/ + +#include <time.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +/** + * Read the time base register. + * + * @return + * The time base for this lcore. + */ +#ifndef RTE_ARM_EAL_RDTSC_USE_PMU + +/** + * This call is easily portable to any architecture, however, + * it may require a system call and inprecise for some tasks. + */ +static inline uint64_t +__rte_rdtsc_syscall(void) +{ + struct timespec val; + uint64_t v; + + while (clock_gettime(CLOCK_MONOTONIC_RAW, &val) != 0) + /* no body */; + + v = (uint64_t) val.tv_sec * 1000000000LL; + v += (uint64_t) val.tv_nsec; + return v; +} +#define rte_rdtsc __rte_rdtsc_syscall + +#else + +/** + * This function requires to configure the PMCCNTR and enable + * userspace access to it: + * + * asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r"(1)); + * asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(29)); + * asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x8000000f)); + * + * which is possible only from the privileged mode (kernel space). + */ +static inline uint64_t +__rte_rdtsc_pmccntr(void) +{ + unsigned tsc; + uint64_t final_tsc; + + /* Read PMCCNTR */ + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(tsc)); + /* 1 tick = 64 clocks */ + final_tsc = ((uint64_t)tsc) << 6; + + return (uint64_t)final_tsc; +} +#define rte_rdtsc __rte_rdtsc_pmccntr + +#endif /* RTE_ARM_EAL_RDTSC_USE_PMU */ + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles_64.h new file mode 100644 index 000000000..da557b6a1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_cycles_64.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_CYCLES_ARM64_H_ +#define _RTE_CYCLES_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +/** + * Read the time base register. + * + * @return + * The time base for this lcore. + */ +#ifndef RTE_ARM_EAL_RDTSC_USE_PMU +/** + * This call is portable to any ARMv8 architecture, however, typically + * cntvct_el0 runs at <= 100MHz and it may be imprecise for some tasks. + */ +static inline uint64_t +rte_rdtsc(void) +{ + uint64_t tsc; + + asm volatile("mrs %0, cntvct_el0" : "=r" (tsc)); + return tsc; +} +#else +/** + * This is an alternative method to enable rte_rdtsc() with high resolution + * PMU cycles counter.The cycle counter runs at cpu frequency and this scheme + * uses ARMv8 PMU subsystem to get the cycle counter at userspace, However, + * access to PMU cycle counter from user space is not enabled by default in + * arm64 linux kernel. + * It is possible to enable cycle counter at user space access by configuring + * the PMU from the privileged mode (kernel space). + * + * asm volatile("msr pmintenset_el1, %0" : : "r" ((u64)(0 << 31))); + * asm volatile("msr pmcntenset_el0, %0" :: "r" BIT(31)); + * asm volatile("msr pmuserenr_el0, %0" : : "r"(BIT(0) | BIT(2))); + * asm volatile("mrs %0, pmcr_el0" : "=r" (val)); + * val |= (BIT(0) | BIT(2)); + * isb(); + * asm volatile("msr pmcr_el0, %0" : : "r" (val)); + * + */ +static inline uint64_t +rte_rdtsc(void) +{ + uint64_t tsc; + + asm volatile("mrs %0, pmccntr_el0" : "=r"(tsc)); + return tsc; +} +#endif + +static inline uint64_t +rte_rdtsc_precise(void) +{ + asm volatile("isb" : : : "memory"); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_io.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_io.h new file mode 100644 index 000000000..f4e66e6ba --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_io.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_ARM_H_ +#define _RTE_IO_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef RTE_ARCH_64 +#include "rte_io_64.h" +#else +#include "generic/rte_io.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_io_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_io_64.h new file mode 100644 index 000000000..e5346240e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_io_64.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_ARM64_H_ +#define _RTE_IO_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> + +#define RTE_OVERRIDE_IO_H + +#include "generic/rte_io.h" +#include "rte_atomic_64.h" + +static __rte_always_inline uint8_t +rte_read8_relaxed(const volatile void *addr) +{ + uint8_t val; + + asm volatile( + "ldrb %w[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline uint16_t +rte_read16_relaxed(const volatile void *addr) +{ + uint16_t val; + + asm volatile( + "ldrh %w[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline uint32_t +rte_read32_relaxed(const volatile void *addr) +{ + uint32_t val; + + asm volatile( + "ldr %w[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline uint64_t +rte_read64_relaxed(const volatile void *addr) +{ + uint64_t val; + + asm volatile( + "ldr %x[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline void +rte_write8_relaxed(uint8_t val, volatile void *addr) +{ + asm volatile( + "strb %w[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline void +rte_write16_relaxed(uint16_t val, volatile void *addr) +{ + asm volatile( + "strh %w[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline void +rte_write32_relaxed(uint32_t val, volatile void *addr) +{ + asm volatile( + "str %w[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline void +rte_write64_relaxed(uint64_t val, volatile void *addr) +{ + asm volatile( + "str %x[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline uint8_t +rte_read8(const volatile void *addr) +{ + uint8_t val; + val = rte_read8_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint16_t +rte_read16(const volatile void *addr) +{ + uint16_t val; + val = rte_read16_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint32_t +rte_read32(const volatile void *addr) +{ + uint32_t val; + val = rte_read32_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint64_t +rte_read64(const volatile void *addr) +{ + uint64_t val; + val = rte_read64_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline void +rte_write8(uint8_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write8_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write16(uint16_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write16_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write32(uint32_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write32_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write64(uint64_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write64_relaxed(value, addr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_mcslock.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_mcslock.h new file mode 100644 index 000000000..dd1fe135b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_mcslock.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_MCSLOCK_ARM_H_ +#define _RTE_MCSLOCK_ARM_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_mcslock.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MCSLOCK_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy.h new file mode 100644 index 000000000..47dea9a8c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_MEMCPY_ARM_H_ +#define _RTE_MEMCPY_ARM_H_ + +#ifdef RTE_ARCH_64 +#include <rte_memcpy_64.h> +#else +#include <rte_memcpy_32.h> +#endif + +#endif /* _RTE_MEMCPY_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy_32.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy_32.h new file mode 100644 index 000000000..eb02c3b41 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy_32.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_MEMCPY_ARM32_H_ +#define _RTE_MEMCPY_ARM32_H_ + +#include <stdint.h> +#include <string.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_memcpy.h" + +#ifdef RTE_ARCH_ARM_NEON_MEMCPY + +#ifndef RTE_MACHINE_CPUFLAG_NEON +#error "Cannot optimize memcpy by NEON as the CPU seems to not support this" +#endif + +/* ARM NEON Intrinsics are used to copy data */ +#include <arm_neon.h> + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + vst1q_u8(dst, vld1q_u8(src)); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + asm volatile ( + "vld1.8 {d0-d3}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : : "memory", "d0", "d1", "d2", "d3"); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d5}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d5}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5"); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d7}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d7}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + asm volatile ("pld [%0, #64]" : : "r" (src)); + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d7}, [%0]!\n\t" + "vld1.8 {d8-d11}, [%0]!\n\t" + "vld1.8 {d12-d15}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d7}, [%1]!\n\t" + "vst1.8 {d8-d11}, [%1]!\n\t" + "vst1.8 {d12-d15}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + asm volatile ("pld [%0, #64]" : : "r" (src)); + asm volatile ("pld [%0, #128]" : : "r" (src)); + asm volatile ("pld [%0, #192]" : : "r" (src)); + asm volatile ("pld [%0, #256]" : : "r" (src)); + asm volatile ("pld [%0, #320]" : : "r" (src)); + asm volatile ("pld [%0, #384]" : : "r" (src)); + asm volatile ("pld [%0, #448]" : : "r" (src)); + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d7}, [%0]!\n\t" + "vld1.8 {d8-d11}, [%0]!\n\t" + "vld1.8 {d12-d15}, [%0]!\n\t" + "vld1.8 {d16-d19}, [%0]!\n\t" + "vld1.8 {d20-d23}, [%0]!\n\t" + "vld1.8 {d24-d27}, [%0]!\n\t" + "vld1.8 {d28-d31}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d7}, [%1]!\n\t" + "vst1.8 {d8-d11}, [%1]!\n\t" + "vst1.8 {d12-d15}, [%1]!\n\t" + "vst1.8 {d16-d19}, [%1]!\n\t" + "vst1.8 {d20-d23}, [%1]!\n\t" + "vst1.8 {d24-d27}, [%1]!\n\t" + "vst1.8 {d28-d31}, [%1]!\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +} + +#define rte_memcpy(dst, src, n) \ + __extension__ ({ \ + (__builtin_constant_p(n)) ? \ + memcpy((dst), (src), (n)) : \ + rte_memcpy_func((dst), (src), (n)); }) + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* We can't copy < 16 bytes using XMM registers so do it manually. */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + dst = (uint8_t *)dst + 1; + src = (const uint8_t *)src + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst = (uint16_t *)dst + 1; + src = (const uint16_t *)src + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + } + if (n & 0x08) { + /* ARMv7 can not handle unaligned access to long long + * (uint64_t). Therefore two uint32_t operations are + * used. + */ + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + *(uint32_t *)dst = *(const uint32_t *)src; + } + return ret; + } + + /* Special fast cases for <= 128 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + + if (n <= 128) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /* + * For large copies > 128 bytes. This combination of 256, 64 and 16 byte + * copies was found to be faster than doing 128 and 32 byte copies as + * well. + */ + for ( ; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /* + * We split the remaining bytes (which will be less than 256) into + * 64byte (2^6) chunks. + * Using incrementing integers in the case labels of a switch statement + * encourages the compiler to use a jump table. To get incrementing + * integers, we shift the 2 relevant bits to the LSB position to first + * get decrementing integers, and then subtract. + */ + switch (3 - (n >> 6)) { + case 0x00: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x01: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x02: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + default: + break; + } + + /* + * We split the remaining bytes (which will be less than 64) into + * 16byte (2^4) chunks, using the same switch structure as above. + */ + switch (3 - (n >> 4)) { + case 0x00: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x01: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x02: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + default: + break; + } + + /* Copy any remaining bytes, without going beyond end of buffers */ + if (n != 0) + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; +} + +#else + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 16); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 32); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 48); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 64); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 128); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 256); +} + +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + return memcpy(dst, src, n); +} + +#endif /* RTE_ARCH_ARM_NEON_MEMCPY */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy_64.h new file mode 100644 index 000000000..85ad587bd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_memcpy_64.h @@ -0,0 +1,372 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_MEMCPY_ARM64_H_ +#define _RTE_MEMCPY_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <string.h> + +#include "generic/rte_memcpy.h" + +#ifdef RTE_ARCH_ARM64_MEMCPY +#include <rte_common.h> +#include <rte_branch_prediction.h> + +/* + * The memory copy performance differs on different AArch64 micro-architectures. + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy() + * performance compared to old glibc versions. It's always suggested to use a + * more recent glibc if possible, from which the entire system can get benefit. + * + * This implementation improves memory copy on some aarch64 micro-architectures, + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not + * always providing better performance than memcpy() so users need to run unit + * test "memcpy_perf_autotest" and customize parameters in customization section + * below for best performance. + * + * Compiler version will also impact the rte_memcpy() performance. It's observed + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can + * provide better performance than GCC 4.8.5 compiled binaries. + */ + +/************************************** + * Beginning of customization section + **************************************/ +#ifndef RTE_ARM64_MEMCPY_ALIGN_MASK +#define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1) +#endif + +#ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN +/* Only src unalignment will be treated as unaligned copy */ +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \ + ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK) +#else +/* Both dst and src unalignment will be treated as unaligned copy */ +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \ + (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK) +#endif + + +/* + * If copy size is larger than threshold, memcpy() will be used. + * Run "memcpy_perf_autotest" to determine the proper threshold. + */ +#ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD +#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \ +(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ +n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) +#else +#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \ +(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src)) +#endif +#ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD +#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \ +(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ +n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD) +#else +#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \ +(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src)) +#endif +/* + * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform. + */ +#if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \ +|| defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD) +#define USE_RTE_MEMCPY(dst, src, n) \ +(USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n)) +#else +#define USE_RTE_MEMCPY(dst, src, n) (1) +#endif +/************************************** + * End of customization section + **************************************/ + + +#if RTE_CC_IS_GNU && !defined RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK +#if (GCC_VERSION < 50400) +#warning "The GCC version is quite old, which may result in sub-optimal \ +performance of the compiled code. It is suggested that at least GCC 5.4.0 \ +be used." +#endif +#endif + +static __rte_always_inline +void rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + *dst128 = *src128; +} + +static __rte_always_inline +void rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + const __uint128_t x0 = src128[0], x1 = src128[1]; + dst128[0] = x0; + dst128[1] = x1; +} + +static __rte_always_inline +void rte_mov48(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2]; + dst128[0] = x0; + dst128[1] = x1; + dst128[2] = x2; +} + +static __rte_always_inline +void rte_mov64(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + const __uint128_t + x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3]; + dst128[0] = x0; + dst128[1] = x1; + dst128[2] = x2; + dst128[3] = x3; +} + +static __rte_always_inline +void rte_mov128(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + /* Keep below declaration & copy sequence for optimized instructions */ + const __uint128_t + x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3]; + dst128[0] = x0; + __uint128_t x4 = src128[4]; + dst128[1] = x1; + __uint128_t x5 = src128[5]; + dst128[2] = x2; + __uint128_t x6 = src128[6]; + dst128[3] = x3; + __uint128_t x7 = src128[7]; + dst128[4] = x4; + dst128[5] = x5; + dst128[6] = x6; + dst128[7] = x7; +} + +static __rte_always_inline +void rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov128(dst, src); + rte_mov128(dst + 128, src + 128); +} + +static __rte_always_inline void +rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n) +{ + if (n & 0x08) { + /* copy 8 ~ 15 bytes */ + *(uint64_t *)dst = *(const uint64_t *)src; + *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n); + } else if (n & 0x04) { + /* copy 4 ~ 7 bytes */ + *(uint32_t *)dst = *(const uint32_t *)src; + *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n); + } else if (n & 0x02) { + /* copy 2 ~ 3 bytes */ + *(uint16_t *)dst = *(const uint16_t *)src; + *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n); + } else if (n & 0x01) { + /* copy 1 byte */ + *dst = *src; + } +} + +static __rte_always_inline +void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n) +{ + if (n < 64) { + if (n == 16) { + rte_mov16(dst, src); + } else if (n <= 32) { + rte_mov16(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else if (n <= 48) { + rte_mov32(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else { + rte_mov48(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } + } else { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + if (n > 48 + 64) + rte_mov64(dst - 64 + n, src - 64 + n); + else if (n > 32 + 64) + rte_mov48(dst - 48 + n, src - 48 + n); + else if (n > 16 + 64) + rte_mov32(dst - 32 + n, src - 32 + n); + else if (n > 64) + rte_mov16(dst - 16 + n, src - 16 + n); + } +} + +static __rte_always_inline +void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n) +{ + do { + rte_mov128(dst, src); + src += 128; + dst += 128; + n -= 128; + } while (likely(n >= 128)); + + if (likely(n)) { + if (n <= 16) + rte_mov16(dst - 16 + n, src - 16 + n); + else if (n <= 32) + rte_mov32(dst - 32 + n, src - 32 + n); + else if (n <= 48) + rte_mov48(dst - 48 + n, src - 48 + n); + else if (n <= 64) + rte_mov64(dst - 64 + n, src - 64 + n); + else + rte_memcpy_ge16_lt128(dst, src, n); + } +} + +static __rte_always_inline +void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n) +{ + if (n == 16) { + rte_mov16(dst, src); + } else if (n <= 32) { + rte_mov16(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else if (n <= 48) { + rte_mov32(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else { + rte_mov48(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } +} + +static __rte_always_inline +void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n) +{ + do { + rte_mov64(dst, src); + src += 64; + dst += 64; + n -= 64; + } while (likely(n >= 64)); + + if (likely(n)) { + if (n <= 16) + rte_mov16(dst - 16 + n, src - 16 + n); + else if (n <= 32) + rte_mov32(dst - 32 + n, src - 32 + n); + else if (n <= 48) + rte_mov48(dst - 48 + n, src - 48 + n); + else + rte_mov64(dst - 64 + n, src - 64 + n); + } +} + +#if RTE_CACHE_LINE_SIZE >= 128 +static __rte_always_inline +void *rte_memcpy(void *dst, const void *src, size_t n) +{ + if (n < 16) { + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + if (n < 128) { + rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + __builtin_prefetch(src, 0, 0); + __builtin_prefetch(dst, 1, 0); + if (likely(USE_RTE_MEMCPY(dst, src, n))) { + rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } else + return memcpy(dst, src, n); +} + +#else +static __rte_always_inline +void *rte_memcpy(void *dst, const void *src, size_t n) +{ + if (n < 16) { + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + if (n < 64) { + rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + __builtin_prefetch(src, 0, 0); + __builtin_prefetch(dst, 1, 0); + if (likely(USE_RTE_MEMCPY(dst, src, n))) { + rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } else + return memcpy(dst, src, n); +} +#endif /* RTE_CACHE_LINE_SIZE >= 128 */ + +#else +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 16); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 32); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 48); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 64); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 128); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 256); +} + +#define rte_memcpy(d, s, n) memcpy((d), (s), (n)) + +#endif /* RTE_ARCH_ARM64_MEMCPY */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_ARM_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause.h new file mode 100644 index 000000000..6c7002ad9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_ARM_H_ +#define _RTE_PAUSE_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef RTE_ARCH_64 +#include <rte_pause_64.h> +#else +#include <rte_pause_32.h> +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause_32.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause_32.h new file mode 100644 index 000000000..d4768c7a9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause_32.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_ARM32_H_ +#define _RTE_PAUSE_ARM32_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include "generic/rte_pause.h" + +static inline void rte_pause(void) +{ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause_64.h new file mode 100644 index 000000000..e87d10b8c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_pause_64.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_PAUSE_ARM64_H_ +#define _RTE_PAUSE_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> + +#ifdef RTE_ARM_USE_WFE +#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED +#endif + +#include "generic/rte_pause.h" + +static inline void rte_pause(void) +{ + asm volatile("yield" ::: "memory"); +} + +#ifdef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED + +/* Send an event to quit WFE. */ +#define __SEVL() { asm volatile("sevl" : : : "memory"); } + +/* Put processor into low power WFE(Wait For Event) state. */ +#define __WFE() { asm volatile("wfe" : : : "memory"); } + +static __rte_always_inline void +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected, + int memorder) +{ + uint16_t value; + + assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); + + /* + * Atomic exclusive load from addr, it returns the 16-bit content of + * *addr while making it 'monitored',when it is written by someone + * else, the 'monitored' state is cleared and a event is generated + * implicitly to exit WFE. + */ +#define __LOAD_EXC_16(src, dst, memorder) { \ + if (memorder == __ATOMIC_RELAXED) { \ + asm volatile("ldxrh %w[tmp], [%x[addr]]" \ + : [tmp] "=&r" (dst) \ + : [addr] "r"(src) \ + : "memory"); \ + } else { \ + asm volatile("ldaxrh %w[tmp], [%x[addr]]" \ + : [tmp] "=&r" (dst) \ + : [addr] "r"(src) \ + : "memory"); \ + } } + + __LOAD_EXC_16(addr, value, memorder) + if (value != expected) { + __SEVL() + do { + __WFE() + __LOAD_EXC_16(addr, value, memorder) + } while (value != expected); + } +#undef __LOAD_EXC_16 +} + +static __rte_always_inline void +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, + int memorder) +{ + uint32_t value; + + assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); + + /* + * Atomic exclusive load from addr, it returns the 32-bit content of + * *addr while making it 'monitored',when it is written by someone + * else, the 'monitored' state is cleared and a event is generated + * implicitly to exit WFE. + */ +#define __LOAD_EXC_32(src, dst, memorder) { \ + if (memorder == __ATOMIC_RELAXED) { \ + asm volatile("ldxr %w[tmp], [%x[addr]]" \ + : [tmp] "=&r" (dst) \ + : [addr] "r"(src) \ + : "memory"); \ + } else { \ + asm volatile("ldaxr %w[tmp], [%x[addr]]" \ + : [tmp] "=&r" (dst) \ + : [addr] "r"(src) \ + : "memory"); \ + } } + + __LOAD_EXC_32(addr, value, memorder) + if (value != expected) { + __SEVL() + do { + __WFE() + __LOAD_EXC_32(addr, value, memorder) + } while (value != expected); + } +#undef __LOAD_EXC_32 +} + +static __rte_always_inline void +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected, + int memorder) +{ + uint64_t value; + + assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); + + /* + * Atomic exclusive load from addr, it returns the 64-bit content of + * *addr while making it 'monitored',when it is written by someone + * else, the 'monitored' state is cleared and a event is generated + * implicitly to exit WFE. + */ +#define __LOAD_EXC_64(src, dst, memorder) { \ + if (memorder == __ATOMIC_RELAXED) { \ + asm volatile("ldxr %x[tmp], [%x[addr]]" \ + : [tmp] "=&r" (dst) \ + : [addr] "r"(src) \ + : "memory"); \ + } else { \ + asm volatile("ldaxr %x[tmp], [%x[addr]]" \ + : [tmp] "=&r" (dst) \ + : [addr] "r"(src) \ + : "memory"); \ + } } + + __LOAD_EXC_64(addr, value, memorder) + if (value != expected) { + __SEVL() + do { + __WFE() + __LOAD_EXC_64(addr, value, memorder) + } while (value != expected); + } +} +#undef __LOAD_EXC_64 + +#undef __SEVL +#undef __WFE + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch.h new file mode 100644 index 000000000..27870c2a8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_PREFETCH_ARM_H_ +#define _RTE_PREFETCH_ARM_H_ + +#ifdef RTE_ARCH_64 +#include <rte_prefetch_64.h> +#else +#include <rte_prefetch_32.h> +#endif + +#endif /* _RTE_PREFETCH_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch_32.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch_32.h new file mode 100644 index 000000000..e53420a0b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch_32.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_PREFETCH_ARM32_H_ +#define _RTE_PREFETCH_ARM32_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("pld [%0]" : : "r" (p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("pld [%0]" : : "r" (p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("pld [%0]" : : "r" (p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + /* non-temporal version not available, fallback to rte_prefetch0 */ + rte_prefetch0(p); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch_64.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch_64.h new file mode 100644 index 000000000..fc2b391aa --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_prefetch_64.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_PREFETCH_ARM_64_H_ +#define _RTE_PREFETCH_ARM_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("PRFM PLDL1KEEP, [%0]" : : "r" (p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("PRFM PLDL2KEEP, [%0]" : : "r" (p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("PRFM PLDL3KEEP, [%0]" : : "r" (p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_ARM_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_rwlock.h new file mode 100644 index 000000000..18bb37b03 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_rwlock.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ +/* copied from ppc_64 */ + +#ifndef _RTE_RWLOCK_ARM_H_ +#define _RTE_RWLOCK_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_rwlock.h" + +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_lock(rwl); +} + +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_unlock(rwl); +} + +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_lock(rwl); +} + +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_unlock(rwl); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_spinlock.h new file mode 100644 index 000000000..1a6916b6e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_spinlock.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_SPINLOCK_ARM_H_ +#define _RTE_SPINLOCK_ARM_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include "generic/rte_spinlock.h" + +static inline int rte_tm_supported(void) +{ + return 0; +} + +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_lock(sl); /* fall-back */ +} + +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl) +{ + return rte_spinlock_trylock(sl); +} + +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_unlock(sl); +} + +static inline void +rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_lock(slr); /* fall-back */ +} + +static inline void +rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_unlock(slr); +} + +static inline int +rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr) +{ + return rte_spinlock_recursive_trylock(slr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SPINLOCK_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_ticketlock.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_ticketlock.h new file mode 100644 index 000000000..e09fbd6a6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_ticketlock.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_TICKETLOCK_ARM_H_ +#define _RTE_TICKETLOCK_ARM_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_ticketlock.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TICKETLOCK_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/arm/include/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_vect.h new file mode 100644 index 000000000..9287a1117 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/include/rte_vect.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_VECT_ARM_H_ +#define _RTE_VECT_ARM_H_ + +#include <stdint.h> +#include "generic/rte_vect.h" +#include "rte_debug.h" +#include "arm_neon.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int32x4_t xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} __rte_aligned(16) rte_xmm_t; + +#ifdef RTE_ARCH_ARM +/* NEON intrinsic vqtbl1q_u8() is not supported in ARMv7-A(AArch32) */ +static __inline uint8x16_t +vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + uint8_t i, pos; + rte_xmm_t rte_a, rte_b, rte_ret; + + vst1q_u8(rte_a.u8, a); + vst1q_u8(rte_b.u8, b); + + for (i = 0; i < 16; i++) { + pos = rte_b.u8[i]; + if (pos < 16) + rte_ret.u8[i] = rte_a.u8[pos]; + else + rte_ret.u8[i] = 0; + } + + return vld1q_u8(rte_ret.u8); +} + +static inline uint16_t +vaddvq_u16(uint16x8_t a) +{ + uint32x4_t m = vpaddlq_u16(a); + uint64x2_t n = vpaddlq_u32(m); + uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); + + return vget_lane_u32((uint32x2_t)o, 0); +} + +#endif + +#if RTE_CC_IS_GNU && (GCC_VERSION < 70000) +static inline uint32x4_t +vcopyq_laneq_u32(uint32x4_t a, const int lane_a, + uint32x4_t b, const int lane_b) +{ + return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a); +} +#endif + +#if defined(RTE_ARCH_ARM64) +#if RTE_CC_IS_GNU && (GCC_VERSION < 70000) + +#if (GCC_VERSION < 40900) +typedef uint64_t poly64_t; +typedef uint64x2_t poly64x2_t; +typedef uint8_t poly128_t __attribute__((vector_size(16), aligned(16))); + +static inline uint32x4_t +vceqzq_u32(uint32x4_t a) +{ + return (a == 0); +} +#endif + +/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */ +static inline uint64x2_t +vreinterpretq_u64_p128(poly128_t x) +{ + return (uint64x2_t)x; +} + +/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */ +static inline poly64x2_t +vreinterpretq_p64_u64(uint64x2_t x) +{ + return (poly64x2_t)x; +} + +/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */ +static inline poly64_t +vgetq_lane_p64(poly64x2_t x, const int lane) +{ + RTE_ASSERT(lane >= 0 && lane <= 1); + + poly64_t *p = (poly64_t *)&x; + + return p[lane]; +} +#endif +#endif + +/* + * If (0 <= index <= 15), then call the ASIMD ext instruction on the + * 128 bit regs v0 and v1 with the appropriate index. + * + * Else returns a zero vector. + */ +static inline uint8x16_t +vextract(uint8x16_t v0, uint8x16_t v1, const int index) +{ + switch (index) { + case 0: return vextq_u8(v0, v1, 0); + case 1: return vextq_u8(v0, v1, 1); + case 2: return vextq_u8(v0, v1, 2); + case 3: return vextq_u8(v0, v1, 3); + case 4: return vextq_u8(v0, v1, 4); + case 5: return vextq_u8(v0, v1, 5); + case 6: return vextq_u8(v0, v1, 6); + case 7: return vextq_u8(v0, v1, 7); + case 8: return vextq_u8(v0, v1, 8); + case 9: return vextq_u8(v0, v1, 9); + case 10: return vextq_u8(v0, v1, 10); + case 11: return vextq_u8(v0, v1, 11); + case 12: return vextq_u8(v0, v1, 12); + case 13: return vextq_u8(v0, v1, 13); + case 14: return vextq_u8(v0, v1, 14); + case 15: return vextq_u8(v0, v1, 15); + } + return vdupq_n_u8(0); +} + +/** + * Shifts right 128 bit register by specified number of bytes + * + * Value of shift parameter must be in range 0 - 16 + */ +static inline uint64x2_t +vshift_bytes_right(uint64x2_t reg, const unsigned int shift) +{ + return vreinterpretq_u64_u8(vextract( + vreinterpretq_u8_u64(reg), + vdupq_n_u8(0), + shift)); +} + +/** + * Shifts left 128 bit register by specified number of bytes + * + * Value of shift parameter must be in range 0 - 16 + */ +static inline uint64x2_t +vshift_bytes_left(uint64x2_t reg, const unsigned int shift) +{ + return vreinterpretq_u64_u8(vextract( + vdupq_n_u8(0), + vreinterpretq_u8_u64(reg), + 16 - shift)); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/arm/meson.build b/src/spdk/dpdk/lib/librte_eal/arm/meson.build new file mode 100644 index 000000000..d62875eba --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation. + +subdir('include') + +sources += files( + 'rte_cpuflags.c', + 'rte_cycles.c', + 'rte_hypervisor.c', +) diff --git a/src/spdk/dpdk/lib/librte_eal/arm/rte_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/arm/rte_cpuflags.c new file mode 100644 index 000000000..caf3dc83a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/rte_cpuflags.c @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) Cavium, Inc. 2015. + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#include "rte_cpuflags.h" + +#include <elf.h> +#include <fcntl.h> +#include <assert.h> +#include <unistd.h> +#include <string.h> + +#ifndef AT_HWCAP +#define AT_HWCAP 16 +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#ifndef AT_PLATFORM +#define AT_PLATFORM 15 +#endif + +enum cpu_register_t { + REG_NONE = 0, + REG_HWCAP, + REG_HWCAP2, + REG_PLATFORM, + REG_MAX +}; + +typedef uint32_t hwcap_registers_t[REG_MAX]; + +/** + * Struct to hold a processor feature entry + */ +struct feature_entry { + uint32_t reg; + uint32_t bit; +#define CPU_FLAG_NAME_MAX_LEN 64 + char name[CPU_FLAG_NAME_MAX_LEN]; +}; + +#define FEAT_DEF(name, reg, bit) \ + [RTE_CPUFLAG_##name] = {reg, bit, #name}, + +#ifdef RTE_ARCH_ARMv7 +#define PLATFORM_STR "v7l" +typedef Elf32_auxv_t _Elfx_auxv_t; + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(SWP, REG_HWCAP, 0) + FEAT_DEF(HALF, REG_HWCAP, 1) + FEAT_DEF(THUMB, REG_HWCAP, 2) + FEAT_DEF(A26BIT, REG_HWCAP, 3) + FEAT_DEF(FAST_MULT, REG_HWCAP, 4) + FEAT_DEF(FPA, REG_HWCAP, 5) + FEAT_DEF(VFP, REG_HWCAP, 6) + FEAT_DEF(EDSP, REG_HWCAP, 7) + FEAT_DEF(JAVA, REG_HWCAP, 8) + FEAT_DEF(IWMMXT, REG_HWCAP, 9) + FEAT_DEF(CRUNCH, REG_HWCAP, 10) + FEAT_DEF(THUMBEE, REG_HWCAP, 11) + FEAT_DEF(NEON, REG_HWCAP, 12) + FEAT_DEF(VFPv3, REG_HWCAP, 13) + FEAT_DEF(VFPv3D16, REG_HWCAP, 14) + FEAT_DEF(TLS, REG_HWCAP, 15) + FEAT_DEF(VFPv4, REG_HWCAP, 16) + FEAT_DEF(IDIVA, REG_HWCAP, 17) + FEAT_DEF(IDIVT, REG_HWCAP, 18) + FEAT_DEF(VFPD32, REG_HWCAP, 19) + FEAT_DEF(LPAE, REG_HWCAP, 20) + FEAT_DEF(EVTSTRM, REG_HWCAP, 21) + FEAT_DEF(AES, REG_HWCAP2, 0) + FEAT_DEF(PMULL, REG_HWCAP2, 1) + FEAT_DEF(SHA1, REG_HWCAP2, 2) + FEAT_DEF(SHA2, REG_HWCAP2, 3) + FEAT_DEF(CRC32, REG_HWCAP2, 4) + FEAT_DEF(V7L, REG_PLATFORM, 0) +}; + +#elif defined RTE_ARCH_ARM64 +#define PLATFORM_STR "aarch64" +typedef Elf64_auxv_t _Elfx_auxv_t; + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(FP, REG_HWCAP, 0) + FEAT_DEF(NEON, REG_HWCAP, 1) + FEAT_DEF(EVTSTRM, REG_HWCAP, 2) + FEAT_DEF(AES, REG_HWCAP, 3) + FEAT_DEF(PMULL, REG_HWCAP, 4) + FEAT_DEF(SHA1, REG_HWCAP, 5) + FEAT_DEF(SHA2, REG_HWCAP, 6) + FEAT_DEF(CRC32, REG_HWCAP, 7) + FEAT_DEF(ATOMICS, REG_HWCAP, 8) + FEAT_DEF(AARCH64, REG_PLATFORM, 1) +}; +#endif /* RTE_ARCH */ + +/* + * Read AUXV software register and get cpu features for ARM + */ +static void +rte_cpu_get_features(hwcap_registers_t out) +{ + out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP); + out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2); + if (!rte_cpu_strcmp_auxval(AT_PLATFORM, PLATFORM_STR)) + out[REG_PLATFORM] = 0x0001; +} + +/* + * Checks if a particular flag is available on current machine. + */ +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature) +{ + const struct feature_entry *feat; + hwcap_registers_t regs = {0}; + + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return -ENOENT; + + feat = &rte_cpu_feature_table[feature]; + if (feat->reg == REG_NONE) + return -EFAULT; + + rte_cpu_get_features(regs); + return (regs[feat->reg] >> feat->bit) & 1; +} + +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature) +{ + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return NULL; + return rte_cpu_feature_table[feature].name; +} diff --git a/src/spdk/dpdk/lib/librte_eal/arm/rte_cycles.c b/src/spdk/dpdk/lib/librte_eal/arm/rte_cycles.c new file mode 100644 index 000000000..3500d523e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/rte_cycles.c @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#include "eal_private.h" + +uint64_t +get_tsc_freq_arch(void) +{ +#if defined RTE_ARCH_ARM64 && !defined RTE_ARM_EAL_RDTSC_USE_PMU + uint64_t freq; + asm volatile("mrs %0, cntfrq_el0" : "=r" (freq)); + return freq; +#else + return 0; +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/arm/rte_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/arm/rte_hypervisor.c new file mode 100644 index 000000000..08a1c97d1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/arm/rte_hypervisor.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +enum rte_hypervisor +rte_hypervisor_get(void) +{ + return RTE_HYPERVISOR_UNKNOWN; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c new file mode 100644 index 000000000..baa5b532a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 NXP + */ + +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_bus.h> +#include <rte_debug.h> +#include <rte_string_fns.h> +#include <rte_errno.h> + +#include "eal_private.h" + +static struct rte_bus_list rte_bus_list = + TAILQ_HEAD_INITIALIZER(rte_bus_list); + +void +rte_bus_register(struct rte_bus *bus) +{ + RTE_VERIFY(bus); + RTE_VERIFY(bus->name && strlen(bus->name)); + /* A bus should mandatorily have the scan implemented */ + RTE_VERIFY(bus->scan); + RTE_VERIFY(bus->probe); + RTE_VERIFY(bus->find_device); + /* Buses supporting driver plug also require unplug. */ + RTE_VERIFY(!bus->plug || bus->unplug); + + TAILQ_INSERT_TAIL(&rte_bus_list, bus, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name); +} + +void +rte_bus_unregister(struct rte_bus *bus) +{ + TAILQ_REMOVE(&rte_bus_list, bus, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] bus.\n", bus->name); +} + +/* Scan all the buses for registered devices */ +int +rte_bus_scan(void) +{ + int ret; + struct rte_bus *bus = NULL; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + ret = bus->scan(); + if (ret) + RTE_LOG(ERR, EAL, "Scan for (%s) bus failed.\n", + bus->name); + } + + return 0; +} + +/* Probe all devices of all buses */ +int +rte_bus_probe(void) +{ + int ret; + struct rte_bus *bus, *vbus = NULL; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + if (!strcmp(bus->name, "vdev")) { + vbus = bus; + continue; + } + + ret = bus->probe(); + if (ret) + RTE_LOG(ERR, EAL, "Bus (%s) probe failed.\n", + bus->name); + } + + if (vbus) { + ret = vbus->probe(); + if (ret) + RTE_LOG(ERR, EAL, "Bus (%s) probe failed.\n", + vbus->name); + } + + return 0; +} + +/* Dump information of a single bus */ +static int +bus_dump_one(FILE *f, struct rte_bus *bus) +{ + int ret; + + /* For now, dump only the bus name */ + ret = fprintf(f, " %s\n", bus->name); + + /* Error in case of inability in writing to stream */ + if (ret < 0) + return ret; + + return 0; +} + +void +rte_bus_dump(FILE *f) +{ + int ret; + struct rte_bus *bus; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + ret = bus_dump_one(f, bus); + if (ret) { + RTE_LOG(ERR, EAL, "Unable to write to stream (%d)\n", + ret); + break; + } + } +} + +struct rte_bus * +rte_bus_find(const struct rte_bus *start, rte_bus_cmp_t cmp, + const void *data) +{ + struct rte_bus *bus; + + if (start != NULL) + bus = TAILQ_NEXT(start, next); + else + bus = TAILQ_FIRST(&rte_bus_list); + while (bus != NULL) { + if (cmp(bus, data) == 0) + break; + bus = TAILQ_NEXT(bus, next); + } + return bus; +} + +static int +cmp_rte_device(const struct rte_device *dev1, const void *_dev2) +{ + const struct rte_device *dev2 = _dev2; + + return dev1 != dev2; +} + +static int +bus_find_device(const struct rte_bus *bus, const void *_dev) +{ + struct rte_device *dev; + + dev = bus->find_device(NULL, cmp_rte_device, _dev); + return dev == NULL; +} + +struct rte_bus * +rte_bus_find_by_device(const struct rte_device *dev) +{ + return rte_bus_find(NULL, bus_find_device, (const void *)dev); +} + +static int +cmp_bus_name(const struct rte_bus *bus, const void *_name) +{ + const char *name = _name; + + return strcmp(bus->name, name); +} + +struct rte_bus * +rte_bus_find_by_name(const char *busname) +{ + return rte_bus_find(NULL, cmp_bus_name, (const void *)busname); +} + +static int +bus_can_parse(const struct rte_bus *bus, const void *_name) +{ + const char *name = _name; + + return !(bus->parse && bus->parse(name, NULL) == 0); +} + +struct rte_bus * +rte_bus_find_by_device_name(const char *str) +{ + char name[RTE_DEV_NAME_MAX_LEN]; + char *c; + + strlcpy(name, str, sizeof(name)); + c = strchr(name, ','); + if (c != NULL) + c[0] = '\0'; + return rte_bus_find(NULL, bus_can_parse, name); +} + + +/* + * Get iommu class of devices on the bus. + */ +enum rte_iova_mode +rte_bus_get_iommu_class(void) +{ + enum rte_iova_mode mode = RTE_IOVA_DC; + bool buses_want_va = false; + bool buses_want_pa = false; + struct rte_bus *bus; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + enum rte_iova_mode bus_iova_mode; + + if (bus->get_iommu_class == NULL) + continue; + + bus_iova_mode = bus->get_iommu_class(); + RTE_LOG(DEBUG, EAL, "Bus %s wants IOVA as '%s'\n", + bus->name, + bus_iova_mode == RTE_IOVA_DC ? "DC" : + (bus_iova_mode == RTE_IOVA_PA ? "PA" : "VA")); + if (bus_iova_mode == RTE_IOVA_PA) + buses_want_pa = true; + else if (bus_iova_mode == RTE_IOVA_VA) + buses_want_va = true; + } + if (buses_want_va && !buses_want_pa) { + mode = RTE_IOVA_VA; + } else if (buses_want_pa && !buses_want_va) { + mode = RTE_IOVA_PA; + } else { + mode = RTE_IOVA_DC; + if (buses_want_va) { + RTE_LOG(WARNING, EAL, "Some buses want 'VA' but forcing 'DC' because other buses want 'PA'.\n"); + RTE_LOG(WARNING, EAL, "Depending on the final decision by the EAL, not all buses may be able to initialize.\n"); + } + } + + return mode; +} + +static int +bus_handle_sigbus(const struct rte_bus *bus, + const void *failure_addr) +{ + int ret; + + if (!bus->sigbus_handler) + return -1; + + ret = bus->sigbus_handler(failure_addr); + + /* find bus but handle failed, keep the errno be set. */ + if (ret < 0 && rte_errno == 0) + rte_errno = ENOTSUP; + + return ret > 0; +} + +int +rte_bus_sigbus_handler(const void *failure_addr) +{ + struct rte_bus *bus; + + int ret = 0; + int old_errno = rte_errno; + + rte_errno = 0; + + bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr); + /* can not find bus. */ + if (!bus) + return 1; + /* find bus but handle failed, pass on the new errno. */ + else if (rte_errno != 0) + return -1; + + /* restore the old errno. */ + rte_errno = old_errno; + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c new file mode 100644 index 000000000..0187076af --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_class.h> +#include <rte_debug.h> + +static struct rte_class_list rte_class_list = + TAILQ_HEAD_INITIALIZER(rte_class_list); + +void +rte_class_register(struct rte_class *class) +{ + RTE_VERIFY(class); + RTE_VERIFY(class->name && strlen(class->name)); + + TAILQ_INSERT_TAIL(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] device class.\n", class->name); +} + +void +rte_class_unregister(struct rte_class *class) +{ + TAILQ_REMOVE(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] device class.\n", class->name); +} + +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data) +{ + struct rte_class *cls; + + if (start != NULL) + cls = TAILQ_NEXT(start, next); + else + cls = TAILQ_FIRST(&rte_class_list); + while (cls != NULL) { + if (cmp(cls, data) == 0) + break; + cls = TAILQ_NEXT(cls, next); + } + return cls; +} + +static int +cmp_class_name(const struct rte_class *class, const void *_name) +{ + const char *name = _name; + + return strcmp(class->name, name); +} + +struct rte_class * +rte_class_find_by_name(const char *name) +{ + return rte_class_find(NULL, cmp_class_name, (const void *)name); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c new file mode 100644 index 000000000..dc5f75d05 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> + +#include <rte_common.h> +#include <rte_cpuflags.h> + +int +rte_cpu_is_supported(void) +{ + /* This is generated at compile-time by the build system */ + static const enum rte_cpu_flag_t compile_time_flags[] = { + RTE_COMPILE_TIME_CPUFLAGS + }; + unsigned count = RTE_DIM(compile_time_flags), i; + int ret; + + for (i = 0; i < count; i++) { + ret = rte_cpu_get_flag_enabled(compile_time_flags[i]); + + if (ret < 0) { + fprintf(stderr, + "ERROR: CPU feature flag lookup failed with error %d\n", + ret); + return 0; + } + if (!ret) { + fprintf(stderr, + "ERROR: This system does not support \"%s\".\n" + "Please check that RTE_MACHINE is set correctly.\n", + rte_cpu_get_flag_name(compile_time_flags[i])); + return 0; + } + } + + return 1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c new file mode 100644 index 000000000..9e4f09d83 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c @@ -0,0 +1,793 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <sys/queue.h> + +#include <rte_compat.h> +#include <rte_bus.h> +#include <rte_class.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_debug.h> +#include <rte_errno.h> +#include <rte_kvargs.h> +#include <rte_log.h> +#include <rte_spinlock.h> +#include <rte_malloc.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "hotplug_mp.h" + +/** + * The device event callback description. + * + * It contains callback address to be registered by user application, + * the pointer to the parameters for callback, and the device name. + */ +struct dev_event_callback { + TAILQ_ENTRY(dev_event_callback) next; /**< Callbacks list */ + rte_dev_event_cb_fn cb_fn; /**< Callback address */ + void *cb_arg; /**< Callback parameter */ + char *dev_name; /**< Callback device name, NULL is for all device */ + uint32_t active; /**< Callback is executing */ +}; + +/** @internal Structure to keep track of registered callbacks */ +TAILQ_HEAD(dev_event_cb_list, dev_event_callback); + +/* The device event callback list for all registered callbacks. */ +static struct dev_event_cb_list dev_event_cbs; + +/* spinlock for device callbacks */ +static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER; + +struct dev_next_ctx { + struct rte_dev_iterator *it; + const char *bus_str; + const char *cls_str; +}; + +#define CTX(it, bus_str, cls_str) \ + (&(const struct dev_next_ctx){ \ + .it = it, \ + .bus_str = bus_str, \ + .cls_str = cls_str, \ + }) + +#define ITCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->it) + +#define BUSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->bus_str) + +#define CLSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->cls_str) + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +int +rte_dev_is_probed(const struct rte_device *dev) +{ + /* The field driver should be set only when the probe is successful. */ + return dev->driver != NULL; +} + +/* helper function to build devargs, caller should free the memory */ +static int +build_devargs(const char *busname, const char *devname, + const char *drvargs, char **devargs) +{ + int length; + + length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs); + if (length < 0) + return -EINVAL; + + *devargs = malloc(length + 1); + if (*devargs == NULL) + return -ENOMEM; + + length = snprintf(*devargs, length + 1, "%s:%s,%s", + busname, devname, drvargs); + if (length < 0) { + free(*devargs); + return -EINVAL; + } + + return 0; +} + +int +rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs) +{ + + char *devargs; + int ret; + + ret = build_devargs(busname, devname, drvargs, &devargs); + if (ret != 0) + return ret; + + ret = rte_dev_probe(devargs); + free(devargs); + + return ret; +} + +/* probe device at local process. */ +int +local_dev_probe(const char *devargs, struct rte_device **new_dev) +{ + struct rte_device *dev; + struct rte_devargs *da; + int ret; + + *new_dev = NULL; + da = calloc(1, sizeof(*da)); + if (da == NULL) + return -ENOMEM; + + ret = rte_devargs_parse(da, devargs); + if (ret) + goto err_devarg; + + if (da->bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + da->bus->name); + ret = -ENOTSUP; + goto err_devarg; + } + + ret = rte_devargs_insert(&da); + if (ret) + goto err_devarg; + + /* the rte_devargs will be referenced in the matching rte_device */ + ret = da->bus->scan(); + if (ret) + goto err_devarg; + + dev = da->bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", + da->name); + ret = -ENODEV; + goto err_devarg; + } + /* Since there is a matching device, it is now its responsibility + * to manage the devargs we've just inserted. From this point + * those devargs shouldn't be removed manually anymore. + */ + + ret = dev->bus->plug(dev); + if (ret > 0) + ret = -ENOTSUP; + + if (ret && !rte_dev_is_probed(dev)) { /* if hasn't ever succeeded */ + RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", + dev->name); + return ret; + } + + *new_dev = dev; + return ret; + +err_devarg: + if (rte_devargs_remove(da) != 0) { + free(da->args); + free(da); + } + return ret; +} + +int +rte_dev_probe(const char *devargs) +{ + struct eal_dev_mp_req req; + struct rte_device *dev; + int ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_ATTACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug add device\n"); + return req.result; + } + + /* attach a shared device from primary start from here: */ + + /* primary attach the new device itself. */ + ret = local_dev_probe(devargs, &dev); + + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on primary process\n"); + + /** + * it is possible that secondary process failed to attached a + * device that primary process have during initialization, + * so for -EEXIST case, we still need to sync with secondary + * process. + */ + if (ret != -EEXIST) + return ret; + } + + /* primary send attach sync request to secondary. */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /* if any communication error, we need to rollback. */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug add request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to attach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on secondary process\n"); + ret = req.result; + + /* for -EEXIST, we don't need to rollback. */ + if (ret == -EEXIST) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on secondary." + "Devices in secondary may not sync with primary\n"); + + /* primary rollback itself. */ + if (local_dev_remove(dev) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on primary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_eal_hotplug_remove(const char *busname, const char *devname) +{ + struct rte_device *dev; + struct rte_bus *bus; + + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); + return -ENOENT; + } + + dev = bus->find_device(NULL, cmp_dev_name, devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname); + return -EINVAL; + } + + return rte_dev_remove(dev); +} + +/* remove device at local process. */ +int +local_dev_remove(struct rte_device *dev) +{ + int ret; + + if (dev->bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + dev->bus->name); + return -ENOTSUP; + } + + ret = dev->bus->unplug(dev); + if (ret) { + RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", + dev->name); + return (ret < 0) ? ret : -ENOENT; + } + + return 0; +} + +int +rte_dev_remove(struct rte_device *dev) +{ + struct eal_dev_mp_req req; + char *devargs; + int ret; + + if (!rte_dev_is_probed(dev)) { + RTE_LOG(ERR, EAL, "Device is not probed\n"); + return -ENOENT; + } + + ret = build_devargs(dev->bus->name, dev->name, "", &devargs); + if (ret != 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_DETACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + free(devargs); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug remove device\n"); + return req.result; + } + + /* detach a device from primary start from here: */ + + /* primary send detach sync request to secondary */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /** + * if communication error, we need to rollback, because it is possible + * part of the secondary processes still detached it successfully. + */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send device detach request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to detach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on secondary process\n"); + ret = req.result; + /** + * if -ENOENT, we don't need to rollback, since devices is + * already detached on secondary process. + */ + if (ret != -ENOENT) + goto rollback; + } + + /* primary detach the device itself. */ + ret = local_dev_remove(dev); + + /* if primary failed, still need to consider if rollback is necessary */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on primary process\n"); + /* if -ENOENT, we don't need to rollback */ + if (ret == -ENOENT) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device detach on secondary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + struct dev_event_callback *event_cb; + int ret; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + + if (TAILQ_EMPTY(&dev_event_cbs)) + TAILQ_INIT(&dev_event_cbs); + + TAILQ_FOREACH(event_cb, &dev_event_cbs, next) { + if (event_cb->cb_fn == cb_fn && event_cb->cb_arg == cb_arg) { + if (device_name == NULL && event_cb->dev_name == NULL) + break; + if (device_name == NULL || event_cb->dev_name == NULL) + continue; + if (!strcmp(event_cb->dev_name, device_name)) + break; + } + } + + /* create a new callback. */ + if (event_cb == NULL) { + event_cb = malloc(sizeof(struct dev_event_callback)); + if (event_cb != NULL) { + event_cb->cb_fn = cb_fn; + event_cb->cb_arg = cb_arg; + event_cb->active = 0; + if (!device_name) { + event_cb->dev_name = NULL; + } else { + event_cb->dev_name = strdup(device_name); + if (event_cb->dev_name == NULL) { + ret = -ENOMEM; + goto error; + } + } + TAILQ_INSERT_TAIL(&dev_event_cbs, event_cb, next); + } else { + RTE_LOG(ERR, EAL, + "Failed to allocate memory for device " + "event callback."); + ret = -ENOMEM; + goto error; + } + } else { + RTE_LOG(ERR, EAL, + "The callback is already exist, no need " + "to register again.\n"); + ret = -EEXIST; + } + + rte_spinlock_unlock(&dev_event_lock); + return 0; +error: + free(event_cb); + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +int +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + int ret = 0; + struct dev_event_callback *event_cb, *next; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + /*walk through the callbacks and remove all that match. */ + for (event_cb = TAILQ_FIRST(&dev_event_cbs); event_cb != NULL; + event_cb = next) { + + next = TAILQ_NEXT(event_cb, next); + + if (device_name != NULL && event_cb->dev_name != NULL) { + if (!strcmp(event_cb->dev_name, device_name)) { + if (event_cb->cb_fn != cb_fn || + (cb_arg != (void *)-1 && + event_cb->cb_arg != cb_arg)) + continue; + } + } else if (device_name != NULL) { + continue; + } + + /* + * if this callback is not executing right now, + * then remove it. + */ + if (event_cb->active == 0) { + TAILQ_REMOVE(&dev_event_cbs, event_cb, next); + free(event_cb); + ret++; + } else { + continue; + } + } + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +void +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event) +{ + struct dev_event_callback *cb_lst; + + if (device_name == NULL) + return; + + rte_spinlock_lock(&dev_event_lock); + + TAILQ_FOREACH(cb_lst, &dev_event_cbs, next) { + if (cb_lst->dev_name) { + if (strcmp(cb_lst->dev_name, device_name)) + continue; + } + cb_lst->active = 1; + rte_spinlock_unlock(&dev_event_lock); + cb_lst->cb_fn(device_name, event, + cb_lst->cb_arg); + rte_spinlock_lock(&dev_event_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&dev_event_lock); +} + +int +rte_dev_iterator_init(struct rte_dev_iterator *it, + const char *dev_str) +{ + struct rte_devargs devargs; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + + /* Having both bus_str and cls_str NULL is illegal, + * marking this iterator as invalid unless + * everything goes well. + */ + it->bus_str = NULL; + it->cls_str = NULL; + + devargs.data = dev_str; + if (rte_devargs_layers_parse(&devargs, dev_str)) + goto get_out; + + bus = devargs.bus; + cls = devargs.cls; + /* The string should have at least + * one layer specified. + */ + if (bus == NULL && cls == NULL) { + RTE_LOG(ERR, EAL, + "Either bus or class must be specified.\n"); + rte_errno = EINVAL; + goto get_out; + } + if (bus != NULL && bus->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Bus %s not supported\n", bus->name); + rte_errno = ENOTSUP; + goto get_out; + } + if (cls != NULL && cls->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Class %s not supported\n", cls->name); + rte_errno = ENOTSUP; + goto get_out; + } + it->bus_str = devargs.bus_str; + it->cls_str = devargs.cls_str; + it->dev_str = dev_str; + it->bus = bus; + it->cls = cls; + it->device = NULL; + it->class_device = NULL; +get_out: + return -rte_errno; +} + +static char * +dev_str_sane_copy(const char *str) +{ + size_t end; + char *copy; + + end = strcspn(str, ",/"); + if (str[end] == ',') { + copy = strdup(&str[end + 1]); + } else { + /* '/' or '\0' */ + copy = strdup(""); + } + if (copy == NULL) { + rte_errno = ENOMEM; + } else { + char *slash; + + slash = strchr(copy, '/'); + if (slash != NULL) + slash[0] = '\0'; + } + return copy; +} + +static int +class_next_dev_cmp(const struct rte_class *cls, + const void *ctx) +{ + struct rte_dev_iterator *it; + const char *cls_str = NULL; + void *dev; + + if (cls->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + cls_str = CLSCTX(ctx); + dev = it->class_device; + /* it->cls_str != NULL means a class + * was specified in the devstr. + */ + if (it->cls_str != NULL && cls != it->cls) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + dev = cls->dev_iterate(dev, cls_str, it); + it->class_device = dev; + return dev == NULL; +} + +static int +bus_next_dev_cmp(const struct rte_bus *bus, + const void *ctx) +{ + struct rte_device *dev = NULL; + struct rte_class *cls = NULL; + struct rte_dev_iterator *it; + const char *bus_str = NULL; + + if (bus->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + bus_str = BUSCTX(ctx); + dev = it->device; + /* it->bus_str != NULL means a bus + * was specified in the devstr. + */ + if (it->bus_str != NULL && bus != it->bus) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + if (it->cls_str == NULL) { + dev = bus->dev_iterate(dev, bus_str, it); + goto end; + } + /* cls_str != NULL */ + if (dev == NULL) { +next_dev_on_bus: + dev = bus->dev_iterate(dev, bus_str, it); + it->device = dev; + } + if (dev == NULL) + return 1; + if (it->cls != NULL) + cls = TAILQ_PREV(it->cls, rte_class_list, next); + cls = rte_class_find(cls, class_next_dev_cmp, ctx); + if (cls != NULL) { + it->cls = cls; + goto end; + } + goto next_dev_on_bus; +end: + it->device = dev; + return dev == NULL; +} +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it) +{ + struct rte_bus *bus = NULL; + int old_errno = rte_errno; + char *bus_str = NULL; + char *cls_str = NULL; + + rte_errno = 0; + if (it->bus_str == NULL && it->cls_str == NULL) { + /* Invalid iterator. */ + rte_errno = EINVAL; + return NULL; + } + if (it->bus != NULL) + bus = TAILQ_PREV(it->bus, rte_bus_list, next); + if (it->bus_str != NULL) { + bus_str = dev_str_sane_copy(it->bus_str); + if (bus_str == NULL) + goto out; + } + if (it->cls_str != NULL) { + cls_str = dev_str_sane_copy(it->cls_str); + if (cls_str == NULL) + goto out; + } + while ((bus = rte_bus_find(bus, bus_next_dev_cmp, + CTX(it, bus_str, cls_str)))) { + if (it->device != NULL) { + it->bus = bus; + goto out; + } + if (it->bus_str != NULL || + rte_errno != 0) + break; + } + if (rte_errno == 0) + rte_errno = old_errno; +out: + free(bus_str); + free(cls_str); + return it->device; +} + +int +rte_dev_dma_map(struct rte_device *dev, void *addr, uint64_t iova, + size_t len) +{ + if (dev->bus->dma_map == NULL || len == 0) { + rte_errno = ENOTSUP; + return -1; + } + /* Memory must be registered through rte_extmem_* APIs */ + if (rte_mem_virt2memseg_list(addr) == NULL) { + rte_errno = EINVAL; + return -1; + } + + return dev->bus->dma_map(dev, addr, iova, len); +} + +int +rte_dev_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, + size_t len) +{ + if (dev->bus->dma_unmap == NULL || len == 0) { + rte_errno = ENOTSUP; + return -1; + } + /* Memory must be registered through rte_extmem_* APIs */ + if (rte_mem_virt2memseg_list(addr) == NULL) { + rte_errno = EINVAL; + return -1; + } + + return dev->bus->dma_unmap(dev, addr, iova, len); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c new file mode 100644 index 000000000..2123773ef --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2014 6WIND S.A. + */ + +/* This file manages the list of devices and their arguments, as given + * by the user at startup + */ + +#include <stdio.h> +#include <string.h> +#include <stdarg.h> + +#include <rte_bus.h> +#include <rte_class.h> +#include <rte_compat.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_errno.h> +#include <rte_kvargs.h> +#include <rte_log.h> +#include <rte_tailq.h> +#include "eal_private.h" + +/** user device double-linked queue type definition */ +TAILQ_HEAD(rte_devargs_list, rte_devargs); + +/** Global list of user devices */ +static struct rte_devargs_list devargs_list = + TAILQ_HEAD_INITIALIZER(devargs_list); + +static size_t +devargs_layer_count(const char *s) +{ + size_t i = s ? 1 : 0; + + while (s != NULL && s[0] != '\0') { + i += s[0] == '/'; + s++; + } + return i; +} + +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr) +{ + struct { + const char *key; + const char *str; + struct rte_kvargs *kvlist; + } layers[] = { + { "bus=", NULL, NULL, }, + { "class=", NULL, NULL, }, + { "driver=", NULL, NULL, }, + }; + struct rte_kvargs_pair *kv = NULL; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + const char *s = devstr; + size_t nblayer; + size_t i = 0; + int ret = 0; + + /* Split each sub-lists. */ + nblayer = devargs_layer_count(devstr); + if (nblayer > RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Invalid format: too many layers (%zu)\n", + nblayer); + ret = -E2BIG; + goto get_out; + } + + /* If the devargs points the devstr + * as source data, then it should not allocate + * anything and keep referring only to it. + */ + if (devargs->data != devstr) { + devargs->data = strdup(devstr); + if (devargs->data == NULL) { + RTE_LOG(ERR, EAL, "OOM\n"); + ret = -ENOMEM; + goto get_out; + } + s = devargs->data; + } + + while (s != NULL) { + if (i >= RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Unrecognized layer %s\n", s); + ret = -EINVAL; + goto get_out; + } + /* + * The last layer is free-form. + * The "driver" key is not required (but accepted). + */ + if (strncmp(layers[i].key, s, strlen(layers[i].key)) && + i != RTE_DIM(layers) - 1) + goto next_layer; + layers[i].str = s; + layers[i].kvlist = rte_kvargs_parse_delim(s, NULL, "/"); + if (layers[i].kvlist == NULL) { + RTE_LOG(ERR, EAL, "Could not parse %s\n", s); + ret = -EINVAL; + goto get_out; + } + s = strchr(s, '/'); + if (s != NULL) + s++; +next_layer: + i++; + } + + /* Parse each sub-list. */ + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist == NULL) + continue; + kv = &layers[i].kvlist->pairs[0]; + if (strcmp(kv->key, "bus") == 0) { + bus = rte_bus_find_by_name(kv->value); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Could not find bus \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "class") == 0) { + cls = rte_class_find_by_name(kv->value); + if (cls == NULL) { + RTE_LOG(ERR, EAL, "Could not find class \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "driver") == 0) { + /* Ignore */ + continue; + } + } + + /* Fill devargs fields. */ + devargs->bus_str = layers[0].str; + devargs->cls_str = layers[1].str; + devargs->drv_str = layers[2].str; + devargs->bus = bus; + devargs->cls = cls; + + /* If we own the data, clean up a bit + * the several layers string, to ease + * their parsing afterward. + */ + if (devargs->data != devstr) { + char *s = (void *)(intptr_t)(devargs->data); + + while ((s = strchr(s, '/'))) { + *s = '\0'; + s++; + } + } + +get_out: + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist) + rte_kvargs_free(layers[i].kvlist); + } + if (ret != 0) + rte_errno = -ret; + return ret; +} + +static int +bus_name_cmp(const struct rte_bus *bus, const void *name) +{ + return strncmp(bus->name, name, strlen(bus->name)); +} + +int +rte_devargs_parse(struct rte_devargs *da, const char *dev) +{ + struct rte_bus *bus = NULL; + const char *devname; + const size_t maxlen = sizeof(da->name); + size_t i; + + if (da == NULL) + return -EINVAL; + + /* Retrieve eventual bus info */ + do { + devname = dev; + bus = rte_bus_find(bus, bus_name_cmp, dev); + if (bus == NULL) + break; + devname = dev + strlen(bus->name) + 1; + if (rte_bus_find_by_device_name(devname) == bus) + break; + } while (1); + /* Store device name */ + i = 0; + while (devname[i] != '\0' && devname[i] != ',') { + da->name[i] = devname[i]; + i++; + if (i == maxlen) { + RTE_LOG(WARNING, EAL, "Parsing \"%s\": device name should be shorter than %zu\n", + dev, maxlen); + da->name[i - 1] = '\0'; + return -EINVAL; + } + } + da->name[i] = '\0'; + if (bus == NULL) { + bus = rte_bus_find_by_device_name(da->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "failed to parse device \"%s\"\n", + da->name); + return -EFAULT; + } + } + da->bus = bus; + /* Parse eventual device arguments */ + if (devname[i] == ',') + da->args = strdup(&devname[i + 1]); + else + da->args = strdup(""); + if (da->args == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse arguments\n"); + return -ENOMEM; + } + return 0; +} + +int +rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) +{ + va_list ap; + size_t len; + char *dev; + int ret; + + if (da == NULL) + return -EINVAL; + + va_start(ap, format); + len = vsnprintf(NULL, 0, format, ap); + va_end(ap); + + dev = calloc(1, len + 1); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse device\n"); + return -ENOMEM; + } + + va_start(ap, format); + vsnprintf(dev, len + 1, format, ap); + va_end(ap); + + ret = rte_devargs_parse(da, dev); + + free(dev); + return ret; +} + +int +rte_devargs_insert(struct rte_devargs **da) +{ + struct rte_devargs *listed_da; + void *tmp; + + if (*da == NULL || (*da)->bus == NULL) + return -1; + + TAILQ_FOREACH_SAFE(listed_da, &devargs_list, next, tmp) { + if (listed_da == *da) + /* devargs already in the list */ + return 0; + if (strcmp(listed_da->bus->name, (*da)->bus->name) == 0 && + strcmp(listed_da->name, (*da)->name) == 0) { + /* device already in devargs list, must be updated */ + listed_da->type = (*da)->type; + listed_da->policy = (*da)->policy; + free(listed_da->args); + listed_da->args = (*da)->args; + listed_da->bus = (*da)->bus; + listed_da->cls = (*da)->cls; + listed_da->bus_str = (*da)->bus_str; + listed_da->cls_str = (*da)->cls_str; + listed_da->data = (*da)->data; + /* replace provided devargs with found one */ + free(*da); + *da = listed_da; + return 0; + } + } + /* new device in the list */ + TAILQ_INSERT_TAIL(&devargs_list, *da, next); + return 0; +} + +/* store a whitelist parameter for later parsing */ +int +rte_devargs_add(enum rte_devtype devtype, const char *devargs_str) +{ + struct rte_devargs *devargs = NULL; + struct rte_bus *bus = NULL; + const char *dev = devargs_str; + + /* use calloc instead of rte_zmalloc as it's called early at init */ + devargs = calloc(1, sizeof(*devargs)); + if (devargs == NULL) + goto fail; + + if (rte_devargs_parse(devargs, dev)) + goto fail; + devargs->type = devtype; + bus = devargs->bus; + if (devargs->type == RTE_DEVTYPE_BLACKLISTED_PCI) + devargs->policy = RTE_DEV_BLACKLISTED; + if (bus->conf.scan_mode == RTE_BUS_SCAN_UNDEFINED) { + if (devargs->policy == RTE_DEV_WHITELISTED) + bus->conf.scan_mode = RTE_BUS_SCAN_WHITELIST; + else if (devargs->policy == RTE_DEV_BLACKLISTED) + bus->conf.scan_mode = RTE_BUS_SCAN_BLACKLIST; + } + TAILQ_INSERT_TAIL(&devargs_list, devargs, next); + return 0; + +fail: + if (devargs) { + free(devargs->args); + free(devargs); + } + + return -1; +} + +int +rte_devargs_remove(struct rte_devargs *devargs) +{ + struct rte_devargs *d; + void *tmp; + + if (devargs == NULL || devargs->bus == NULL) + return -1; + + TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) { + if (strcmp(d->bus->name, devargs->bus->name) == 0 && + strcmp(d->name, devargs->name) == 0) { + TAILQ_REMOVE(&devargs_list, d, next); + free(d->args); + free(d); + return 0; + } + } + return 1; +} + +/* count the number of devices of a specified type */ +unsigned int +rte_devargs_type_count(enum rte_devtype devtype) +{ + struct rte_devargs *devargs; + unsigned int count = 0; + + TAILQ_FOREACH(devargs, &devargs_list, next) { + if (devargs->type != devtype) + continue; + count++; + } + return count; +} + +/* dump the user devices on the console */ +void +rte_devargs_dump(FILE *f) +{ + struct rte_devargs *devargs; + + fprintf(f, "User device list:\n"); + TAILQ_FOREACH(devargs, &devargs_list, next) { + fprintf(f, " [%s]: %s %s\n", + (devargs->bus ? devargs->bus->name : "??"), + devargs->name, devargs->args); + } +} + +/* bus-aware rte_devargs iterator. */ +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start) +{ + struct rte_devargs *da; + + if (start != NULL) + da = TAILQ_NEXT(start, next); + else + da = TAILQ_FIRST(&devargs_list); + while (da != NULL) { + if (busname == NULL || + (strcmp(busname, da->bus->name) == 0)) + return da; + da = TAILQ_NEXT(da, next); + } + return NULL; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c new file mode 100644 index 000000000..2a10fb823 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/* Use XSI-compliant portable version of strerror_r() */ +#undef _GNU_SOURCE + +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <stdarg.h> +#include <errno.h> + +#include <rte_per_lcore.h> +#include <rte_errno.h> +#include <rte_string_fns.h> + +RTE_DEFINE_PER_LCORE(int, _rte_errno); + +const char * +rte_strerror(int errnum) +{ + /* BSD puts a colon in the "unknown error" messages, Linux doesn't */ +#ifdef RTE_EXEC_ENV_FREEBSD + static const char *sep = ":"; +#else + static const char *sep = ""; +#endif +#define RETVAL_SZ 256 + static RTE_DEFINE_PER_LCORE(char[RETVAL_SZ], retval); + char *ret = RTE_PER_LCORE(retval); + + /* since some implementations of strerror_r throw an error + * themselves if errnum is too big, we handle that case here */ + if (errnum >= RTE_MAX_ERRNO) + snprintf(ret, RETVAL_SZ, "Unknown error%s %d", sep, errnum); + else + switch (errnum){ + case E_RTE_SECONDARY: + return "Invalid call in secondary process"; + case E_RTE_NO_CONFIG: + return "Missing rte_config structure"; + default: + if (strerror_r(errnum, ret, RETVAL_SZ) != 0) + snprintf(ret, RETVAL_SZ, "Unknown error%s %d", + sep, errnum); + } + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c new file mode 100644 index 000000000..4f8f1af73 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c @@ -0,0 +1,1510 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include <fcntl.h> +#include <inttypes.h> +#include <limits.h> +#include <sys/mman.h> +#include <stdint.h> +#include <errno.h> +#include <sys/file.h> +#include <string.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_errno.h> +#include <rte_spinlock.h> +#include <rte_tailq.h> + +#include "eal_filesystem.h" +#include "eal_private.h" + +#include "rte_fbarray.h" + +#define MASK_SHIFT 6ULL +#define MASK_ALIGN (1ULL << MASK_SHIFT) +#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT) +#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN)) +#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod) + +/* + * We use this to keep track of created/attached memory areas to prevent user + * errors in API usage. + */ +struct mem_area { + TAILQ_ENTRY(mem_area) next; + void *addr; + size_t len; + int fd; +}; +TAILQ_HEAD(mem_area_head, mem_area); +/* local per-process tailq */ +static struct mem_area_head mem_area_tailq = + TAILQ_HEAD_INITIALIZER(mem_area_tailq); +static rte_spinlock_t mem_area_lock = RTE_SPINLOCK_INITIALIZER; + +/* + * This is a mask that is always stored at the end of array, to provide fast + * way of finding free/used spots without looping through each element. + */ + +struct used_mask { + unsigned int n_masks; + uint64_t data[]; +}; + +static size_t +calc_mask_size(unsigned int len) +{ + /* mask must be multiple of MASK_ALIGN, even though length of array + * itself may not be aligned on that boundary. + */ + len = RTE_ALIGN_CEIL(len, MASK_ALIGN); + return sizeof(struct used_mask) + + sizeof(uint64_t) * MASK_LEN_TO_IDX(len); +} + +static size_t +calc_data_size(size_t page_sz, unsigned int elt_sz, unsigned int len) +{ + size_t data_sz = elt_sz * len; + size_t msk_sz = calc_mask_size(len); + return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz); +} + +static struct used_mask * +get_used_mask(void *data, unsigned int elt_sz, unsigned int len) +{ + return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len); +} + +static int +resize_and_map(int fd, void *addr, size_t len) +{ + char path[PATH_MAX]; + void *map_addr; + + if (ftruncate(fd, len)) { + RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + + map_addr = mmap(addr, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr != addr) { + RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno)); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + return 0; +} + +static int +overlap(const struct mem_area *ma, const void *start, size_t len) +{ + const void *end = RTE_PTR_ADD(start, len); + const void *ma_start = ma->addr; + const void *ma_end = RTE_PTR_ADD(ma->addr, ma->len); + + /* start overlap? */ + if (start >= ma_start && start < ma_end) + return 1; + /* end overlap? */ + if (end >= ma_start && end < ma_end) + return 1; + return 0; +} + +static int +find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookahead_idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-1ULL << last_mod); + + for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) { + uint64_t cur_msk, lookahead_msk; + unsigned int run_start, clz, left; + bool found = false; + /* + * The process of getting n consecutive bits for arbitrary n is + * a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * rshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count leading zeroes (that is, count + * how many consecutive set bits we had starting from the + * end of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits right at the start, so we will do + * (n-k-1) rshift-ands and check if first bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* combine current ignore mask with last index ignore mask */ + if (msk_idx == last) + ignore_msk |= last_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk >> 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + run_start = __builtin_ctzll(tmp_msk); + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count leading zeroes on inverted mask */ + if (~cur_msk == 0) + clz = sizeof(cur_msk) * 8; + else + clz = __builtin_clzll(~cur_msk); + + /* if there aren't any runs at the end either, just continue */ + if (clz == 0) + continue; + + /* we have a partial run at the end, so try looking ahead */ + run_start = MASK_ALIGN - clz; + left -= clz; + + for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks; + lookahead_idx++) { + unsigned int s_idx, need; + lookahead_msk = msk->data[lookahead_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookahead_msk = ~lookahead_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookahead_msk &= lookahead_msk >> 1ULL; + + /* if first bit is not set, we've lost the run */ + if ((lookahead_msk & 1) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookahead-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = ~((1ULL << need) - 1); + msk_idx = lookahead_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } + + /* we didn't find anything, so continue */ + if (!found) + continue; + + return MASK_GET_IDX(msk_idx, run_start); + } + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_next(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1ULL); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + for (idx = first; idx < msk->n_masks; idx++) { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find first set bit - that will correspond to whatever it is + * that we're looking for. + */ + found = __builtin_ctzll(cur); + return MASK_GET_IDX(idx, found); + } + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk; + unsigned int need_len, result = 0; + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + for (idx = first; idx < msk->n_masks; idx++, result += need_len) { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* if this is last mask, ignore everything after last bit */ + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) { + cur >>= first_mod; + /* at the start, we don't need the full mask len */ + need_len -= first_mod; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + continue; + + /* + * see if current run ends before mask end. + */ + run_len = __builtin_ctzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } + } + return result; +} + +static int +find_prev_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookbehind_idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + msk_idx = first; + do { + uint64_t cur_msk, lookbehind_msk; + unsigned int run_start, run_end, ctz, left; + bool found = false; + /* + * The process of getting n consecutive bits from the top for + * arbitrary n is a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * lshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count trailing zeroes (that is, count + * how many consecutive set bits we had starting from the + * start of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits at the end, so we will do (n-k-1) + * lshift-ands and check if last bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk << 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + /* clz will give us offset from end of mask, and + * we only get the end of our run, not start, + * so adjust result to point to where start + * would have been. + */ + run_start = MASK_ALIGN - + __builtin_clzll(tmp_msk) - n; + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count trailing zeroes on inverted mask */ + if (~cur_msk == 0) + ctz = sizeof(cur_msk) * 8; + else + ctz = __builtin_ctzll(~cur_msk); + + /* if there aren't any runs at the start either, just + * continue + */ + if (ctz == 0) + continue; + + /* we have a partial run at the start, so try looking behind */ + run_end = MASK_GET_IDX(msk_idx, ctz); + left -= ctz; + + /* go backwards, include zero */ + lookbehind_idx = msk_idx - 1; + + /* we can't lookbehind as we've run out of masks, so stop */ + if (msk_idx == 0) + break; + + do { + const uint64_t last_bit = 1ULL << (MASK_ALIGN - 1); + unsigned int s_idx, need; + + lookbehind_msk = msk->data[lookbehind_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookbehind_msk = ~lookbehind_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookbehind_msk &= lookbehind_msk << 1ULL; + + /* if last bit is not set, we've lost the run */ + if ((lookbehind_msk & last_bit) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookbehind-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = -1ULL << need; + msk_idx = lookbehind_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } while ((lookbehind_idx--) != 0); /* decrement after check to + * include zero + */ + + /* we didn't find anything, so continue */ + if (!found) + continue; + + /* we've found what we were looking for, but we only know where + * the run ended, so calculate start position. + */ + return run_end - n; + } while (msk_idx-- != 0); /* decrement after check to include zero */ + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_prev(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing clz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find last set bit - that will correspond to whatever it is + * that we're looking for. we're counting trailing zeroes, thus + * the value we get is counted from end of mask, so calculate + * position from start of mask. + */ + found = MASK_ALIGN - __builtin_clzll(cur) - 1; + + return MASK_GET_IDX(idx, found); + } while (idx-- != 0); /* decrement after check to include zero*/ + + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_rev_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int need_len, result = 0; + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything after start on first iteration */ + if (idx == first) { + unsigned int end_len = MASK_ALIGN - first_mod - 1; + cur <<= end_len; + /* at the start, we don't need the full mask len */ + need_len -= end_len; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + goto endloop; + + /* + * see where run ends, starting from the end. + */ + run_len = __builtin_clzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } +endloop: + result += need_len; + } while (idx-- != 0); /* decrement after check to include zero */ + return result; +} + +static int +set_used(struct rte_fbarray *arr, unsigned int idx, bool used) +{ + struct used_mask *msk; + uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + unsigned int msk_idx = MASK_LEN_TO_IDX(idx); + bool already_used; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + ret = 0; + + /* prevent array from changing under us */ + rte_rwlock_write_lock(&arr->rwlock); + + already_used = (msk->data[msk_idx] & msk_bit) != 0; + + /* nothing to be done */ + if (used == already_used) + goto out; + + if (used) { + msk->data[msk_idx] |= msk_bit; + arr->count++; + } else { + msk->data[msk_idx] &= ~msk_bit; + arr->count--; + } +out: + rte_rwlock_write_unlock(&arr->rwlock); + + return ret; +} + +static int +fully_validate(const char *name, unsigned int elt_sz, unsigned int len) +{ + if (name == NULL || elt_sz == 0 || len == 0 || len > INT_MAX) { + rte_errno = EINVAL; + return -1; + } + + if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +int +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz) +{ + size_t page_sz, mmap_len; + char path[PATH_MAX]; + struct used_mask *msk; + struct mem_area *ma = NULL; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + if (fully_validate(name, elt_sz, len)) + return -1; + + /* allocate mem area before doing anything */ + ma = malloc(sizeof(*ma)); + if (ma == NULL) { + rte_errno = ENOMEM; + return -1; + } + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) { + free(ma); + return -1; + } + + /* calculate our memory limits */ + mmap_len = calc_data_size(page_sz, elt_sz, len); + + data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0); + if (data == NULL) { + free(ma); + return -1; + } + + rte_spinlock_lock(&mem_area_lock); + + fd = -1; + + if (internal_config.no_shconf) { + /* remap virtual area as writable */ + void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0); + if (new_data == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n", + __func__, strerror(errno)); + goto fail; + } + } else { + eal_get_fbarray_path(path, sizeof(path), name); + + /* + * Each fbarray is unique to process namespace, i.e. the + * filename depends on process prefix. Try to take out a lock + * and see if we succeed. If we don't, someone else is using it + * already. + */ + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = errno; + goto fail; + } else if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = EBUSY; + goto fail; + } + + /* take out a non-exclusive lock, so that other processes could + * still attach to it, but no other process could reinitialize + * it. + */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + } + ma->addr = data; + ma->len = mmap_len; + ma->fd = fd; + + /* do not close fd - keep it until detach/destroy */ + TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next); + + /* initialize the data */ + memset(data, 0, mmap_len); + + /* populate data structure */ + strlcpy(arr->name, name, sizeof(arr->name)); + arr->data = data; + arr->len = len; + arr->elt_sz = elt_sz; + arr->count = 0; + + msk = get_used_mask(data, elt_sz, len); + msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN)); + + rte_rwlock_init(&arr->rwlock); + + rte_spinlock_unlock(&mem_area_lock); + + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + free(ma); + + rte_spinlock_unlock(&mem_area_lock); + return -1; +} + +int +rte_fbarray_attach(struct rte_fbarray *arr) +{ + struct mem_area *ma = NULL, *tmp = NULL; + size_t page_sz, mmap_len; + char path[PATH_MAX]; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize attach as two values we need (element + * size and array length) are constant for the duration of life of + * the array, so the parts we care about will not race. + */ + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) + return -1; + + ma = malloc(sizeof(*ma)); + if (ma == NULL) { + rte_errno = ENOMEM; + return -1; + } + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) { + free(ma); + return -1; + } + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + /* check the tailq - maybe user has already mapped this address space */ + rte_spinlock_lock(&mem_area_lock); + + TAILQ_FOREACH(tmp, &mem_area_tailq, next) { + if (overlap(tmp, arr->data, mmap_len)) { + rte_errno = EEXIST; + goto fail; + } + } + + /* we know this memory area is unique, so proceed */ + + data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0); + if (data == NULL) + goto fail; + + eal_get_fbarray_path(path, sizeof(path), arr->name); + + fd = open(path, O_RDWR); + if (fd < 0) { + rte_errno = errno; + goto fail; + } + + /* lock the file, to let others know we're using it */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + + /* store our new memory area */ + ma->addr = data; + ma->fd = fd; /* keep fd until detach/destroy */ + ma->len = mmap_len; + + TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next); + + /* we're done */ + + rte_spinlock_unlock(&mem_area_lock); + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + free(ma); + rte_spinlock_unlock(&mem_area_lock); + return -1; +} + +int +rte_fbarray_detach(struct rte_fbarray *arr) +{ + struct mem_area *tmp = NULL; + size_t mmap_len; + int ret = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize detach as two values we need (element + * size and total capacity) are constant for the duration of life of + * the array, so the parts we care about will not race. if the user is + * detaching while doing something else in the same process, we can't + * really do anything about it, things will blow up either way. + */ + + size_t page_sz = sysconf(_SC_PAGESIZE); + + if (page_sz == (size_t)-1) + return -1; + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + /* does this area exist? */ + rte_spinlock_lock(&mem_area_lock); + + TAILQ_FOREACH(tmp, &mem_area_tailq, next) { + if (tmp->addr == arr->data && tmp->len == mmap_len) + break; + } + if (tmp == NULL) { + rte_errno = ENOENT; + ret = -1; + goto out; + } + + munmap(arr->data, mmap_len); + + /* area is unmapped, close fd and remove the tailq entry */ + if (tmp->fd >= 0) + close(tmp->fd); + TAILQ_REMOVE(&mem_area_tailq, tmp, next); + free(tmp); + + ret = 0; +out: + rte_spinlock_unlock(&mem_area_lock); + return ret; +} + +int +rte_fbarray_destroy(struct rte_fbarray *arr) +{ + struct mem_area *tmp = NULL; + size_t mmap_len; + int fd, ret; + char path[PATH_MAX]; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize detach as two values we need (element + * size and total capacity) are constant for the duration of life of + * the array, so the parts we care about will not race. if the user is + * detaching while doing something else in the same process, we can't + * really do anything about it, things will blow up either way. + */ + + size_t page_sz = sysconf(_SC_PAGESIZE); + + if (page_sz == (size_t)-1) + return -1; + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + /* does this area exist? */ + rte_spinlock_lock(&mem_area_lock); + + TAILQ_FOREACH(tmp, &mem_area_tailq, next) { + if (tmp->addr == arr->data && tmp->len == mmap_len) + break; + } + if (tmp == NULL) { + rte_errno = ENOENT; + ret = -1; + goto out; + } + /* with no shconf, there were never any files to begin with */ + if (!internal_config.no_shconf) { + /* + * attempt to get an exclusive lock on the file, to ensure it + * has been detached by all other processes + */ + fd = tmp->fd; + if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n"); + rte_errno = EBUSY; + ret = -1; + goto out; + } + + /* we're OK to destroy the file */ + eal_get_fbarray_path(path, sizeof(path), arr->name); + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "Cannot unlink fbarray: %s\n", + strerror(errno)); + rte_errno = errno; + /* + * we're still holding an exclusive lock, so drop it to + * shared. + */ + flock(fd, LOCK_SH | LOCK_NB); + + ret = -1; + goto out; + } + close(fd); + } + munmap(arr->data, mmap_len); + + /* area is unmapped, remove the tailq entry */ + TAILQ_REMOVE(&mem_area_tailq, tmp, next); + free(tmp); + ret = 0; + + /* reset the fbarray structure */ + memset(arr, 0, sizeof(*arr)); +out: + rte_spinlock_unlock(&mem_area_lock); + return ret; +} + +void * +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx) +{ + void *ret = NULL; + if (arr == NULL) { + rte_errno = EINVAL; + return NULL; + } + + if (idx >= arr->len) { + rte_errno = EINVAL; + return NULL; + } + + ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz); + + return ret; +} + +int +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, true); +} + +int +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, false); +} + +int +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx) +{ + struct used_mask *msk; + int msk_idx; + uint64_t msk_bit; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + msk_idx = MASK_LEN_TO_IDX(idx); + msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + + ret = (msk->data[msk_idx] & msk_bit) != 0; + + rte_rwlock_read_unlock(&arr->rwlock); + + return ret; +} + +static int +fbarray_find(struct rte_fbarray *arr, unsigned int start, bool next, bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = start; + goto out; + } + } else { + if (arr->count == 0) { + rte_errno = ENOENT; + goto out; + } + if (arr->len == arr->count) { + ret = start; + goto out; + } + } + if (next) + ret = find_next(arr, start, used); + else + ret = find_prev(arr, start, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, false); +} + +int +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, true); +} + +int +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, false); +} + +int +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, true); +} + +static int +fbarray_find_n(struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool next, bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len || n > arr->len || n == 0) { + rte_errno = EINVAL; + return -1; + } + if (next && (arr->len - start) < n) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + if (!next && start < (n - 1)) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count || arr->len - arr->count < n) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = next ? start : start - n + 1; + goto out; + } + } else { + if (arr->count < n) { + rte_errno = ENOENT; + goto out; + } + if (arr->count == arr->len) { + ret = next ? start : start - n + 1; + goto out; + } + } + + if (next) + ret = find_next_n(arr, start, n, used); + else + ret = find_prev_n(arr, start, n, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, true, false); +} + +int +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, true, true); +} + +int +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, false); +} + +int +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, true); +} + +static int +fbarray_find_contig(struct rte_fbarray *arr, unsigned int start, bool next, + bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (used) { + if (arr->count == 0) { + ret = 0; + goto out; + } + if (next && arr->count == arr->len) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == arr->len) { + ret = start + 1; + goto out; + } + } else { + if (arr->len == arr->count) { + ret = 0; + goto out; + } + if (next && arr->count == 0) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == 0) { + ret = start + 1; + goto out; + } + } + + if (next) + ret = find_contig(arr, start, used); + else + ret = find_rev_contig(arr, start, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +static int +fbarray_find_biggest(struct rte_fbarray *arr, unsigned int start, bool used, + bool rev) +{ + int cur_idx, next_idx, cur_len, biggest_idx, biggest_len; + /* don't stack if conditions, use function pointers instead */ + int (*find_func)(struct rte_fbarray *, unsigned int); + int (*find_contig_func)(struct rte_fbarray *, unsigned int); + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + /* the other API calls already do their fair share of cheap checks, so + * no need to do them here. + */ + + /* the API's called are thread-safe, but something may still happen + * between the API calls, so lock the fbarray. all other API's are + * read-locking the fbarray, so read lock here is OK. + */ + rte_rwlock_read_lock(&arr->rwlock); + + /* pick out appropriate functions */ + if (used) { + if (rev) { + find_func = rte_fbarray_find_prev_used; + find_contig_func = rte_fbarray_find_rev_contig_used; + } else { + find_func = rte_fbarray_find_next_used; + find_contig_func = rte_fbarray_find_contig_used; + } + } else { + if (rev) { + find_func = rte_fbarray_find_prev_free; + find_contig_func = rte_fbarray_find_rev_contig_free; + } else { + find_func = rte_fbarray_find_next_free; + find_contig_func = rte_fbarray_find_contig_free; + } + } + + cur_idx = start; + biggest_idx = -1; /* default is error */ + biggest_len = 0; + for (;;) { + cur_idx = find_func(arr, cur_idx); + + /* block found, check its length */ + if (cur_idx >= 0) { + cur_len = find_contig_func(arr, cur_idx); + /* decide where we go next */ + next_idx = rev ? cur_idx - cur_len : cur_idx + cur_len; + /* move current index to start of chunk */ + cur_idx = rev ? next_idx + 1 : cur_idx; + + if (cur_len > biggest_len) { + biggest_idx = cur_idx; + biggest_len = cur_len; + } + cur_idx = next_idx; + /* in reverse mode, next_idx may be -1 if chunk started + * at array beginning. this means there's no more work + * to do. + */ + if (cur_idx < 0) + break; + } else { + /* nothing more to find, stop. however, a failed API + * call has set rte_errno, which we want to ignore, as + * reaching the end of fbarray is not an error. + */ + rte_errno = 0; + break; + } + } + /* if we didn't find anything at all, set rte_errno */ + if (biggest_idx < 0) + rte_errno = used ? ENOENT : ENOSPC; + + rte_rwlock_read_unlock(&arr->rwlock); + return biggest_idx; +} + +int +rte_fbarray_find_biggest_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, false, false); +} + +int +rte_fbarray_find_biggest_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, true, false); +} + +int +rte_fbarray_find_rev_biggest_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, false, true); +} + +int +rte_fbarray_find_rev_biggest_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, true, true); +} + + +int +rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, false); +} + +int +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, true); +} + +int +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, false); +} + +int +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, true); +} + +int +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt) +{ + void *end; + int ret = -1; + + /* + * no need to synchronize as it doesn't matter if underlying data + * changes - we're doing pointer arithmetic here. + */ + + if (arr == NULL || elt == NULL) { + rte_errno = EINVAL; + return -1; + } + end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len); + if (elt < arr->data || elt >= end) { + rte_errno = EINVAL; + return -1; + } + + ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz; + + return ret; +} + +void +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f) +{ + struct used_mask *msk; + unsigned int i; + + if (arr == NULL || f == NULL) { + rte_errno = EINVAL; + return; + } + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) { + fprintf(f, "Invalid file-backed array\n"); + goto out; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + fprintf(f, "File-backed array: %s\n", arr->name); + fprintf(f, "size: %i occupied: %i elt_sz: %i\n", + arr->len, arr->count, arr->elt_sz); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + + for (i = 0; i < msk->n_masks; i++) + fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]); +out: + rte_rwlock_read_unlock(&arr->rwlock); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c new file mode 100644 index 000000000..2d2179d41 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <stdint.h> +#include <rte_hexdump.h> +#include <rte_string_fns.h> + +#define LINE_LEN 128 + +void +rte_hexdump(FILE *f, const char *title, const void *buf, unsigned int len) +{ + unsigned int i, out, ofs; + const unsigned char *data = buf; + char line[LINE_LEN]; /* space needed 8+16*3+3+16 == 75 */ + + fprintf(f, "%s at [%p], len=%u\n", + title ? : " Dump data", data, len); + ofs = 0; + while (ofs < len) { + /* format the line in the buffer */ + out = snprintf(line, LINE_LEN, "%08X:", ofs); + for (i = 0; i < 16; i++) { + if (ofs + i < len) + snprintf(line + out, LINE_LEN - out, + " %02X", (data[ofs + i] & 0xff)); + else + strcpy(line + out, " "); + out += 3; + } + + + for (; i <= 16; i++) + out += snprintf(line + out, LINE_LEN - out, " | "); + + for (i = 0; ofs < len && i < 16; i++, ofs++) { + unsigned char c = data[ofs]; + + if (c < ' ' || c > '~') + c = '.'; + out += snprintf(line + out, LINE_LEN - out, "%c", c); + } + fprintf(f, "%s\n", line); + } + fflush(f); +} + +void +rte_memdump(FILE *f, const char *title, const void *buf, unsigned int len) +{ + unsigned int i, out; + const unsigned char *data = buf; + char line[LINE_LEN]; + + if (title) + fprintf(f, "%s: ", title); + + line[0] = '\0'; + for (i = 0, out = 0; i < len; i++) { + /* Make sure we do not overrun the line buffer length. */ + if (out >= LINE_LEN - 4) { + fprintf(f, "%s", line); + out = 0; + line[out] = '\0'; + } + out += snprintf(line + out, LINE_LEN - out, "%02x%s", + (data[i] & 0xff), ((i + 1) < len) ? ":" : ""); + } + if (out > 0) + fprintf(f, "%s", line); + fprintf(f, "\n"); + + fflush(f); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c new file mode 100644 index 000000000..5388b81a5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +const char * +rte_hypervisor_get_name(enum rte_hypervisor id) +{ + switch (id) { + case RTE_HYPERVISOR_NONE: + return "none"; + case RTE_HYPERVISOR_KVM: + return "KVM"; + case RTE_HYPERVISOR_HYPERV: + return "Hyper-V"; + case RTE_HYPERVISOR_VMWARE: + return "VMware"; + default: + return "unknown"; + } +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c new file mode 100644 index 000000000..cf52d717f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <sys/queue.h> + +#include <rte_launch.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_atomic.h> +#include <rte_pause.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> + +#include "eal_private.h" + +/* + * Wait until a lcore finished its job. + */ +int +rte_eal_wait_lcore(unsigned slave_id) +{ + if (lcore_config[slave_id].state == WAIT) + return 0; + + while (lcore_config[slave_id].state != WAIT && + lcore_config[slave_id].state != FINISHED) + rte_pause(); + + rte_rmb(); + + /* we are in finished state, go to wait state */ + lcore_config[slave_id].state = WAIT; + return lcore_config[slave_id].ret; +} + +/* + * Check that every SLAVE lcores are in WAIT state, then call + * rte_eal_remote_launch() for all of them. If call_master is true + * (set to CALL_MASTER), also call the function on the master lcore. + */ +int +rte_eal_mp_remote_launch(int (*f)(void *), void *arg, + enum rte_rmt_call_master_t call_master) +{ + int lcore_id; + int master = rte_get_master_lcore(); + + /* check state of lcores */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (lcore_config[lcore_id].state != WAIT) + return -EBUSY; + } + + /* send messages to cores */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_remote_launch(f, arg, lcore_id); + } + + if (call_master == CALL_MASTER) { + lcore_config[master].ret = f(arg); + lcore_config[master].state = FINISHED; + } + + return 0; +} + +/* + * Return the state of the lcore identified by slave_id. + */ +enum rte_lcore_state_t +rte_eal_get_lcore_state(unsigned lcore_id) +{ + return lcore_config[lcore_id].state; +} + +/* + * Do a rte_eal_wait_lcore() for every lcore. The return values are + * ignored. + */ +void +rte_eal_mp_wait_lcore(void) +{ + unsigned lcore_id; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_wait_lcore(lcore_id); + } +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c new file mode 100644 index 000000000..5404922a8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <unistd.h> +#include <limits.h> +#include <string.h> + +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_thread.h" + +unsigned int rte_get_master_lcore(void) +{ + return rte_eal_get_configuration()->master_lcore; +} + +unsigned int rte_lcore_count(void) +{ + return rte_eal_get_configuration()->lcore_count; +} + +int rte_lcore_index(int lcore_id) +{ + if (unlikely(lcore_id >= RTE_MAX_LCORE)) + return -1; + + if (lcore_id < 0) + lcore_id = (int)rte_lcore_id(); + + return lcore_config[lcore_id].core_index; +} + +int rte_lcore_to_cpu_id(int lcore_id) +{ + if (unlikely(lcore_id >= RTE_MAX_LCORE)) + return -1; + + if (lcore_id < 0) + lcore_id = (int)rte_lcore_id(); + + return lcore_config[lcore_id].core_id; +} + +rte_cpuset_t rte_lcore_cpuset(unsigned int lcore_id) +{ + return lcore_config[lcore_id].cpuset; +} + +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned int lcore_id) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return ROLE_OFF; + return cfg->lcore_role[lcore_id]; +} + +int rte_lcore_is_enabled(unsigned int lcore_id) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return 0; + return cfg->lcore_role[lcore_id] == ROLE_RTE; +} + +unsigned int rte_get_next_lcore(unsigned int i, int skip_master, int wrap) +{ + i++; + if (wrap) + i %= RTE_MAX_LCORE; + + while (i < RTE_MAX_LCORE) { + if (!rte_lcore_is_enabled(i) || + (skip_master && (i == rte_get_master_lcore()))) { + i++; + if (wrap) + i %= RTE_MAX_LCORE; + continue; + } + break; + } + return i; +} + +unsigned int +rte_lcore_to_socket_id(unsigned int lcore_id) +{ + return lcore_config[lcore_id].socket_id; +} + +static int +socket_id_cmp(const void *a, const void *b) +{ + const int *lcore_id_a = a; + const int *lcore_id_b = b; + + if (*lcore_id_a < *lcore_id_b) + return -1; + if (*lcore_id_a > *lcore_id_b) + return 1; + return 0; +} + +/* + * Parse /sys/devices/system/cpu to get the number of physical and logical + * processors on the machine. The function will fill the cpu_info + * structure. + */ +int +rte_eal_cpu_init(void) +{ + /* pointer to global configuration */ + struct rte_config *config = rte_eal_get_configuration(); + unsigned lcore_id; + unsigned count = 0; + unsigned int socket_id, prev_socket_id; + int lcore_to_socket_id[RTE_MAX_LCORE]; + + /* + * Parse the maximum set of logical cores, detect the subset of running + * ones and enable them by default. + */ + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + lcore_config[lcore_id].core_index = count; + + /* init cpuset for per lcore config */ + CPU_ZERO(&lcore_config[lcore_id].cpuset); + + /* find socket first */ + socket_id = eal_cpu_socket_id(lcore_id); + lcore_to_socket_id[lcore_id] = socket_id; + + if (eal_cpu_detected(lcore_id) == 0) { + config->lcore_role[lcore_id] = ROLE_OFF; + lcore_config[lcore_id].core_index = -1; + continue; + } + + /* By default, lcore 1:1 map to cpu id */ + CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset); + + /* By default, each detected core is enabled */ + config->lcore_role[lcore_id] = ROLE_RTE; + lcore_config[lcore_id].core_role = ROLE_RTE; + lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id); + lcore_config[lcore_id].socket_id = socket_id; + RTE_LOG(DEBUG, EAL, "Detected lcore %u as " + "core %u on socket %u\n", + lcore_id, lcore_config[lcore_id].core_id, + lcore_config[lcore_id].socket_id); + count++; + } + for (; lcore_id < CPU_SETSIZE; lcore_id++) { + if (eal_cpu_detected(lcore_id) == 0) + continue; + RTE_LOG(DEBUG, EAL, "Skipped lcore %u as core %u on socket %u\n", + lcore_id, eal_cpu_core_id(lcore_id), + eal_cpu_socket_id(lcore_id)); + } + + /* Set the count of enabled logical cores of the EAL configuration */ + config->lcore_count = count; + RTE_LOG(DEBUG, EAL, + "Support maximum %u logical core(s) by configuration.\n", + RTE_MAX_LCORE); + RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count); + + /* sort all socket id's in ascending order */ + qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id), + sizeof(lcore_to_socket_id[0]), socket_id_cmp); + + prev_socket_id = -1; + config->numa_node_count = 0; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + socket_id = lcore_to_socket_id[lcore_id]; + if (socket_id != prev_socket_id) + config->numa_nodes[config->numa_node_count++] = + socket_id; + prev_socket_id = socket_id; + } + RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count); + + return 0; +} + +unsigned int +rte_socket_count(void) +{ + const struct rte_config *config = rte_eal_get_configuration(); + return config->numa_node_count; +} + +int +rte_socket_id_by_idx(unsigned int idx) +{ + const struct rte_config *config = rte_eal_get_configuration(); + if (idx >= config->numa_node_count) { + rte_errno = EINVAL; + return -1; + } + return config->numa_nodes[idx]; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c new file mode 100644 index 000000000..8835c8fff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c @@ -0,0 +1,481 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <stdint.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <regex.h> +#include <fnmatch.h> + +#include <rte_eal.h> +#include <rte_log.h> +#include <rte_per_lcore.h> + +#include "eal_private.h" + +/* global log structure */ +struct rte_logs rte_logs = { + .type = ~0, + .level = RTE_LOG_DEBUG, + .file = NULL, +}; + +struct rte_eal_opt_loglevel { + /** Next list entry */ + TAILQ_ENTRY(rte_eal_opt_loglevel) next; + /** Compiled regular expression obtained from the option */ + regex_t re_match; + /** Globbing pattern option */ + char *pattern; + /** Log level value obtained from the option */ + uint32_t level; +}; + +TAILQ_HEAD(rte_eal_opt_loglevel_list, rte_eal_opt_loglevel); + +/** List of valid EAL log level options */ +static struct rte_eal_opt_loglevel_list opt_loglevel_list = + TAILQ_HEAD_INITIALIZER(opt_loglevel_list); + +/* Stream to use for logging if rte_logs.file is NULL */ +static FILE *default_log_stream; + +/** + * This global structure stores some information about the message + * that is currently being processed by one lcore + */ +struct log_cur_msg { + uint32_t loglevel; /**< log level - see rte_log.h */ + uint32_t logtype; /**< log type - see rte_log.h */ +}; + +struct rte_log_dynamic_type { + const char *name; + uint32_t loglevel; +}; + + /* per core log */ +static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg); + +/* default logs */ + +/* Change the stream that will be used by logging system */ +int +rte_openlog_stream(FILE *f) +{ + rte_logs.file = f; + return 0; +} + +FILE * +rte_log_get_stream(void) +{ + FILE *f = rte_logs.file; + + if (f == NULL) { + /* + * Grab the current value of stderr here, rather than + * just initializing default_log_stream to stderr. This + * ensures that we will always use the current value + * of stderr, even if the application closes and + * reopens it. + */ + return default_log_stream ? : stderr; + } + return f; +} + +/* Set global log level */ +void +rte_log_set_global_level(uint32_t level) +{ + rte_logs.level = (uint32_t)level; +} + +/* Get global log level */ +uint32_t +rte_log_get_global_level(void) +{ + return rte_logs.level; +} + +int +rte_log_get_level(uint32_t type) +{ + if (type >= rte_logs.dynamic_types_len) + return -1; + + return rte_logs.dynamic_types[type].loglevel; +} + +bool +rte_log_can_log(uint32_t logtype, uint32_t level) +{ + int log_level; + + if (level > rte_log_get_global_level()) + return false; + + log_level = rte_log_get_level(logtype); + if (log_level < 0) + return false; + + if (level > (uint32_t)log_level) + return false; + + return true; +} + +int +rte_log_set_level(uint32_t type, uint32_t level) +{ + if (type >= rte_logs.dynamic_types_len) + return -1; + if (level > RTE_LOG_DEBUG) + return -1; + + rte_logs.dynamic_types[type].loglevel = level; + + return 0; +} + +/* set log level by regular expression */ +int +rte_log_set_level_regexp(const char *regex, uint32_t level) +{ + regex_t r; + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + if (regcomp(&r, regex, 0) != 0) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + if (regexec(&r, rte_logs.dynamic_types[i].name, 0, + NULL, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + regfree(&r); + + return 0; +} + +/* + * Save the type string and the loglevel for later dynamic + * logtypes which may register later. + */ +static int rte_log_save_level(int priority, + const char *regex, const char *pattern) +{ + struct rte_eal_opt_loglevel *opt_ll = NULL; + + opt_ll = malloc(sizeof(*opt_ll)); + if (opt_ll == NULL) + goto fail; + + opt_ll->level = priority; + + if (regex) { + opt_ll->pattern = NULL; + if (regcomp(&opt_ll->re_match, regex, 0) != 0) + goto fail; + } else if (pattern) { + opt_ll->pattern = strdup(pattern); + if (opt_ll->pattern == NULL) + goto fail; + } else + goto fail; + + TAILQ_INSERT_HEAD(&opt_loglevel_list, opt_ll, next); + return 0; +fail: + free(opt_ll); + return -1; +} + +int rte_log_save_regexp(const char *regex, int tmp) +{ + return rte_log_save_level(tmp, regex, NULL); +} + +/* set log level based on globbing pattern */ +int +rte_log_set_level_pattern(const char *pattern, uint32_t level) +{ + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + + if (fnmatch(pattern, rte_logs.dynamic_types[i].name, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + return 0; +} + +int rte_log_save_pattern(const char *pattern, int priority) +{ + return rte_log_save_level(priority, NULL, pattern); +} + +/* get the current loglevel for the message being processed */ +int rte_log_cur_msg_loglevel(void) +{ + return RTE_PER_LCORE(log_cur_msg).loglevel; +} + +/* get the current logtype for the message being processed */ +int rte_log_cur_msg_logtype(void) +{ + return RTE_PER_LCORE(log_cur_msg).logtype; +} + +static int +rte_log_lookup(const char *name) +{ + size_t i; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + if (strcmp(name, rte_logs.dynamic_types[i].name) == 0) + return i; + } + + return -1; +} + +/* register an extended log type, assuming table is large enough, and id + * is not yet registered. + */ +static int +__rte_log_register(const char *name, int id) +{ + char *dup_name = strdup(name); + + if (dup_name == NULL) + return -ENOMEM; + + rte_logs.dynamic_types[id].name = dup_name; + rte_logs.dynamic_types[id].loglevel = RTE_LOG_INFO; + + return id; +} + +/* register an extended log type */ +int +rte_log_register(const char *name) +{ + struct rte_log_dynamic_type *new_dynamic_types; + int id, ret; + + id = rte_log_lookup(name); + if (id >= 0) + return id; + + new_dynamic_types = realloc(rte_logs.dynamic_types, + sizeof(struct rte_log_dynamic_type) * + (rte_logs.dynamic_types_len + 1)); + if (new_dynamic_types == NULL) + return -ENOMEM; + rte_logs.dynamic_types = new_dynamic_types; + + ret = __rte_log_register(name, rte_logs.dynamic_types_len); + if (ret < 0) + return ret; + + rte_logs.dynamic_types_len++; + + return ret; +} + +/* Register an extended log type and try to pick its level from EAL options */ +int +rte_log_register_type_and_pick_level(const char *name, uint32_t level_def) +{ + struct rte_eal_opt_loglevel *opt_ll; + uint32_t level = level_def; + int type; + + type = rte_log_register(name); + if (type < 0) + return type; + + TAILQ_FOREACH(opt_ll, &opt_loglevel_list, next) { + if (opt_ll->level > RTE_LOG_DEBUG) + continue; + + if (opt_ll->pattern) { + if (fnmatch(opt_ll->pattern, name, 0) == 0) + level = opt_ll->level; + } else { + if (regexec(&opt_ll->re_match, name, 0, NULL, 0) == 0) + level = opt_ll->level; + } + } + + rte_logs.dynamic_types[type].loglevel = level; + + return type; +} + +struct logtype { + uint32_t log_id; + const char *logtype; +}; + +static const struct logtype logtype_strings[] = { + {RTE_LOGTYPE_EAL, "lib.eal"}, + {RTE_LOGTYPE_MALLOC, "lib.malloc"}, + {RTE_LOGTYPE_RING, "lib.ring"}, + {RTE_LOGTYPE_MEMPOOL, "lib.mempool"}, + {RTE_LOGTYPE_TIMER, "lib.timer"}, + {RTE_LOGTYPE_PMD, "pmd"}, + {RTE_LOGTYPE_HASH, "lib.hash"}, + {RTE_LOGTYPE_LPM, "lib.lpm"}, + {RTE_LOGTYPE_KNI, "lib.kni"}, + {RTE_LOGTYPE_ACL, "lib.acl"}, + {RTE_LOGTYPE_POWER, "lib.power"}, + {RTE_LOGTYPE_METER, "lib.meter"}, + {RTE_LOGTYPE_SCHED, "lib.sched"}, + {RTE_LOGTYPE_PORT, "lib.port"}, + {RTE_LOGTYPE_TABLE, "lib.table"}, + {RTE_LOGTYPE_PIPELINE, "lib.pipeline"}, + {RTE_LOGTYPE_MBUF, "lib.mbuf"}, + {RTE_LOGTYPE_CRYPTODEV, "lib.cryptodev"}, + {RTE_LOGTYPE_EFD, "lib.efd"}, + {RTE_LOGTYPE_EVENTDEV, "lib.eventdev"}, + {RTE_LOGTYPE_GSO, "lib.gso"}, + {RTE_LOGTYPE_USER1, "user1"}, + {RTE_LOGTYPE_USER2, "user2"}, + {RTE_LOGTYPE_USER3, "user3"}, + {RTE_LOGTYPE_USER4, "user4"}, + {RTE_LOGTYPE_USER5, "user5"}, + {RTE_LOGTYPE_USER6, "user6"}, + {RTE_LOGTYPE_USER7, "user7"}, + {RTE_LOGTYPE_USER8, "user8"} +}; + +/* Logging should be first initializer (before drivers and bus) */ +RTE_INIT_PRIO(rte_log_init, LOG) +{ + uint32_t i; + + rte_log_set_global_level(RTE_LOG_DEBUG); + + rte_logs.dynamic_types = calloc(RTE_LOGTYPE_FIRST_EXT_ID, + sizeof(struct rte_log_dynamic_type)); + if (rte_logs.dynamic_types == NULL) + return; + + /* register legacy log types */ + for (i = 0; i < RTE_DIM(logtype_strings); i++) + __rte_log_register(logtype_strings[i].logtype, + logtype_strings[i].log_id); + + rte_logs.dynamic_types_len = RTE_LOGTYPE_FIRST_EXT_ID; +} + +static const char * +loglevel_to_string(uint32_t level) +{ + switch (level) { + case 0: return "disabled"; + case RTE_LOG_EMERG: return "emerg"; + case RTE_LOG_ALERT: return "alert"; + case RTE_LOG_CRIT: return "critical"; + case RTE_LOG_ERR: return "error"; + case RTE_LOG_WARNING: return "warning"; + case RTE_LOG_NOTICE: return "notice"; + case RTE_LOG_INFO: return "info"; + case RTE_LOG_DEBUG: return "debug"; + default: return "unknown"; + } +} + +/* dump global level and registered log types */ +void +rte_log_dump(FILE *f) +{ + size_t i; + + fprintf(f, "global log level is %s\n", + loglevel_to_string(rte_log_get_global_level())); + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + fprintf(f, "id %zu: %s, level is %s\n", + i, rte_logs.dynamic_types[i].name, + loglevel_to_string(rte_logs.dynamic_types[i].loglevel)); + } +} + +/* + * Generates a log message The message will be sent in the stream + * defined by the previous call to rte_openlog_stream(). + */ +int +rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) +{ + FILE *f = rte_log_get_stream(); + int ret; + + if (logtype >= rte_logs.dynamic_types_len) + return -1; + if (!rte_log_can_log(logtype, level)) + return 0; + + /* save loglevel and logtype in a global per-lcore variable */ + RTE_PER_LCORE(log_cur_msg).loglevel = level; + RTE_PER_LCORE(log_cur_msg).logtype = logtype; + + ret = vfprintf(f, format, ap); + fflush(f); + return ret; +} + +/* + * Generates a log message The message will be sent in the stream + * defined by the previous call to rte_openlog_stream(). + * No need to check level here, done by rte_vlog(). + */ +int +rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +{ + va_list ap; + int ret; + + va_start(ap, format); + ret = rte_vlog(level, logtype, format, ap); + va_end(ap); + return ret; +} + +/* + * Called by environment-specific initialization functions. + */ +void +eal_log_set_default(FILE *default_log) +{ + default_log_stream = default_log; + +#if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG + RTE_LOG(NOTICE, EAL, + "Debug dataplane logs available - lower performance\n"); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_mcfg.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_mcfg.c new file mode 100644 index 000000000..49d3ed0ce --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_mcfg.c @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include <rte_eal_memconfig.h> +#include <rte_version.h> + +#include "eal_internal_cfg.h" +#include "eal_memcfg.h" +#include "eal_private.h" + +void +eal_mcfg_complete(void) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + struct rte_mem_config *mcfg = cfg->mem_config; + + /* ALL shared mem_config related INIT DONE */ + if (cfg->process_type == RTE_PROC_PRIMARY) + mcfg->magic = RTE_MAGIC; + + internal_config.init_complete = 1; +} + +void +eal_mcfg_wait_complete(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* wait until shared mem_config finish initialising */ + while (mcfg->magic != RTE_MAGIC) + rte_pause(); +} + +int +eal_mcfg_check_version(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* check if version from memconfig matches compiled in macro */ + if (mcfg->version != RTE_VERSION) + return -1; + + return 0; +} + +void +eal_mcfg_update_internal(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + internal_config.legacy_mem = mcfg->legacy_mem; + internal_config.single_file_segments = mcfg->single_file_segments; +} + +void +eal_mcfg_update_from_internal(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + mcfg->legacy_mem = internal_config.legacy_mem; + mcfg->single_file_segments = internal_config.single_file_segments; + /* record current DPDK version */ + mcfg->version = RTE_VERSION; +} + +void +rte_mcfg_mem_read_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_mem_read_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_mem_write_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_mem_write_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_tailq_read_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_lock(&mcfg->qlock); +} + +void +rte_mcfg_tailq_read_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_unlock(&mcfg->qlock); +} + +void +rte_mcfg_tailq_write_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_lock(&mcfg->qlock); +} + +void +rte_mcfg_tailq_write_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_unlock(&mcfg->qlock); +} + +void +rte_mcfg_mempool_read_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_lock(&mcfg->mplock); +} + +void +rte_mcfg_mempool_read_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_unlock(&mcfg->mplock); +} + +void +rte_mcfg_mempool_write_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_lock(&mcfg->mplock); +} + +void +rte_mcfg_mempool_write_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_unlock(&mcfg->mplock); +} + +void +rte_mcfg_timer_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_spinlock_lock(&mcfg->tlock); +} + +void +rte_mcfg_timer_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_spinlock_unlock(&mcfg->tlock); +} + +bool +rte_mcfg_get_single_file_segments(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + return (bool)mcfg->single_file_segments; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c new file mode 100644 index 000000000..55189d072 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c @@ -0,0 +1,363 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include <string.h> + +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_fbarray.h> +#include <rte_memzone.h> +#include <rte_memory.h> +#include <rte_string_fns.h> +#include <rte_rwlock.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" + +struct mem_event_callback_entry { + TAILQ_ENTRY(mem_event_callback_entry) next; + char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN]; + rte_mem_event_callback_t clb; + void *arg; +}; + +struct mem_alloc_validator_entry { + TAILQ_ENTRY(mem_alloc_validator_entry) next; + char name[RTE_MEM_ALLOC_VALIDATOR_NAME_LEN]; + rte_mem_alloc_validator_t clb; + int socket_id; + size_t limit; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry); +TAILQ_HEAD(mem_alloc_validator_entry_list, mem_alloc_validator_entry); + +static struct mem_event_callback_entry_list mem_event_callback_list = + TAILQ_HEAD_INITIALIZER(mem_event_callback_list); +static rte_rwlock_t mem_event_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_alloc_validator_entry_list mem_alloc_validator_list = + TAILQ_HEAD_INITIALIZER(mem_alloc_validator_list); +static rte_rwlock_t mem_alloc_validator_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_event_callback_entry * +find_mem_event_callback(const char *name, void *arg) +{ + struct mem_event_callback_entry *r; + + TAILQ_FOREACH(r, &mem_event_callback_list, next) { + if (!strcmp(r->name, name) && r->arg == arg) + break; + } + return r; +} + +static struct mem_alloc_validator_entry * +find_mem_alloc_validator(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *r; + + TAILQ_FOREACH(r, &mem_alloc_validator_list, next) { + if (!strcmp(r->name, name) && r->socket_id == socket_id) + break; + } + return r; +} + +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len) +{ + void *end, *aligned_start, *aligned_end; + size_t pgsz = (size_t)msl->page_sz; + const struct rte_memseg *ms; + + /* for IOVA_VA, it's always contiguous */ + if (rte_eal_iova_mode() == RTE_IOVA_VA && !msl->external) + return true; + + /* for legacy memory, it's always contiguous */ + if (internal_config.legacy_mem) + return true; + + end = RTE_PTR_ADD(start, len); + + /* for nohuge, we check pagemap, otherwise check memseg */ + if (!rte_eal_has_hugepages()) { + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + cur = rte_mem_virt2iova(aligned_start); + expected = cur + pgsz; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + + while (aligned_start < aligned_end) { + cur = rte_mem_virt2iova(aligned_start); + if (cur != expected) + return false; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + expected += pgsz; + } + } else { + int start_seg, end_seg, cur_seg; + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + start_seg = RTE_PTR_DIFF(aligned_start, msl->base_va) / + pgsz; + end_seg = RTE_PTR_DIFF(aligned_end, msl->base_va) / + pgsz; + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + ms = rte_fbarray_get(&msl->memseg_arr, start_seg); + cur = ms->iova; + expected = cur + pgsz; + + /* if we can't access IOVA addresses, assume non-contiguous */ + if (cur == RTE_BAD_IOVA) + return false; + + for (cur_seg = start_seg + 1; cur_seg < end_seg; + cur_seg++, expected += pgsz) { + ms = rte_fbarray_get(&msl->memseg_arr, cur_seg); + + if (ms->iova != expected) + return false; + } + } + return true; +} + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + if (name == NULL || clb == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->arg = arg; + strlcpy(entry->name, name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_event_callback_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' registered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + + if (name == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_event_callback_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' unregistered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len) +{ + struct mem_event_callback_entry *entry; + + rte_rwlock_read_lock(&mem_event_rwlock); + + TAILQ_FOREACH(entry, &mem_event_callback_list, next) { + RTE_LOG(DEBUG, EAL, "Calling mem event callback '%s:%p'\n", + entry->name, entry->arg); + entry->clb(event, start, len, entry->arg); + } + + rte_rwlock_read_unlock(&mem_event_rwlock); +} + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + if (name == NULL || clb == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->socket_id = socket_id; + entry->limit = limit; + strlcpy(entry->name, name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_alloc_validator_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i with limit %zu registered\n", + name, socket_id, limit); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + + if (name == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_alloc_validator_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i unregistered\n", + name, socket_id); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len) +{ + struct mem_alloc_validator_entry *entry; + int ret = 0; + + rte_rwlock_read_lock(&mem_alloc_validator_rwlock); + + TAILQ_FOREACH(entry, &mem_alloc_validator_list, next) { + if (entry->socket_id != socket_id || entry->limit > new_len) + continue; + RTE_LOG(DEBUG, EAL, "Calling mem alloc validator '%s' on socket %i\n", + entry->name, entry->socket_id); + if (entry->clb(socket_id, entry->limit, new_len) < 0) + ret = -1; + } + + rte_rwlock_read_unlock(&mem_alloc_validator_rwlock); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c new file mode 100644 index 000000000..4c897a13f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c @@ -0,0 +1,939 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <fcntl.h> +#include <errno.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/queue.h> + +#include <rte_fbarray.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_errno.h> +#include <rte_log.h> + +#include "eal_memalloc.h" +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_memcfg.h" +#include "malloc_heap.h" + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. + */ + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" + +static void *next_baseaddr; +static uint64_t system_page_sz; + +#ifdef RTE_EXEC_ENV_LINUX +#define RTE_DONTDUMP MADV_DONTDUMP +#elif defined RTE_EXEC_ENV_FREEBSD +#define RTE_DONTDUMP MADV_NOCORE +#else +#error "madvise doesn't support this OS" +#endif + +#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5 +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags) +{ + bool addr_is_hint, allow_shrink, unmap, no_align; + uint64_t map_sz; + void *mapped_addr, *aligned_addr; + uint8_t try = 0; + + if (system_page_sz == 0) + system_page_sz = sysconf(_SC_PAGESIZE); + + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; + allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; + unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; + + if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) internal_config.base_virtaddr; + +#ifdef RTE_ARCH_64 + if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) eal_get_baseaddr(); +#endif + if (requested_addr == NULL && next_baseaddr != NULL) { + requested_addr = next_baseaddr; + requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); + addr_is_hint = true; + } + + /* we don't need alignment of resulting pointer in the following cases: + * + * 1. page size is equal to system size + * 2. we have a requested address, and it is page-aligned, and we will + * be discarding the address if we get a different one. + * + * for all other cases, alignment is potentially necessary. + */ + no_align = (requested_addr != NULL && + requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && + !addr_is_hint) || + page_sz == system_page_sz; + + do { + map_sz = no_align ? *size : *size + page_sz; + if (map_sz > SIZE_MAX) { + RTE_LOG(ERR, EAL, "Map size too big\n"); + rte_errno = E2BIG; + return NULL; + } + + mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; + + if (mapped_addr != MAP_FAILED && addr_is_hint && + mapped_addr != requested_addr) { + try++; + next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); + if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) { + /* hint was not used. Try with another offset */ + munmap(mapped_addr, map_sz); + mapped_addr = MAP_FAILED; + requested_addr = next_baseaddr; + } + } + } while ((allow_shrink || addr_is_hint) && + mapped_addr == MAP_FAILED && *size > 0); + + /* align resulting address - if map failed, we will ignore the value + * anyway, so no need to add additional checks. + */ + aligned_addr = no_align ? mapped_addr : + RTE_PTR_ALIGN(mapped_addr, page_sz); + + if (*size == 0) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", + strerror(errno)); + rte_errno = errno; + return NULL; + } else if (mapped_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + /* pass errno up the call chain */ + rte_errno = errno; + return NULL; + } else if (requested_addr != NULL && !addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", + requested_addr, aligned_addr); + munmap(mapped_addr, map_sz); + rte_errno = EADDRNOTAVAIL; + return NULL; + } else if (requested_addr != NULL && addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", + requested_addr, aligned_addr); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } else if (next_baseaddr != NULL) { + next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); + } + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + aligned_addr, *size); + + if (unmap) { + munmap(mapped_addr, map_sz); + } else if (!no_align) { + void *map_end, *aligned_end; + size_t before_len, after_len; + + /* when we reserve space with alignment, we add alignment to + * mapping size. On 32-bit, if 1GB alignment was requested, this + * would waste 1GB of address space, which is a luxury we cannot + * afford. so, if alignment was performed, check if any unneeded + * address space can be unmapped back. + */ + + map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); + aligned_end = RTE_PTR_ADD(aligned_addr, *size); + + /* unmap space before aligned mmap address */ + before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); + if (before_len > 0) + munmap(mapped_addr, before_len); + + /* unmap space after aligned end mmap address */ + after_len = RTE_PTR_DIFF(map_end, aligned_end); + if (after_len > 0) + munmap(aligned_end, after_len); + } + + if (!unmap) { + /* Exclude these pages from a core dump. */ + if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0) + RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", + strerror(errno)); + } + + return aligned_addr; +} + +static struct rte_memseg * +virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + const struct rte_fbarray *arr; + void *start, *end; + int ms_idx; + + if (msl == NULL) + return NULL; + + /* a memseg list was specified, check if it's the right one */ + start = msl->base_va; + end = RTE_PTR_ADD(start, msl->len); + + if (addr < start || addr >= end) + return NULL; + + /* now, calculate index */ + arr = &msl->memseg_arr; + ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; + return rte_fbarray_get(arr, ms_idx); +} + +static struct rte_memseg_list * +virt2memseg_list(const void *addr) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + int msl_idx; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + void *start, *end; + msl = &mcfg->memsegs[msl_idx]; + + start = msl->base_va; + end = RTE_PTR_ADD(start, msl->len); + if (addr >= start && addr < end) + break; + } + /* if we didn't find our memseg list */ + if (msl_idx == RTE_MAX_MEMSEG_LISTS) + return NULL; + return msl; +} + +struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *addr) +{ + return virt2memseg_list(addr); +} + +struct virtiova { + rte_iova_t iova; + void *virt; +}; +static int +find_virt(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} +static int +find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} + +void * +rte_mem_iova2virt(rte_iova_t iova) +{ + struct virtiova vi; + + memset(&vi, 0, sizeof(vi)); + + vi.iova = iova; + /* for legacy mem, we can get away with scanning VA-contiguous segments, + * as we know they are PA-contiguous as well + */ + if (internal_config.legacy_mem) + rte_memseg_contig_walk(find_virt_legacy, &vi); + else + rte_memseg_walk(find_virt, &vi); + + return vi.virt; +} + +struct rte_memseg * +rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + return virt2memseg(addr, msl != NULL ? msl : + rte_mem_virt2memseg_list(addr)); +} + +static int +physmem_size(const struct rte_memseg_list *msl, void *arg) +{ + uint64_t *total_len = arg; + + if (msl->external) + return 0; + + *total_len += msl->memseg_arr.count * msl->page_sz; + + return 0; +} + +/* get the total size of memory */ +uint64_t +rte_eal_get_physmem_size(void) +{ + uint64_t total_len = 0; + + rte_memseg_list_walk(physmem_size, &total_len); + + return total_len; +} + +static int +dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx, ms_idx, fd; + FILE *f = arg; + + msl_idx = msl - mcfg->memsegs; + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) + return -1; + + fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); + fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " + "virt:%p, socket_id:%"PRId32", " + "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " + "nrank:%"PRIx32" fd:%i\n", + msl_idx, ms_idx, + ms->iova, + ms->len, + ms->addr, + ms->socket_id, + ms->hugepage_sz, + ms->nchannel, + ms->nrank, + fd); + + return 0; +} + +/* + * Defining here because declared in rte_memory.h, but the actual implementation + * is in eal_common_memalloc.c, like all other memalloc internals. + */ +int +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_register(name, clb, arg); +} + +int +rte_mem_event_callback_unregister(const char *name, void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_unregister(name, arg); +} + +int +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, + limit); +} + +int +rte_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); +} + +/* Dump the physical memory layout on console */ +void +rte_dump_physmem_layout(FILE *f) +{ + rte_memseg_walk(dump_memseg, f); +} + +static int +check_iova(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + uint64_t *mask = arg; + rte_iova_t iova; + + /* higher address within segment */ + iova = (ms->iova + ms->len) - 1; + if (!(iova & *mask)) + return 0; + + RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", + ms->iova, ms->len); + + RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); + return 1; +} + +#define MAX_DMA_MASK_BITS 63 + +/* check memseg iovas are within the required range based on dma mask */ +static int +check_dma_mask(uint8_t maskbits, bool thread_unsafe) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t mask; + int ret; + + /* Sanity check. We only check width can be managed with 64 bits + * variables. Indeed any higher value is likely wrong. */ + if (maskbits > MAX_DMA_MASK_BITS) { + RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", + maskbits, MAX_DMA_MASK_BITS); + return -1; + } + + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + if (thread_unsafe) + ret = rte_memseg_walk_thread_unsafe(check_iova, &mask); + else + ret = rte_memseg_walk(check_iova, &mask); + + if (ret) + /* + * Dma mask precludes hugepage usage. + * This device can not be used and we do not need to keep + * the dma mask. + */ + return 1; + + /* + * we need to keep the more restricted maskbit for checking + * potential dynamic memory allocation in the future. + */ + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); + + return 0; +} + +int +rte_mem_check_dma_mask(uint8_t maskbits) +{ + return check_dma_mask(maskbits, false); +} + +int +rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits) +{ + return check_dma_mask(maskbits, true); +} + +/* + * Set dma mask to use when memory initialization is done. + * + * This function should ONLY be used by code executed before the memory + * initialization. PMDs should use rte_mem_check_dma_mask if addressing + * limitations by the device. + */ +void +rte_mem_set_dma_mask(uint8_t maskbits) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); +} + +/* return the number of memory channels */ +unsigned rte_memory_get_nchannel(void) +{ + return rte_eal_get_configuration()->mem_config->nchannel; +} + +/* return the number of memory rank */ +unsigned rte_memory_get_nrank(void) +{ + return rte_eal_get_configuration()->mem_config->nrank; +} + +static int +rte_eal_memdevice_init(void) +{ + struct rte_config *config; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + return 0; + + config = rte_eal_get_configuration(); + config->mem_config->nchannel = internal_config.force_nchannel; + config->mem_config->nrank = internal_config.force_nrank; + + return 0; +} + +/* Lock page in physical memory and prevent from swapping. */ +int +rte_mem_lock_page(const void *virt) +{ + unsigned long virtual = (unsigned long)virt; + int page_size = getpagesize(); + unsigned long aligned = (virtual & ~(page_size - 1)); + return mlock((void *)aligned, page_size); +} + +int +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + int n_segs; + size_t len; + + ms = rte_fbarray_get(arr, ms_idx); + + /* find how many more segments there are, starting with + * this one. + */ + n_segs = rte_fbarray_find_contig_used(arr, ms_idx); + len = n_segs * msl->page_sz; + + ret = func(msl, ms, len, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, + ms_idx + n_segs); + } + } + return 0; +} + +int +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) +{ + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_mcfg_mem_read_lock(); + ret = rte_memseg_contig_walk_thread_unsafe(func, arg); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + ms = rte_fbarray_get(arr, ms_idx); + ret = func(msl, ms, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); + } + } + return 0; +} + +int +rte_memseg_walk(rte_memseg_walk_t func, void *arg) +{ + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_mcfg_mem_read_lock(); + ret = rte_memseg_walk_thread_unsafe(func, arg); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret) + return ret; + } + return 0; +} + +int +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +{ + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_mcfg_mem_read_lock(); + ret = rte_memseg_list_walk_thread_unsafe(func, arg); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + /* segment fd API is not supported for external segments */ + if (msl->external) { + rte_errno = ENOTSUP; + return -1; + } + + ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int +rte_memseg_get_fd(const struct rte_memseg *ms) +{ + int ret; + + rte_mcfg_mem_read_lock(); + ret = rte_memseg_get_fd_thread_unsafe(ms); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL || offset == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + /* segment fd API is not supported for external segments */ + if (msl->external) { + rte_errno = ENOTSUP; + return -1; + } + + ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) +{ + int ret; + + rte_mcfg_mem_read_lock(); + ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int socket_id, n; + int ret = 0; + + if (va_addr == NULL || page_sz == 0 || len == 0 || + !rte_is_power_of_2(page_sz) || + RTE_ALIGN(len, page_sz) != len || + ((len / page_sz) != n_pages && iova_addrs != NULL) || + !rte_is_aligned(va_addr, page_sz)) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* make sure the segment doesn't already exist */ + if (malloc_heap_find_external_seg(va_addr, len) != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + /* get next available socket ID */ + socket_id = mcfg->next_socket_id; + if (socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we can create a new memseg */ + n = len / page_sz; + if (malloc_heap_create_external_seg(va_addr, iova_addrs, n, + page_sz, "extmem", socket_id) == NULL) { + ret = -1; + goto unlock; + } + + /* memseg list successfully created - increment next socket ID */ + mcfg->next_socket_id++; +unlock: + rte_mcfg_mem_write_unlock(); + return ret; +} + +int +rte_extmem_unregister(void *va_addr, size_t len) +{ + struct rte_memseg_list *msl; + int ret = 0; + + if (va_addr == NULL || len == 0) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* find our segment */ + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + + ret = malloc_heap_destroy_external_seg(msl); +unlock: + rte_mcfg_mem_write_unlock(); + return ret; +} + +static int +sync_memory(void *va_addr, size_t len, bool attach) +{ + struct rte_memseg_list *msl; + int ret = 0; + + if (va_addr == NULL || len == 0) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* find our segment */ + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (attach) + ret = rte_fbarray_attach(&msl->memseg_arr); + else + ret = rte_fbarray_detach(&msl->memseg_arr); + +unlock: + rte_mcfg_mem_write_unlock(); + return ret; +} + +int +rte_extmem_attach(void *va_addr, size_t len) +{ + return sync_memory(va_addr, len, true); +} + +int +rte_extmem_detach(void *va_addr, size_t len) +{ + return sync_memory(va_addr, len, false); +} + +/* init memory subsystem */ +int +rte_eal_memory_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int retval; + RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); + + if (!mcfg) + return -1; + + /* lock mem hotplug here, to prevent races while we init */ + rte_mcfg_mem_read_lock(); + + if (rte_eal_memseg_init() < 0) + goto fail; + + if (eal_memalloc_init() < 0) + goto fail; + + retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? + rte_eal_hugepage_init() : + rte_eal_hugepage_attach(); + if (retval < 0) + goto fail; + + if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) + goto fail; + + return 0; +fail: + rte_mcfg_mem_read_unlock(); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c new file mode 100644 index 000000000..7c21aa921 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c @@ -0,0 +1,420 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <stdarg.h> +#include <inttypes.h> +#include <string.h> +#include <errno.h> +#include <sys/queue.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_errno.h> +#include <rte_string_fns.h> +#include <rte_common.h> +#include <rte_eal_trace.h> + +#include "malloc_heap.h" +#include "malloc_elem.h" +#include "eal_private.h" +#include "eal_memcfg.h" + +static inline const struct rte_memzone * +memzone_lookup_thread_unsafe(const char *name) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + const struct rte_memzone *mz; + int i = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + /* + * the algorithm is not optimal (linear), but there are few + * zones and this function should be called at init only + */ + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + mz = rte_fbarray_get(arr, i); + if (mz->addr != NULL && + !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE)) + return mz; + i = rte_fbarray_find_next_used(arr, i + 1); + } + return NULL; +} + +static const struct rte_memzone * +memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, + int socket_id, unsigned int flags, unsigned int align, + unsigned int bound) +{ + struct rte_memzone *mz; + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + void *mz_addr; + size_t requested_len; + int mz_idx; + bool contig; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + /* no more room in config */ + if (arr->count >= arr->len) { + RTE_LOG(ERR, EAL, + "%s(): Number of requested memzone segments exceeds RTE_MAX_MEMZONE\n", + __func__); + rte_errno = ENOSPC; + return NULL; + } + + if (strlen(name) > sizeof(mz->name) - 1) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s>: name too long\n", + __func__, name); + rte_errno = ENAMETOOLONG; + return NULL; + } + + /* zone already exist */ + if ((memzone_lookup_thread_unsafe(name)) != NULL) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s> already exists\n", + __func__, name); + rte_errno = EEXIST; + return NULL; + } + + /* if alignment is not a power of two */ + if (align && !rte_is_power_of_2(align)) { + RTE_LOG(ERR, EAL, "%s(): Invalid alignment: %u\n", __func__, + align); + rte_errno = EINVAL; + return NULL; + } + + /* alignment less than cache size is not allowed */ + if (align < RTE_CACHE_LINE_SIZE) + align = RTE_CACHE_LINE_SIZE; + + /* align length on cache boundary. Check for overflow before doing so */ + if (len > SIZE_MAX - RTE_CACHE_LINE_MASK) { + rte_errno = EINVAL; /* requested size too big */ + return NULL; + } + + len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE); + + /* save minimal requested length */ + requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, len); + + /* check that boundary condition is valid */ + if (bound != 0 && (requested_len > bound || !rte_is_power_of_2(bound))) { + rte_errno = EINVAL; + return NULL; + } + + if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) { + rte_errno = EINVAL; + return NULL; + } + + /* only set socket to SOCKET_ID_ANY if we aren't allocating for an + * external heap. + */ + if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES) + socket_id = SOCKET_ID_ANY; + + contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; + /* malloc only cares about size flags, remove contig flag from flags */ + flags &= ~RTE_MEMZONE_IOVA_CONTIG; + + if (len == 0 && bound == 0) { + /* no size constraints were placed, so use malloc elem len */ + requested_len = 0; + mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags, + align, contig); + } else { + if (len == 0) + requested_len = bound; + /* allocate memory on heap */ + mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, + flags, align, bound, contig); + } + if (mz_addr == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + struct malloc_elem *elem = malloc_elem_from_data(mz_addr); + + /* fill the zone in config */ + mz_idx = rte_fbarray_find_next_free(arr, 0); + + if (mz_idx < 0) { + mz = NULL; + } else { + rte_fbarray_set_used(arr, mz_idx); + mz = rte_fbarray_get(arr, mz_idx); + } + + if (mz == NULL) { + RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone\n", __func__); + malloc_heap_free(elem); + rte_errno = ENOSPC; + return NULL; + } + + strlcpy(mz->name, name, sizeof(mz->name)); + mz->iova = rte_malloc_virt2iova(mz_addr); + mz->addr = mz_addr; + mz->len = requested_len == 0 ? + elem->size - elem->pad - MALLOC_ELEM_OVERHEAD : + requested_len; + mz->hugepage_sz = elem->msl->page_sz; + mz->socket_id = elem->msl->socket_id; + mz->flags = 0; + + return mz; +} + +static const struct rte_memzone * +rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id, + unsigned int flags, unsigned int align, unsigned int bound) +{ + struct rte_mem_config *mcfg; + const struct rte_memzone *mz = NULL; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_write_lock(&mcfg->mlock); + + mz = memzone_reserve_aligned_thread_unsafe( + name, len, socket_id, flags, align, bound); + + rte_eal_trace_memzone_reserve(name, len, socket_id, flags, align, + bound, mz); + + rte_rwlock_write_unlock(&mcfg->mlock); + + return mz; +} + +/* + * Return a pointer to a correctly filled memzone descriptor (with a + * specified alignment and boundary). If the allocation cannot be done, + * return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align, unsigned bound) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, flags, + align, bound); +} + +/* + * Return a pointer to a correctly filled memzone descriptor (with a + * specified alignment). If the allocation cannot be done, return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, flags, + align, 0); +} + +/* + * Return a pointer to a correctly filled memzone descriptor. If the + * allocation cannot be done, return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve(const char *name, size_t len, int socket_id, + unsigned flags) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, + flags, RTE_CACHE_LINE_SIZE, 0); +} + +int +rte_memzone_free(const struct rte_memzone *mz) +{ + char name[RTE_MEMZONE_NAMESIZE]; + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + struct rte_memzone *found_mz; + int ret = 0; + void *addr = NULL; + unsigned idx; + + if (mz == NULL) + return -EINVAL; + + rte_strlcpy(name, mz->name, RTE_MEMZONE_NAMESIZE); + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + rte_rwlock_write_lock(&mcfg->mlock); + + idx = rte_fbarray_find_idx(arr, mz); + found_mz = rte_fbarray_get(arr, idx); + + if (found_mz == NULL) { + ret = -EINVAL; + } else if (found_mz->addr == NULL) { + RTE_LOG(ERR, EAL, "Memzone is not allocated\n"); + ret = -EINVAL; + } else { + addr = found_mz->addr; + memset(found_mz, 0, sizeof(*found_mz)); + rte_fbarray_set_free(arr, idx); + } + + rte_rwlock_write_unlock(&mcfg->mlock); + + if (addr != NULL) + rte_free(addr); + + rte_eal_trace_memzone_free(name, addr, ret); + return ret; +} + +/* + * Lookup for the memzone identified by the given name + */ +const struct rte_memzone * +rte_memzone_lookup(const char *name) +{ + struct rte_mem_config *mcfg; + const struct rte_memzone *memzone = NULL; + + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_read_lock(&mcfg->mlock); + + memzone = memzone_lookup_thread_unsafe(name); + + rte_rwlock_read_unlock(&mcfg->mlock); + + rte_eal_trace_memzone_lookup(name, memzone); + return memzone; +} + +static void +dump_memzone(const struct rte_memzone *mz, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl = NULL; + void *cur_addr, *mz_end; + struct rte_memseg *ms; + int mz_idx, ms_idx; + size_t page_sz; + FILE *f = arg; + + mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz); + + fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, " + "socket_id:%"PRId32", flags:%"PRIx32"\n", + mz_idx, + mz->name, + mz->len, + mz->addr, + mz->socket_id, + mz->flags); + + /* go through each page occupied by this memzone */ + msl = rte_mem_virt2memseg_list(mz->addr); + if (!msl) { + RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n"); + return; + } + page_sz = (size_t)mz->hugepage_sz; + cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz); + mz_end = RTE_PTR_ADD(cur_addr, mz->len); + + fprintf(f, "physical segments used:\n"); + ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz; + ms = rte_fbarray_get(&msl->memseg_arr, ms_idx); + + do { + fprintf(f, " addr: %p iova: 0x%" PRIx64 " " + "len: 0x%zx " + "pagesz: 0x%zx\n", + cur_addr, ms->iova, ms->len, page_sz); + + /* advance VA to next page */ + cur_addr = RTE_PTR_ADD(cur_addr, page_sz); + + /* memzones occupy contiguous segments */ + ++ms; + } while (cur_addr < mz_end); +} + +/* Dump all reserved memory zones on console */ +void +rte_memzone_dump(FILE *f) +{ + rte_memzone_walk(dump_memzone, f); +} + +/* + * Init the memzone subsystem + */ +int +rte_eal_memzone_init(void) +{ + struct rte_mem_config *mcfg; + int ret = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_write_lock(&mcfg->mlock); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + rte_fbarray_init(&mcfg->memzones, "memzone", + RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) { + RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n"); + ret = -1; + } else if (rte_eal_process_type() == RTE_PROC_SECONDARY && + rte_fbarray_attach(&mcfg->memzones)) { + RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n"); + ret = -1; + } + + rte_rwlock_write_unlock(&mcfg->mlock); + + return ret; +} + +/* Walk all reserved memory zones */ +void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *), + void *arg) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + int i; + + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + rte_rwlock_read_lock(&mcfg->mlock); + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + struct rte_memzone *mz = rte_fbarray_get(arr, i); + (*func)(mz, arg); + i = rte_fbarray_find_next_used(arr, i + 1); + } + rte_rwlock_read_unlock(&mcfg->mlock); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c new file mode 100644 index 000000000..8f2cbd1c6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c @@ -0,0 +1,1861 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <syslog.h> +#endif +#include <ctype.h> +#include <limits.h> +#include <errno.h> +#include <getopt.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <dlfcn.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> + +#include <rte_string_fns.h> +#include <rte_eal.h> +#include <rte_log.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_tailq.h> +#include <rte_version.h> +#include <rte_devargs.h> +#include <rte_memcpy.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <rte_telemetry.h> +#endif + +#include "eal_internal_cfg.h" +#include "eal_options.h" +#include "eal_filesystem.h" +#include "eal_private.h" +#ifndef RTE_EXEC_ENV_WINDOWS +#include "eal_trace.h" +#endif + +#define BITS_PER_HEX 4 +#define LCORE_OPT_LST 1 +#define LCORE_OPT_MSK 2 +#define LCORE_OPT_MAP 3 + +const char +eal_short_options[] = + "b:" /* pci-blacklist */ + "c:" /* coremask */ + "s:" /* service coremask */ + "d:" /* driver */ + "h" /* help */ + "l:" /* corelist */ + "S:" /* service corelist */ + "m:" /* memory size */ + "n:" /* memory channels */ + "r:" /* memory ranks */ + "v" /* version */ + "w:" /* pci-whitelist */ + ; + +const struct option +eal_long_options[] = { + {OPT_BASE_VIRTADDR, 1, NULL, OPT_BASE_VIRTADDR_NUM }, + {OPT_CREATE_UIO_DEV, 0, NULL, OPT_CREATE_UIO_DEV_NUM }, + {OPT_FILE_PREFIX, 1, NULL, OPT_FILE_PREFIX_NUM }, + {OPT_HELP, 0, NULL, OPT_HELP_NUM }, + {OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM }, + {OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM }, + {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM }, + {OPT_LCORES, 1, NULL, OPT_LCORES_NUM }, + {OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM }, + {OPT_TRACE, 1, NULL, OPT_TRACE_NUM }, + {OPT_TRACE_DIR, 1, NULL, OPT_TRACE_DIR_NUM }, + {OPT_TRACE_BUF_SIZE, 1, NULL, OPT_TRACE_BUF_SIZE_NUM }, + {OPT_TRACE_MODE, 1, NULL, OPT_TRACE_MODE_NUM }, + {OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM }, + {OPT_MBUF_POOL_OPS_NAME, 1, NULL, OPT_MBUF_POOL_OPS_NAME_NUM}, + {OPT_NO_HPET, 0, NULL, OPT_NO_HPET_NUM }, + {OPT_NO_HUGE, 0, NULL, OPT_NO_HUGE_NUM }, + {OPT_NO_PCI, 0, NULL, OPT_NO_PCI_NUM }, + {OPT_NO_SHCONF, 0, NULL, OPT_NO_SHCONF_NUM }, + {OPT_IN_MEMORY, 0, NULL, OPT_IN_MEMORY_NUM }, + {OPT_PCI_BLACKLIST, 1, NULL, OPT_PCI_BLACKLIST_NUM }, + {OPT_PCI_WHITELIST, 1, NULL, OPT_PCI_WHITELIST_NUM }, + {OPT_PROC_TYPE, 1, NULL, OPT_PROC_TYPE_NUM }, + {OPT_SOCKET_MEM, 1, NULL, OPT_SOCKET_MEM_NUM }, + {OPT_SOCKET_LIMIT, 1, NULL, OPT_SOCKET_LIMIT_NUM }, + {OPT_SYSLOG, 1, NULL, OPT_SYSLOG_NUM }, + {OPT_VDEV, 1, NULL, OPT_VDEV_NUM }, + {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, + {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM }, + {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM }, + {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM}, + {OPT_MATCH_ALLOCATIONS, 0, NULL, OPT_MATCH_ALLOCATIONS_NUM}, + {OPT_TELEMETRY, 0, NULL, OPT_TELEMETRY_NUM }, + {OPT_NO_TELEMETRY, 0, NULL, OPT_NO_TELEMETRY_NUM }, + {0, 0, NULL, 0 } +}; + +TAILQ_HEAD(shared_driver_list, shared_driver); + +/* Definition for shared object drivers. */ +struct shared_driver { + TAILQ_ENTRY(shared_driver) next; + + char name[PATH_MAX]; + void* lib_handle; +}; + +/* List of external loadable drivers */ +static struct shared_driver_list solib_list = +TAILQ_HEAD_INITIALIZER(solib_list); + +/* Default path of external loadable drivers */ +static const char *default_solib_dir = RTE_EAL_PMD_PATH; + +/* + * Stringified version of solib path used by dpdk-pmdinfo.py + * Note: PLEASE DO NOT ALTER THIS without making a corresponding + * change to usertools/dpdk-pmdinfo.py + */ +static const char dpdk_solib_path[] __rte_used = +"DPDK_PLUGIN_PATH=" RTE_EAL_PMD_PATH; + +TAILQ_HEAD(device_option_list, device_option); + +struct device_option { + TAILQ_ENTRY(device_option) next; + + enum rte_devtype type; + char arg[]; +}; + +static struct device_option_list devopt_list = +TAILQ_HEAD_INITIALIZER(devopt_list); + +static int master_lcore_parsed; +static int mem_parsed; +static int core_parsed; + +#ifndef RTE_EXEC_ENV_WINDOWS +static char **eal_args; +static char **eal_app_args; + +#define EAL_PARAM_REQ "/eal/params" +#define EAL_APP_PARAM_REQ "/eal/app_params" + +/* callback handler for telemetry library to report out EAL flags */ +int +handle_eal_info_request(const char *cmd, const char *params __rte_unused, + struct rte_tel_data *d) +{ + char **args; + int used = 0; + int i = 0; + + if (strcmp(cmd, EAL_PARAM_REQ) == 0) + args = eal_args; + else + args = eal_app_args; + + rte_tel_data_start_array(d, RTE_TEL_STRING_VAL); + if (args == NULL || args[0] == NULL) + return 0; + + for ( ; args[i] != NULL; i++) + used = rte_tel_data_add_array_string(d, args[i]); + return used; +} + +int +eal_save_args(int argc, char **argv) +{ + int i, j; + + rte_telemetry_register_cmd(EAL_PARAM_REQ, handle_eal_info_request, + "Returns EAL commandline parameters used. Takes no parameters"); + rte_telemetry_register_cmd(EAL_APP_PARAM_REQ, handle_eal_info_request, + "Returns app commandline parameters used. Takes no parameters"); + + /* clone argv to report out later. We overprovision, but + * this does not waste huge amounts of memory + */ + eal_args = calloc(argc + 1, sizeof(*eal_args)); + if (eal_args == NULL) + return -1; + + for (i = 0; i < argc; i++) { + eal_args[i] = strdup(argv[i]); + if (strcmp(argv[i], "--") == 0) + break; + } + eal_args[i++] = NULL; /* always finish with NULL */ + + /* allow reporting of any app args we know about too */ + if (i >= argc) + return 0; + + eal_app_args = calloc(argc - i + 1, sizeof(*eal_args)); + if (eal_app_args == NULL) + return -1; + + for (j = 0; i < argc; j++, i++) + eal_app_args[j] = strdup(argv[i]); + eal_app_args[j] = NULL; + + return 0; +} +#endif + +static int +eal_option_device_add(enum rte_devtype type, const char *optarg) +{ + struct device_option *devopt; + size_t optlen; + int ret; + + optlen = strlen(optarg) + 1; + devopt = calloc(1, sizeof(*devopt) + optlen); + if (devopt == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate device option\n"); + return -ENOMEM; + } + + devopt->type = type; + ret = strlcpy(devopt->arg, optarg, optlen); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Unable to copy device option\n"); + free(devopt); + return -EINVAL; + } + TAILQ_INSERT_TAIL(&devopt_list, devopt, next); + return 0; +} + +int +eal_option_device_parse(void) +{ + struct device_option *devopt; + void *tmp; + int ret = 0; + + TAILQ_FOREACH_SAFE(devopt, &devopt_list, next, tmp) { + if (ret == 0) { + ret = rte_devargs_add(devopt->type, devopt->arg); + if (ret) + RTE_LOG(ERR, EAL, "Unable to parse device '%s'\n", + devopt->arg); + } + TAILQ_REMOVE(&devopt_list, devopt, next); + free(devopt); + } + return ret; +} + +const char * +eal_get_hugefile_prefix(void) +{ + if (internal_config.hugefile_prefix != NULL) + return internal_config.hugefile_prefix; + return HUGEFILE_PREFIX_DEFAULT; +} + +void +eal_reset_internal_config(struct internal_config *internal_cfg) +{ + int i; + + internal_cfg->memory = 0; + internal_cfg->force_nrank = 0; + internal_cfg->force_nchannel = 0; + internal_cfg->hugefile_prefix = NULL; + internal_cfg->hugepage_dir = NULL; + internal_cfg->force_sockets = 0; + /* zero out the NUMA config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_mem[i] = 0; + internal_cfg->force_socket_limits = 0; + /* zero out the NUMA limits config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_limit[i] = 0; + /* zero out hugedir descriptors */ + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) { + memset(&internal_cfg->hugepage_info[i], 0, + sizeof(internal_cfg->hugepage_info[0])); + internal_cfg->hugepage_info[i].lock_descriptor = -1; + } + internal_cfg->base_virtaddr = 0; + +#ifdef LOG_DAEMON + internal_cfg->syslog_facility = LOG_DAEMON; +#endif + + /* if set to NONE, interrupt mode is determined automatically */ + internal_cfg->vfio_intr_mode = RTE_INTR_MODE_NONE; + +#ifdef RTE_LIBEAL_USE_HPET + internal_cfg->no_hpet = 0; +#else + internal_cfg->no_hpet = 1; +#endif + internal_cfg->vmware_tsc_map = 0; + internal_cfg->create_uio_dev = 0; + internal_cfg->iova_mode = RTE_IOVA_DC; + internal_cfg->user_mbuf_pool_ops_name = NULL; + CPU_ZERO(&internal_cfg->ctrl_cpuset); + internal_cfg->init_complete = 0; +} + +static int +eal_plugin_add(const char *path) +{ + struct shared_driver *solib; + + solib = malloc(sizeof(*solib)); + if (solib == NULL) { + RTE_LOG(ERR, EAL, "malloc(solib) failed\n"); + return -1; + } + memset(solib, 0, sizeof(*solib)); + strlcpy(solib->name, path, PATH_MAX-1); + solib->name[PATH_MAX-1] = 0; + TAILQ_INSERT_TAIL(&solib_list, solib, next); + + return 0; +} + +static int +eal_plugindir_init(const char *path) +{ + DIR *d = NULL; + struct dirent *dent = NULL; + char sopath[PATH_MAX]; + + if (path == NULL || *path == '\0') + return 0; + + d = opendir(path); + if (d == NULL) { + RTE_LOG(ERR, EAL, "failed to open directory %s: %s\n", + path, strerror(errno)); + return -1; + } + + while ((dent = readdir(d)) != NULL) { + struct stat sb; + + snprintf(sopath, sizeof(sopath), "%s/%s", path, dent->d_name); + + if (!(stat(sopath, &sb) == 0 && S_ISREG(sb.st_mode))) + continue; + + if (eal_plugin_add(sopath) == -1) + break; + } + + closedir(d); + /* XXX this ignores failures from readdir() itself */ + return (dent == NULL) ? 0 : -1; +} + +int +eal_plugins_init(void) +{ +#ifndef RTE_EXEC_ENV_WINDOWS + struct shared_driver *solib = NULL; + struct stat sb; + + if (*default_solib_dir != '\0' && stat(default_solib_dir, &sb) == 0 && + S_ISDIR(sb.st_mode)) + eal_plugin_add(default_solib_dir); + + TAILQ_FOREACH(solib, &solib_list, next) { + + if (stat(solib->name, &sb) == 0 && S_ISDIR(sb.st_mode)) { + if (eal_plugindir_init(solib->name) == -1) { + RTE_LOG(ERR, EAL, + "Cannot init plugin directory %s\n", + solib->name); + return -1; + } + } else { + RTE_LOG(DEBUG, EAL, "open shared lib %s\n", + solib->name); + solib->lib_handle = dlopen(solib->name, RTLD_NOW); + if (solib->lib_handle == NULL) { + RTE_LOG(ERR, EAL, "%s\n", dlerror()); + return -1; + } + } + + } + return 0; +#endif +} + +/* + * Parse the coremask given as argument (hexadecimal string) and fill + * the global configuration (core role and core count) with the parsed + * value. + */ +static int xdigit2val(unsigned char c) +{ + int val; + + if (isdigit(c)) + val = c - '0'; + else if (isupper(c)) + val = c - 'A' + 10; + else + val = c - 'a' + 10; + return val; +} + +static int +eal_parse_service_coremask(const char *coremask) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, j, idx = 0; + unsigned int count = 0; + char c; + int val; + uint32_t taken_lcore_count = 0; + + if (coremask == NULL) + return -1; + /* Remove all blank characters ahead and after . + * Remove 0x/0X if exists. + */ + while (isblank(*coremask)) + coremask++; + if (coremask[0] == '0' && ((coremask[1] == 'x') + || (coremask[1] == 'X'))) + coremask += 2; + i = strlen(coremask); + while ((i > 0) && isblank(coremask[i - 1])) + i--; + + if (i == 0) + return -1; + + for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE; i--) { + c = coremask[i]; + if (isxdigit(c) == 0) { + /* invalid characters */ + return -1; + } + val = xdigit2val(c); + for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE; + j++, idx++) { + if ((1 << j) & val) { + /* handle master lcore already parsed */ + uint32_t lcore = idx; + if (master_lcore_parsed && + cfg->master_lcore == lcore) { + RTE_LOG(ERR, EAL, + "lcore %u is master lcore, cannot use as service core\n", + idx); + return -1; + } + + if (eal_cpu_detected(idx) == 0) { + RTE_LOG(ERR, EAL, + "lcore %u unavailable\n", idx); + return -1; + } + + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + + lcore_config[idx].core_role = ROLE_SERVICE; + count++; + } + } + } + + for (; i >= 0; i--) + if (coremask[i] != '0') + return -1; + + for (; idx < RTE_MAX_LCORE; idx++) + lcore_config[idx].core_index = -1; + + if (count == 0) + return -1; + + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores are in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + + cfg->service_lcore_count = count; + return 0; +} + +static int +eal_service_cores_parsed(void) +{ + int idx; + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (lcore_config[idx].core_role == ROLE_SERVICE) + return 1; + } + return 0; +} + +static int +update_lcore_config(int *cores) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int count = 0; + unsigned int i; + int ret = 0; + + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (cores[i] != -1) { + if (eal_cpu_detected(i) == 0) { + RTE_LOG(ERR, EAL, "lcore %u unavailable\n", i); + ret = -1; + continue; + } + cfg->lcore_role[i] = ROLE_RTE; + count++; + } else { + cfg->lcore_role[i] = ROLE_OFF; + } + lcore_config[i].core_index = cores[i]; + } + if (!ret) + cfg->lcore_count = count; + return ret; +} + +static int +eal_parse_coremask(const char *coremask, int *cores) +{ + unsigned count = 0; + int i, j, idx; + int val; + char c; + + for (idx = 0; idx < RTE_MAX_LCORE; idx++) + cores[idx] = -1; + idx = 0; + + /* Remove all blank characters ahead and after . + * Remove 0x/0X if exists. + */ + while (isblank(*coremask)) + coremask++; + if (coremask[0] == '0' && ((coremask[1] == 'x') + || (coremask[1] == 'X'))) + coremask += 2; + i = strlen(coremask); + while ((i > 0) && isblank(coremask[i - 1])) + i--; + if (i == 0) + return -1; + + for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE; i--) { + c = coremask[i]; + if (isxdigit(c) == 0) { + /* invalid characters */ + return -1; + } + val = xdigit2val(c); + for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE; j++, idx++) + { + if ((1 << j) & val) { + cores[idx] = count; + count++; + } + } + } + for (; i >= 0; i--) + if (coremask[i] != '0') + return -1; + if (count == 0) + return -1; + return 0; +} + +static int +eal_parse_service_corelist(const char *corelist) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, idx = 0; + unsigned count = 0; + char *end = NULL; + int min, max; + uint32_t taken_lcore_count = 0; + + if (corelist == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*corelist)) + corelist++; + i = strlen(corelist); + while ((i > 0) && isblank(corelist[i - 1])) + i--; + + /* Get list of cores */ + min = RTE_MAX_LCORE; + do { + while (isblank(*corelist)) + corelist++; + if (*corelist == '\0') + return -1; + errno = 0; + idx = strtoul(corelist, &end, 10); + if (errno || end == NULL) + return -1; + while (isblank(*end)) + end++; + if (*end == '-') { + min = idx; + } else if ((*end == ',') || (*end == '\0')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = min; idx <= max; idx++) { + if (cfg->lcore_role[idx] != ROLE_SERVICE) { + /* handle master lcore already parsed */ + uint32_t lcore = idx; + if (cfg->master_lcore == lcore && + master_lcore_parsed) { + RTE_LOG(ERR, EAL, + "Error: lcore %u is master lcore, cannot use as service core\n", + idx); + return -1; + } + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + + lcore_config[idx].core_role = + ROLE_SERVICE; + count++; + } + } + min = RTE_MAX_LCORE; + } else + return -1; + corelist = end + 1; + } while (*end != '\0'); + + if (count == 0) + return -1; + + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores were in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + + return 0; +} + +static int +eal_parse_corelist(const char *corelist, int *cores) +{ + unsigned count = 0; + char *end = NULL; + int min, max; + int idx; + + for (idx = 0; idx < RTE_MAX_LCORE; idx++) + cores[idx] = -1; + + /* Remove all blank characters ahead */ + while (isblank(*corelist)) + corelist++; + + /* Get list of cores */ + min = RTE_MAX_LCORE; + do { + while (isblank(*corelist)) + corelist++; + if (*corelist == '\0') + return -1; + errno = 0; + idx = strtol(corelist, &end, 10); + if (errno || end == NULL) + return -1; + if (idx < 0 || idx >= RTE_MAX_LCORE) + return -1; + while (isblank(*end)) + end++; + if (*end == '-') { + min = idx; + } else if ((*end == ',') || (*end == '\0')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = min; idx <= max; idx++) { + if (cores[idx] == -1) { + cores[idx] = count; + count++; + } + } + min = RTE_MAX_LCORE; + } else + return -1; + corelist = end + 1; + } while (*end != '\0'); + + if (count == 0) + return -1; + return 0; +} + +/* Changes the lcore id of the master thread */ +static int +eal_parse_master_lcore(const char *arg) +{ + char *parsing_end; + struct rte_config *cfg = rte_eal_get_configuration(); + + errno = 0; + cfg->master_lcore = (uint32_t) strtol(arg, &parsing_end, 0); + if (errno || parsing_end[0] != 0) + return -1; + if (cfg->master_lcore >= RTE_MAX_LCORE) + return -1; + master_lcore_parsed = 1; + + /* ensure master core is not used as service core */ + if (lcore_config[cfg->master_lcore].core_role == ROLE_SERVICE) { + RTE_LOG(ERR, EAL, + "Error: Master lcore is used as a service core\n"); + return -1; + } + + return 0; +} + +/* + * Parse elem, the elem could be single number/range or '(' ')' group + * 1) A single number elem, it's just a simple digit. e.g. 9 + * 2) A single range elem, two digits with a '-' between. e.g. 2-6 + * 3) A group elem, combines multiple 1) or 2) with '( )'. e.g (0,2-4,6) + * Within group elem, '-' used for a range separator; + * ',' used for a single number. + */ +static int +eal_parse_set(const char *input, rte_cpuset_t *set) +{ + unsigned idx; + const char *str = input; + char *end = NULL; + unsigned min, max; + + CPU_ZERO(set); + + while (isblank(*str)) + str++; + + /* only digit or left bracket is qualify for start point */ + if ((!isdigit(*str) && *str != '(') || *str == '\0') + return -1; + + /* process single number or single range of number */ + if (*str != '(') { + errno = 0; + idx = strtoul(str, &end, 10); + if (errno || end == NULL || idx >= CPU_SETSIZE) + return -1; + else { + while (isblank(*end)) + end++; + + min = idx; + max = idx; + if (*end == '-') { + /* process single <number>-<number> */ + end++; + while (isblank(*end)) + end++; + if (!isdigit(*end)) + return -1; + + errno = 0; + idx = strtoul(end, &end, 10); + if (errno || end == NULL || idx >= CPU_SETSIZE) + return -1; + max = idx; + while (isblank(*end)) + end++; + if (*end != ',' && *end != '\0') + return -1; + } + + if (*end != ',' && *end != '\0' && + *end != '@') + return -1; + + for (idx = RTE_MIN(min, max); + idx <= RTE_MAX(min, max); idx++) + CPU_SET(idx, set); + + return end - input; + } + } + + /* process set within bracket */ + str++; + while (isblank(*str)) + str++; + if (*str == '\0') + return -1; + + min = RTE_MAX_LCORE; + do { + + /* go ahead to the first digit */ + while (isblank(*str)) + str++; + if (!isdigit(*str)) + return -1; + + /* get the digit value */ + errno = 0; + idx = strtoul(str, &end, 10); + if (errno || end == NULL || idx >= CPU_SETSIZE) + return -1; + + /* go ahead to separator '-',',' and ')' */ + while (isblank(*end)) + end++; + if (*end == '-') { + if (min == RTE_MAX_LCORE) + min = idx; + else /* avoid continuous '-' */ + return -1; + } else if ((*end == ',') || (*end == ')')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = RTE_MIN(min, max); + idx <= RTE_MAX(min, max); idx++) + CPU_SET(idx, set); + + min = RTE_MAX_LCORE; + } else + return -1; + + str = end + 1; + } while (*end != '\0' && *end != ')'); + + /* + * to avoid failure that tail blank makes end character check fail + * in eal_parse_lcores( ) + */ + while (isblank(*str)) + str++; + + return str - input; +} + +static int +check_cpuset(rte_cpuset_t *set) +{ + unsigned int idx; + + for (idx = 0; idx < CPU_SETSIZE; idx++) { + if (!CPU_ISSET(idx, set)) + continue; + + if (eal_cpu_detected(idx) == 0) { + RTE_LOG(ERR, EAL, "core %u " + "unavailable\n", idx); + return -1; + } + } + return 0; +} + +/* + * The format pattern: --lcores='<lcores[@cpus]>[<,lcores[@cpus]>...]' + * lcores, cpus could be a single digit/range or a group. + * '(' and ')' are necessary if it's a group. + * If not supply '@cpus', the value of cpus uses the same as lcores. + * e.g. '1,2@(5-7),(3-5)@(0,2),(0,6),7-8' means start 9 EAL thread as below + * lcore 0 runs on cpuset 0x41 (cpu 0,6) + * lcore 1 runs on cpuset 0x2 (cpu 1) + * lcore 2 runs on cpuset 0xe0 (cpu 5,6,7) + * lcore 3,4,5 runs on cpuset 0x5 (cpu 0,2) + * lcore 6 runs on cpuset 0x41 (cpu 0,6) + * lcore 7 runs on cpuset 0x80 (cpu 7) + * lcore 8 runs on cpuset 0x100 (cpu 8) + */ +static int +eal_parse_lcores(const char *lcores) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + rte_cpuset_t lcore_set; + unsigned int set_count; + unsigned idx = 0; + unsigned count = 0; + const char *lcore_start = NULL; + const char *end = NULL; + int offset; + rte_cpuset_t cpuset; + int lflags; + int ret = -1; + + if (lcores == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*lcores)) + lcores++; + + CPU_ZERO(&cpuset); + + /* Reset lcore config */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + cfg->lcore_role[idx] = ROLE_OFF; + lcore_config[idx].core_index = -1; + CPU_ZERO(&lcore_config[idx].cpuset); + } + + /* Get list of cores */ + do { + while (isblank(*lcores)) + lcores++; + if (*lcores == '\0') + goto err; + + lflags = 0; + + /* record lcore_set start point */ + lcore_start = lcores; + + /* go across a complete bracket */ + if (*lcore_start == '(') { + lcores += strcspn(lcores, ")"); + if (*lcores++ == '\0') + goto err; + } + + /* scan the separator '@', ','(next) or '\0'(finish) */ + lcores += strcspn(lcores, "@,"); + + if (*lcores == '@') { + /* explicit assign cpuset and update the end cursor */ + offset = eal_parse_set(lcores + 1, &cpuset); + if (offset < 0) + goto err; + end = lcores + 1 + offset; + } else { /* ',' or '\0' */ + /* haven't given cpuset, current loop done */ + end = lcores; + + /* go back to check <number>-<number> */ + offset = strcspn(lcore_start, "(-"); + if (offset < (end - lcore_start) && + *(lcore_start + offset) != '(') + lflags = 1; + } + + if (*end != ',' && *end != '\0') + goto err; + + /* parse lcore_set from start point */ + if (eal_parse_set(lcore_start, &lcore_set) < 0) + goto err; + + /* without '@', by default using lcore_set as cpuset */ + if (*lcores != '@') + rte_memcpy(&cpuset, &lcore_set, sizeof(cpuset)); + + set_count = CPU_COUNT(&lcore_set); + /* start to update lcore_set */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (!CPU_ISSET(idx, &lcore_set)) + continue; + set_count--; + + if (cfg->lcore_role[idx] != ROLE_RTE) { + lcore_config[idx].core_index = count; + cfg->lcore_role[idx] = ROLE_RTE; + count++; + } + + if (lflags) { + CPU_ZERO(&cpuset); + CPU_SET(idx, &cpuset); + } + + if (check_cpuset(&cpuset) < 0) + goto err; + rte_memcpy(&lcore_config[idx].cpuset, &cpuset, + sizeof(rte_cpuset_t)); + } + + /* some cores from the lcore_set can't be handled by EAL */ + if (set_count != 0) + goto err; + + lcores = end + 1; + } while (*end != '\0'); + + if (count == 0) + goto err; + + cfg->lcore_count = count; + ret = 0; + +err: + + return ret; +} + +#ifndef RTE_EXEC_ENV_WINDOWS +static int +eal_parse_syslog(const char *facility, struct internal_config *conf) +{ + int i; + static const struct { + const char *name; + int value; + } map[] = { + { "auth", LOG_AUTH }, + { "cron", LOG_CRON }, + { "daemon", LOG_DAEMON }, + { "ftp", LOG_FTP }, + { "kern", LOG_KERN }, + { "lpr", LOG_LPR }, + { "mail", LOG_MAIL }, + { "news", LOG_NEWS }, + { "syslog", LOG_SYSLOG }, + { "user", LOG_USER }, + { "uucp", LOG_UUCP }, + { "local0", LOG_LOCAL0 }, + { "local1", LOG_LOCAL1 }, + { "local2", LOG_LOCAL2 }, + { "local3", LOG_LOCAL3 }, + { "local4", LOG_LOCAL4 }, + { "local5", LOG_LOCAL5 }, + { "local6", LOG_LOCAL6 }, + { "local7", LOG_LOCAL7 }, + { NULL, 0 } + }; + + for (i = 0; map[i].name; i++) { + if (!strcmp(facility, map[i].name)) { + conf->syslog_facility = map[i].value; + return 0; + } + } + return -1; +} +#endif + +static int +eal_parse_log_priority(const char *level) +{ + static const char * const levels[] = { + [RTE_LOG_EMERG] = "emergency", + [RTE_LOG_ALERT] = "alert", + [RTE_LOG_CRIT] = "critical", + [RTE_LOG_ERR] = "error", + [RTE_LOG_WARNING] = "warning", + [RTE_LOG_NOTICE] = "notice", + [RTE_LOG_INFO] = "info", + [RTE_LOG_DEBUG] = "debug", + }; + size_t len = strlen(level); + unsigned long tmp; + char *end; + unsigned int i; + + if (len == 0) + return -1; + + /* look for named values, skip 0 which is not a valid level */ + for (i = 1; i < RTE_DIM(levels); i++) { + if (strncmp(levels[i], level, len) == 0) + return i; + } + + /* not a string, maybe it is numeric */ + errno = 0; + tmp = strtoul(level, &end, 0); + + /* check for errors */ + if (errno != 0 || end == NULL || *end != '\0' || + tmp >= UINT32_MAX) + return -1; + + return tmp; +} + +static int +eal_parse_log_level(const char *arg) +{ + const char *pattern = NULL; + const char *regex = NULL; + char *str, *level; + int priority; + + str = strdup(arg); + if (str == NULL) + return -1; + + if ((level = strchr(str, ','))) { + regex = str; + *level++ = '\0'; + } else if ((level = strchr(str, ':'))) { + pattern = str; + *level++ = '\0'; + } else { + level = str; + } + + priority = eal_parse_log_priority(level); + if (priority < 0) { + fprintf(stderr, "invalid log priority: %s\n", level); + goto fail; + } + + if (regex) { + if (rte_log_set_level_regexp(regex, priority) < 0) { + fprintf(stderr, "cannot set log level %s,%d\n", + regex, priority); + goto fail; + } + if (rte_log_save_regexp(regex, priority) < 0) + goto fail; + } else if (pattern) { + if (rte_log_set_level_pattern(pattern, priority) < 0) { + fprintf(stderr, "cannot set log level %s:%d\n", + pattern, priority); + goto fail; + } + if (rte_log_save_pattern(pattern, priority) < 0) + goto fail; + } else { + rte_log_set_global_level(priority); + } + + free(str); + return 0; + +fail: + free(str); + return -1; +} + +static enum rte_proc_type_t +eal_parse_proc_type(const char *arg) +{ + if (strncasecmp(arg, "primary", sizeof("primary")) == 0) + return RTE_PROC_PRIMARY; + if (strncasecmp(arg, "secondary", sizeof("secondary")) == 0) + return RTE_PROC_SECONDARY; + if (strncasecmp(arg, "auto", sizeof("auto")) == 0) + return RTE_PROC_AUTO; + + return RTE_PROC_INVALID; +} + +static int +eal_parse_iova_mode(const char *name) +{ + int mode; + + if (name == NULL) + return -1; + + if (!strcmp("pa", name)) + mode = RTE_IOVA_PA; + else if (!strcmp("va", name)) + mode = RTE_IOVA_VA; + else + return -1; + + internal_config.iova_mode = mode; + return 0; +} + +static int +eal_parse_base_virtaddr(const char *arg) +{ + char *end; + uint64_t addr; + + errno = 0; + addr = strtoull(arg, &end, 16); + + /* check for errors */ + if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) + return -1; + + /* make sure we don't exceed 32-bit boundary on 32-bit target */ +#ifndef RTE_ARCH_64 + if (addr >= UINTPTR_MAX) + return -1; +#endif + + /* align the addr on 16M boundary, 16MB is the minimum huge page + * size on IBM Power architecture. If the addr is aligned to 16MB, + * it can align to 2MB for x86. So this alignment can also be used + * on x86 and other architectures. + */ + internal_config.base_virtaddr = + RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); + + return 0; +} + +/* caller is responsible for freeing the returned string */ +static char * +available_cores(void) +{ + char *str = NULL; + int previous; + int sequence; + char *tmp; + int idx; + + /* find the first available cpu */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (eal_cpu_detected(idx) == 0) + continue; + break; + } + if (idx >= RTE_MAX_LCORE) + return NULL; + + /* first sequence */ + if (asprintf(&str, "%d", idx) < 0) + return NULL; + previous = idx; + sequence = 0; + + for (idx++ ; idx < RTE_MAX_LCORE; idx++) { + if (eal_cpu_detected(idx) == 0) + continue; + + if (idx == previous + 1) { + previous = idx; + sequence = 1; + continue; + } + + /* finish current sequence */ + if (sequence) { + if (asprintf(&tmp, "%s-%d", str, previous) < 0) { + free(str); + return NULL; + } + free(str); + str = tmp; + } + + /* new sequence */ + if (asprintf(&tmp, "%s,%d", str, idx) < 0) { + free(str); + return NULL; + } + free(str); + str = tmp; + previous = idx; + sequence = 0; + } + + /* finish last sequence */ + if (sequence) { + if (asprintf(&tmp, "%s-%d", str, previous) < 0) { + free(str); + return NULL; + } + free(str); + str = tmp; + } + + return str; +} + +int +eal_parse_common_option(int opt, const char *optarg, + struct internal_config *conf) +{ + static int b_used; + static int w_used; + + switch (opt) { + /* blacklist */ + case 'b': + if (w_used) + goto bw_used; + if (eal_option_device_add(RTE_DEVTYPE_BLACKLISTED_PCI, + optarg) < 0) { + return -1; + } + b_used = 1; + break; + /* whitelist */ + case 'w': + if (b_used) + goto bw_used; + if (eal_option_device_add(RTE_DEVTYPE_WHITELISTED_PCI, + optarg) < 0) { + return -1; + } + w_used = 1; + break; + /* coremask */ + case 'c': { + int lcore_indexes[RTE_MAX_LCORE]; + + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. Please ensure -c is before -s or -S\n"); + if (eal_parse_coremask(optarg, lcore_indexes) < 0) { + RTE_LOG(ERR, EAL, "invalid coremask syntax\n"); + return -1; + } + if (update_lcore_config(lcore_indexes) < 0) { + char *available = available_cores(); + + RTE_LOG(ERR, EAL, + "invalid coremask, please check specified cores are part of %s\n", + available); + free(available); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option -c is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_LST) ? "-l" : + (core_parsed == LCORE_OPT_MAP) ? "--lcore" : + "-c"); + return -1; + } + + core_parsed = LCORE_OPT_MSK; + break; + } + /* corelist */ + case 'l': { + int lcore_indexes[RTE_MAX_LCORE]; + + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. Please ensure -l is before -s or -S\n"); + + if (eal_parse_corelist(optarg, lcore_indexes) < 0) { + RTE_LOG(ERR, EAL, "invalid core list syntax\n"); + return -1; + } + if (update_lcore_config(lcore_indexes) < 0) { + char *available = available_cores(); + + RTE_LOG(ERR, EAL, + "invalid core list, please check specified cores are part of %s\n", + available); + free(available); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option -l is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_MSK) ? "-c" : + (core_parsed == LCORE_OPT_MAP) ? "--lcore" : + "-l"); + return -1; + } + + core_parsed = LCORE_OPT_LST; + break; + } + /* service coremask */ + case 's': + if (eal_parse_service_coremask(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid service coremask\n"); + return -1; + } + break; + /* service corelist */ + case 'S': + if (eal_parse_service_corelist(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid service core list\n"); + return -1; + } + break; + /* size of memory */ + case 'm': + conf->memory = atoi(optarg); + conf->memory *= 1024ULL; + conf->memory *= 1024ULL; + mem_parsed = 1; + break; + /* force number of channels */ + case 'n': + conf->force_nchannel = atoi(optarg); + if (conf->force_nchannel == 0) { + RTE_LOG(ERR, EAL, "invalid channel number\n"); + return -1; + } + break; + /* force number of ranks */ + case 'r': + conf->force_nrank = atoi(optarg); + if (conf->force_nrank == 0 || + conf->force_nrank > 16) { + RTE_LOG(ERR, EAL, "invalid rank number\n"); + return -1; + } + break; + /* force loading of external driver */ + case 'd': + if (eal_plugin_add(optarg) == -1) + return -1; + break; + case 'v': + /* since message is explicitly requested by user, we + * write message at highest log level so it can always + * be seen + * even if info or warning messages are disabled */ + RTE_LOG(CRIT, EAL, "RTE Version: '%s'\n", rte_version()); + break; + + /* long options */ + case OPT_HUGE_UNLINK_NUM: + conf->hugepage_unlink = 1; + break; + + case OPT_NO_HUGE_NUM: + conf->no_hugetlbfs = 1; + /* no-huge is legacy mem */ + conf->legacy_mem = 1; + break; + + case OPT_NO_PCI_NUM: + conf->no_pci = 1; + break; + + case OPT_NO_HPET_NUM: + conf->no_hpet = 1; + break; + + case OPT_VMWARE_TSC_MAP_NUM: + conf->vmware_tsc_map = 1; + break; + + case OPT_NO_SHCONF_NUM: + conf->no_shconf = 1; + break; + + case OPT_IN_MEMORY_NUM: + conf->in_memory = 1; + /* in-memory is a superset of noshconf and huge-unlink */ + conf->no_shconf = 1; + conf->hugepage_unlink = 1; + break; + + case OPT_PROC_TYPE_NUM: + conf->process_type = eal_parse_proc_type(optarg); + break; + + case OPT_MASTER_LCORE_NUM: + if (eal_parse_master_lcore(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_MASTER_LCORE "\n"); + return -1; + } + break; + + case OPT_VDEV_NUM: + if (eal_option_device_add(RTE_DEVTYPE_VIRTUAL, + optarg) < 0) { + return -1; + } + break; + +#ifndef RTE_EXEC_ENV_WINDOWS + case OPT_SYSLOG_NUM: + if (eal_parse_syslog(optarg, conf) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SYSLOG "\n"); + return -1; + } + break; +#endif + + case OPT_LOG_LEVEL_NUM: { + if (eal_parse_log_level(optarg) < 0) { + RTE_LOG(ERR, EAL, + "invalid parameters for --" + OPT_LOG_LEVEL "\n"); + return -1; + } + break; + } + +#ifndef RTE_EXEC_ENV_WINDOWS + case OPT_TRACE_NUM: { + if (eal_trace_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE "\n"); + return -1; + } + break; + } + + case OPT_TRACE_DIR_NUM: { + if (eal_trace_dir_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE_DIR "\n"); + return -1; + } + break; + } + + case OPT_TRACE_BUF_SIZE_NUM: { + if (eal_trace_bufsz_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE_BUF_SIZE "\n"); + return -1; + } + break; + } + + case OPT_TRACE_MODE_NUM: { + if (eal_trace_mode_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE_MODE "\n"); + return -1; + } + break; + } +#endif /* !RTE_EXEC_ENV_WINDOWS */ + + case OPT_LCORES_NUM: + if (eal_parse_lcores(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_LCORES "\n"); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option --lcore is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_LST) ? "-l" : + (core_parsed == LCORE_OPT_MSK) ? "-c" : + "--lcore"); + return -1; + } + + core_parsed = LCORE_OPT_MAP; + break; + case OPT_LEGACY_MEM_NUM: + conf->legacy_mem = 1; + break; + case OPT_SINGLE_FILE_SEGMENTS_NUM: + conf->single_file_segments = 1; + break; + case OPT_IOVA_MODE_NUM: + if (eal_parse_iova_mode(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_IOVA_MODE "\n"); + return -1; + } + break; + case OPT_BASE_VIRTADDR_NUM: + if (eal_parse_base_virtaddr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_BASE_VIRTADDR "\n"); + return -1; + } + break; + case OPT_TELEMETRY_NUM: + break; + case OPT_NO_TELEMETRY_NUM: + conf->no_telemetry = 1; + break; + + /* don't know what to do, leave this to caller */ + default: + return 1; + + } + + return 0; +bw_used: + RTE_LOG(ERR, EAL, "Options blacklist (-b) and whitelist (-w) " + "cannot be used at the same time\n"); + return -1; +} + +static void +eal_auto_detect_cores(struct rte_config *cfg) +{ + unsigned int lcore_id; + unsigned int removed = 0; + rte_cpuset_t affinity_set; + + if (pthread_getaffinity_np(pthread_self(), sizeof(rte_cpuset_t), + &affinity_set)) + CPU_ZERO(&affinity_set); + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (cfg->lcore_role[lcore_id] == ROLE_RTE && + !CPU_ISSET(lcore_id, &affinity_set)) { + cfg->lcore_role[lcore_id] = ROLE_OFF; + removed++; + } + } + + cfg->lcore_count -= removed; +} + +static void +compute_ctrl_threads_cpuset(struct internal_config *internal_cfg) +{ + rte_cpuset_t *cpuset = &internal_cfg->ctrl_cpuset; + rte_cpuset_t default_set; + unsigned int lcore_id; + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (rte_lcore_has_role(lcore_id, ROLE_OFF)) + continue; + RTE_CPU_OR(cpuset, cpuset, &lcore_config[lcore_id].cpuset); + } + RTE_CPU_NOT(cpuset, cpuset); + + if (pthread_getaffinity_np(pthread_self(), sizeof(rte_cpuset_t), + &default_set)) + CPU_ZERO(&default_set); + + RTE_CPU_AND(cpuset, cpuset, &default_set); + + /* if no remaining cpu, use master lcore cpu affinity */ + if (!CPU_COUNT(cpuset)) { + memcpy(cpuset, &lcore_config[rte_get_master_lcore()].cpuset, + sizeof(*cpuset)); + } +} + +int +eal_cleanup_config(struct internal_config *internal_cfg) +{ + if (internal_cfg->hugefile_prefix != NULL) + free(internal_cfg->hugefile_prefix); + if (internal_cfg->hugepage_dir != NULL) + free(internal_cfg->hugepage_dir); + if (internal_cfg->user_mbuf_pool_ops_name != NULL) + free(internal_cfg->user_mbuf_pool_ops_name); + + return 0; +} + +int +eal_adjust_config(struct internal_config *internal_cfg) +{ + int i; + struct rte_config *cfg = rte_eal_get_configuration(); + + if (!core_parsed) + eal_auto_detect_cores(cfg); + + if (internal_config.process_type == RTE_PROC_AUTO) + internal_config.process_type = eal_proc_type_detect(); + + /* default master lcore is the first one */ + if (!master_lcore_parsed) { + cfg->master_lcore = rte_get_next_lcore(-1, 0, 0); + if (cfg->master_lcore >= RTE_MAX_LCORE) + return -1; + lcore_config[cfg->master_lcore].core_role = ROLE_RTE; + } + + compute_ctrl_threads_cpuset(internal_cfg); + + /* if no memory amounts were requested, this will result in 0 and + * will be overridden later, right after eal_hugepage_info_init() */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->memory += internal_cfg->socket_mem[i]; + + return 0; +} + +int +eal_check_common_options(struct internal_config *internal_cfg) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (cfg->lcore_role[cfg->master_lcore] != ROLE_RTE) { + RTE_LOG(ERR, EAL, "Master lcore is not enabled for DPDK\n"); + return -1; + } + + if (internal_cfg->process_type == RTE_PROC_INVALID) { + RTE_LOG(ERR, EAL, "Invalid process type specified\n"); + return -1; + } + if (internal_cfg->hugefile_prefix != NULL && + strlen(internal_cfg->hugefile_prefix) < 1) { + RTE_LOG(ERR, EAL, "Invalid length of --" OPT_FILE_PREFIX " option\n"); + return -1; + } + if (internal_cfg->hugepage_dir != NULL && + strlen(internal_cfg->hugepage_dir) < 1) { + RTE_LOG(ERR, EAL, "Invalid length of --" OPT_HUGE_DIR" option\n"); + return -1; + } + if (internal_cfg->user_mbuf_pool_ops_name != NULL && + strlen(internal_cfg->user_mbuf_pool_ops_name) < 1) { + RTE_LOG(ERR, EAL, "Invalid length of --" OPT_MBUF_POOL_OPS_NAME" option\n"); + return -1; + } + if (index(eal_get_hugefile_prefix(), '%') != NULL) { + RTE_LOG(ERR, EAL, "Invalid char, '%%', in --"OPT_FILE_PREFIX" " + "option\n"); + return -1; + } + if (mem_parsed && internal_cfg->force_sockets == 1) { + RTE_LOG(ERR, EAL, "Options -m and --"OPT_SOCKET_MEM" cannot " + "be specified at the same time\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->force_sockets == 1) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_MEM" cannot " + "be specified together with --"OPT_NO_HUGE"\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot " + "be specified together with --"OPT_NO_HUGE"\n"); + return -1; + } + if (internal_config.force_socket_limits && internal_config.legacy_mem) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_LIMIT + " is only supported in non-legacy memory mode\n"); + } + if (internal_cfg->single_file_segments && + internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " + "not compatible with --"OPT_HUGE_UNLINK"\n"); + return -1; + } + if (internal_cfg->legacy_mem && + internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_IN_MEMORY"\n"); + return -1; + } + if (internal_cfg->legacy_mem && internal_cfg->match_allocations) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_MATCH_ALLOCATIONS"\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->match_allocations) { + RTE_LOG(ERR, EAL, "Option --"OPT_NO_HUGE" is not compatible " + "with --"OPT_MATCH_ALLOCATIONS"\n"); + return -1; + } + if (internal_cfg->legacy_mem && internal_cfg->memory == 0) { + RTE_LOG(NOTICE, EAL, "Static memory layout is selected, " + "amount of reserved memory can be adjusted with " + "-m or --"OPT_SOCKET_MEM"\n"); + } + + return 0; +} + +void +eal_common_usage(void) +{ + printf("[options]\n\n" + "EAL common options:\n" + " -c COREMASK Hexadecimal bitmask of cores to run on\n" + " -l CORELIST List of cores to run on\n" + " The argument format is <c1>[-c2][,c3[-c4],...]\n" + " where c1, c2, etc are core indexes between 0 and %d\n" + " --"OPT_LCORES" COREMAP Map lcore set to physical cpu set\n" + " The argument format is\n" + " '<lcores[@cpus]>[<,lcores[@cpus]>...]'\n" + " lcores and cpus list are grouped by '(' and ')'\n" + " Within the group, '-' is used for range separator,\n" + " ',' is used for single number separator.\n" + " '( )' can be omitted for single element group,\n" + " '@' can be omitted if cpus and lcores have the same value\n" + " -s SERVICE COREMASK Hexadecimal bitmask of cores to be used as service cores\n" + " --"OPT_MASTER_LCORE" ID Core ID that is used as master\n" + " --"OPT_MBUF_POOL_OPS_NAME" Pool ops name for mbuf to use\n" + " -n CHANNELS Number of memory channels\n" + " -m MB Memory to allocate (see also --"OPT_SOCKET_MEM")\n" + " -r RANKS Force number of memory ranks (don't detect)\n" + " -b, --"OPT_PCI_BLACKLIST" Add a PCI device in black list.\n" + " Prevent EAL from using this PCI device. The argument\n" + " format is <domain:bus:devid.func>.\n" + " -w, --"OPT_PCI_WHITELIST" Add a PCI device in white list.\n" + " Only use the specified PCI devices. The argument format\n" + " is <[domain:]bus:devid.func>. This option can be present\n" + " several times (once per device).\n" + " [NOTE: PCI whitelist cannot be used with -b option]\n" + " --"OPT_VDEV" Add a virtual device.\n" + " The argument format is <driver><id>[,key=val,...]\n" + " (ex: --vdev=net_pcap0,iface=eth2).\n" + " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n" + " 'va' for IOVA_VA\n" + " -d LIB.so|DIR Add a driver or driver directory\n" + " (can be used multiple times)\n" + " --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n" + " --"OPT_PROC_TYPE" Type of this process (primary|secondary|auto)\n" +#ifndef RTE_EXEC_ENV_WINDOWS + " --"OPT_SYSLOG" Set syslog facility\n" +#endif + " --"OPT_LOG_LEVEL"=<int> Set global log level\n" + " --"OPT_LOG_LEVEL"=<type-match>:<int>\n" + " Set specific log level\n" +#ifndef RTE_EXEC_ENV_WINDOWS + " --"OPT_TRACE"=<regex-match>\n" + " Enable trace based on regular expression trace name.\n" + " By default, the trace is disabled.\n" + " User must specify this option to enable trace.\n" + " --"OPT_TRACE_DIR"=<directory path>\n" + " Specify trace directory for trace output.\n" + " By default, trace output will created at\n" + " $HOME directory and parameter must be\n" + " specified once only.\n" + " --"OPT_TRACE_BUF_SIZE"=<int>\n" + " Specify maximum size of allocated memory\n" + " for trace output for each thread. Valid\n" + " unit can be either 'B|K|M' for 'Bytes',\n" + " 'KBytes' and 'MBytes' respectively.\n" + " Default is 1MB and parameter must be\n" + " specified once only.\n" + " --"OPT_TRACE_MODE"=<o[verwrite] | d[iscard]>\n" + " Specify the mode of update of trace\n" + " output file. Either update on a file can\n" + " be wrapped or discarded when file size\n" + " reaches its maximum limit.\n" + " Default mode is 'overwrite' and parameter\n" + " must be specified once only.\n" +#endif /* !RTE_EXEC_ENV_WINDOWS */ + " -v Display version information on startup\n" + " -h, --help This help\n" + " --"OPT_IN_MEMORY" Operate entirely in memory. This will\n" + " disable secondary process support\n" + " --"OPT_BASE_VIRTADDR" Base virtual address\n" + " --"OPT_TELEMETRY" Enable telemetry support (on by default)\n" + " --"OPT_NO_TELEMETRY" Disable telemetry support\n" + "\nEAL options for DEBUG use only:\n" + " --"OPT_HUGE_UNLINK" Unlink hugepage files after init\n" + " --"OPT_NO_HUGE" Use malloc instead of hugetlbfs\n" + " --"OPT_NO_PCI" Disable PCI\n" + " --"OPT_NO_HPET" Disable HPET\n" + " --"OPT_NO_SHCONF" No shared config (mmap'd files)\n" + "\n", RTE_MAX_LCORE); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c new file mode 100644 index 000000000..935e8fefe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c @@ -0,0 +1,1217 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2018 Intel Corporation + */ + +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <fnmatch.h> +#include <inttypes.h> +#include <libgen.h> +#include <limits.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/file.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <unistd.h> + +#include <rte_alarm.h> +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_tailq.h> + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" + +static int mp_fd = -1; +static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */ +static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */ +static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER; +static char peer_name[PATH_MAX]; + +struct action_entry { + TAILQ_ENTRY(action_entry) next; + char action_name[RTE_MP_MAX_NAME_LEN]; + rte_mp_t action; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(action_entry_list, action_entry); + +static struct action_entry_list action_entry_list = + TAILQ_HEAD_INITIALIZER(action_entry_list); + +enum mp_type { + MP_MSG, /* Share message with peers, will not block */ + MP_REQ, /* Request for information, Will block for a reply */ + MP_REP, /* Response to previously-received request */ + MP_IGN, /* Response telling requester to ignore this response */ +}; + +struct mp_msg_internal { + int type; + struct rte_mp_msg msg; +}; + +struct async_request_param { + rte_mp_async_reply_t clb; + struct rte_mp_reply user_reply; + struct timespec end; + int n_responses_processed; +}; + +struct pending_request { + TAILQ_ENTRY(pending_request) next; + enum { + REQUEST_TYPE_SYNC, + REQUEST_TYPE_ASYNC + } type; + char dst[PATH_MAX]; + struct rte_mp_msg *request; + struct rte_mp_msg *reply; + int reply_received; + RTE_STD_C11 + union { + struct { + struct async_request_param *param; + } async; + struct { + pthread_cond_t cond; + } sync; + }; +}; + +TAILQ_HEAD(pending_request_list, pending_request); + +static struct { + struct pending_request_list requests; + pthread_mutex_t lock; +} pending_requests = { + .requests = TAILQ_HEAD_INITIALIZER(pending_requests.requests), + .lock = PTHREAD_MUTEX_INITIALIZER, + /**< used in async requests only */ +}; + +/* forward declarations */ +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type); + +/* for use with alarm callback */ +static void +async_reply_handle(void *arg); + +/* for use with process_msg */ +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg); + +static void +trigger_async_action(struct pending_request *req); + +static struct pending_request * +find_pending_request(const char *dst, const char *act_name) +{ + struct pending_request *r; + + TAILQ_FOREACH(r, &pending_requests.requests, next) { + if (!strcmp(r->dst, dst) && + !strcmp(r->request->name, act_name)) + break; + } + + return r; +} + +static void +create_socket_path(const char *name, char *buf, int len) +{ + const char *prefix = eal_mp_socket_path(); + + if (strlen(name) > 0) + snprintf(buf, len, "%s_%s", prefix, name); + else + strlcpy(buf, prefix, len); +} + +int +rte_eal_primary_proc_alive(const char *config_file_path) +{ + int config_fd; + + if (config_file_path) + config_fd = open(config_file_path, O_RDONLY); + else { + const char *path; + + path = eal_runtime_config_path(); + config_fd = open(path, O_RDONLY); + } + if (config_fd < 0) + return 0; + + int ret = lockf(config_fd, F_TEST, 0); + close(config_fd); + + return !!ret; +} + +static struct action_entry * +find_action_entry_by_name(const char *name) +{ + struct action_entry *entry; + + TAILQ_FOREACH(entry, &action_entry_list, next) { + if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0) + break; + } + + return entry; +} + +static int +validate_action_name(const char *name) +{ + if (name == NULL) { + RTE_LOG(ERR, EAL, "Action name cannot be NULL\n"); + rte_errno = EINVAL; + return -1; + } + if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) { + RTE_LOG(ERR, EAL, "Length of action name is zero\n"); + rte_errno = EINVAL; + return -1; + } + if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) { + rte_errno = E2BIG; + return -1; + } + return 0; +} + +int +rte_mp_action_register(const char *name, rte_mp_t action) +{ + struct action_entry *entry; + + if (validate_action_name(name) != 0) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + entry = malloc(sizeof(struct action_entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + return -1; + } + strlcpy(entry->action_name, name, sizeof(entry->action_name)); + entry->action = action; + + pthread_mutex_lock(&mp_mutex_action); + if (find_action_entry_by_name(name) != NULL) { + pthread_mutex_unlock(&mp_mutex_action); + rte_errno = EEXIST; + free(entry); + return -1; + } + TAILQ_INSERT_TAIL(&action_entry_list, entry, next); + pthread_mutex_unlock(&mp_mutex_action); + return 0; +} + +void +rte_mp_action_unregister(const char *name) +{ + struct action_entry *entry; + + if (validate_action_name(name) != 0) + return; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return; + } + + pthread_mutex_lock(&mp_mutex_action); + entry = find_action_entry_by_name(name); + if (entry == NULL) { + pthread_mutex_unlock(&mp_mutex_action); + return; + } + TAILQ_REMOVE(&action_entry_list, entry, next); + pthread_mutex_unlock(&mp_mutex_action); + free(entry); +} + +static int +read_msg(struct mp_msg_internal *m, struct sockaddr_un *s) +{ + int msglen; + struct iovec iov; + struct msghdr msgh; + char control[CMSG_SPACE(sizeof(m->msg.fds))]; + struct cmsghdr *cmsg; + int buflen = sizeof(*m) - sizeof(m->msg.fds); + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = m; + iov.iov_len = buflen; + + msgh.msg_name = s; + msgh.msg_namelen = sizeof(*s); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + msglen = recvmsg(mp_fd, &msgh, 0); + if (msglen < 0) { + RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno)); + return -1; + } + + if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + RTE_LOG(ERR, EAL, "truncated msg\n"); + return -1; + } + + /* read auxiliary FDs if any */ + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds)); + break; + } + } + /* sanity-check the response */ + if (m->msg.num_fds < 0 || m->msg.num_fds > RTE_MP_MAX_FD_NUM) { + RTE_LOG(ERR, EAL, "invalid number of fd's received\n"); + return -1; + } + if (m->msg.len_param < 0 || m->msg.len_param > RTE_MP_MAX_PARAM_LEN) { + RTE_LOG(ERR, EAL, "invalid received data length\n"); + return -1; + } + return 0; +} + +static void +process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) +{ + struct pending_request *pending_req; + struct action_entry *entry; + struct rte_mp_msg *msg = &m->msg; + rte_mp_t action = NULL; + + RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name); + + if (m->type == MP_REP || m->type == MP_IGN) { + struct pending_request *req = NULL; + + pthread_mutex_lock(&pending_requests.lock); + pending_req = find_pending_request(s->sun_path, msg->name); + if (pending_req) { + memcpy(pending_req->reply, msg, sizeof(*msg)); + /* -1 indicates that we've been asked to ignore */ + pending_req->reply_received = + m->type == MP_REP ? 1 : -1; + + if (pending_req->type == REQUEST_TYPE_SYNC) + pthread_cond_signal(&pending_req->sync.cond); + else if (pending_req->type == REQUEST_TYPE_ASYNC) + req = async_reply_handle_thread_unsafe( + pending_req); + } else + RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name); + pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); + return; + } + + pthread_mutex_lock(&mp_mutex_action); + entry = find_action_entry_by_name(msg->name); + if (entry != NULL) + action = entry->action; + pthread_mutex_unlock(&mp_mutex_action); + + if (!action) { + if (m->type == MP_REQ && !internal_config.init_complete) { + /* if this is a request, and init is not yet complete, + * and callback wasn't registered, we should tell the + * requester to ignore our existence because we're not + * yet ready to process this request. + */ + struct rte_mp_msg dummy; + + memset(&dummy, 0, sizeof(dummy)); + strlcpy(dummy.name, msg->name, sizeof(dummy.name)); + mp_send(&dummy, s->sun_path, MP_IGN); + } else { + RTE_LOG(ERR, EAL, "Cannot find action: %s\n", + msg->name); + } + } else if (action(msg, s->sun_path) < 0) { + RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name); + } +} + +static void * +mp_handle(void *arg __rte_unused) +{ + struct mp_msg_internal msg; + struct sockaddr_un sa; + + while (1) { + if (read_msg(&msg, &sa) == 0) + process_msg(&msg, &sa); + } + + return NULL; +} + +static int +timespec_cmp(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec < b->tv_sec) + return -1; + if (a->tv_sec > b->tv_sec) + return 1; + if (a->tv_nsec < b->tv_nsec) + return -1; + if (a->tv_nsec > b->tv_nsec) + return 1; + return 0; +} + +enum async_action { + ACTION_FREE, /**< free the action entry, but don't trigger callback */ + ACTION_TRIGGER /**< trigger callback, then free action entry */ +}; + +static enum async_action +process_async_request(struct pending_request *sr, const struct timespec *now) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + bool timeout, last_msg; + + param = sr->async.param; + reply = ¶m->user_reply; + + /* did we timeout? */ + timeout = timespec_cmp(¶m->end, now) <= 0; + + /* if we received a response, adjust relevant data and copy mesasge. */ + if (sr->reply_received == 1 && sr->reply) { + struct rte_mp_msg *msg, *user_msgs, *tmp; + + msg = sr->reply; + user_msgs = reply->msgs; + + tmp = realloc(user_msgs, sizeof(*msg) * + (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + sr->dst, sr->request->name); + /* this entry is going to be removed and its message + * dropped, but we don't want to leak memory, so + * continue. + */ + } else { + user_msgs = tmp; + reply->msgs = user_msgs; + memcpy(&user_msgs[reply->nb_received], + msg, sizeof(*msg)); + reply->nb_received++; + } + + /* mark this request as processed */ + param->n_responses_processed++; + } else if (sr->reply_received == -1) { + /* we were asked to ignore this process */ + reply->nb_sent--; + } else if (timeout) { + /* count it as processed response, but don't increment + * nb_received. + */ + param->n_responses_processed++; + } + + free(sr->reply); + + last_msg = param->n_responses_processed == reply->nb_sent; + + return last_msg ? ACTION_TRIGGER : ACTION_FREE; +} + +static void +trigger_async_action(struct pending_request *sr) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + + param = sr->async.param; + reply = ¶m->user_reply; + + param->clb(sr->request, reply); + + /* clean up */ + free(sr->async.param->user_reply.msgs); + free(sr->async.param); + free(sr->request); + free(sr); +} + +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg) +{ + struct pending_request *req = (struct pending_request *)arg; + enum async_action action; + struct timespec ts_now; + struct timeval now; + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto no_trigger; + } + ts_now.tv_nsec = now.tv_usec * 1000; + ts_now.tv_sec = now.tv_sec; + + action = process_async_request(req, &ts_now); + + TAILQ_REMOVE(&pending_requests.requests, req, next); + + if (rte_eal_alarm_cancel(async_reply_handle, req) < 0) { + /* if we failed to cancel the alarm because it's already in + * progress, don't proceed because otherwise we will end up + * handling the same message twice. + */ + if (rte_errno == EINPROGRESS) { + RTE_LOG(DEBUG, EAL, "Request handling is already in progress\n"); + goto no_trigger; + } + RTE_LOG(ERR, EAL, "Failed to cancel alarm\n"); + } + + if (action == ACTION_TRIGGER) + return req; +no_trigger: + free(req); + return NULL; +} + +static void +async_reply_handle(void *arg) +{ + struct pending_request *req; + + pthread_mutex_lock(&pending_requests.lock); + req = async_reply_handle_thread_unsafe(arg); + pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); +} + +static int +open_socket_fd(void) +{ + struct sockaddr_un un; + + peer_name[0] = '\0'; + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + snprintf(peer_name, sizeof(peer_name), + "%d_%"PRIx64, getpid(), rte_rdtsc()); + + mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (mp_fd < 0) { + RTE_LOG(ERR, EAL, "failed to create unix socket\n"); + return -1; + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + + create_socket_path(peer_name, un.sun_path, sizeof(un.sun_path)); + + unlink(un.sun_path); /* May still exist since last run */ + + if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + RTE_LOG(ERR, EAL, "failed to bind %s: %s\n", + un.sun_path, strerror(errno)); + close(mp_fd); + return -1; + } + + RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path); + return mp_fd; +} + +static void +close_socket_fd(void) +{ + char path[PATH_MAX]; + + if (mp_fd < 0) + return; + + close(mp_fd); + create_socket_path(peer_name, path, sizeof(path)); + unlink(path); +} + +int +rte_mp_channel_init(void) +{ + char path[PATH_MAX]; + int dir_fd; + pthread_t mp_handle_tid; + + /* in no shared files mode, we do not have secondary processes support, + * so no need to initialize IPC. + */ + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC will be disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + /* create filter path */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_filter, basename(path), sizeof(mp_filter)); + + /* path may have been modified, so recreate it */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_dir_path, dirname(path), sizeof(mp_dir_path)); + + /* lock the directory */ + dir_fd = open(mp_dir_path, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "failed to open %s: %s\n", + mp_dir_path, strerror(errno)); + return -1; + } + + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "failed to lock %s: %s\n", + mp_dir_path, strerror(errno)); + close(dir_fd); + return -1; + } + + if (open_socket_fd() < 0) { + close(dir_fd); + return -1; + } + + if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle", + NULL, mp_handle, NULL) < 0) { + RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", + strerror(errno)); + close(mp_fd); + close(dir_fd); + mp_fd = -1; + return -1; + } + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + close(dir_fd); + + return 0; +} + +void +rte_mp_channel_cleanup(void) +{ + close_socket_fd(); +} + +/** + * Return -1, as fail to send message and it's caused by the local side. + * Return 0, as fail to send message and it's caused by the remote side. + * Return 1, as succeed to send message. + * + */ +static int +send_msg(const char *dst_path, struct rte_mp_msg *msg, int type) +{ + int snd; + struct iovec iov; + struct msghdr msgh; + struct cmsghdr *cmsg; + struct sockaddr_un dst; + struct mp_msg_internal m; + int fd_size = msg->num_fds * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + + m.type = type; + memcpy(&m.msg, msg, sizeof(*msg)); + + memset(&dst, 0, sizeof(dst)); + dst.sun_family = AF_UNIX; + strlcpy(dst.sun_path, dst_path, sizeof(dst.sun_path)); + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = &m; + iov.iov_len = sizeof(m) - sizeof(msg->fds); + + msgh.msg_name = &dst; + msgh.msg_namelen = sizeof(dst); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), msg->fds, fd_size); + + do { + snd = sendmsg(mp_fd, &msgh, 0); + } while (snd < 0 && errno == EINTR); + + if (snd < 0) { + rte_errno = errno; + /* Check if it caused by peer process exits */ + if (errno == ECONNREFUSED && + rte_eal_process_type() == RTE_PROC_PRIMARY) { + unlink(dst_path); + return 0; + } + RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n", + dst_path, strerror(errno)); + return -1; + } + + return 1; +} + +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type) +{ + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + + if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY)) + peer = eal_mp_socket_path(); + + if (peer) { + if (send_msg(peer, msg, type) < 0) + return -1; + else + return 0; + } + + /* broadcast to all secondary processes */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", + mp_dir_path); + rte_errno = errno; + return -1; + } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + closedir(mp_dir); + return -1; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + if (send_msg(path, msg, type) < 0) + ret = -1; + } + /* unlock the dir */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + return ret; +} + +static int +check_input(const struct rte_mp_msg *msg) +{ + if (msg == NULL) { + RTE_LOG(ERR, EAL, "Msg cannot be NULL\n"); + rte_errno = EINVAL; + return -1; + } + + if (validate_action_name(msg->name) != 0) + return -1; + + if (msg->len_param < 0) { + RTE_LOG(ERR, EAL, "Message data length is negative\n"); + rte_errno = EINVAL; + return -1; + } + + if (msg->num_fds < 0) { + RTE_LOG(ERR, EAL, "Number of fd's is negative\n"); + rte_errno = EINVAL; + return -1; + } + + if (msg->len_param > RTE_MP_MAX_PARAM_LEN) { + RTE_LOG(ERR, EAL, "Message data is too long\n"); + rte_errno = E2BIG; + return -1; + } + + if (msg->num_fds > RTE_MP_MAX_FD_NUM) { + RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", + RTE_MP_MAX_FD_NUM); + rte_errno = E2BIG; + return -1; + } + + return 0; +} + +int +rte_mp_sendmsg(struct rte_mp_msg *msg) +{ + if (check_input(msg) != 0) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name); + return mp_send(msg, NULL, MP_MSG); +} + +static int +mp_request_async(const char *dst, struct rte_mp_msg *req, + struct async_request_param *param, const struct timespec *ts) +{ + struct rte_mp_msg *reply_msg; + struct pending_request *pending_req, *exist; + int ret = -1; + + pending_req = calloc(1, sizeof(*pending_req)); + reply_msg = calloc(1, sizeof(*reply_msg)); + if (pending_req == NULL || reply_msg == NULL) { + RTE_LOG(ERR, EAL, "Could not allocate space for sync request\n"); + rte_errno = ENOMEM; + ret = -1; + goto fail; + } + + pending_req->type = REQUEST_TYPE_ASYNC; + strlcpy(pending_req->dst, dst, sizeof(pending_req->dst)); + pending_req->request = req; + pending_req->reply = reply_msg; + pending_req->async.param = param; + + /* queue already locked by caller */ + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + ret = -1; + goto fail; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + ret = -1; + goto fail; + } else if (ret == 0) { + ret = 0; + goto fail; + } + param->user_reply.nb_sent++; + + /* if alarm set fails, we simply ignore the reply */ + if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000, + async_reply_handle, pending_req) < 0) { + RTE_LOG(ERR, EAL, "Fail to set alarm for request %s:%s\n", + dst, req->name); + ret = -1; + goto fail; + } + TAILQ_INSERT_TAIL(&pending_requests.requests, pending_req, next); + + return 0; +fail: + free(pending_req); + free(reply_msg); + return ret; +} + +static int +mp_request_sync(const char *dst, struct rte_mp_msg *req, + struct rte_mp_reply *reply, const struct timespec *ts) +{ + int ret; + struct rte_mp_msg msg, *tmp; + struct pending_request pending_req, *exist; + + pending_req.type = REQUEST_TYPE_SYNC; + pending_req.reply_received = 0; + strlcpy(pending_req.dst, dst, sizeof(pending_req.dst)); + pending_req.request = req; + pending_req.reply = &msg; + pthread_cond_init(&pending_req.sync.cond, NULL); + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + return -1; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + return -1; + } else if (ret == 0) + return 0; + + TAILQ_INSERT_TAIL(&pending_requests.requests, &pending_req, next); + + reply->nb_sent++; + + do { + ret = pthread_cond_timedwait(&pending_req.sync.cond, + &pending_requests.lock, ts); + } while (ret != 0 && ret != ETIMEDOUT); + + TAILQ_REMOVE(&pending_requests.requests, &pending_req, next); + + if (pending_req.reply_received == 0) { + RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n", + dst, req->name); + rte_errno = ETIMEDOUT; + return -1; + } + if (pending_req.reply_received == -1) { + RTE_LOG(DEBUG, EAL, "Asked to ignore response\n"); + /* not receiving this message is not an error, so decrement + * number of sent messages + */ + reply->nb_sent--; + return 0; + } + + tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + dst, req->name); + rte_errno = ENOMEM; + return -1; + } + memcpy(&tmp[reply->nb_received], &msg, sizeof(msg)); + reply->msgs = tmp; + reply->nb_received++; + return 0; +} + +int +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, + const struct timespec *ts) +{ + int dir_fd, ret = -1; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec end; + + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + if (check_input(req) != 0) + goto end; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Failed to get current time\n"); + rte_errno = errno; + goto end; + } + + end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end.tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + pthread_mutex_lock(&pending_requests.lock); + ret = mp_request_sync(eal_mp_socket_path(), req, reply, &end); + pthread_mutex_unlock(&pending_requests.lock); + goto end; + } + + /* for primary process, broadcast request, and collect reply 1 by 1 */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + goto end; + } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + goto close_end; + } + + pthread_mutex_lock(&pending_requests.lock); + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + /* unlocks the mutex while waiting for response, + * locks on receive + */ + if (mp_request_sync(path, req, reply, &end)) + goto unlock_end; + } + ret = 0; + +unlock_end: + pthread_mutex_unlock(&pending_requests.lock); + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + +close_end: + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + +end: + if (ret) { + free(reply->msgs); + reply->nb_received = 0; + reply->msgs = NULL; + } + return ret; +} + +int +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb) +{ + struct rte_mp_msg *copy; + struct pending_request *dummy; + struct async_request_param *param; + struct rte_mp_reply *reply; + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec *end; + bool dummy_used = false; + + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + if (check_input(req) != 0) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Failed to get current time\n"); + rte_errno = errno; + return -1; + } + copy = calloc(1, sizeof(*copy)); + dummy = calloc(1, sizeof(*dummy)); + param = calloc(1, sizeof(*param)); + if (copy == NULL || dummy == NULL || param == NULL) { + RTE_LOG(ERR, EAL, "Failed to allocate memory for async reply\n"); + rte_errno = ENOMEM; + goto fail; + } + + /* copy message */ + memcpy(copy, req, sizeof(*copy)); + + param->n_responses_processed = 0; + param->clb = clb; + end = ¶m->end; + reply = ¶m->user_reply; + + end->tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end->tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + /* we have to lock the request queue here, as we will be adding a bunch + * of requests to the queue at once, and some of the replies may arrive + * before we add all of the requests to the queue. + */ + pthread_mutex_lock(&pending_requests.lock); + + /* we have to ensure that callback gets triggered even if we don't send + * anything, therefore earlier we have allocated a dummy request. fill + * it, and put it on the queue if we don't send any requests. + */ + dummy->type = REQUEST_TYPE_ASYNC; + dummy->request = copy; + dummy->reply = NULL; + dummy->async.param = param; + dummy->reply_received = 1; /* short-circuit the timeout */ + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + ret = mp_request_async(eal_mp_socket_path(), copy, param, ts); + + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_TAIL(&pending_requests.requests, dummy, + next); + dummy_used = true; + } + + pthread_mutex_unlock(&pending_requests.lock); + + /* if we couldn't send anything, clean up */ + if (ret != 0) + goto fail; + return 0; + } + + /* for primary process, broadcast request */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + goto unlock_fail; + } + dir_fd = dirfd(mp_dir); + + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + goto closedir_fail; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + if (mp_request_async(path, copy, param, ts)) + ret = -1; + } + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_HEAD(&pending_requests.requests, dummy, next); + dummy_used = true; + } + + /* finally, unlock the queue */ + pthread_mutex_unlock(&pending_requests.lock); + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + + /* if dummy was unused, free it */ + if (!dummy_used) + free(dummy); + + return ret; +closedir_fail: + closedir(mp_dir); +unlock_fail: + pthread_mutex_unlock(&pending_requests.lock); +fail: + free(dummy); + free(param); + free(copy); + return -1; +} + +int +rte_mp_reply(struct rte_mp_msg *msg, const char *peer) +{ + RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name); + + if (check_input(msg) != 0) + return -1; + + if (peer == NULL) { + RTE_LOG(ERR, EAL, "peer is not specified\n"); + rte_errno = EINVAL; + return -1; + } + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + + return mp_send(msg, peer, MP_REP); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c new file mode 100644 index 000000000..60c5dd66f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <string.h> +#include <stdio.h> +#include <stdarg.h> +#include <errno.h> + +#include <rte_string_fns.h> + +/* split string into tokens */ +int +rte_strsplit(char *string, int stringlen, + char **tokens, int maxtokens, char delim) +{ + int i, tok = 0; + int tokstart = 1; /* first token is right at start of string */ + + if (string == NULL || tokens == NULL) + goto einval_error; + + for (i = 0; i < stringlen; i++) { + if (string[i] == '\0' || tok >= maxtokens) + break; + if (tokstart) { + tokstart = 0; + tokens[tok++] = &string[i]; + } + if (string[i] == delim) { + string[i] = '\0'; + tokstart = 1; + } + } + return tok; + +einval_error: + errno = EINVAL; + return -1; +} + +/* Copy src string into dst. + * + * Return negative value and NUL-terminate if dst is too short, + * Otherwise return number of bytes copied. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize) +{ + size_t nleft = dsize; + size_t res = 0; + + /* Copy as many bytes as will fit. */ + while (nleft != 0) { + dst[res] = src[res]; + if (src[res] == '\0') + return res; + res++; + nleft--; + } + + /* Not enough room in dst, set NUL and return error. */ + if (res != 0) + dst[res - 1] = '\0'; + return -E2BIG; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c new file mode 100644 index 000000000..ead06897b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <sys/queue.h> +#include <stdint.h> +#include <errno.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <inttypes.h> + +#include <rte_memory.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_memcfg.h" + +TAILQ_HEAD(rte_tailq_elem_head, rte_tailq_elem); +/* local tailq list */ +static struct rte_tailq_elem_head rte_tailq_elem_head = + TAILQ_HEAD_INITIALIZER(rte_tailq_elem_head); + +/* number of tailqs registered, -1 before call to rte_eal_tailqs_init */ +static int rte_tailqs_count = -1; + +struct rte_tailq_head * +rte_eal_tailq_lookup(const char *name) +{ + unsigned i; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (name == NULL) + return NULL; + + for (i = 0; i < RTE_MAX_TAILQ; i++) { + if (!strncmp(name, mcfg->tailq_head[i].name, + RTE_TAILQ_NAMESIZE-1)) + return &mcfg->tailq_head[i]; + } + + return NULL; +} + +void +rte_dump_tailq(FILE *f) +{ + struct rte_mem_config *mcfg; + unsigned i = 0; + + mcfg = rte_eal_get_configuration()->mem_config; + + rte_mcfg_tailq_read_lock(); + for (i = 0; i < RTE_MAX_TAILQ; i++) { + const struct rte_tailq_head *tailq = &mcfg->tailq_head[i]; + const struct rte_tailq_entry_head *head = &tailq->tailq_head; + + fprintf(f, "Tailq %u: qname:<%s>, tqh_first:%p, tqh_last:%p\n", + i, tailq->name, head->tqh_first, head->tqh_last); + } + rte_mcfg_tailq_read_unlock(); +} + +static struct rte_tailq_head * +rte_eal_tailq_create(const char *name) +{ + struct rte_tailq_head *head = NULL; + + if (!rte_eal_tailq_lookup(name) && + (rte_tailqs_count + 1 < RTE_MAX_TAILQ)) { + struct rte_mem_config *mcfg; + + mcfg = rte_eal_get_configuration()->mem_config; + head = &mcfg->tailq_head[rte_tailqs_count]; + strlcpy(head->name, name, sizeof(head->name) - 1); + TAILQ_INIT(&head->tailq_head); + rte_tailqs_count++; + } + + return head; +} + +/* local register, used to store "early" tailqs before rte_eal_init() and to + * ensure secondary process only registers tailqs once. */ +static int +rte_eal_tailq_local_register(struct rte_tailq_elem *t) +{ + struct rte_tailq_elem *temp; + + TAILQ_FOREACH(temp, &rte_tailq_elem_head, next) { + if (!strncmp(t->name, temp->name, sizeof(temp->name))) + return -1; + } + + TAILQ_INSERT_TAIL(&rte_tailq_elem_head, t, next); + return 0; +} + +static void +rte_eal_tailq_update(struct rte_tailq_elem *t) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* primary process is the only one that creates */ + t->head = rte_eal_tailq_create(t->name); + } else { + t->head = rte_eal_tailq_lookup(t->name); + } +} + +int +rte_eal_tailq_register(struct rte_tailq_elem *t) +{ + if (rte_eal_tailq_local_register(t) < 0) { + RTE_LOG(ERR, EAL, + "%s tailq is already registered\n", t->name); + goto error; + } + + /* if a register happens after rte_eal_tailqs_init(), then we can update + * tailq head */ + if (rte_tailqs_count >= 0) { + rte_eal_tailq_update(t); + if (t->head == NULL) { + RTE_LOG(ERR, EAL, + "Cannot initialize tailq: %s\n", t->name); + TAILQ_REMOVE(&rte_tailq_elem_head, t, next); + goto error; + } + } + + return 0; + +error: + t->head = NULL; + return -1; +} + +int +rte_eal_tailqs_init(void) +{ + struct rte_tailq_elem *t; + + rte_tailqs_count = 0; + + TAILQ_FOREACH(t, &rte_tailq_elem_head, next) { + /* second part of register job for "early" tailqs, see + * rte_eal_tailq_register and EAL_REGISTER_TAILQ */ + rte_eal_tailq_update(t); + if (t->head == NULL) { + RTE_LOG(ERR, EAL, + "Cannot initialize tailq: %s\n", t->name); + /* TAILQ_REMOVE not needed, error is already fatal */ + goto fail; + } + } + + return 0; + +fail: + rte_dump_tailq(stderr); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c new file mode 100644 index 000000000..f9f588c17 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <pthread.h> +#include <signal.h> +#include <sched.h> +#include <assert.h> +#include <string.h> + +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_log.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <rte_trace_point.h> +#endif + +#include "eal_internal_cfg.h" +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DECLARE_PER_LCORE(unsigned , _socket_id); + +unsigned rte_socket_id(void) +{ + return RTE_PER_LCORE(_socket_id); +} + +int +rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return -EINVAL; + + return cfg->lcore_role[lcore_id] == role; +} + +static int +eal_cpuset_socket_id(rte_cpuset_t *cpusetp) +{ + unsigned cpu = 0; + int socket_id = SOCKET_ID_ANY; + int sid; + + if (cpusetp == NULL) + return SOCKET_ID_ANY; + + do { + if (!CPU_ISSET(cpu, cpusetp)) + continue; + + if (socket_id == SOCKET_ID_ANY) + socket_id = eal_cpu_socket_id(cpu); + + sid = eal_cpu_socket_id(cpu); + if (socket_id != sid) { + socket_id = SOCKET_ID_ANY; + break; + } + + } while (++cpu < CPU_SETSIZE); + + return socket_id; +} + +int +rte_thread_set_affinity(rte_cpuset_t *cpusetp) +{ + int s; + unsigned lcore_id; + pthread_t tid; + + tid = pthread_self(); + + s = pthread_setaffinity_np(tid, sizeof(rte_cpuset_t), cpusetp); + if (s != 0) { + RTE_LOG(ERR, EAL, "pthread_setaffinity_np failed\n"); + return -1; + } + + /* store socket_id in TLS for quick access */ + RTE_PER_LCORE(_socket_id) = + eal_cpuset_socket_id(cpusetp); + + /* store cpuset in TLS for quick access */ + memmove(&RTE_PER_LCORE(_cpuset), cpusetp, + sizeof(rte_cpuset_t)); + + lcore_id = rte_lcore_id(); + if (lcore_id != (unsigned)LCORE_ID_ANY) { + /* EAL thread will update lcore_config */ + lcore_config[lcore_id].socket_id = RTE_PER_LCORE(_socket_id); + memmove(&lcore_config[lcore_id].cpuset, cpusetp, + sizeof(rte_cpuset_t)); + } + + return 0; +} + +void +rte_thread_get_affinity(rte_cpuset_t *cpusetp) +{ + assert(cpusetp); + memmove(cpusetp, &RTE_PER_LCORE(_cpuset), + sizeof(rte_cpuset_t)); +} + +int +eal_thread_dump_affinity(char *str, unsigned size) +{ + rte_cpuset_t cpuset; + unsigned cpu; + int ret; + unsigned int out = 0; + + rte_thread_get_affinity(&cpuset); + + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (!CPU_ISSET(cpu, &cpuset)) + continue; + + ret = snprintf(str + out, + size - out, "%u,", cpu); + if (ret < 0 || (unsigned)ret >= size - out) { + /* string will be truncated */ + ret = -1; + goto exit; + } + + out += ret; + } + + ret = 0; +exit: + /* remove the last separator */ + if (out > 0) + str[out - 1] = '\0'; + + return ret; +} + + +struct rte_thread_ctrl_params { + void *(*start_routine)(void *); + void *arg; + pthread_barrier_t configured; +}; + +static void *rte_thread_init(void *arg) +{ + int ret; + rte_cpuset_t *cpuset = &internal_config.ctrl_cpuset; + struct rte_thread_ctrl_params *params = arg; + void *(*start_routine)(void *) = params->start_routine; + void *routine_arg = params->arg; + + /* Store cpuset in TLS for quick access */ + memmove(&RTE_PER_LCORE(_cpuset), cpuset, sizeof(rte_cpuset_t)); + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + +#ifndef RTE_EXEC_ENV_WINDOWS + __rte_trace_mem_per_thread_alloc(); +#endif + return start_routine(routine_arg); +} + +int +rte_ctrl_thread_create(pthread_t *thread, const char *name, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) +{ + rte_cpuset_t *cpuset = &internal_config.ctrl_cpuset; + struct rte_thread_ctrl_params *params; + int ret; + + params = malloc(sizeof(*params)); + if (!params) + return -ENOMEM; + + params->start_routine = start_routine; + params->arg = arg; + + pthread_barrier_init(¶ms->configured, NULL, 2); + + ret = pthread_create(thread, attr, rte_thread_init, (void *)params); + if (ret != 0) { + free(params); + return -ret; + } + + if (name != NULL) { + ret = rte_thread_setname(*thread, name); + if (ret < 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for ctrl thread\n"); + } + + ret = pthread_setaffinity_np(*thread, sizeof(*cpuset), cpuset); + if (ret) + goto fail; + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + + return 0; + +fail: + if (PTHREAD_BARRIER_SERIAL_THREAD == + pthread_barrier_wait(¶ms->configured)) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + pthread_cancel(*thread); + pthread_join(*thread, NULL); + return -ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c new file mode 100644 index 000000000..fa9ee1b22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/types.h> +#include <time.h> +#include <errno.h> + +#include <rte_common.h> +#include <rte_compat.h> +#include <rte_log.h> +#include <rte_cycles.h> +#include <rte_pause.h> +#include <rte_eal.h> + +#include "eal_private.h" +#include "eal_memcfg.h" + +/* The frequency of the RDTSC timer resolution */ +static uint64_t eal_tsc_resolution_hz; + +/* Pointer to user delay function */ +void (*rte_delay_us)(unsigned int) = NULL; + +void +rte_delay_us_block(unsigned int us) +{ + const uint64_t start = rte_get_timer_cycles(); + const uint64_t ticks = (uint64_t)us * rte_get_timer_hz() / 1E6; + while ((rte_get_timer_cycles() - start) < ticks) + rte_pause(); +} + +void +rte_delay_us_sleep(unsigned int us) +{ + struct timespec wait[2]; + int ind = 0; + + wait[0].tv_sec = 0; + if (us >= US_PER_S) { + wait[0].tv_sec = us / US_PER_S; + us -= wait[0].tv_sec * US_PER_S; + } + wait[0].tv_nsec = 1000 * us; + + while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) { + /* + * Sleep was interrupted. Flip the index, so the 'remainder' + * will become the 'request' for a next call. + */ + ind = 1 - ind; + } +} + +uint64_t +rte_get_tsc_hz(void) +{ + return eal_tsc_resolution_hz; +} + +static uint64_t +estimate_tsc_freq(void) +{ +#define CYC_PER_10MHZ 1E7 + RTE_LOG(WARNING, EAL, "WARNING: TSC frequency estimated roughly" + " - clock timings may be less accurate.\n"); + /* assume that the sleep(1) will sleep for 1 second */ + uint64_t start = rte_rdtsc(); + sleep(1); + /* Round up to 10Mhz. 1E7 ~ 10Mhz */ + return RTE_ALIGN_MUL_NEAR(rte_rdtsc() - start, CYC_PER_10MHZ); +} + +void +set_tsc_freq(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t freq; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + /* + * Just use the primary process calculated TSC rate in any + * secondary process. It avoids any unnecessary overhead on + * systems where arch-specific frequency detection is not + * available. + */ + eal_tsc_resolution_hz = mcfg->tsc_hz; + return; + } + + freq = get_tsc_freq_arch(); + if (!freq) + freq = get_tsc_freq(); + if (!freq) + freq = estimate_tsc_freq(); + + RTE_LOG(DEBUG, EAL, "TSC frequency is ~%" PRIu64 " KHz\n", freq / 1000); + eal_tsc_resolution_hz = freq; + mcfg->tsc_hz = freq; +} + +void rte_delay_us_callback_register(void (*userfunc)(unsigned int)) +{ + rte_delay_us = userfunc; +} + +RTE_INIT(rte_timer_init) +{ + /* set rte_delay_us_block as a delay function */ + rte_delay_us_callback_register(rte_delay_us_block); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace.c new file mode 100644 index 000000000..875553d7e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace.c @@ -0,0 +1,498 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <fnmatch.h> +#include <inttypes.h> +#include <sys/queue.h> +#include <regex.h> + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_string_fns.h> + +#include "eal_trace.h" + +RTE_DEFINE_PER_LCORE(volatile int, trace_point_sz); +RTE_DEFINE_PER_LCORE(void *, trace_mem); +static RTE_DEFINE_PER_LCORE(char, ctf_field[TRACE_CTF_FIELD_SIZE]); +static RTE_DEFINE_PER_LCORE(int, ctf_count); + +static struct trace_point_head tp_list = STAILQ_HEAD_INITIALIZER(tp_list); +static struct trace trace = { .args = STAILQ_HEAD_INITIALIZER(trace.args), }; + +struct trace * +trace_obj_get(void) +{ + return &trace; +} + +struct trace_point_head * +trace_list_head_get(void) +{ + return &tp_list; +} + +int +eal_trace_init(void) +{ + struct trace_arg *arg; + + /* Trace memory should start with 8B aligned for natural alignment */ + RTE_BUILD_BUG_ON((offsetof(struct __rte_trace_header, mem) % 8) != 0); + + /* One of the trace point registration failed */ + if (trace.register_errno) { + rte_errno = trace.register_errno; + goto fail; + } + + if (!STAILQ_EMPTY(&trace.args)) + trace.status = true; + + if (!rte_trace_is_enabled()) + return 0; + + rte_spinlock_init(&trace.lock); + + /* Is duplicate trace name registered */ + if (trace_has_duplicate_entry()) + goto fail; + + /* Generate UUID ver 4 with total size of events and number of + * events + */ + trace_uuid_generate(); + + /* Apply buffer size configuration for trace output */ + trace_bufsz_args_apply(); + + /* Generate CTF TDSL metadata */ + if (trace_metadata_create() < 0) + goto fail; + + /* Create trace directory */ + if (trace_mkdir()) + goto free_meta; + + /* Save current epoch timestamp for future use */ + if (trace_epoch_time_save() < 0) + goto fail; + + /* Apply global configurations */ + STAILQ_FOREACH(arg, &trace.args, next) + trace_args_apply(arg->val); + + rte_trace_mode_set(trace.mode); + + return 0; + +free_meta: + trace_metadata_destroy(); +fail: + trace_err("failed to initialize trace [%s]", rte_strerror(rte_errno)); + return -rte_errno; +} + +void +eal_trace_fini(void) +{ + if (!rte_trace_is_enabled()) + return; + trace_mem_per_thread_free(); + trace_metadata_destroy(); + eal_trace_args_free(); +} + +bool +rte_trace_is_enabled(void) +{ + return trace.status; +} + +static void +trace_mode_set(rte_trace_point_t *trace, enum rte_trace_mode mode) +{ + if (mode == RTE_TRACE_MODE_OVERWRITE) + __atomic_and_fetch(trace, ~__RTE_TRACE_FIELD_ENABLE_DISCARD, + __ATOMIC_RELEASE); + else + __atomic_or_fetch(trace, __RTE_TRACE_FIELD_ENABLE_DISCARD, + __ATOMIC_RELEASE); +} + +void +rte_trace_mode_set(enum rte_trace_mode mode) +{ + struct trace_point *tp; + + if (!rte_trace_is_enabled()) + return; + + STAILQ_FOREACH(tp, &tp_list, next) + trace_mode_set(tp->handle, mode); + + trace.mode = mode; +} + +enum +rte_trace_mode rte_trace_mode_get(void) +{ + return trace.mode; +} + +static bool +trace_point_is_invalid(rte_trace_point_t *t) +{ + return (t == NULL) || (trace_id_get(t) >= trace.nb_trace_points); +} + +bool +rte_trace_point_is_enabled(rte_trace_point_t *trace) +{ + uint64_t val; + + if (trace_point_is_invalid(trace)) + return false; + + val = __atomic_load_n(trace, __ATOMIC_ACQUIRE); + return (val & __RTE_TRACE_FIELD_ENABLE_MASK) != 0; +} + +int +rte_trace_point_enable(rte_trace_point_t *trace) +{ + if (trace_point_is_invalid(trace)) + return -ERANGE; + + __atomic_or_fetch(trace, __RTE_TRACE_FIELD_ENABLE_MASK, + __ATOMIC_RELEASE); + return 0; +} + +int +rte_trace_point_disable(rte_trace_point_t *trace) +{ + if (trace_point_is_invalid(trace)) + return -ERANGE; + + __atomic_and_fetch(trace, ~__RTE_TRACE_FIELD_ENABLE_MASK, + __ATOMIC_RELEASE); + return 0; +} + +int +rte_trace_pattern(const char *pattern, bool enable) +{ + struct trace_point *tp; + int rc = 0, found = 0; + + STAILQ_FOREACH(tp, &tp_list, next) { + if (fnmatch(pattern, tp->name, 0) == 0) { + if (enable) + rc = rte_trace_point_enable(tp->handle); + else + rc = rte_trace_point_disable(tp->handle); + found = 1; + } + if (rc < 0) + return rc; + } + + return rc | found; +} + +int +rte_trace_regexp(const char *regex, bool enable) +{ + struct trace_point *tp; + int rc = 0, found = 0; + regex_t r; + + if (regcomp(&r, regex, 0) != 0) + return -EINVAL; + + STAILQ_FOREACH(tp, &tp_list, next) { + if (regexec(&r, tp->name, 0, NULL, 0) == 0) { + if (enable) + rc = rte_trace_point_enable(tp->handle); + else + rc = rte_trace_point_disable(tp->handle); + found = 1; + } + if (rc < 0) + return rc; + } + regfree(&r); + + return rc | found; +} + +rte_trace_point_t * +rte_trace_point_lookup(const char *name) +{ + struct trace_point *tp; + + if (name == NULL) + return NULL; + + STAILQ_FOREACH(tp, &tp_list, next) + if (strncmp(tp->name, name, TRACE_POINT_NAME_SIZE) == 0) + return tp->handle; + + return NULL; +} + +static void +trace_point_dump(FILE *f, struct trace_point *tp) +{ + rte_trace_point_t *handle = tp->handle; + + fprintf(f, "\tid %d, %s, size is %d, %s\n", + trace_id_get(handle), tp->name, + (uint16_t)(*handle & __RTE_TRACE_FIELD_SIZE_MASK), + rte_trace_point_is_enabled(handle) ? "enabled" : "disabled"); +} + +static void +trace_lcore_mem_dump(FILE *f) +{ + struct trace *trace = trace_obj_get(); + struct __rte_trace_header *header; + uint32_t count; + + if (trace->nb_trace_mem_list == 0) + return; + + rte_spinlock_lock(&trace->lock); + fprintf(f, "nb_trace_mem_list = %d\n", trace->nb_trace_mem_list); + fprintf(f, "\nTrace mem info\n--------------\n"); + for (count = 0; count < trace->nb_trace_mem_list; count++) { + header = trace->lcore_meta[count].mem; + fprintf(f, "\tid %d, mem=%p, area=%s, lcore_id=%d, name=%s\n", + count, header, + trace_area_to_string(trace->lcore_meta[count].area), + header->stream_header.lcore_id, + header->stream_header.thread_name); + } + rte_spinlock_unlock(&trace->lock); +} + +void +rte_trace_dump(FILE *f) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace *trace = trace_obj_get(); + struct trace_point *tp; + + fprintf(f, "\nGlobal info\n-----------\n"); + fprintf(f, "status = %s\n", + rte_trace_is_enabled() ? "enabled" : "disabled"); + fprintf(f, "mode = %s\n", + trace_mode_to_string(rte_trace_mode_get())); + fprintf(f, "dir = %s\n", trace->dir); + fprintf(f, "buffer len = %d\n", trace->buff_len); + fprintf(f, "number of trace points = %d\n", trace->nb_trace_points); + + trace_lcore_mem_dump(f); + fprintf(f, "\nTrace point info\n----------------\n"); + STAILQ_FOREACH(tp, tp_list, next) + trace_point_dump(f, tp); +} + +void +__rte_trace_mem_per_thread_alloc(void) +{ + struct trace *trace = trace_obj_get(); + struct __rte_trace_header *header; + uint32_t count; + + if (!rte_trace_is_enabled()) + return; + + if (RTE_PER_LCORE(trace_mem)) + return; + + rte_spinlock_lock(&trace->lock); + + count = trace->nb_trace_mem_list; + + /* Allocate room for storing the thread trace mem meta */ + trace->lcore_meta = realloc(trace->lcore_meta, + sizeof(trace->lcore_meta[0]) * (count + 1)); + + /* Provide dummy space for fast path to consume */ + if (trace->lcore_meta == NULL) { + trace_crit("trace mem meta memory realloc failed"); + header = NULL; + goto fail; + } + + /* First attempt from huge page */ + header = eal_malloc_no_trace(NULL, trace_mem_sz(trace->buff_len), 8); + if (header) { + trace->lcore_meta[count].area = TRACE_AREA_HUGEPAGE; + goto found; + } + + /* Second attempt from heap */ + header = malloc(trace_mem_sz(trace->buff_len)); + if (header == NULL) { + trace_crit("trace mem malloc attempt failed"); + header = NULL; + goto fail; + + } + + /* Second attempt from heap is success */ + trace->lcore_meta[count].area = TRACE_AREA_HEAP; + + /* Initialize the trace header */ +found: + header->offset = 0; + header->len = trace->buff_len; + header->stream_header.magic = TRACE_CTF_MAGIC; + rte_uuid_copy(header->stream_header.uuid, trace->uuid); + header->stream_header.lcore_id = rte_lcore_id(); + + /* Store the thread name */ + char *name = header->stream_header.thread_name; + memset(name, 0, __RTE_TRACE_EMIT_STRING_LEN_MAX); + rte_thread_getname(pthread_self(), name, + __RTE_TRACE_EMIT_STRING_LEN_MAX); + + trace->lcore_meta[count].mem = header; + trace->nb_trace_mem_list++; +fail: + RTE_PER_LCORE(trace_mem) = header; + rte_spinlock_unlock(&trace->lock); +} + +void +trace_mem_per_thread_free(void) +{ + struct trace *trace = trace_obj_get(); + uint32_t count; + void *mem; + + if (!rte_trace_is_enabled()) + return; + + rte_spinlock_lock(&trace->lock); + for (count = 0; count < trace->nb_trace_mem_list; count++) { + mem = trace->lcore_meta[count].mem; + if (trace->lcore_meta[count].area == TRACE_AREA_HUGEPAGE) + eal_free_no_trace(mem); + else if (trace->lcore_meta[count].area == TRACE_AREA_HEAP) + free(mem); + } + rte_spinlock_unlock(&trace->lock); +} + +void +__rte_trace_point_emit_field(size_t sz, const char *in, const char *datatype) +{ + char *field = RTE_PER_LCORE(ctf_field); + int count = RTE_PER_LCORE(ctf_count); + size_t size; + int rc; + + size = RTE_MAX(0, TRACE_CTF_FIELD_SIZE - 1 - count); + RTE_PER_LCORE(trace_point_sz) += sz; + rc = snprintf(RTE_PTR_ADD(field, count), size, "%s %s;", datatype, in); + if (rc <= 0 || (size_t)rc >= size) { + RTE_PER_LCORE(trace_point_sz) = 0; + trace_crit("CTF field is too long"); + return; + } + RTE_PER_LCORE(ctf_count) += rc; +} + +int +__rte_trace_point_register(rte_trace_point_t *handle, const char *name, + void (*register_fn)(void)) +{ + char *field = RTE_PER_LCORE(ctf_field); + struct trace_point *tp; + uint16_t sz; + + /* Sanity checks of arguments */ + if (name == NULL || register_fn == NULL || handle == NULL) { + trace_err("invalid arguments"); + rte_errno = EINVAL; + goto fail; + } + + /* Check the size of the trace point object */ + RTE_PER_LCORE(trace_point_sz) = 0; + RTE_PER_LCORE(ctf_count) = 0; + register_fn(); + if (RTE_PER_LCORE(trace_point_sz) == 0) { + trace_err("missing rte_trace_emit_header() in register fn"); + rte_errno = EBADF; + goto fail; + } + + /* Is size overflowed */ + if (RTE_PER_LCORE(trace_point_sz) > UINT16_MAX) { + trace_err("trace point size overflowed"); + rte_errno = ENOSPC; + goto fail; + } + + /* Are we running out of space to store trace points? */ + if (trace.nb_trace_points > UINT16_MAX) { + trace_err("trace point exceeds the max count"); + rte_errno = ENOSPC; + goto fail; + } + + /* Get the size of the trace point */ + sz = RTE_PER_LCORE(trace_point_sz); + tp = calloc(1, sizeof(struct trace_point)); + if (tp == NULL) { + trace_err("fail to allocate trace point memory"); + rte_errno = ENOMEM; + goto fail; + } + + /* Initialize the trace point */ + if (rte_strscpy(tp->name, name, TRACE_POINT_NAME_SIZE) < 0) { + trace_err("name is too long"); + rte_errno = E2BIG; + goto free; + } + + /* Copy the field data for future use */ + if (rte_strscpy(tp->ctf_field, field, TRACE_CTF_FIELD_SIZE) < 0) { + trace_err("CTF field size is too long"); + rte_errno = E2BIG; + goto free; + } + + /* Clear field memory for the next event */ + memset(field, 0, TRACE_CTF_FIELD_SIZE); + + /* Form the trace handle */ + *handle = sz; + *handle |= trace.nb_trace_points << __RTE_TRACE_FIELD_ID_SHIFT; + + trace.nb_trace_points++; + tp->handle = handle; + + /* Add the trace point at tail */ + STAILQ_INSERT_TAIL(&tp_list, tp, next); + __atomic_thread_fence(__ATOMIC_RELEASE); + + /* All Good !!! */ + return 0; +free: + free(tp); +fail: + if (trace.register_errno == 0) + trace.register_errno = rte_errno; + + return -rte_errno; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_ctf.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_ctf.c new file mode 100644 index 000000000..302e2bb74 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_ctf.c @@ -0,0 +1,488 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <inttypes.h> +#include <time.h> + +#include <rte_byteorder.h> +#include <rte_common.h> +#include <rte_time.h> +#include <rte_trace.h> +#include <rte_version.h> + +#include "eal_trace.h" + +__rte_format_printf(2, 0) +static int +metadata_printf(char **str, const char *fmt, ...) +{ + va_list ap; + int rc; + + *str = NULL; + va_start(ap, fmt); + rc = vasprintf(str, fmt, ap); + va_end(ap); + + return rc; +} + +static int +meta_copy(char **meta, int *offset, char *str, int rc) +{ + int count = *offset; + char *ptr = *meta; + + if (rc < 0) + return rc; + + ptr = realloc(ptr, count + rc); + if (ptr == NULL) + goto free_str; + + memcpy(RTE_PTR_ADD(ptr, count), str, rc); + count += rc; + free(str); + + *meta = ptr; + *offset = count; + + return rc; + +free_str: + if (str) + free(str); + return -ENOMEM; +} + +static int +meta_data_type_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "/* CTF 1.8 */\n" + "typealias integer {size = 8; base = x;}:= uint8_t;\n" + "typealias integer {size = 16; base = x;} := uint16_t;\n" + "typealias integer {size = 32; base = x;} := uint32_t;\n" + "typealias integer {size = 64; base = x;} := uint64_t;\n" + "typealias integer {size = 8; signed = true;} := int8_t;\n" + "typealias integer {size = 16; signed = true;} := int16_t;\n" + "typealias integer {size = 32; signed = true;} := int32_t;\n" + "typealias integer {size = 64; signed = true;} := int64_t;\n" +#ifdef RTE_ARCH_64 + "typealias integer {size = 64; base = x;} := uintptr_t;\n" +#else + "typealias integer {size = 32; base = x;} := uintptr_t;\n" +#endif +#ifdef RTE_ARCH_64 + "typealias integer {size = 64; base = x;} := long;\n" +#else + "typealias integer {size = 32; base = x;} := long;\n" +#endif + "typealias integer {size = 8; signed = false; encoding = ASCII; } := string_bounded_t;\n\n" + "typealias floating_point {\n" + " exp_dig = 8;\n" + " mant_dig = 24;\n" + "} := float;\n\n" + "typealias floating_point {\n" + " exp_dig = 11;\n" + " mant_dig = 53;\n" + "} := double;\n\n"); + + return meta_copy(meta, offset, str, rc); +} + +static int +is_be(void) +{ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + return 1; +#else + return 0; +#endif +} + +static int +meta_header_emit(char **meta, int *offset) +{ + struct trace *trace = trace_obj_get(); + char uustr[RTE_UUID_STRLEN]; + char *str = NULL; + int rc; + + rte_uuid_unparse(trace->uuid, uustr, RTE_UUID_STRLEN); + rc = metadata_printf(&str, + "trace {\n" + " major = 1;\n" + " minor = 8;\n" + " uuid = \"%s\";\n" + " byte_order = %s;\n" + " packet.header := struct {\n" + " uint32_t magic;\n" + " uint8_t uuid[16];\n" + " };\n" + "};\n\n", uustr, is_be() ? "be" : "le"); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_env_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "env {\n" + " dpdk_version = \"%s\";\n" + " tracer_name = \"dpdk\";\n" + "};\n\n", rte_version()); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass1_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "clock {\n" + " name = \"dpdk\";\n" + " freq = "); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass2_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "%20"PRIu64";\n" + " offset_s =", 0); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass3_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "%20"PRIu64";\n" + " offset =", 0); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass4_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "%20"PRIu64";\n};\n\n" + "typealias integer {\n" + " size = 48; align = 1; signed = false;\n" + " map = clock.dpdk.value;\n" + "} := uint48_clock_dpdk_t;\n\n", 0); + + return meta_copy(meta, offset, str, rc); +} + +static int +meta_stream_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "stream {\n" + " packet.context := struct {\n" + " uint32_t cpu_id;\n" + " string_bounded_t name[32];\n" + " };\n" + " event.header := struct {\n" + " uint48_clock_dpdk_t timestamp;\n" + " uint16_t id;\n" + " } align(64);\n" + "};\n\n"); + return meta_copy(meta, offset, str, rc); +} + +static void +string_fixed_replace(char *input, const char *search, const char *replace) +{ + char *found; + size_t len; + + found = strstr(input, search); + if (found == NULL) + return; + + if (strlen(found) != strlen(search)) + return; + + len = strlen(replace); + memcpy(found, replace, len); + found[len] = '\0'; +} + +static void +ctf_fixup_align(char *str) +{ + string_fixed_replace(str, "align", "_align"); +} + +static void +ctf_fixup_arrow_deref(char *str) +{ + const char *replace = "_"; + const char *search = "->"; + char *found; + size_t len; + + found = strstr(str, search); + if (found == NULL) + return; + + do { + memcpy(found, replace, strlen(replace)); + len = strlen(found + 2); + memcpy(found + 1, found + 2, len); + found[len + 1] = '\0'; + found = strstr(str, search); + } while (found != NULL); +} + +static void +ctf_fixup_dot_deref(char *str) +{ + const char *replace = "_"; + const char *search = "."; + char *found; + size_t len; + + found = strstr(str, search); + if (found == NULL) + return; + + len = strlen(replace); + do { + memcpy(found, replace, len); + found = strstr(str, search); + } while (found != NULL); +} + +static void +ctf_fixup_event(char *str) +{ + string_fixed_replace(str, "event", "_event"); +} + +static int +ctf_fixup_keyword(char *str) +{ + char dup_str[TRACE_CTF_FIELD_SIZE]; + char input[TRACE_CTF_FIELD_SIZE]; + const char *delim = ";"; + char *from; + int len; + + if (str == NULL) + return 0; + + len = strlen(str); + if (len >= TRACE_CTF_FIELD_SIZE) { + trace_err("ctf_field reached its maximum limit"); + return -EMSGSIZE; + } + + /* Create duplicate string */ + strcpy(dup_str, str); + + len = 0; + from = strtok(dup_str, delim); + while (from != NULL) { + strcpy(input, from); + ctf_fixup_align(input); + ctf_fixup_dot_deref(input); + ctf_fixup_arrow_deref(input); + ctf_fixup_event(input); + + strcpy(&input[strlen(input)], delim); + if ((len + strlen(input)) >= TRACE_CTF_FIELD_SIZE) { + trace_err("ctf_field reached its maximum limit"); + return -EMSGSIZE; + } + + strcpy(str + len, input); + len += strlen(input); + from = strtok(NULL, delim); + } + + return 0; +} + +static int +meta_event_emit(char **meta, int *offset, struct trace_point *tp) +{ + char *str = NULL; + int rc; + + /* Fixup ctf field string in case it using reserved ctf keywords */ + rc = ctf_fixup_keyword(tp->ctf_field); + if (rc) + return rc; + + rc = metadata_printf(&str, + "event {\n" + " id = %d;\n" + " name = \"%s\";\n" + " fields := struct {\n" + " %s\n" + " };\n" + "};\n\n", trace_id_get(tp->handle), tp->name, tp->ctf_field); + return meta_copy(meta, offset, str, rc); +} + +int +trace_metadata_create(void) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace *trace = trace_obj_get(); + struct trace_point *tp; + int rc, offset = 0; + char *meta = NULL; + + rc = meta_data_type_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_header_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_env_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_clock_pass1_emit(&meta, &offset); + if (rc < 0) + goto fail; + trace->ctf_meta_offset_freq = offset; + + rc = meta_clock_pass2_emit(&meta, &offset); + if (rc < 0) + goto fail; + trace->ctf_meta_offset_freq_off_s = offset; + + rc = meta_clock_pass3_emit(&meta, &offset); + if (rc < 0) + goto fail; + trace->ctf_meta_offset_freq_off = offset; + + rc = meta_clock_pass4_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_stream_emit(&meta, &offset); + if (rc < 0) + goto fail; + + STAILQ_FOREACH(tp, tp_list, next) + if (meta_event_emit(&meta, &offset, tp) < 0) + goto fail; + + trace->ctf_meta = meta; + return 0; + +fail: + if (meta) + free(meta); + return -EBADF; +} + +void +trace_metadata_destroy(void) +{ + struct trace *trace = trace_obj_get(); + + if (trace->ctf_meta) { + free(trace->ctf_meta); + trace->ctf_meta = NULL; + } +} + +static void +meta_fix_freq(struct trace *trace, char *meta) +{ + char *str; + int rc; + + str = RTE_PTR_ADD(meta, trace->ctf_meta_offset_freq); + rc = sprintf(str, "%20"PRIu64"", rte_get_timer_hz()); + str[rc] = ';'; +} + +static void +meta_fix_freq_offset(struct trace *trace, char *meta) +{ + uint64_t uptime_tickes_floor, uptime_ticks, freq, uptime_sec; + uint64_t offset, offset_s; + char *str; + int rc; + + uptime_ticks = trace->uptime_ticks & + ((1ULL << __RTE_TRACE_EVENT_HEADER_ID_SHIFT) - 1); + freq = rte_get_tsc_hz(); + uptime_tickes_floor = RTE_ALIGN_MUL_FLOOR(uptime_ticks, freq); + + uptime_sec = uptime_tickes_floor / freq; + offset_s = trace->epoch_sec - uptime_sec; + + offset = uptime_ticks - uptime_tickes_floor; + offset += trace->epoch_nsec * (freq / NSEC_PER_SEC); + + str = RTE_PTR_ADD(meta, trace->ctf_meta_offset_freq_off_s); + rc = sprintf(str, "%20"PRIu64"", offset_s); + str[rc] = ';'; + str = RTE_PTR_ADD(meta, trace->ctf_meta_offset_freq_off); + rc = sprintf(str, "%20"PRIu64"", offset); + str[rc] = ';'; +} + +static void +meta_fixup(struct trace *trace, char *meta) +{ + meta_fix_freq(trace, meta); + meta_fix_freq_offset(trace, meta); +} + +int +rte_trace_metadata_dump(FILE *f) +{ + struct trace *trace = trace_obj_get(); + char *ctf_meta = trace->ctf_meta; + int rc; + + if (!rte_trace_is_enabled()) + return 0; + + if (ctf_meta == NULL) + return -EINVAL; + + if (!__atomic_load_n(&trace->ctf_fixup_done, __ATOMIC_SEQ_CST) && + rte_get_timer_hz()) { + meta_fixup(trace, ctf_meta); + __atomic_store_n(&trace->ctf_fixup_done, 1, __ATOMIC_SEQ_CST); + } + + rc = fprintf(f, "%s", ctf_meta); + return rc < 0 ? rc : 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_points.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_points.c new file mode 100644 index 000000000..4a8ce9088 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_points.c @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <rte_trace_point_register.h> + +#include <rte_eal_trace.h> + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_void); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u64); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u32); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u16); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u8); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i64); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i32); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i16); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i8); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_int); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_long); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_float); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_double); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_ptr); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_str); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_func); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_alarm_set); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_alarm_cancel); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_zmalloc); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_malloc); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_realloc); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_free); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_memzone_reserve); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_memzone_lookup); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_memzone_free); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_thread_remote_launch); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_thread_lcore_ready); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_callback_register); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_callback_unregister); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_enable); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_disable); + +RTE_INIT(eal_trace_init) +{ + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_void, + lib.eal.generic.void); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u64, + lib.eal.generic.u64); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u32, + lib.eal.generic.u32); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u16, + lib.eal.generic.u16); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u8, + lib.eal.generic.u8); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i64, + lib.eal.generic.i64); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i32, + lib.eal.generic.i32); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i16, + lib.eal.generic.i16); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i8, + lib.eal.generic.i8); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_int, + lib.eal.generic.int); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_long, + lib.eal.generic.long); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_float, + lib.eal.generic.float); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_double, + lib.eal.generic.double); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_ptr, + lib.eal.generic.ptr); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_str, + lib.eal.generic.string); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_func, + lib.eal.generic.func); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_alarm_set, + lib.eal.alarm.set); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_alarm_cancel, + lib.eal.alarm.cancel); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_zmalloc, + lib.eal.mem.zmalloc); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_malloc, + lib.eal.mem.malloc); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_realloc, + lib.eal.mem.realloc); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_free, + lib.eal.mem.free); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_memzone_reserve, + lib.eal.memzone.reserve); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_memzone_lookup, + lib.eal.memzone.lookup); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_memzone_free, + lib.eal.memzone.free); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_thread_remote_launch, + lib.eal.thread.remote.launch); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_thread_lcore_ready, + lib.eal.thread.lcore.ready); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_callback_register, + lib.eal.intr.register); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_callback_unregister, + lib.eal.intr.unregister); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_enable, + lib.eal.intr.enable); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_disable, + lib.eal.intr.disable); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_utils.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_utils.c new file mode 100644 index 000000000..64f58fb66 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_utils.c @@ -0,0 +1,448 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <fnmatch.h> +#include <pwd.h> +#include <sys/stat.h> +#include <time.h> + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_string_fns.h> + +#include "eal_filesystem.h" +#include "eal_trace.h" + +const char * +trace_mode_to_string(enum rte_trace_mode mode) +{ + switch (mode) { + case RTE_TRACE_MODE_OVERWRITE: return "overwrite"; + case RTE_TRACE_MODE_DISCARD: return "discard"; + default: return "unknown"; + } +} + +const char * +trace_area_to_string(enum trace_area_e area) +{ + switch (area) { + case TRACE_AREA_HEAP: return "heap"; + case TRACE_AREA_HUGEPAGE: return "hugepage"; + default: return "unknown"; + } +} + +static bool +trace_entry_compare(const char *name) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace_point *tp; + int count = 0; + + STAILQ_FOREACH(tp, tp_list, next) { + if (strncmp(tp->name, name, TRACE_POINT_NAME_SIZE) == 0) + count++; + if (count > 1) { + trace_err("found duplicate entry %s", name); + rte_errno = EEXIST; + return true; + } + } + return false; +} + +bool +trace_has_duplicate_entry(void) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace_point *tp; + + /* Is duplicate trace name registered */ + STAILQ_FOREACH(tp, tp_list, next) + if (trace_entry_compare(tp->name)) + return true; + + return false; +} + +void +trace_uuid_generate(void) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace *trace = trace_obj_get(); + struct trace_point *tp; + uint64_t sz_total = 0; + + /* Go over the registered trace points to get total size of events */ + STAILQ_FOREACH(tp, tp_list, next) { + const uint16_t sz = *tp->handle & __RTE_TRACE_FIELD_SIZE_MASK; + sz_total += sz; + } + + rte_uuid_t uuid = RTE_UUID_INIT(sz_total, trace->nb_trace_points, + 0x4370, 0x8f50, 0x222ddd514176ULL); + rte_uuid_copy(trace->uuid, uuid); +} + +static int +trace_session_name_generate(char *trace_dir) +{ + struct tm *tm_result; + time_t tm; + int rc; + + tm = time(NULL); + if ((int)tm == -1) + goto fail; + + tm_result = localtime(&tm); + if (tm_result == NULL) + goto fail; + + rc = rte_strscpy(trace_dir, eal_get_hugefile_prefix(), + TRACE_PREFIX_LEN); + if (rc == -E2BIG) + rc = TRACE_PREFIX_LEN; + trace_dir[rc++] = '-'; + + rc = strftime(trace_dir + rc, TRACE_DIR_STR_LEN - rc, + "%Y-%m-%d-%p-%I-%M-%S", tm_result); + if (rc == 0) + goto fail; + + return rc; +fail: + rte_errno = errno; + return -rte_errno; +} + +static int +trace_dir_update(const char *str) +{ + struct trace *trace = trace_obj_get(); + int rc, remaining; + + remaining = sizeof(trace->dir) - trace->dir_offset; + rc = rte_strscpy(&trace->dir[0] + trace->dir_offset, str, remaining); + if (rc < 0) + goto fail; + + trace->dir_offset += rc; +fail: + return rc; +} + +int +eal_trace_args_save(const char *val) +{ + struct trace *trace = trace_obj_get(); + struct trace_arg *arg = malloc(sizeof(*arg)); + + if (arg == NULL) { + trace_err("failed to allocate memory for %s", val); + return -ENOMEM; + } + + arg->val = strdup(val); + if (arg->val == NULL) { + trace_err("failed to allocate memory for %s", val); + free(arg); + return -ENOMEM; + } + + STAILQ_INSERT_TAIL(&trace->args, arg, next); + return 0; +} + +void +eal_trace_args_free(void) +{ + struct trace *trace = trace_obj_get(); + struct trace_arg *arg; + + while (!STAILQ_EMPTY(&trace->args)) { + arg = STAILQ_FIRST(&trace->args); + STAILQ_REMOVE_HEAD(&trace->args, next); + free(arg->val); + free(arg); + } +} + +int +trace_args_apply(const char *arg) +{ + if (rte_trace_regexp(arg, true) < 0) { + trace_err("cannot enable trace for %s", arg); + return -1; + } + + return 0; +} + +int +eal_trace_bufsz_args_save(char const *val) +{ + struct trace *trace = trace_obj_get(); + uint64_t bufsz; + + bufsz = rte_str_to_size(val); + if (bufsz == 0) { + trace_err("buffer size cannot be zero"); + return -EINVAL; + } + + trace->buff_len = bufsz; + return 0; +} + +void +trace_bufsz_args_apply(void) +{ + struct trace *trace = trace_obj_get(); + + if (trace->buff_len == 0) + trace->buff_len = 1024 * 1024; /* 1MB */ +} + +int +eal_trace_mode_args_save(const char *val) +{ + struct trace *trace = trace_obj_get(); + size_t len = strlen(val); + unsigned long tmp; + char *pattern; + + if (len == 0) { + trace_err("value is not provided with option"); + return -EINVAL; + } + + pattern = (char *)calloc(1, len + 2); + if (pattern == NULL) { + trace_err("fail to allocate memory"); + return -ENOMEM; + } + + sprintf(pattern, "%s*", val); + + if (fnmatch(pattern, "overwrite", 0) == 0) + tmp = RTE_TRACE_MODE_OVERWRITE; + else if (fnmatch(pattern, "discard", 0) == 0) + tmp = RTE_TRACE_MODE_DISCARD; + else { + free(pattern); + return -EINVAL; + } + + trace->mode = tmp; + free(pattern); + return 0; +} + +int +eal_trace_dir_args_save(char const *val) +{ + struct trace *trace = trace_obj_get(); + char *dir_path; + int rc; + + if (strlen(val) >= sizeof(trace->dir) - 1) { + trace_err("input string is too big"); + return -ENAMETOOLONG; + } + + if (asprintf(&dir_path, "%s/", val) == -1) { + trace_err("failed to copy directory: %s", strerror(errno)); + return -ENOMEM; + } + + rc = trace_dir_update(dir_path); + + free(dir_path); + return rc; +} + +int +trace_epoch_time_save(void) +{ + struct trace *trace = trace_obj_get(); + struct timespec epoch = { 0, 0 }; + uint64_t avg, start, end; + + start = rte_get_tsc_cycles(); + if (clock_gettime(CLOCK_REALTIME, &epoch) < 0) { + trace_err("failed to get the epoch time"); + return -1; + } + end = rte_get_tsc_cycles(); + avg = (start + end) >> 1; + + trace->epoch_sec = (uint64_t) epoch.tv_sec; + trace->epoch_nsec = (uint64_t) epoch.tv_nsec; + trace->uptime_ticks = avg; + + return 0; +} + +static int +trace_dir_default_path_get(char *dir_path) +{ + struct trace *trace = trace_obj_get(); + uint32_t size = sizeof(trace->dir); + struct passwd *pwd; + char *home_dir; + + /* First check for shell environment variable */ + home_dir = getenv("HOME"); + if (home_dir == NULL) { + /* Fallback to password file entry */ + pwd = getpwuid(getuid()); + if (pwd == NULL) + return -EINVAL; + + home_dir = pwd->pw_dir; + } + + /* Append dpdk-traces to directory */ + if (snprintf(dir_path, size, "%s/dpdk-traces/", home_dir) < 0) + return -ENAMETOOLONG; + + return 0; +} + +int +trace_mkdir(void) +{ + struct trace *trace = trace_obj_get(); + char session[TRACE_DIR_STR_LEN]; + char *dir_path; + int rc; + + if (!trace->dir_offset) { + dir_path = calloc(1, sizeof(trace->dir)); + if (dir_path == NULL) { + trace_err("fail to allocate memory"); + return -ENOMEM; + } + + rc = trace_dir_default_path_get(dir_path); + if (rc < 0) { + trace_err("fail to get default path"); + free(dir_path); + return rc; + } + + rc = trace_dir_update(dir_path); + free(dir_path); + if (rc < 0) + return rc; + } + + /* Create the path if it t exist, no "mkdir -p" available here */ + rc = mkdir(trace->dir, 0700); + if (rc < 0 && errno != EEXIST) { + trace_err("mkdir %s failed [%s]", trace->dir, strerror(errno)); + rte_errno = errno; + return -rte_errno; + } + + rc = trace_session_name_generate(session); + if (rc < 0) + return rc; + rc = trace_dir_update(session); + if (rc < 0) + return rc; + + rc = mkdir(trace->dir, 0700); + if (rc < 0) { + trace_err("mkdir %s failed [%s]", trace->dir, strerror(errno)); + rte_errno = errno; + return -rte_errno; + } + + RTE_LOG(INFO, EAL, "Trace dir: %s\n", trace->dir); + return 0; +} + +static int +trace_meta_save(struct trace *trace) +{ + char file_name[PATH_MAX]; + FILE *f; + int rc; + + rc = snprintf(file_name, PATH_MAX, "%s/metadata", trace->dir); + if (rc < 0) + return rc; + + f = fopen(file_name, "w"); + if (f == NULL) + return -errno; + + rc = rte_trace_metadata_dump(f); + + if (fclose(f)) + rc = -errno; + + return rc; +} + + +static inline int +trace_file_sz(struct __rte_trace_header *hdr) +{ + return sizeof(struct __rte_trace_stream_header) + hdr->offset; +} + +static int +trace_mem_save(struct trace *trace, struct __rte_trace_header *hdr, + uint32_t cnt) +{ + char file_name[PATH_MAX]; + FILE *f; + int rc; + + rc = snprintf(file_name, PATH_MAX, "%s/channel0_%d", trace->dir, cnt); + if (rc < 0) + return rc; + + f = fopen(file_name, "w"); + if (f == NULL) + return -errno; + + rc = fwrite(&hdr->stream_header, trace_file_sz(hdr), 1, f); + rc = (rc == 1) ? 0 : -EACCES; + + if (fclose(f)) + rc = -errno; + + return rc; +} + +int +rte_trace_save(void) +{ + struct trace *trace = trace_obj_get(); + struct __rte_trace_header *header; + uint32_t count; + int rc = 0; + + if (trace->nb_trace_mem_list == 0) + return rc; + + rc = trace_meta_save(trace); + if (rc) + return rc; + + rte_spinlock_lock(&trace->lock); + for (count = 0; count < trace->nb_trace_mem_list; count++) { + header = trace->lcore_meta[count].mem; + rc = trace_mem_save(trace, header, count); + if (rc) + break; + } + rte_spinlock_unlock(&trace->lock); + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c new file mode 100644 index 000000000..0a80bfbb3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997 Theodore Ts'o. + */ + +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <stdlib.h> +#include <ctype.h> + +#include <rte_uuid.h> + +/* UUID packed form */ +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +static void uuid_pack(const struct uuid *uu, rte_uuid_t ptr) +{ + uint32_t tmp; + uint8_t *out = ptr; + + tmp = uu->time_low; + out[3] = (uint8_t) tmp; + tmp >>= 8; + out[2] = (uint8_t) tmp; + tmp >>= 8; + out[1] = (uint8_t) tmp; + tmp >>= 8; + out[0] = (uint8_t) tmp; + + tmp = uu->time_mid; + out[5] = (uint8_t) tmp; + tmp >>= 8; + out[4] = (uint8_t) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (uint8_t) tmp; + tmp >>= 8; + out[6] = (uint8_t) tmp; + + tmp = uu->clock_seq; + out[9] = (uint8_t) tmp; + tmp >>= 8; + out[8] = (uint8_t) tmp; + + memcpy(out+10, uu->node, 6); +} + +static void uuid_unpack(const rte_uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + +bool rte_uuid_is_null(const rte_uuid_t uu) +{ + const uint8_t *cp = uu; + int i; + + for (i = 0; i < 16; i++) + if (*cp++) + return false; + return true; +} + +/* + * rte_uuid_compare() - compare two UUIDs. + */ +int rte_uuid_compare(const rte_uuid_t uu1, const rte_uuid_t uu2) +{ + struct uuid uuid1, uuid2; + + uuid_unpack(uu1, &uuid1); + uuid_unpack(uu2, &uuid2); + +#define UUCMP(u1, u2) \ + do { if (u1 != u2) return (u1 < u2) ? -1 : 1; } while (0) + + UUCMP(uuid1.time_low, uuid2.time_low); + UUCMP(uuid1.time_mid, uuid2.time_mid); + UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); + UUCMP(uuid1.clock_seq, uuid2.clock_seq); +#undef UUCMP + + return memcmp(uuid1.node, uuid2.node, 6); +} + +int rte_uuid_parse(const char *in, rte_uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + + for (i = 0, cp = in; i <= 36; i++, cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i == 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + + for (i = 0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} + +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + + snprintf(out, len, + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h b/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h new file mode 100644 index 000000000..5d21f07c2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +/** + * @file + * Stores functions and path defines for files and directories + * on the filesystem for Linux, that are used by the Linux EAL. + */ + +#ifndef EAL_FILESYSTEM_H +#define EAL_FILESYSTEM_H + +/** Path of rte config file. */ + +#include <stdint.h> +#include <limits.h> +#include <unistd.h> +#include <stdlib.h> + +#include <rte_string_fns.h> +#include "eal_internal_cfg.h" + +/* sets up platform-specific runtime data dir */ +int +eal_create_runtime_dir(void); + +int +eal_clean_runtime_dir(void); + +/** Function to return hugefile prefix that's currently set up */ +const char * +eal_get_hugefile_prefix(void); + +#define RUNTIME_CONFIG_FNAME "config" +static inline const char * +eal_runtime_config_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + RUNTIME_CONFIG_FNAME); + return buffer; +} + +/** Path of primary/secondary communication unix socket file. */ +#define MP_SOCKET_FNAME "mp_socket" +static inline const char * +eal_mp_socket_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + MP_SOCKET_FNAME); + return buffer; +} + +#define FBARRAY_NAME_FMT "%s/fbarray_%s" +static inline const char * +eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { + snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(), + name); + return buffer; +} + +/** Path of hugepage info file. */ +#define HUGEPAGE_INFO_FNAME "hugepage_info" +static inline const char * +eal_hugepage_info_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + HUGEPAGE_INFO_FNAME); + return buffer; +} + +/** Path of hugepage data file. */ +#define HUGEPAGE_DATA_FNAME "hugepage_data" +static inline const char * +eal_hugepage_data_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + HUGEPAGE_DATA_FNAME); + return buffer; +} + +/** String format for hugepage map files. */ +#define HUGEFILE_FMT "%s/%smap_%d" +static inline const char * +eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id) +{ + snprintf(buffer, buflen, HUGEFILE_FMT, hugedir, + eal_get_hugefile_prefix(), f_id); + return buffer; +} + +/** define the default filename prefix for the %s values above */ +#define HUGEFILE_PREFIX_DEFAULT "rte" + +/** Function to read a single numeric value from a file on the filesystem. + * Used to read information from files on /sys */ +int eal_parse_sysfs_value(const char *filename, unsigned long *val); + +#endif /* EAL_FILESYSTEM_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h b/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h new file mode 100644 index 000000000..1b560d337 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_HUGEPAGES_H +#define EAL_HUGEPAGES_H + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> + +#define MAX_HUGEPAGE_PATH PATH_MAX + +/** + * Structure used to store information about hugepages that we mapped + * through the files in hugetlbfs. + */ +struct hugepage_file { + void *orig_va; /**< virtual addr of first mmap() */ + void *final_va; /**< virtual addr of 2nd mmap() */ + uint64_t physaddr; /**< physical addr */ + size_t size; /**< the page size */ + int socket_id; /**< NUMA socket ID */ + int file_id; /**< the '%d' in HUGEFILE_FMT */ + char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */ +}; + +/** + * Read the information on what hugepages are available for the EAL to use, + * clearing out any unused ones. + */ +int eal_hugepage_info_init(void); + +/** + * Read whatever information primary process has shared about hugepages into + * secondary process. + */ +int eal_hugepage_info_read(void); + +#endif /* EAL_HUGEPAGES_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h b/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h new file mode 100644 index 000000000..c650bc081 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Holds the structures for the eal internal configuration + */ + +#ifndef EAL_INTERNAL_CFG_H +#define EAL_INTERNAL_CFG_H + +#include <rte_eal.h> +#include <rte_pci_dev_feature_defs.h> + +#include "eal_thread.h" + +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#define MAX_HUGEPAGE_SIZES 4 /**< support up to 4 page sizes */ +#else +#define MAX_HUGEPAGE_SIZES 3 /**< support up to 3 page sizes */ +#endif + +/* + * internal configuration structure for the number, size and + * mount points of hugepages + */ +struct hugepage_info { + uint64_t hugepage_sz; /**< size of a huge page */ + char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */ + uint32_t num_pages[RTE_MAX_NUMA_NODES]; + /**< number of hugepages of that size on each socket */ + int lock_descriptor; /**< file descriptor for hugepage dir */ +}; + +/** + * internal configuration + */ +struct internal_config { + volatile size_t memory; /**< amount of asked memory */ + volatile unsigned force_nchannel; /**< force number of channels */ + volatile unsigned force_nrank; /**< force number of ranks */ + volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */ + unsigned hugepage_unlink; /**< true to unlink backing files */ + volatile unsigned no_pci; /**< true to disable PCI */ + volatile unsigned no_hpet; /**< true to disable HPET */ + volatile unsigned vmware_tsc_map; /**< true to use VMware TSC mapping + * instead of native TSC */ + volatile unsigned no_shconf; /**< true if there is no shared config */ + volatile unsigned in_memory; + /**< true if DPDK should operate entirely in-memory and not create any + * shared files or runtime data. + */ + volatile unsigned create_uio_dev; /**< true to create /dev/uioX devices */ + volatile enum rte_proc_type_t process_type; /**< multi-process proc type */ + /** true to try allocating memory on specific sockets */ + volatile unsigned force_sockets; + volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */ + volatile unsigned force_socket_limits; + volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */ + uintptr_t base_virtaddr; /**< base address to try and reserve memory from */ + volatile unsigned legacy_mem; + /**< true to enable legacy memory behavior (no dynamic allocation, + * IOVA-contiguous segments). + */ + volatile unsigned match_allocations; + /**< true to free hugepages exactly as allocated */ + volatile unsigned single_file_segments; + /**< true if storing all pages within single files (per-page-size, + * per-node) non-legacy mode only. + */ + volatile int syslog_facility; /**< facility passed to openlog() */ + /** default interrupt mode for VFIO */ + volatile enum rte_intr_mode vfio_intr_mode; + char *hugefile_prefix; /**< the base filename of hugetlbfs files */ + char *hugepage_dir; /**< specific hugetlbfs directory to use */ + char *user_mbuf_pool_ops_name; + /**< user defined mbuf pool ops name */ + unsigned num_hugepage_sizes; /**< how many sizes on this system */ + struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */ + rte_cpuset_t ctrl_cpuset; /**< cpuset for ctrl threads */ + volatile unsigned int init_complete; + /**< indicates whether EAL has completed initialization */ + unsigned int no_telemetry; /**< true to disable Telemetry */ +}; +extern struct internal_config internal_config; /**< Global EAL configuration. */ + +void eal_reset_internal_config(struct internal_config *internal_cfg); + +#endif /* EAL_INTERNAL_CFG_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h b/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h new file mode 100644 index 000000000..e953cd84e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef EAL_MEMALLOC_H +#define EAL_MEMALLOC_H + +#include <stdbool.h> + +#include <rte_memory.h> + +/* + * Allocate segment of specified page size. + */ +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket); + +/* + * Allocate `n_segs` segments. + * + * Note: `ms` can be NULL. + * + * Note: it is possible to request best-effort allocation by setting `exact` to + * `false`, in which case allocator will return however many pages it managed to + * allocate successfully. + */ +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact); + +/* + * Deallocate segment + */ +int +eal_memalloc_free_seg(struct rte_memseg *ms); + +/* + * Deallocate `n_segs` segments. Returns 0 on successful deallocation of all + * segments, returns -1 on error. Any segments that could have been deallocated, + * will be deallocated even in case of error. + */ +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs); + +/* + * Check if memory pointed to by `start` and of `length` that resides in + * memseg list `msl` is IOVA-contiguous. + */ +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len); + +/* synchronize local memory map to primary process */ +int +eal_memalloc_sync_with_primary(void); + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg); + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg); + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len); + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); + +/* returns fd or -errno */ +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_list_fd(int list_idx, int fd); + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset); + +int +eal_memalloc_init(void); + +#endif /* EAL_MEMALLOC_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_memcfg.h b/src/spdk/dpdk/lib/librte_eal/common/eal_memcfg.h new file mode 100644 index 000000000..583fcb595 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_memcfg.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef EAL_MEMCFG_H +#define EAL_MEMCFG_H + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_pause.h> +#include <rte_spinlock.h> +#include <rte_rwlock.h> +#include <rte_tailq.h> + +#include "malloc_heap.h" + +/** + * Memory configuration shared across multiple processes. + */ +struct rte_mem_config { + volatile uint32_t magic; /**< Magic number - sanity check. */ + uint32_t version; + /**< Prevent secondary processes using different DPDK versions. */ + + /* memory topology */ + uint32_t nchannel; /**< Number of channels (0 if unknown). */ + uint32_t nrank; /**< Number of ranks (0 if unknown). */ + + /** + * current lock nest order + * - qlock->mlock (ring/hash/lpm) + * - mplock->qlock->mlock (mempool) + * Notice: + * *ALWAYS* obtain qlock first if having to obtain both qlock and mlock + */ + rte_rwlock_t mlock; /**< used by memzones for thread safety. */ + rte_rwlock_t qlock; /**< used by tailqs for thread safety. */ + rte_rwlock_t mplock; /**< used by mempool library for thread safety. */ + rte_spinlock_t tlock; /**< used by timer library for thread safety. */ + + rte_rwlock_t memory_hotplug_lock; + /**< Indicates whether memory hotplug request is in progress. */ + + /* memory segments and zones */ + struct rte_fbarray memzones; /**< Memzone descriptors. */ + + struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS]; + /**< List of dynamic arrays holding memsegs */ + + struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; + /**< Tailqs for objects */ + + struct malloc_heap malloc_heaps[RTE_MAX_HEAPS]; + /**< DPDK malloc heaps */ + + int next_socket_id; /**< Next socket ID for external malloc heap */ + + /* rte_mem_config has to be mapped at the exact same address in all + * processes, so we need to store it. + */ + uint64_t mem_cfg_addr; /**< Address of this structure in memory. */ + + /* Primary and secondary processes cannot run with different legacy or + * single file segments options, so to avoid having to specify these + * options to all processes, store them in shared config and update the + * internal config at init time. + */ + uint32_t legacy_mem; /**< stored legacy mem parameter. */ + uint32_t single_file_segments; + /**< stored single file segments parameter. */ + + uint64_t tsc_hz; + /**< TSC rate */ + + uint8_t dma_maskbits; /**< Keeps the more restricted dma mask. */ +}; + +/* update internal config from shared mem config */ +void +eal_mcfg_update_internal(void); + +/* update shared mem config from internal config */ +void +eal_mcfg_update_from_internal(void); + +/* wait until primary process initialization is complete */ +void +eal_mcfg_wait_complete(void); + +/* check if DPDK version of current process matches one stored in the config */ +int +eal_mcfg_check_version(void); + +/* set mem config as complete */ +void +eal_mcfg_complete(void); + +#endif /* EAL_MEMCFG_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_options.h b/src/spdk/dpdk/lib/librte_eal/common/eal_options.h new file mode 100644 index 000000000..18e6da9ab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_options.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2014 6WIND S.A. + */ + +#ifndef EAL_OPTIONS_H +#define EAL_OPTIONS_H + +#include "getopt.h" + +struct rte_tel_data; + +enum { + /* long options mapped to a short option */ +#define OPT_HELP "help" + OPT_HELP_NUM = 'h', +#define OPT_PCI_BLACKLIST "pci-blacklist" + OPT_PCI_BLACKLIST_NUM = 'b', +#define OPT_PCI_WHITELIST "pci-whitelist" + OPT_PCI_WHITELIST_NUM = 'w', + + /* first long only option value must be >= 256, so that we won't + * conflict with short options */ + OPT_LONG_MIN_NUM = 256, +#define OPT_BASE_VIRTADDR "base-virtaddr" + OPT_BASE_VIRTADDR_NUM, +#define OPT_CREATE_UIO_DEV "create-uio-dev" + OPT_CREATE_UIO_DEV_NUM, +#define OPT_FILE_PREFIX "file-prefix" + OPT_FILE_PREFIX_NUM, +#define OPT_HUGE_DIR "huge-dir" + OPT_HUGE_DIR_NUM, +#define OPT_HUGE_UNLINK "huge-unlink" + OPT_HUGE_UNLINK_NUM, +#define OPT_LCORES "lcores" + OPT_LCORES_NUM, +#define OPT_LOG_LEVEL "log-level" + OPT_LOG_LEVEL_NUM, +#define OPT_TRACE "trace" + OPT_TRACE_NUM, +#define OPT_TRACE_DIR "trace-dir" + OPT_TRACE_DIR_NUM, +#define OPT_TRACE_BUF_SIZE "trace-bufsz" + OPT_TRACE_BUF_SIZE_NUM, +#define OPT_TRACE_MODE "trace-mode" + OPT_TRACE_MODE_NUM, +#define OPT_MASTER_LCORE "master-lcore" + OPT_MASTER_LCORE_NUM, +#define OPT_MBUF_POOL_OPS_NAME "mbuf-pool-ops-name" + OPT_MBUF_POOL_OPS_NAME_NUM, +#define OPT_PROC_TYPE "proc-type" + OPT_PROC_TYPE_NUM, +#define OPT_NO_HPET "no-hpet" + OPT_NO_HPET_NUM, +#define OPT_NO_HUGE "no-huge" + OPT_NO_HUGE_NUM, +#define OPT_NO_PCI "no-pci" + OPT_NO_PCI_NUM, +#define OPT_NO_SHCONF "no-shconf" + OPT_NO_SHCONF_NUM, +#define OPT_IN_MEMORY "in-memory" + OPT_IN_MEMORY_NUM, +#define OPT_SOCKET_MEM "socket-mem" + OPT_SOCKET_MEM_NUM, +#define OPT_SOCKET_LIMIT "socket-limit" + OPT_SOCKET_LIMIT_NUM, +#define OPT_SYSLOG "syslog" + OPT_SYSLOG_NUM, +#define OPT_VDEV "vdev" + OPT_VDEV_NUM, +#define OPT_VFIO_INTR "vfio-intr" + OPT_VFIO_INTR_NUM, +#define OPT_VMWARE_TSC_MAP "vmware-tsc-map" + OPT_VMWARE_TSC_MAP_NUM, +#define OPT_LEGACY_MEM "legacy-mem" + OPT_LEGACY_MEM_NUM, +#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" + OPT_SINGLE_FILE_SEGMENTS_NUM, +#define OPT_IOVA_MODE "iova-mode" + OPT_IOVA_MODE_NUM, +#define OPT_MATCH_ALLOCATIONS "match-allocations" + OPT_MATCH_ALLOCATIONS_NUM, +#define OPT_TELEMETRY "telemetry" + OPT_TELEMETRY_NUM, +#define OPT_NO_TELEMETRY "no-telemetry" + OPT_NO_TELEMETRY_NUM, + OPT_LONG_MAX_NUM +}; + +extern const char eal_short_options[]; +extern const struct option eal_long_options[]; + +int eal_parse_common_option(int opt, const char *argv, + struct internal_config *conf); +int eal_option_device_parse(void); +int eal_adjust_config(struct internal_config *internal_cfg); +int eal_cleanup_config(struct internal_config *internal_cfg); +int eal_check_common_options(struct internal_config *internal_cfg); +void eal_common_usage(void); +enum rte_proc_type_t eal_proc_type_detect(void); +int eal_plugins_init(void); +int eal_save_args(int argc, char **argv); +int handle_eal_info_request(const char *cmd, const char *params __rte_unused, + struct rte_tel_data *d); + +#endif /* EAL_OPTIONS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_private.h b/src/spdk/dpdk/lib/librte_eal/common/eal_private.h new file mode 100644 index 000000000..869ce183a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_private.h @@ -0,0 +1,423 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _EAL_PRIVATE_H_ +#define _EAL_PRIVATE_H_ + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> + +#include <rte_dev.h> +#include <rte_lcore.h> + +/** + * Structure storing internal configuration (per-lcore) + */ +struct lcore_config { + pthread_t thread_id; /**< pthread identifier */ + int pipe_master2slave[2]; /**< communication pipe with master */ + int pipe_slave2master[2]; /**< communication pipe with master */ + + lcore_function_t * volatile f; /**< function to call */ + void * volatile arg; /**< argument of function */ + volatile int ret; /**< return value of function */ + + volatile enum rte_lcore_state_t state; /**< lcore state */ + unsigned int socket_id; /**< physical socket id for this lcore */ + unsigned int core_id; /**< core number on socket for this lcore */ + int core_index; /**< relative index, starting from 0 */ + uint8_t core_role; /**< role of core eg: OFF, RTE, SERVICE */ + + rte_cpuset_t cpuset; /**< cpu set which the lcore affinity to */ +}; + +extern struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/** + * The global RTE configuration structure. + */ +struct rte_config { + uint32_t master_lcore; /**< Id of the master lcore */ + uint32_t lcore_count; /**< Number of available logical cores. */ + uint32_t numa_node_count; /**< Number of detected NUMA nodes. */ + uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected NUMA nodes. */ + uint32_t service_lcore_count;/**< Number of available service cores. */ + enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */ + + /** Primary or secondary configuration */ + enum rte_proc_type_t process_type; + + /** PA or VA mapping mode */ + enum rte_iova_mode iova_mode; + + /** + * Pointer to memory configuration, which may be shared across multiple + * DPDK instances + */ + struct rte_mem_config *mem_config; +} __rte_packed; + +/** + * Get the global configuration structure. + * + * @return + * A pointer to the global configuration structure. + */ +struct rte_config *rte_eal_get_configuration(void); + +/** + * Initialize the memzone subsystem (private to eal). + * + * @return + * - 0 on success + * - Negative on error + */ +int rte_eal_memzone_init(void); + +/** + * Common log initialization function (private to eal). Determines + * where log data is written when no call to rte_openlog_stream is + * in effect. + * + * @param default_log + * The default log stream to be used. + * @return + * - 0 on success + * - Negative on error + */ +void eal_log_set_default(FILE *default_log); + +/** + * Fill configuration with number of physical and logical processors + * + * This function is private to EAL. + * + * Parse /proc/cpuinfo to get the number of physical and logical + * processors on the machine. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_cpu_init(void); + +/** + * Create memseg lists + * + * This function is private to EAL. + * + * Preallocate virtual memory. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memseg_init(void); + +/** + * Map memory + * + * This function is private to EAL. + * + * Fill configuration structure with these infos, and return 0 on success. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memory_init(void); + +/** + * Configure timers + * + * This function is private to EAL. + * + * Mmap memory areas used by HPET (high precision event timer) that will + * provide our time reference, and configure the TSC frequency also for it + * to be used as a reference. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_timer_init(void); + +/** + * Init the default log stream + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_log_init(const char *id, int facility); + +/** + * Save the log regexp for later + */ +int rte_log_save_regexp(const char *type, int priority); +int rte_log_save_pattern(const char *pattern, int priority); + +/** + * Init tail queues for non-EAL library structures. This is to allow + * the rings, mempools, etc. lists to be shared among multiple processes + * + * This function is private to EAL + * + * @return + * 0 on success, negative on error + */ +int rte_eal_tailqs_init(void); + +/** + * Init interrupt handling. + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_intr_init(void); + +/** + * Init alarm mechanism. This is to allow a callback be called after + * specific time. + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_alarm_init(void); + +/** + * Function is to check if the kernel module(like, vfio, vfio_iommu_type1, + * etc.) loaded. + * + * @param module_name + * The module's name which need to be checked + * + * @return + * -1 means some error happens(NULL pointer or open failure) + * 0 means the module not loaded + * 1 means the module loaded + */ +int rte_eal_check_module(const char *module_name); + +/** + * Get virtual area of specified size from the OS. + * + * This function is private to the EAL. + * + * @param requested_addr + * Address where to request address space. + * @param size + * Size of requested area. + * @param page_sz + * Page size on which to align requested virtual area. + * @param flags + * EAL_VIRTUAL_AREA_* flags. + * @param mmap_flags + * Extra flags passed directly to mmap(). + * + * @return + * Virtual area address if successful. + * NULL if unsuccessful. + */ + +#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0) +/**< don't fail if cannot get exact requested address. */ +#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1) +/**< try getting smaller sized (decrement by page size) virtual areas if cannot + * get area of requested size. + */ +#define EAL_VIRTUAL_AREA_UNMAP (1 << 2) +/**< immediately unmap reserved virtual area. */ +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags); + +/** + * Get cpu core_id. + * + * This function is private to the EAL. + */ +unsigned eal_cpu_core_id(unsigned lcore_id); + +/** + * Check if cpu is present. + * + * This function is private to the EAL. + */ +int eal_cpu_detected(unsigned lcore_id); + +/** + * Set TSC frequency from precise value or estimation + * + * This function is private to the EAL. + */ +void set_tsc_freq(void); + +/** + * Get precise TSC frequency from system + * + * This function is private to the EAL. + */ +uint64_t get_tsc_freq(void); + +/** + * Get TSC frequency if the architecture supports. + * + * This function is private to the EAL. + * + * @return + * The number of TSC cycles in one second. + * Returns zero if the architecture support is not available. + */ +uint64_t get_tsc_freq_arch(void); + +/** + * Prepare physical memory mapping + * i.e. hugepages on Linux and + * contigmem on BSD. + * + * This function is private to the EAL. + */ +int rte_eal_hugepage_init(void); + +/** + * Creates memory mapping in secondary process + * i.e. hugepages on Linux and + * contigmem on BSD. + * + * This function is private to the EAL. + */ +int rte_eal_hugepage_attach(void); + +/** + * Find a bus capable of identifying a device. + * + * @param str + * A device identifier (PCI address, virtual PMD name, ...). + * + * @return + * A valid bus handle if found. + * NULL if no bus is able to parse this device. + */ +struct rte_bus *rte_bus_find_by_device_name(const char *str); + +/** + * Create the unix channel for primary/secondary communication. + * + * @return + * 0 on success; + * (<0) on failure. + */ +int rte_mp_channel_init(void); + +/** + * Primary/secondary communication cleanup. + */ +void rte_mp_channel_cleanup(void); + +/** + * @internal + * Parse a device string and store its information in an + * rte_devargs structure. + * + * A device description is split by layers of abstraction of the device: + * bus, class and driver. Each layer will offer a set of properties that + * can be applied either to configure or recognize a device. + * + * This function will parse those properties and prepare the rte_devargs + * to be given to each layers for processing. + * + * Note: if the "data" field of the devargs points to devstr, + * then no dynamic allocation is performed and the rte_devargs + * can be safely discarded. + * + * Otherwise ``data`` will hold a workable copy of devstr, that will be + * used by layers descriptors within rte_devargs. In this case, + * any rte_devargs should be cleaned-up before being freed. + * + * @param da + * rte_devargs structure to fill. + * + * @param devstr + * Device string. + * + * @return + * 0 on success. + * Negative errno values on error (rte_errno is set). + */ +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr); + +/* + * probe a device at local process. + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @param new_dev + * new device be probed as output. + * @return + * 0 on success, negative on error. + */ +int local_dev_probe(const char *devargs, struct rte_device **new_dev); + +/** + * Hotplug remove a given device from a specific bus at local process. + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int local_dev_remove(struct rte_device *dev); + +/** + * Iterate over all buses to find the corresponding bus to handle the sigbus + * error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 success to handle the sigbus. + * -1 failed to handle the sigbus + * 1 no bus can handler the sigbus + */ +int rte_bus_sigbus_handler(const void *failure_addr); + +/** + * @internal + * Register the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_register(void); + +/** + * @internal + * Unregister the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_unregister(void); + +/** + * Get OS-specific EAL mapping base address. + */ +uint64_t +eal_get_baseaddr(void); + +void * +eal_malloc_no_trace(const char *type, size_t size, unsigned int align); + +void eal_free_no_trace(void *addr); + +#endif /* _EAL_PRIVATE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h b/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h new file mode 100644 index 000000000..b40ed249e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_THREAD_H +#define EAL_THREAD_H + +#include <rte_lcore.h> + +/** + * basic loop of thread, called for each thread by eal_init(). + * + * @param arg + * opaque pointer + */ +__rte_noreturn void *eal_thread_loop(void *arg); + +/** + * Init per-lcore info for master thread + * + * @param lcore_id + * identifier of master lcore + */ +void eal_thread_init_master(unsigned lcore_id); + +/** + * Get the NUMA socket id from cpu id. + * This function is private to EAL. + * + * @param cpu_id + * The logical process id. + * @return + * socket_id or SOCKET_ID_ANY + */ +unsigned eal_cpu_socket_id(unsigned cpu_id); + +/** + * Default buffer size to use with eal_thread_dump_affinity() + */ +#define RTE_CPU_AFFINITY_STR_LEN 256 + +/** + * Dump the current pthread cpuset. + * This function is private to EAL. + * + * Note: + * If the dump size is greater than the size of given buffer, + * the string will be truncated and with '\0' at the end. + * + * @param str + * The string buffer the cpuset will dump to. + * @param size + * The string buffer size. + * @return + * 0 for success, -1 if truncation happens. + */ +int +eal_thread_dump_affinity(char *str, unsigned size); + +#endif /* EAL_THREAD_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_trace.h b/src/spdk/dpdk/lib/librte_eal/common/eal_trace.h new file mode 100644 index 000000000..8f6061615 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_trace.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#ifndef __EAL_TRACE_H +#define __EAL_TRACE_H + +#include <rte_cycles.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_spinlock.h> +#include <rte_trace.h> +#include <rte_trace_point.h> +#include <rte_uuid.h> + +#include "eal_private.h" +#include "eal_thread.h" + +#define trace_err(fmt, args...) \ + RTE_LOG(ERR, EAL, "%s():%u " fmt "\n", __func__, __LINE__, ## args) + +#define trace_crit(fmt, args...) \ + RTE_LOG(CRIT, EAL, "%s():%u " fmt "\n", __func__, __LINE__, ## args) + +#define TRACE_PREFIX_LEN 12 +#define TRACE_DIR_STR_LEN (sizeof("YYYY-mm-dd-AM-HH-MM-SS") + TRACE_PREFIX_LEN) +#define TRACE_CTF_FIELD_SIZE 384 +#define TRACE_POINT_NAME_SIZE 64 +#define TRACE_CTF_MAGIC 0xC1FC1FC1 +#define TRACE_MAX_ARGS 32 + +struct trace_point { + STAILQ_ENTRY(trace_point) next; + rte_trace_point_t *handle; + char name[TRACE_POINT_NAME_SIZE]; + char ctf_field[TRACE_CTF_FIELD_SIZE]; +}; + +enum trace_area_e { + TRACE_AREA_HEAP, + TRACE_AREA_HUGEPAGE, +}; + +struct thread_mem_meta { + void *mem; + enum trace_area_e area; +}; + +struct trace_arg { + STAILQ_ENTRY(trace_arg) next; + char *val; +}; + +struct trace { + char dir[PATH_MAX]; + int dir_offset; + int register_errno; + bool status; + enum rte_trace_mode mode; + rte_uuid_t uuid; + uint32_t buff_len; + STAILQ_HEAD(, trace_arg) args; + uint32_t nb_trace_points; + uint32_t nb_trace_mem_list; + struct thread_mem_meta *lcore_meta; + uint64_t epoch_sec; + uint64_t epoch_nsec; + uint64_t uptime_ticks; + char *ctf_meta; + uint32_t ctf_meta_offset_freq; + uint32_t ctf_meta_offset_freq_off_s; + uint32_t ctf_meta_offset_freq_off; + uint16_t ctf_fixup_done; + rte_spinlock_t lock; +}; + +/* Helper functions */ +static inline uint16_t +trace_id_get(rte_trace_point_t *trace) +{ + return (*trace & __RTE_TRACE_FIELD_ID_MASK) >> + __RTE_TRACE_FIELD_ID_SHIFT; +} + +static inline size_t +trace_mem_sz(uint32_t len) +{ + return len + sizeof(struct __rte_trace_header); +} + +/* Trace object functions */ +struct trace *trace_obj_get(void); + +/* Trace point list functions */ +STAILQ_HEAD(trace_point_head, trace_point); +struct trace_point_head *trace_list_head_get(void); + +/* Util functions */ +const char *trace_mode_to_string(enum rte_trace_mode mode); +const char *trace_area_to_string(enum trace_area_e area); +int trace_args_apply(const char *arg); +void trace_bufsz_args_apply(void); +bool trace_has_duplicate_entry(void); +void trace_uuid_generate(void); +int trace_metadata_create(void); +void trace_metadata_destroy(void); +int trace_mkdir(void); +int trace_epoch_time_save(void); +void trace_mem_per_thread_free(void); + +/* EAL interface */ +int eal_trace_init(void); +void eal_trace_fini(void); +int eal_trace_args_save(const char *val); +void eal_trace_args_free(void); +int eal_trace_dir_args_save(const char *val); +int eal_trace_mode_args_save(const char *val); +int eal_trace_bufsz_args_save(const char *val); + +#endif /* __EAL_TRACE_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.c b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.c new file mode 100644 index 000000000..ee791903b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.c @@ -0,0 +1,465 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ +#include <string.h> + +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_alarm.h> +#include <rte_string_fns.h> +#include <rte_devargs.h> + +#include "hotplug_mp.h" +#include "eal_private.h" + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +struct mp_reply_bundle { + struct rte_mp_msg msg; + void *peer; +}; + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +/** + * Secondary to primary request. + * start from function eal_dev_hotplug_request_to_primary. + * + * device attach on secondary: + * a) secondary send sync request to the primary. + * b) primary receive the request and attach the new device if + * failed goto i). + * c) primary forward attach sync request to all secondary. + * d) secondary receive the request and attach the device and send a reply. + * e) primary check the reply if all success goes to j). + * f) primary send attach rollback sync request to all secondary. + * g) secondary receive the request and detach the device and send a reply. + * h) primary receive the reply and detach device as rollback action. + * i) send attach fail to secondary as a reply of step a), goto k). + * j) send attach success to secondary as a reply of step a). + * k) secondary receive reply and return. + * + * device detach on secondary: + * a) secondary send sync request to the primary. + * b) primary send detach sync request to all secondary. + * c) secondary detach the device and send a reply. + * d) primary check the reply if all success goes to g). + * e) primary send detach rollback sync request to all secondary. + * f) secondary receive the request and attach back device. goto h). + * g) primary detach the device if success goto i), else goto e). + * h) primary send detach fail to secondary as a reply of step a), goto j). + * i) primary send detach success to secondary as a reply of step a). + * j) secondary receive reply and return. + */ + +static int +send_response_to_secondary(const struct eal_dev_mp_req *req, + int result, + const void *peer) +{ + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + int ret; + + memset(&mp_resp, 0, sizeof(mp_resp)); + mp_resp.len_param = sizeof(*resp); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + memcpy(resp, req, sizeof(*req)); + resp->result = result; + + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + return ret; +} + +static void +__handle_secondary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + const struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req tmp_req; + struct rte_devargs da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + tmp_req = *req; + + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + ret = local_dev_probe(req->devargs, &dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n"); + if (ret != -EEXIST) + goto finish; + } + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + if (tmp_req.result != 0) { + ret = tmp_req.result; + RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n"); + if (ret != -EEXIST) + goto rollback; + } + } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) { + ret = rte_devargs_parse(&da, req->devargs); + if (ret != 0) + goto finish; + free(da.args); /* we don't need those */ + da.args = NULL; + + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + bus = rte_bus_find_by_name(da.bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da.bus->name); + ret = -ENOENT; + goto finish; + } + + dev = bus->find_device(NULL, cmp_dev_name, da.name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da.name); + ret = -ENOENT; + goto finish; + } + + if (tmp_req.result != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n"); + ret = tmp_req.result; + if (ret != -ENOENT) + goto rollback; + } + + ret = local_dev_remove(dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n"); + if (ret != -ENOENT) + goto rollback; + } + } else { + RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n"); + ret = -ENOTSUP; + } + goto finish; + +rollback: + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + local_dev_remove(dev); + } else { + tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + } + +finish: + ret = send_response_to_secondary(&tmp_req, ret, bundle->peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_secondary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct mp_reply_bundle *bundle; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + int ret = 0; + + bundle = malloc(sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = strdup(peer); + if (bundle->peer == NULL) { + free(bundle); + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to add mp task\n"); + free(bundle->peer); + free(bundle); + return send_response_to_secondary(req, ret, peer); + } + return 0; +} + +static void __handle_primary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + + switch (req->t) { + case EAL_DEV_REQ_TYPE_ATTACH: + case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK: + ret = local_dev_probe(req->devargs, &dev); + break; + case EAL_DEV_REQ_TYPE_DETACH: + case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK: + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + break; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto quit; + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto quit; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto quit; + } + + if (!rte_dev_is_probed(dev)) { + if (req->t == EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK) { + /** + * Don't fail the rollback just because there's + * nothing to do. + */ + ret = 0; + } else + ret = -ENODEV; + + goto quit; + } + + ret = local_dev_remove(dev); +quit: + free(da->args); + free(da); + break; + default: + ret = -EINVAL; + } + + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + resp->result = ret; + if (rte_mp_reply(&mp_resp, bundle->peer) < 0) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_primary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg mp_resp; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct mp_reply_bundle *bundle; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + + bundle = calloc(1, sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = (void *)strdup(peer); + if (bundle->peer == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + free(bundle); + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_primary_request, bundle); + if (ret != 0) { + free(bundle->peer); + free(bundle); + resp->result = ret; + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + } + return 0; +} + +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + struct eal_dev_mp_req *resp; + int ret; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret || mp_reply.nb_received != 1) { + RTE_LOG(ERR, EAL, "Cannot send request to primary\n"); + if (!ret) + return -1; + return ret; + } + + resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param; + req->result = resp->result; + + free(mp_reply.msgs); + return ret; +} + +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + int ret; + int i; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret != 0) { + /* if IPC is not supported, behave as if the call succeeded */ + if (rte_errno != ENOTSUP) + RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n"); + else + ret = 0; + return ret; + } + + if (mp_reply.nb_sent != mp_reply.nb_received) { + RTE_LOG(ERR, EAL, "not all secondary reply\n"); + free(mp_reply.msgs); + return -1; + } + + req->result = 0; + for (i = 0; i < mp_reply.nb_received; i++) { + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_reply.msgs[i].param; + if (resp->result != 0) { + if (req->t == EAL_DEV_REQ_TYPE_ATTACH && + resp->result == -EEXIST) + continue; + if (req->t == EAL_DEV_REQ_TYPE_DETACH && + resp->result == -ENOENT) + continue; + req->result = resp->result; + } + } + + free(mp_reply.msgs); + return 0; +} + +int eal_mp_dev_hotplug_init(void) +{ + int ret; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_secondary_request); + /* primary is allowed to not support IPC */ + if (ret != 0 && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } else { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_primary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.h b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.h new file mode 100644 index 000000000..8fcf9b52e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _HOTPLUG_MP_H_ +#define _HOTPLUG_MP_H_ + +#include "rte_dev.h" +#include "rte_bus.h" + +#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request" +#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response" + +#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN +#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32 +#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128 + +enum eal_dev_req_type { + EAL_DEV_REQ_TYPE_ATTACH, + EAL_DEV_REQ_TYPE_DETACH, + EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK, + EAL_DEV_REQ_TYPE_DETACH_ROLLBACK, +}; + +struct eal_dev_mp_req { + enum eal_dev_req_type t; + char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN]; + int result; +}; + +/** + * Register all mp action callbacks for hotplug. + * + * @return + * 0 on success, negative on error. + */ +int +eal_mp_dev_hotplug_init(void); + +/** + * This is a synchronous wrapper for secondary process send + * request to primary process, this is invoked when an attach + * or detach request is issued from primary process. + */ +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req); + +/** + * this is a synchronous wrapper for primary process send + * request to secondary process, this is invoked when an attach + * or detach request issued from secondary process. + */ +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req); + + +#endif /* _HOTPLUG_MP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c new file mode 100644 index 000000000..51cdfc5d5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c @@ -0,0 +1,682 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <inttypes.h> +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_debug.h> +#include <rte_common.h> +#include <rte_spinlock.h> + +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "malloc_elem.h" +#include "malloc_heap.h" + +/* + * If debugging is enabled, freed memory is set to poison value + * to catch buggy programs. Otherwise, freed memory is set to zero + * to avoid having to zero in zmalloc + */ +#ifdef RTE_MALLOC_DEBUG +#define MALLOC_POISON 0x6b +#else +#define MALLOC_POISON 0 +#endif + +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) +{ + void *cur_page, *contig_seg_start, *page_end, *cur_seg_end; + void *data_start, *data_end; + rte_iova_t expected_iova; + struct rte_memseg *ms; + size_t page_sz, cur, max; + + page_sz = (size_t)elem->msl->page_sz; + data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN); + /* segment must start after header and with specified alignment */ + contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); + + /* return if aligned address is already out of malloc element */ + if (contig_seg_start > data_end) + return 0; + + /* if we're in IOVA as VA mode, or if we're in legacy mode with + * hugepages, all elements are IOVA-contiguous. however, we can only + * make these assumptions about internal memory - externally allocated + * segments have to be checked. + */ + if (!elem->msl->external && + (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && + rte_eal_has_hugepages()))) + return RTE_PTR_DIFF(data_end, contig_seg_start); + + cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); + ms = rte_mem_virt2memseg(cur_page, elem->msl); + + /* do first iteration outside the loop */ + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) - + MALLOC_ELEM_TRAILER_LEN; + max = cur; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + + cur_page = RTE_PTR_ADD(cur_page, page_sz); + + while (cur_page < data_end) { + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + + /* reset start of contiguous segment if unexpected iova */ + if (ms->iova != expected_iova) { + /* next contiguous segment must start at specified + * alignment. + */ + contig_seg_start = RTE_PTR_ALIGN(cur_page, align); + /* new segment start may be on a different page, so find + * the page and skip to next iteration to make sure + * we're not blowing past data end. + */ + ms = rte_mem_virt2memseg(contig_seg_start, elem->msl); + cur_page = ms->addr; + /* don't trigger another recalculation */ + expected_iova = ms->iova; + continue; + } + /* cur_seg_end ends on a page boundary or on data end. if we're + * looking at data end, then malloc trailer is already included + * in the calculations. if we're looking at page end, then we + * know there's more data past this page and thus there's space + * for malloc element trailer, so don't count it here. + */ + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start); + /* update max if cur value is bigger */ + if (cur > max) + max = cur; + + /* move to next page */ + cur_page = page_end; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + } + + return max; +} + +/* + * Initialize a general malloc_elem header structure + */ +void +malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap, + struct rte_memseg_list *msl, size_t size, + struct malloc_elem *orig_elem, size_t orig_size) +{ + elem->heap = heap; + elem->msl = msl; + elem->prev = NULL; + elem->next = NULL; + memset(&elem->free_list, 0, sizeof(elem->free_list)); + elem->state = ELEM_FREE; + elem->size = size; + elem->pad = 0; + elem->orig_elem = orig_elem; + elem->orig_size = orig_size; + set_header(elem); + set_trailer(elem); +} + +void +malloc_elem_insert(struct malloc_elem *elem) +{ + struct malloc_elem *prev_elem, *next_elem; + struct malloc_heap *heap = elem->heap; + + /* first and last elements must be both NULL or both non-NULL */ + if ((heap->first == NULL) != (heap->last == NULL)) { + RTE_LOG(ERR, EAL, "Heap is probably corrupt\n"); + return; + } + + if (heap->first == NULL && heap->last == NULL) { + /* if empty heap */ + heap->first = elem; + heap->last = elem; + prev_elem = NULL; + next_elem = NULL; + } else if (elem < heap->first) { + /* if lower than start */ + prev_elem = NULL; + next_elem = heap->first; + heap->first = elem; + } else if (elem > heap->last) { + /* if higher than end */ + prev_elem = heap->last; + next_elem = NULL; + heap->last = elem; + } else { + /* the new memory is somewhere between start and end */ + uint64_t dist_from_start, dist_from_end; + + dist_from_end = RTE_PTR_DIFF(heap->last, elem); + dist_from_start = RTE_PTR_DIFF(elem, heap->first); + + /* check which is closer, and find closest list entries */ + if (dist_from_start < dist_from_end) { + prev_elem = heap->first; + while (prev_elem->next < elem) + prev_elem = prev_elem->next; + next_elem = prev_elem->next; + } else { + next_elem = heap->last; + while (next_elem->prev > elem) + next_elem = next_elem->prev; + prev_elem = next_elem->prev; + } + } + + /* insert new element */ + elem->prev = prev_elem; + elem->next = next_elem; + if (prev_elem) + prev_elem->next = elem; + if (next_elem) + next_elem->prev = elem; +} + +/* + * Attempt to find enough physically contiguous memory in this block to store + * our data. Assume that element has at least enough space to fit in the data, + * so we just check the page addresses. + */ +static bool +elem_check_phys_contig(const struct rte_memseg_list *msl, + void *start, size_t size) +{ + return eal_memalloc_is_contig(msl, start, size); +} + +/* + * calculate the starting point of where data of the requested size + * and alignment would fit in the current element. If the data doesn't + * fit, return NULL. + */ +static void * +elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + size_t elem_size = elem->size; + + /* + * we're allocating from the end, so adjust the size of element by + * alignment size. + */ + while (elem_size >= size) { + const size_t bmask = ~(bound - 1); + uintptr_t end_pt = (uintptr_t)elem + + elem_size - MALLOC_ELEM_TRAILER_LEN; + uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + uintptr_t new_elem_start; + + /* check boundary */ + if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) { + end_pt = RTE_ALIGN_FLOOR(end_pt, bound); + new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + end_pt = new_data_start + size; + + if (((end_pt - 1) & bmask) != (new_data_start & bmask)) + return NULL; + } + + new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN; + + /* if the new start point is before the exist start, + * it won't fit + */ + if (new_elem_start < (uintptr_t)elem) + return NULL; + + if (contig) { + size_t new_data_size = end_pt - new_data_start; + + /* + * if physical contiguousness was requested and we + * couldn't fit all data into one physically contiguous + * block, try again with lower addresses. + */ + if (!elem_check_phys_contig(elem->msl, + (void *)new_data_start, + new_data_size)) { + elem_size -= align; + continue; + } + } + return (void *)new_elem_start; + } + return NULL; +} + +/* + * use elem_start_pt to determine if we get meet the size and + * alignment request from the current element + */ +int +malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + return elem_start_pt(elem, size, align, bound, contig) != NULL; +} + +/* + * split an existing element into two smaller elements at the given + * split_pt parameter. + */ +static void +split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt) +{ + struct malloc_elem *next_elem = elem->next; + const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem; + const size_t new_elem_size = elem->size - old_elem_size; + + malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size, + elem->orig_elem, elem->orig_size); + split_pt->prev = elem; + split_pt->next = next_elem; + if (next_elem) + next_elem->prev = split_pt; + else + elem->heap->last = split_pt; + elem->next = split_pt; + elem->size = old_elem_size; + set_trailer(elem); + if (elem->pad) { + /* Update inner padding inner element size. */ + elem = RTE_PTR_ADD(elem, elem->pad); + elem->size = old_elem_size - elem->pad; + } +} + +/* + * our malloc heap is a doubly linked list, so doubly remove our element. + */ +static void __rte_unused +remove_elem(struct malloc_elem *elem) +{ + struct malloc_elem *next, *prev; + next = elem->next; + prev = elem->prev; + + if (next) + next->prev = prev; + else + elem->heap->last = prev; + if (prev) + prev->next = next; + else + elem->heap->first = next; + + elem->prev = NULL; + elem->next = NULL; +} + +static int +next_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem->next == RTE_PTR_ADD(elem, elem->size) && + elem->next->msl == elem->msl && + (!internal_config.match_allocations || + elem->orig_elem == elem->next->orig_elem); +} + +static int +prev_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem == RTE_PTR_ADD(elem->prev, elem->prev->size) && + elem->prev->msl == elem->msl && + (!internal_config.match_allocations || + elem->orig_elem == elem->prev->orig_elem); +} + +/* + * Given an element size, compute its freelist index. + * We free an element into the freelist containing similarly-sized elements. + * We try to allocate elements starting with the freelist containing + * similarly-sized elements, and if necessary, we search freelists + * containing larger elements. + * + * Example element size ranges for a heap with five free lists: + * heap->free_head[0] - (0 , 2^8] + * heap->free_head[1] - (2^8 , 2^10] + * heap->free_head[2] - (2^10 ,2^12] + * heap->free_head[3] - (2^12, 2^14] + * heap->free_head[4] - (2^14, MAX_SIZE] + */ +size_t +malloc_elem_free_list_index(size_t size) +{ +#define MALLOC_MINSIZE_LOG2 8 +#define MALLOC_LOG2_INCREMENT 2 + + size_t log2; + size_t index; + + if (size <= (1UL << MALLOC_MINSIZE_LOG2)) + return 0; + + /* Find next power of 2 >= size. */ + log2 = sizeof(size) * 8 - __builtin_clzl(size-1); + + /* Compute freelist index, based on log2(size). */ + index = (log2 - MALLOC_MINSIZE_LOG2 + MALLOC_LOG2_INCREMENT - 1) / + MALLOC_LOG2_INCREMENT; + + return index <= RTE_HEAP_NUM_FREELISTS-1? + index: RTE_HEAP_NUM_FREELISTS-1; +} + +/* + * Add the specified element to its heap's free list. + */ +void +malloc_elem_free_list_insert(struct malloc_elem *elem) +{ + size_t idx; + + idx = malloc_elem_free_list_index(elem->size - MALLOC_ELEM_HEADER_LEN); + elem->state = ELEM_FREE; + LIST_INSERT_HEAD(&elem->heap->free_head[idx], elem, free_list); +} + +/* + * Remove the specified element from its heap's free list. + */ +void +malloc_elem_free_list_remove(struct malloc_elem *elem) +{ + LIST_REMOVE(elem, free_list); +} + +/* + * reserve a block of data in an existing malloc_elem. If the malloc_elem + * is much larger than the data block requested, we split the element in two. + * This function is only called from malloc_heap_alloc so parameter checking + * is not done here, as it's done there previously. + */ +struct malloc_elem * +malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound, + contig); + const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem; + const size_t trailer_size = elem->size - old_elem_size - size - + MALLOC_ELEM_OVERHEAD; + + malloc_elem_free_list_remove(elem); + + if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split it, too much free space after elem */ + struct malloc_elem *new_free_elem = + RTE_PTR_ADD(new_elem, size + MALLOC_ELEM_OVERHEAD); + + split_elem(elem, new_free_elem); + malloc_elem_free_list_insert(new_free_elem); + + if (elem == elem->heap->last) + elem->heap->last = new_free_elem; + } + + if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* don't split it, pad the element instead */ + elem->state = ELEM_BUSY; + elem->pad = old_elem_size; + + /* put a dummy header in padding, to point to real element header */ + if (elem->pad > 0) { /* pad will be at least 64-bytes, as everything + * is cache-line aligned */ + new_elem->pad = elem->pad; + new_elem->state = ELEM_PAD; + new_elem->size = elem->size - elem->pad; + set_header(new_elem); + } + + return new_elem; + } + + /* we are going to split the element in two. The original element + * remains free, and the new element is the one allocated. + * Re-insert original element, in case its new size makes it + * belong on a different list. + */ + split_elem(elem, new_elem); + new_elem->state = ELEM_BUSY; + malloc_elem_free_list_insert(elem); + + return new_elem; +} + +/* + * join two struct malloc_elem together. elem1 and elem2 must + * be contiguous in memory. + */ +static inline void +join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2) +{ + struct malloc_elem *next = elem2->next; + elem1->size += elem2->size; + if (next) + next->prev = elem1; + else + elem1->heap->last = elem1; + elem1->next = next; + if (elem1->pad) { + struct malloc_elem *inner = RTE_PTR_ADD(elem1, elem1->pad); + inner->size = elem1->size - elem1->pad; + } +} + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem) +{ + /* + * check if next element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->next != NULL && elem->next->state == ELEM_FREE && + next_elem_is_adjacent(elem)) { + void *erase; + size_t erase_len; + + /* we will want to erase the trailer and header */ + erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->next->pad; + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + /* erase header, trailer and pad */ + memset(erase, MALLOC_POISON, erase_len); + } + + /* + * check if prev element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->prev != NULL && elem->prev->state == ELEM_FREE && + prev_elem_is_adjacent(elem)) { + struct malloc_elem *new_elem; + void *erase; + size_t erase_len; + + /* we will want to erase trailer and header */ + erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->pad; + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->prev); + + new_elem = elem->prev; + join_elem(new_elem, elem); + + /* erase header, trailer and pad */ + memset(erase, MALLOC_POISON, erase_len); + + elem = new_elem; + } + + return elem; +} + +/* + * free a malloc_elem block by adding it to the free list. If the + * blocks either immediately before or immediately after newly freed block + * are also free, the blocks are merged together. + */ +struct malloc_elem * +malloc_elem_free(struct malloc_elem *elem) +{ + void *ptr; + size_t data_len; + + ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_len = elem->size - MALLOC_ELEM_OVERHEAD; + + elem = malloc_elem_join_adjacent_free(elem); + + malloc_elem_free_list_insert(elem); + + elem->pad = 0; + + /* decrease heap's count of allocated elements */ + elem->heap->alloc_count--; + + /* poison memory */ + memset(ptr, MALLOC_POISON, data_len); + + return elem; +} + +/* assume all checks were already done */ +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len) +{ + struct malloc_elem *hide_start, *hide_end, *prev, *next; + size_t len_before, len_after; + + hide_start = start; + hide_end = RTE_PTR_ADD(start, len); + + prev = elem->prev; + next = elem->next; + + /* we cannot do anything with non-adjacent elements */ + if (next && next_elem_is_adjacent(elem)) { + len_after = RTE_PTR_DIFF(next, hide_end); + if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split after */ + split_elem(elem, hide_end); + + malloc_elem_free_list_insert(hide_end); + } else if (len_after > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + /* we cannot do anything with non-adjacent elements */ + if (prev && prev_elem_is_adjacent(elem)) { + len_before = RTE_PTR_DIFF(hide_start, elem); + if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split before */ + split_elem(elem, hide_start); + + prev = elem; + elem = hide_start; + + malloc_elem_free_list_insert(prev); + } else if (len_before > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + remove_elem(elem); +} + +/* + * attempt to resize a malloc_elem by expanding into any free space + * immediately after it in memory. + */ +int +malloc_elem_resize(struct malloc_elem *elem, size_t size) +{ + const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD; + + /* if we request a smaller size, then always return ok */ + if (elem->size >= new_size) + return 0; + + /* check if there is a next element, it's free and adjacent */ + if (!elem->next || elem->next->state != ELEM_FREE || + !next_elem_is_adjacent(elem)) + return -1; + if (elem->size + elem->next->size < new_size) + return -1; + + /* we now know the element fits, so remove from free list, + * join the two + */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) { + /* now we have a big block together. Lets cut it down a bit, by splitting */ + struct malloc_elem *split_pt = RTE_PTR_ADD(elem, new_size); + split_pt = RTE_PTR_ALIGN_CEIL(split_pt, RTE_CACHE_LINE_SIZE); + split_elem(elem, split_pt); + malloc_elem_free_list_insert(split_pt); + } + return 0; +} + +static inline const char * +elem_state_to_str(enum elem_state state) +{ + switch (state) { + case ELEM_PAD: + return "PAD"; + case ELEM_BUSY: + return "BUSY"; + case ELEM_FREE: + return "FREE"; + } + return "ERROR"; +} + +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f) +{ + fprintf(f, "Malloc element at %p (%s)\n", elem, + elem_state_to_str(elem->state)); + fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad); + fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h new file mode 100644 index 000000000..a1e5f7f02 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef MALLOC_ELEM_H_ +#define MALLOC_ELEM_H_ + +#include <stdbool.h> + +#define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE) + +/* dummy definition of struct so we can use pointers to it in malloc_elem struct */ +struct malloc_heap; + +enum elem_state { + ELEM_FREE = 0, + ELEM_BUSY, + ELEM_PAD /* element is a padding-only header */ +}; + +struct malloc_elem { + struct malloc_heap *heap; + struct malloc_elem *volatile prev; + /**< points to prev elem in memseg */ + struct malloc_elem *volatile next; + /**< points to next elem in memseg */ + LIST_ENTRY(malloc_elem) free_list; + /**< list of free elements in heap */ + struct rte_memseg_list *msl; + volatile enum elem_state state; + uint32_t pad; + size_t size; + struct malloc_elem *orig_elem; + size_t orig_size; +#ifdef RTE_MALLOC_DEBUG + uint64_t header_cookie; /* Cookie marking start of data */ + /* trailer cookie at start + size */ +#endif +} __rte_cache_aligned; + +#ifndef RTE_MALLOC_DEBUG +static const unsigned MALLOC_ELEM_TRAILER_LEN = 0; + +/* dummy function - just check if pointer is non-null */ +static inline int +malloc_elem_cookies_ok(const struct malloc_elem *elem){ return elem != NULL; } + +/* dummy function - no header if malloc_debug is not enabled */ +static inline void +set_header(struct malloc_elem *elem __rte_unused){ } + +/* dummy function - no trailer if malloc_debug is not enabled */ +static inline void +set_trailer(struct malloc_elem *elem __rte_unused){ } + + +#else +static const unsigned MALLOC_ELEM_TRAILER_LEN = RTE_CACHE_LINE_SIZE; + +#define MALLOC_HEADER_COOKIE 0xbadbadbadadd2e55ULL /**< Header cookie. */ +#define MALLOC_TRAILER_COOKIE 0xadd2e55badbadbadULL /**< Trailer cookie.*/ + +/* define macros to make referencing the header and trailer cookies easier */ +#define MALLOC_ELEM_TRAILER(elem) (*((uint64_t*)RTE_PTR_ADD(elem, \ + elem->size - MALLOC_ELEM_TRAILER_LEN))) +#define MALLOC_ELEM_HEADER(elem) (elem->header_cookie) + +static inline void +set_header(struct malloc_elem *elem) +{ + if (elem != NULL) + MALLOC_ELEM_HEADER(elem) = MALLOC_HEADER_COOKIE; +} + +static inline void +set_trailer(struct malloc_elem *elem) +{ + if (elem != NULL) + MALLOC_ELEM_TRAILER(elem) = MALLOC_TRAILER_COOKIE; +} + +/* check that the header and trailer cookies are set correctly */ +static inline int +malloc_elem_cookies_ok(const struct malloc_elem *elem) +{ + return elem != NULL && + MALLOC_ELEM_HEADER(elem) == MALLOC_HEADER_COOKIE && + MALLOC_ELEM_TRAILER(elem) == MALLOC_TRAILER_COOKIE; +} + +#endif + +static const unsigned MALLOC_ELEM_HEADER_LEN = sizeof(struct malloc_elem); +#define MALLOC_ELEM_OVERHEAD (MALLOC_ELEM_HEADER_LEN + MALLOC_ELEM_TRAILER_LEN) + +/* + * Given a pointer to the start of a memory block returned by malloc, get + * the actual malloc_elem header for that block. + */ +static inline struct malloc_elem * +malloc_elem_from_data(const void *data) +{ + if (data == NULL) + return NULL; + + struct malloc_elem *elem = RTE_PTR_SUB(data, MALLOC_ELEM_HEADER_LEN); + if (!malloc_elem_cookies_ok(elem)) + return NULL; + return elem->state != ELEM_PAD ? elem: RTE_PTR_SUB(elem, elem->pad); +} + +/* + * initialise a malloc_elem header + */ +void +malloc_elem_init(struct malloc_elem *elem, + struct malloc_heap *heap, + struct rte_memseg_list *msl, + size_t size, + struct malloc_elem *orig_elem, + size_t orig_size); + +void +malloc_elem_insert(struct malloc_elem *elem); + +/* + * return true if the current malloc_elem can hold a block of data + * of the requested size and with the requested alignment + */ +int +malloc_elem_can_hold(struct malloc_elem *elem, size_t size, + unsigned int align, size_t bound, bool contig); + +/* + * reserve a block of data in an existing malloc_elem. If the malloc_elem + * is much larger than the data block requested, we split the element in two. + */ +struct malloc_elem * +malloc_elem_alloc(struct malloc_elem *elem, size_t size, + unsigned int align, size_t bound, bool contig); + +/* + * free a malloc_elem block by adding it to the free list. If the + * blocks either immediately before or immediately after newly freed block + * are also free, the blocks are merged together. + */ +struct malloc_elem * +malloc_elem_free(struct malloc_elem *elem); + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem); + +/* + * attempt to resize a malloc_elem by expanding into any free space + * immediately after it in memory. + */ +int +malloc_elem_resize(struct malloc_elem *elem, size_t size); + +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len); + +void +malloc_elem_free_list_remove(struct malloc_elem *elem); + +/* + * dump contents of malloc elem to a file. + */ +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f); + +/* + * Given an element size, compute its freelist index. + */ +size_t +malloc_elem_free_list_index(size_t size); + +/* + * Add element to its heap's free list. + */ +void +malloc_elem_free_list_insert(struct malloc_elem *elem); + +/* + * Find biggest IOVA-contiguous zone within an element with specified alignment. + */ +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align); + +#endif /* MALLOC_ELEM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c new file mode 100644 index 000000000..bd5065698 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c @@ -0,0 +1,1367 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <errno.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_errno.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> +#include <rte_spinlock.h> +#include <rte_memcpy.h> +#include <rte_memzone.h> +#include <rte_atomic.h> +#include <rte_fbarray.h> + +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_private.h" +#include "malloc_elem.h" +#include "malloc_heap.h" +#include "malloc_mp.h" + +/* start external socket ID's at a very high number */ +#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ +#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) + +static unsigned +check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) +{ + unsigned check_flag = 0; + + if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) + return 1; + + switch (hugepage_sz) { + case RTE_PGSIZE_256K: + check_flag = RTE_MEMZONE_256KB; + break; + case RTE_PGSIZE_2M: + check_flag = RTE_MEMZONE_2MB; + break; + case RTE_PGSIZE_16M: + check_flag = RTE_MEMZONE_16MB; + break; + case RTE_PGSIZE_256M: + check_flag = RTE_MEMZONE_256MB; + break; + case RTE_PGSIZE_512M: + check_flag = RTE_MEMZONE_512MB; + break; + case RTE_PGSIZE_1G: + check_flag = RTE_MEMZONE_1GB; + break; + case RTE_PGSIZE_4G: + check_flag = RTE_MEMZONE_4GB; + break; + case RTE_PGSIZE_16G: + check_flag = RTE_MEMZONE_16GB; + } + + return check_flag & flags; +} + +int +malloc_socket_to_heap_id(unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (heap->socket_id == socket_id) + return i; + } + return -1; +} + +/* + * Expand the heap with a memory area. + */ +static struct malloc_elem * +malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, + void *start, size_t len) +{ + struct malloc_elem *elem = start; + + malloc_elem_init(elem, heap, msl, len, elem, len); + + malloc_elem_insert(elem); + + elem = malloc_elem_join_adjacent_free(elem); + + malloc_elem_free_list_insert(elem); + + return elem; +} + +static int +malloc_add_seg(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct malloc_heap *heap; + int msl_idx, heap_idx; + + if (msl->external) + return 0; + + heap_idx = malloc_socket_to_heap_id(msl->socket_id); + if (heap_idx < 0) { + RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); + return -1; + } + heap = &mcfg->malloc_heaps[heap_idx]; + + /* msl is const, so find it */ + msl_idx = msl - mcfg->memsegs; + + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + found_msl = &mcfg->memsegs[msl_idx]; + + malloc_heap_add_memory(heap, found_msl, ms->addr, len); + + heap->total_size += len; + + RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, + msl->socket_id); + return 0; +} + +/* + * Iterates through the freelist for a heap to find a free element + * which can store data of the required size and with the requested alignment. + * If size is 0, find the biggest available elem. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_suitable_element(struct malloc_heap *heap, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + size_t idx; + struct malloc_elem *elem, *alt_elem = NULL; + + for (idx = malloc_elem_free_list_index(size); + idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + if (malloc_elem_can_hold(elem, size, align, bound, + contig)) { + if (check_hugepage_sz(flags, + elem->msl->page_sz)) + return elem; + if (alt_elem == NULL) + alt_elem = elem; + } + } + } + + if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) + return alt_elem; + + return NULL; +} + +/* + * Iterates through the freelist for a heap to find a free element with the + * biggest size and requested alignment. Will also set size to whatever element + * size that was found. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_biggest_element(struct malloc_heap *heap, size_t *size, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem, *max_elem = NULL; + size_t idx, max_size = 0; + + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + size_t cur_size; + if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && + !check_hugepage_sz(flags, + elem->msl->page_sz)) + continue; + if (contig) { + cur_size = + malloc_elem_find_max_iova_contig(elem, + align); + } else { + void *data_start = RTE_PTR_ADD(elem, + MALLOC_ELEM_HEADER_LEN); + void *data_end = RTE_PTR_ADD(elem, elem->size - + MALLOC_ELEM_TRAILER_LEN); + void *aligned = RTE_PTR_ALIGN_CEIL(data_start, + align); + /* check if aligned data start is beyond end */ + if (aligned >= data_end) + continue; + cur_size = RTE_PTR_DIFF(data_end, aligned); + } + if (cur_size > max_size) { + max_size = cur_size; + max_elem = elem; + } + } + } + + *size = max_size; + return max_elem; +} + +/* + * Main function to allocate a block of memory from the heap. + * It locks the free list, scans it, and adds a new memseg if the + * scan fails. Once the new memseg is added, it re-scans and should return + * the new element after releasing the lock. + */ +static void * +heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct malloc_elem *elem; + + size = RTE_CACHE_LINE_ROUNDUP(size); + align = RTE_CACHE_LINE_ROUNDUP(align); + + /* roundup might cause an overflow */ + if (size == 0) + return NULL; + elem = find_suitable_element(heap, size, flags, align, bound, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, bound, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + +static void * +heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem; + size_t size; + + align = RTE_CACHE_LINE_ROUNDUP(align); + + elem = find_biggest_element(heap, &size, flags, align, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, 0, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + +/* this function is exposed in malloc_mp.h */ +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len) +{ + if (elem != NULL) { + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, map_addr, map_len); + } + + eal_memalloc_free_seg_bulk(ms, n_segs); +} + +/* this function is exposed in malloc_mp.h */ +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct malloc_elem *elem = NULL; + size_t alloc_sz; + int allocd_pages; + void *ret, *map_addr; + + alloc_sz = (size_t)pg_sz * n_segs; + + /* first, check if we're allowed to allocate this memory */ + if (eal_memalloc_mem_alloc_validate(socket, + heap->total_size + alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); + return NULL; + } + + allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, + socket, true); + + /* make sure we've allocated our pages... */ + if (allocd_pages < 0) + return NULL; + + map_addr = ms[0]->addr; + msl = rte_mem_virt2memseg_list(map_addr); + + /* check if we wanted contiguous memory but didn't get it */ + if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", + __func__); + goto fail; + } + + /* + * Once we have all the memseg lists configured, if there is a dma mask + * set, check iova addresses are not out of range. Otherwise the device + * setting the dma mask could have problems with the mapped memory. + * + * There are two situations when this can happen: + * 1) memory initialization + * 2) dynamic memory allocation + * + * For 1), an error when checking dma mask implies app can not be + * executed. For 2) implies the new memory can not be added. + */ + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + /* + * Currently this can only happen if IOMMU is enabled + * and the address width supported by the IOMMU hw is + * not enough for using the memory mapped IOVAs. + * + * If IOVA is VA, advice to try with '--iova-mode pa' + * which could solve some situations when IOVA VA is not + * really needed. + */ + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask\n", + __func__); + + /* + * If IOVA is VA and it is possible to run with IOVA PA, + * because user is root, give and advice for solving the + * problem. + */ + if ((rte_eal_iova_mode() == RTE_IOVA_VA) && + rte_eal_using_phys_addrs()) + RTE_LOG(ERR, EAL, + "%s(): Please try initializing EAL with --iova-mode=pa parameter\n", + __func__); + goto fail; + } + + /* add newly minted memsegs to malloc heap */ + elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); + + /* try once more, as now we have allocated new memory */ + ret = find_suitable_element(heap, elt_size, flags, align, bound, + contig); + + if (ret == NULL) + goto fail; + + return elem; + +fail: + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + return NULL; +} + +static int +try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_elem *elem; + struct rte_memseg **ms; + void *map_addr; + size_t alloc_sz; + int n_segs; + bool callback_triggered = false; + + alloc_sz = RTE_ALIGN_CEIL(align + elt_size + + MALLOC_ELEM_TRAILER_LEN, pg_sz); + n_segs = alloc_sz / pg_sz; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + if (ms == NULL) + return -1; + memset(ms, 0, sizeof(*ms) * n_segs); + + elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, + bound, contig, ms, n_segs); + + if (elem == NULL) + goto free_ms; + + map_addr = ms[0]->addr; + + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); + + /* notify other processes that this has happened */ + if (request_sync()) { + /* we couldn't ensure all processes have mapped memory, + * so free it back and notify everyone that it's been + * freed back. + * + * technically, we could've avoided adding memory addresses to + * the map, but that would've led to inconsistent behavior + * between primary and secondary processes, as those get + * callbacks during sync. therefore, force primary process to + * do alloc-and-rollback syncs as well. + */ + callback_triggered = true; + goto free_elem; + } + heap->total_size += alloc_sz; + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", + socket, alloc_sz >> 20ULL); + + free(ms); + + return 0; + +free_elem: + if (callback_triggered) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + map_addr, alloc_sz); + + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + + request_sync(); +free_ms: + free(ms); + + return -1; +} + +static int +try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_mp_req req; + int req_result; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_ALLOC; + req.alloc_req.align = align; + req.alloc_req.bound = bound; + req.alloc_req.contig = contig; + req.alloc_req.flags = flags; + req.alloc_req.elt_size = elt_size; + req.alloc_req.page_sz = pg_sz; + req.alloc_req.socket = socket; + req.alloc_req.heap = heap; /* it's in shared memory */ + + req_result = request_to_primary(&req); + + if (req_result != 0) + return -1; + + if (req.result != REQ_RESULT_SUCCESS) + return -1; + + return 0; +} + +static int +try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig) +{ + int ret; + + rte_mcfg_mem_write_lock(); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } else { + ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } + + rte_mcfg_mem_write_unlock(); + return ret; +} + +static int +compare_pagesz(const void *a, const void *b) +{ + const struct rte_memseg_list * const*mpa = a; + const struct rte_memseg_list * const*mpb = b; + const struct rte_memseg_list *msla = *mpa; + const struct rte_memseg_list *mslb = *mpb; + uint64_t pg_sz_a = msla->page_sz; + uint64_t pg_sz_b = mslb->page_sz; + + if (pg_sz_a < pg_sz_b) + return -1; + if (pg_sz_a > pg_sz_b) + return 1; + return 0; +} + +static int +alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; + struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; + uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t prev_pg_sz; + int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; + bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + void *ret; + + memset(requested_msls, 0, sizeof(requested_msls)); + memset(other_msls, 0, sizeof(other_msls)); + memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); + memset(other_pg_sz, 0, sizeof(other_pg_sz)); + + /* + * go through memseg list and take note of all the page sizes available, + * and if any of them were specifically requested by the user. + */ + n_requested_msls = 0; + n_other_msls = 0; + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->socket_id != socket) + continue; + + if (msl->base_va == NULL) + continue; + + /* if pages of specific size were requested */ + if (size_flags != 0 && check_hugepage_sz(size_flags, + msl->page_sz)) + requested_msls[n_requested_msls++] = msl; + else if (size_flags == 0 || size_hint) + other_msls[n_other_msls++] = msl; + } + + /* sort the lists, smallest first */ + qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), + compare_pagesz); + qsort(other_msls, n_other_msls, sizeof(other_msls[0]), + compare_pagesz); + + /* now, extract page sizes we are supposed to try */ + prev_pg_sz = 0; + n_requested_pg_sz = 0; + for (i = 0; i < n_requested_msls; i++) { + uint64_t pg_sz = requested_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + requested_pg_sz[n_requested_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + prev_pg_sz = 0; + n_other_pg_sz = 0; + for (i = 0; i < n_other_msls; i++) { + uint64_t pg_sz = other_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + other_pg_sz[n_other_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + + /* finally, try allocating memory of specified page sizes, starting from + * the smallest sizes + */ + for (i = 0; i < n_requested_pg_sz; i++) { + uint64_t pg_sz = requested_pg_sz[i]; + + /* + * do not pass the size hint here, as user expects other page + * sizes first, before resorting to best effort allocation. + */ + if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, + align, bound, contig)) + return 0; + } + if (n_other_pg_sz == 0) + return -1; + + /* now, check if we can reserve anything with size hint */ + ret = find_suitable_element(heap, size, flags, align, bound, contig); + if (ret != NULL) + return 0; + + /* + * we still couldn't reserve memory, so try expanding heap with other + * page sizes, if there are any + */ + for (i = 0; i < n_other_pg_sz; i++) { + uint64_t pg_sz = other_pg_sz[i]; + + if (!try_expand_heap(heap, pg_sz, size, socket, flags, + align, bound, contig)) + return 0; + } + return -1; +} + +/* this will try lower page sizes first */ +static void * +malloc_heap_alloc_on_heap_id(const char *type, size_t size, + unsigned int heap_id, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + int socket_id; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + /* for legacy mode, try once and with all flags */ + if (internal_config.legacy_mem) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + goto alloc_unlock; + } + + /* + * we do not pass the size hint here, because even if allocation fails, + * we may still be able to allocate memory from appropriate page sizes, + * we just need to request more memory first. + */ + + socket_id = rte_socket_id_by_idx(heap_id); + /* + * if socket ID is negative, we cannot find a socket ID for this heap - + * which means it's an external heap. those can have unexpected page + * sizes, so if the user asked to allocate from there - assume user + * knows what they're doing, and allow allocating from there with any + * page size flags. + */ + if (socket_id < 0) + size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; + + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); + if (ret != NULL) + goto alloc_unlock; + + /* if socket ID is invalid, this is an external heap */ + if (socket_id < 0) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, + bound, contig)) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + + /* this should have succeeded */ + if (ret == NULL) + RTE_LOG(ERR, EAL, "Error allocating from heap\n"); + } +alloc_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket_arg, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + int socket, heap_id, i; + void *ret; + + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) + return NULL; + + ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, + bound, contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps. we are only iterating through native DPDK sockets, + * so external heaps won't be included. + */ + for (i = 0; i < (int) rte_socket_count(); i++) { + if (i == heap_id) + continue; + ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, + bound, contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +static void * +heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, + unsigned int flags, size_t align, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + ret = heap_alloc_biggest(heap, type, flags, align, contig); + + rte_spinlock_unlock(&(heap->lock)); + + return ret; +} + +void * +malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, + size_t align, bool contig) +{ + int socket, i, cur_socket, heap_id; + void *ret; + + /* return NULL if align is not power-of-2 */ + if ((align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) + return NULL; + + ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, + contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps */ + for (i = 0; i < (int) rte_socket_count(); i++) { + cur_socket = rte_socket_id_by_idx(i); + if (cur_socket == socket) + continue; + ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, + contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +/* this function is exposed in malloc_mp.h */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len) +{ + int n_segs, seg_idx, max_seg_idx; + struct rte_memseg_list *msl; + size_t page_sz; + + msl = rte_mem_virt2memseg_list(aligned_start); + if (msl == NULL) + return -1; + + page_sz = (size_t)msl->page_sz; + n_segs = aligned_len / page_sz; + seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; + max_seg_idx = seg_idx + n_segs; + + for (; seg_idx < max_seg_idx; seg_idx++) { + struct rte_memseg *ms; + + ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); + eal_memalloc_free_seg(ms); + } + return 0; +} + +int +malloc_heap_free(struct malloc_elem *elem) +{ + struct malloc_heap *heap; + void *start, *aligned_start, *end, *aligned_end; + size_t len, aligned_len, page_sz; + struct rte_memseg_list *msl; + unsigned int i, n_segs, before_space, after_space; + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + /* elem may be merged with previous element, so keep heap address */ + heap = elem->heap; + msl = elem->msl; + page_sz = (size_t)msl->page_sz; + + rte_spinlock_lock(&(heap->lock)); + + /* mark element as free */ + elem->state = ELEM_FREE; + + elem = malloc_elem_free(elem); + + /* anything after this is a bonus */ + ret = 0; + + /* ...of which we can't avail if we are in legacy mode, or if this is an + * externally allocated segment. + */ + if (internal_config.legacy_mem || (msl->external > 0)) + goto free_unlock; + + /* check if we can free any memory back to the system */ + if (elem->size < page_sz) + goto free_unlock; + + /* if user requested to match allocations, the sizes must match - if not, + * we will defer freeing these hugepages until the entire original allocation + * can be freed + */ + if (internal_config.match_allocations && elem->size != elem->orig_size) + goto free_unlock; + + /* probably, but let's make sure, as we may not be using up full page */ + start = elem; + len = elem->size; + aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); + end = RTE_PTR_ADD(elem, len); + aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); + + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + + /* can't free anything */ + if (aligned_len < page_sz) + goto free_unlock; + + /* we can free something. however, some of these pages may be marked as + * unfreeable, so also check that as well + */ + n_segs = aligned_len / page_sz; + for (i = 0; i < n_segs; i++) { + const struct rte_memseg *tmp = + rte_mem_virt2memseg(aligned_start, msl); + + if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + /* this is an unfreeable segment, so move start */ + aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); + } + } + + /* recalculate length and number of segments */ + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + n_segs = aligned_len / page_sz; + + /* check if we can still free some pages */ + if (n_segs == 0) + goto free_unlock; + + /* We're not done yet. We also have to check if by freeing space we will + * be leaving free elements that are too small to store new elements. + * Check if we have enough space in the beginning and at the end, or if + * start/end are exactly page aligned. + */ + before_space = RTE_PTR_DIFF(aligned_start, elem); + after_space = RTE_PTR_DIFF(end, aligned_end); + if (before_space != 0 && + before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space before start, but we may be able to + * move the start forward by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move start */ + aligned_start = RTE_PTR_ADD(aligned_start, page_sz); + aligned_len -= page_sz; + n_segs--; + } + if (after_space != 0 && after_space < + MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space after end, but we may be able to + * move the end backwards by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move end */ + aligned_end = RTE_PTR_SUB(aligned_end, page_sz); + aligned_len -= page_sz; + n_segs--; + } + + /* now we can finally free us some pages */ + + rte_mcfg_mem_write_lock(); + + /* + * we allow secondary processes to clear the heap of this allocated + * memory because it is safe to do so, as even if notifications about + * unmapped pages don't make it to other processes, heap is shared + * across all processes, and will become empty of this memory anyway, + * and nothing can allocate it back unless primary process will be able + * to deliver allocation message to every single running process. + */ + + malloc_elem_free_list_remove(elem); + + malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); + + heap->total_size -= aligned_len; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + aligned_start, aligned_len); + + /* don't care if any of this fails */ + malloc_heap_free_pages(aligned_start, aligned_len); + + request_sync(); + } else { + struct malloc_mp_req req; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_FREE; + req.free_req.addr = aligned_start; + req.free_req.len = aligned_len; + + /* + * we request primary to deallocate pages, but we don't do it + * in this thread. instead, we notify primary that we would like + * to deallocate pages, and this process will receive another + * request (in parallel) that will do it for us on another + * thread. + * + * we also don't really care if this succeeds - the data is + * already removed from the heap, so it is, for all intents and + * purposes, hidden from the rest of DPDK even if some other + * process (including this one) may have these pages mapped. + * + * notifications about deallocated memory happen during sync. + */ + request_to_primary(&req); + } + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", + msl->socket_id, aligned_len >> 20ULL); + + rte_mcfg_mem_write_unlock(); +free_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size) +{ + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + rte_spinlock_lock(&(elem->heap->lock)); + + ret = malloc_elem_resize(elem, size); + + rte_spinlock_unlock(&(elem->heap->lock)); + + return ret; +} + +/* + * Function to retrieve data for a given heap + */ +int +malloc_heap_get_stats(struct malloc_heap *heap, + struct rte_malloc_socket_stats *socket_stats) +{ + size_t idx; + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + /* Initialise variables for heap */ + socket_stats->free_count = 0; + socket_stats->heap_freesz_bytes = 0; + socket_stats->greatest_free_size = 0; + + /* Iterate through free list */ + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) + { + socket_stats->free_count++; + socket_stats->heap_freesz_bytes += elem->size; + if (elem->size > socket_stats->greatest_free_size) + socket_stats->greatest_free_size = elem->size; + } + } + /* Get stats on overall heap and allocated memory on this heap */ + socket_stats->heap_totalsz_bytes = heap->total_size; + socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - + socket_stats->heap_freesz_bytes); + socket_stats->alloc_count = heap->alloc_count; + + rte_spinlock_unlock(&heap->lock); + return 0; +} + +/* + * Function to retrieve data for a given heap + */ +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f) +{ + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + fprintf(f, "Heap size: 0x%zx\n", heap->total_size); + fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); + + elem = heap->first; + while (elem) { + malloc_elem_dump(elem, f); + elem = elem->next; + } + + rte_spinlock_unlock(&heap->lock); +} + +static int +destroy_elem(struct malloc_elem *elem, size_t len) +{ + struct malloc_heap *heap = elem->heap; + + /* notify all subscribers that a memory area is going to be removed */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); + + /* this element can be removed */ + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, elem, len); + + heap->total_size -= len; + + memset(elem, 0, sizeof(*elem)); + + return 0; +} + +struct rte_memseg_list * +malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz, const char *seg_name, + unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + char fbarray_name[RTE_FBARRAY_NAME_LEN]; + struct rte_memseg_list *msl = NULL; + struct rte_fbarray *arr; + size_t seg_len = n_pages * page_sz; + unsigned int i; + + /* first, find a free memseg list */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *tmp = &mcfg->memsegs[i]; + if (tmp->base_va == NULL) { + msl = tmp; + break; + } + } + if (msl == NULL) { + RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); + rte_errno = ENOSPC; + return NULL; + } + + snprintf(fbarray_name, sizeof(fbarray_name), "%s_%p", + seg_name, va_addr); + + /* create the backing fbarray */ + if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, + sizeof(struct rte_memseg)) < 0) { + RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); + return NULL; + } + arr = &msl->memseg_arr; + + /* fbarray created, fill it up */ + for (i = 0; i < n_pages; i++) { + struct rte_memseg *ms; + + rte_fbarray_set_used(arr, i); + ms = rte_fbarray_get(arr, i); + ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); + ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->socket_id = socket_id; + } + + /* set up the memseg list */ + msl->base_va = va_addr; + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->len = seg_len; + msl->version = 0; + msl->external = 1; + + return msl; +} + +struct extseg_walk_arg { + void *va_addr; + size_t len; + struct rte_memseg_list *msl; +}; + +static int +extseg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct extseg_walk_arg *wa = arg; + + if (msl->base_va == wa->va_addr && msl->len == wa->len) { + unsigned int found_idx; + + /* msl is const */ + found_idx = msl - mcfg->memsegs; + wa->msl = &mcfg->memsegs[found_idx]; + return 1; + } + return 0; +} + +struct rte_memseg_list * +malloc_heap_find_external_seg(void *va_addr, size_t len) +{ + struct extseg_walk_arg wa; + int res; + + wa.va_addr = va_addr; + wa.len = len; + + res = rte_memseg_list_walk_thread_unsafe(extseg_walk, &wa); + + if (res != 1) { + /* 0 means nothing was found, -1 shouldn't happen */ + if (res == 0) + rte_errno = ENOENT; + return NULL; + } + return wa.msl; +} + +int +malloc_heap_destroy_external_seg(struct rte_memseg_list *msl) +{ + /* destroy the fbarray backing this memory */ + if (rte_fbarray_destroy(&msl->memseg_arr) < 0) + return -1; + + /* reset the memseg list */ + memset(msl, 0, sizeof(*msl)); + + return 0; +} + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, + struct rte_memseg_list *msl) +{ + /* erase contents of new memory */ + memset(msl->base_va, 0, msl->len); + + /* now, add newly minted memory to the malloc heap */ + malloc_heap_add_memory(heap, msl, msl->base_va, msl->len); + + heap->total_size += msl->len; + + /* all done! */ + RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", + heap->name, msl->base_va); + + /* notify all subscribers that a new memory area has been added */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + msl->base_va, msl->len); + + return 0; +} + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len) +{ + struct malloc_elem *elem = heap->first; + + /* find element with specified va address */ + while (elem != NULL && elem != va_addr) { + elem = elem->next; + /* stop if we've blown past our VA */ + if (elem > (struct malloc_elem *)va_addr) { + rte_errno = ENOENT; + return -1; + } + } + /* check if element was found */ + if (elem == NULL || elem->msl->len != len) { + rte_errno = ENOENT; + return -1; + } + /* if element's size is not equal to segment len, segment is busy */ + if (elem->state == ELEM_BUSY || elem->size != len) { + rte_errno = EBUSY; + return -1; + } + return destroy_elem(elem, len); +} + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint32_t next_socket_id = mcfg->next_socket_id; + + /* prevent overflow. did you really create 2 billion heaps??? */ + if (next_socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + return -1; + } + + /* initialize empty heap */ + heap->alloc_count = 0; + heap->first = NULL; + heap->last = NULL; + LIST_INIT(heap->free_head); + rte_spinlock_init(&heap->lock); + heap->total_size = 0; + heap->socket_id = next_socket_id; + + /* we hold a global mem hotplug writelock, so it's safe to increment */ + mcfg->next_socket_id++; + + /* set up name */ + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + return 0; +} + +int +malloc_heap_destroy(struct malloc_heap *heap) +{ + if (heap->alloc_count != 0) { + RTE_LOG(ERR, EAL, "Heap is still in use\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->first != NULL || heap->last != NULL) { + RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->total_size != 0) + RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); + + /* after this, the lock will be dropped */ + memset(heap, 0, sizeof(*heap)); + + return 0; +} + +int +rte_eal_malloc_heap_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + if (internal_config.match_allocations) { + RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n"); + } + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* assign min socket ID to external heaps */ + mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; + + /* assign names to default DPDK heaps */ + for (i = 0; i < rte_socket_count(); i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + char heap_name[RTE_HEAP_NAME_MAX_LEN]; + int socket_id = rte_socket_id_by_idx(i); + + snprintf(heap_name, sizeof(heap_name), + "socket_%i", socket_id); + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + heap->socket_id = socket_id; + } + } + + + if (register_mp_requests()) { + RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); + rte_mcfg_mem_read_unlock(); + return -1; + } + + /* unlock mem hotplug here. it's safe for primary as no requests can + * even come before primary itself is fully initialized, and secondaries + * do not need to initialize the heap. + */ + rte_mcfg_mem_read_unlock(); + + /* secondary process does not need to initialize anything */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + /* add all IOVA-contiguous areas to the heap */ + return rte_memseg_contig_walk(malloc_add_seg, NULL); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h new file mode 100644 index 000000000..772736b53 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef MALLOC_HEAP_H_ +#define MALLOC_HEAP_H_ + +#include <stdbool.h> +#include <sys/queue.h> + +#include <rte_malloc.h> +#include <rte_spinlock.h> + +/* Number of free lists per heap, grouped by size. */ +#define RTE_HEAP_NUM_FREELISTS 13 +#define RTE_HEAP_NAME_MAX_LEN 32 + +/* dummy definition, for pointers */ +struct malloc_elem; + +/** + * Structure to hold malloc heap + */ +struct malloc_heap { + rte_spinlock_t lock; + LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS]; + struct malloc_elem *volatile first; + struct malloc_elem *volatile last; + + unsigned int alloc_count; + unsigned int socket_id; + size_t total_size; + char name[RTE_HEAP_NAME_MAX_LEN]; +} __rte_cache_aligned; + +#ifdef __cplusplus +extern "C" { +#endif + +static inline unsigned +malloc_get_numa_socket(void) +{ + unsigned socket_id = rte_socket_id(); + + if (socket_id == (unsigned)SOCKET_ID_ANY) + return 0; + + return socket_id; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags, + size_t align, size_t bound, bool contig); + +void * +malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, + size_t align, bool contig); + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name); + +int +malloc_heap_destroy(struct malloc_heap *heap); + +struct rte_memseg_list * +malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz, const char *seg_name, + unsigned int socket_id); + +struct rte_memseg_list * +malloc_heap_find_external_seg(void *va_addr, size_t len); + +int +malloc_heap_destroy_external_seg(struct rte_memseg_list *msl); + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, + struct rte_memseg_list *msl); + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len); + +int +malloc_heap_free(struct malloc_elem *elem); + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size); + +int +malloc_heap_get_stats(struct malloc_heap *heap, + struct rte_malloc_socket_stats *socket_stats); + +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f); + +int +malloc_socket_to_heap_id(unsigned int socket_id); + +int +rte_eal_malloc_heap_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* MALLOC_HEAP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c new file mode 100644 index 000000000..1f212f834 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c @@ -0,0 +1,751 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <string.h> +#include <sys/time.h> + +#include <rte_alarm.h> +#include <rte_errno.h> +#include <rte_string_fns.h> + +#include "eal_memalloc.h" +#include "eal_memcfg.h" + +#include "malloc_elem.h" +#include "malloc_mp.h" + +#define MP_ACTION_SYNC "mp_malloc_sync" +/**< request sent by primary process to notify of changes in memory map */ +#define MP_ACTION_ROLLBACK "mp_malloc_rollback" +/**< request sent by primary process to notify of changes in memory map. this is + * essentially a regular sync request, but we cannot send sync requests while + * another one is in progress, and we might have to - therefore, we do this as + * a separate callback. + */ +#define MP_ACTION_REQUEST "mp_malloc_request" +/**< request sent by secondary process to ask for allocation/deallocation */ +#define MP_ACTION_RESPONSE "mp_malloc_response" +/**< response sent to secondary process to indicate result of request */ + +/* forward declarations */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +/* when we're allocating, we need to store some state to ensure that we can + * roll back later + */ +struct primary_alloc_req_state { + struct malloc_heap *heap; + struct rte_memseg **ms; + int ms_len; + struct malloc_elem *elem; + void *map_addr; + size_t map_len; +}; + +enum req_state { + REQ_STATE_INACTIVE = 0, + REQ_STATE_ACTIVE, + REQ_STATE_COMPLETE +}; + +struct mp_request { + TAILQ_ENTRY(mp_request) next; + struct malloc_mp_req user_req; /**< contents of request */ + pthread_cond_t cond; /**< variable we use to time out on this request */ + enum req_state state; /**< indicate status of this request */ + struct primary_alloc_req_state alloc_state; +}; + +/* + * We could've used just a single request, but it may be possible for + * secondaries to timeout earlier than the primary, and send a new request while + * primary is still expecting replies to the old one. Therefore, each new + * request will get assigned a new ID, which is how we will distinguish between + * expected and unexpected messages. + */ +TAILQ_HEAD(mp_request_list, mp_request); +static struct { + struct mp_request_list list; + pthread_mutex_t lock; +} mp_request_list = { + .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list), + .lock = PTHREAD_MUTEX_INITIALIZER +}; + +/** + * General workflow is the following: + * + * Allocation: + * S: send request to primary + * P: attempt to allocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * if success, sendmsg success + * if failure, roll back allocation and send a rollback request + * S: if received msg of success, quit + * if received rollback request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * Aside from timeouts, there are three points where we can quit: + * - if allocation failed straight away + * - if allocation and sync request succeeded + * - if allocation succeeded, sync request failed, allocation rolled back and + * rollback request received (irrespective of whether it succeeded or failed) + * + * Deallocation: + * S: send request to primary + * P: attempt to deallocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * There is no "rollback" from deallocation, as it's safe to have some memory + * mapped in some processes - it's absent from the heap, so it won't get used. + */ + +static struct mp_request * +find_request_by_id(uint64_t id) +{ + struct mp_request *req; + TAILQ_FOREACH(req, &mp_request_list.list, next) { + if (req->user_req.id == id) + break; + } + return req; +} + +/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */ +static uint64_t +get_unique_id(void) +{ + uint64_t id; + do { + id = rte_rand(); + } while (find_request_by_id(id) != NULL); + return id; +} + +/* secondary will respond to sync requests thusly */ +static int +handle_sync(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg reply; + const struct malloc_mp_req *req = + (const struct malloc_mp_req *)msg->param; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.param; + int ret; + + if (req->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected request from primary\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + reply.num_fds = 0; + strlcpy(reply.name, msg->name, sizeof(reply.name)); + reply.len_param = sizeof(*resp); + + ret = eal_memalloc_sync_with_primary(); + + resp->t = REQ_TYPE_SYNC; + resp->id = req->id; + resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL; + + rte_mp_reply(&reply, peer); + + return 0; +} + +static int +handle_alloc_request(const struct malloc_mp_req *m, + struct mp_request *req) +{ + const struct malloc_req_alloc *ar = &m->alloc_req; + struct malloc_heap *heap; + struct malloc_elem *elem; + struct rte_memseg **ms; + size_t alloc_sz; + int n_segs; + void *map_addr; + + alloc_sz = RTE_ALIGN_CEIL(ar->align + ar->elt_size + + MALLOC_ELEM_TRAILER_LEN, ar->page_sz); + n_segs = alloc_sz / ar->page_sz; + + heap = ar->heap; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + if (ms == NULL) { + RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); + goto fail; + } + memset(ms, 0, sizeof(*ms) * n_segs); + + elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, + ar->flags, ar->align, ar->bound, ar->contig, ms, + n_segs); + + if (elem == NULL) + goto fail; + + map_addr = ms[0]->addr; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); + + /* we have succeeded in allocating memory, but we still need to sync + * with other processes. however, since DPDK IPC is single-threaded, we + * send an asynchronous request and exit this callback. + */ + + req->alloc_state.ms = ms; + req->alloc_state.ms_len = n_segs; + req->alloc_state.map_addr = map_addr; + req->alloc_state.map_len = alloc_sz; + req->alloc_state.elem = elem; + req->alloc_state.heap = heap; + + return 0; +fail: + free(ms); + return -1; +} + +/* first stage of primary handling requests from secondary */ +static int +handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + int ret; + + /* lock access to request */ + pthread_mutex_lock(&mp_request_list.lock); + + /* make sure it's not a dupe */ + entry = find_request_by_id(m->id); + if (entry != NULL) { + RTE_LOG(ERR, EAL, "Duplicate request id\n"); + goto fail; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n"); + goto fail; + } + + /* erase all data */ + memset(entry, 0, sizeof(*entry)); + + if (m->t == REQ_TYPE_ALLOC) { + ret = handle_alloc_request(m, entry); + } else if (m->t == REQ_TYPE_FREE) { + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + m->free_req.addr, m->free_req.len); + + ret = malloc_heap_free_pages(m->free_req.addr, + m->free_req.len); + } else { + RTE_LOG(ERR, EAL, "Unexpected request from secondary\n"); + goto fail; + } + + if (ret != 0) { + struct rte_mp_msg resp_msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)resp_msg.param; + + /* send failure message straight away */ + resp_msg.num_fds = 0; + resp_msg.len_param = sizeof(*resp); + strlcpy(resp_msg.name, MP_ACTION_RESPONSE, + sizeof(resp_msg.name)); + + resp->t = m->t; + resp->result = REQ_RESULT_FAIL; + resp->id = m->id; + + if (rte_mp_sendmsg(&resp_msg)) { + RTE_LOG(ERR, EAL, "Couldn't send response\n"); + goto fail; + } + /* we did not modify the request */ + free(entry); + } else { + struct rte_mp_msg sr_msg; + struct malloc_mp_req *sr = + (struct malloc_mp_req *)sr_msg.param; + struct timespec ts; + + memset(&sr_msg, 0, sizeof(sr_msg)); + + /* we can do something, so send sync request asynchronously */ + sr_msg.num_fds = 0; + sr_msg.len_param = sizeof(*sr); + strlcpy(sr_msg.name, MP_ACTION_SYNC, sizeof(sr_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + sr->t = REQ_TYPE_SYNC; + sr->id = m->id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&sr_msg, &ts, + handle_sync_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't send sync request\n"); + if (m->t == REQ_TYPE_ALLOC) + free(entry->alloc_state.ms); + goto fail; + } + + /* mark request as in progress */ + memcpy(&entry->user_req, m, sizeof(*m)); + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + } + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +/* callback for asynchronous sync requests for primary. this will either do a + * sendmsg with results, or trigger rollback request. + */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply) +{ + enum malloc_req_result result; + struct mp_request *entry; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + int i; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + result = REQ_RESULT_SUCCESS; + + if (reply->nb_received != reply->nb_sent) + result = REQ_RESULT_FAIL; + + for (i = 0; i < reply->nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply->msgs[i].param; + + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response to sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->id != entry->user_req.id) { + RTE_LOG(ERR, EAL, "Response to wrong sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->result == REQ_RESULT_FAIL) { + result = REQ_RESULT_FAIL; + break; + } + } + + if (entry->user_req.t == REQ_TYPE_FREE) { + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + /* this is a free request, just sendmsg result */ + resp->t = REQ_TYPE_FREE; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_SUCCESS) { + struct malloc_heap *heap = entry->alloc_state.heap; + struct rte_mp_msg msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + heap->total_size += entry->alloc_state.map_len; + + /* result is success, so just notify secondary about this */ + resp->t = REQ_TYPE_ALLOC; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_FAIL) { + struct rte_mp_msg rb_msg; + struct malloc_mp_req *rb = + (struct malloc_mp_req *)rb_msg.param; + struct timespec ts; + struct primary_alloc_req_state *state = + &entry->alloc_state; + int ret; + + memset(&rb_msg, 0, sizeof(rb_msg)); + + /* we've failed to sync, so do a rollback */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + state->map_addr, state->map_len); + + rollback_expand_heap(state->ms, state->ms_len, state->elem, + state->map_addr, state->map_len); + + /* send rollback request */ + rb_msg.num_fds = 0; + rb_msg.len_param = sizeof(*rb); + strlcpy(rb_msg.name, MP_ACTION_ROLLBACK, sizeof(rb_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + rb->t = REQ_TYPE_SYNC; + rb->id = entry->user_req.id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&rb_msg, &ts, + handle_rollback_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n"); + + /* we couldn't send rollback request, but that's OK - + * secondary will time out, and memory has been removed + * from heap anyway. + */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(state->ms); + free(entry); + goto fail; + } + } else { + RTE_LOG(ERR, EAL, " to sync request of unknown type\n"); + goto fail; + } + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply __rte_unused) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + struct mp_request *entry; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + memset(&msg, 0, sizeof(msg)); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + if (entry->user_req.t != REQ_TYPE_ALLOC) { + RTE_LOG(ERR, EAL, "Unexpected active request\n"); + goto fail; + } + + /* we don't care if rollback succeeded, request still failed */ + resp->t = REQ_TYPE_ALLOC; + resp->result = REQ_RESULT_FAIL; + resp->id = mpreq->id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + /* clean up */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +/* final stage of the request from secondary */ +static int +handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(m->id); + if (entry != NULL) { + /* update request status */ + entry->user_req.result = m->result; + + entry->state = REQ_STATE_COMPLETE; + + /* trigger thread wakeup */ + pthread_cond_signal(&entry->cond); + } + + pthread_mutex_unlock(&mp_request_list.lock); + + return 0; +} + +/* synchronously request memory map sync, this is only called whenever primary + * process initiates the allocation. + */ +int +request_sync(void) +{ + struct rte_mp_msg msg; + struct rte_mp_reply reply; + struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param; + struct timespec ts; + int i, ret = -1; + + memset(&msg, 0, sizeof(msg)); + memset(&reply, 0, sizeof(reply)); + + /* no need to create tailq entries as this is entirely synchronous */ + + msg.num_fds = 0; + msg.len_param = sizeof(*req); + strlcpy(msg.name, MP_ACTION_SYNC, sizeof(msg.name)); + + /* sync request carries no data */ + req->t = REQ_TYPE_SYNC; + req->id = get_unique_id(); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_sync(&msg, &reply, &ts); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + /* if IPC is unsupported, behave as if the call succeeded */ + if (rte_errno != ENOTSUP) + RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n"); + else + ret = 0; + goto out; + } + + if (reply.nb_received != reply.nb_sent) { + RTE_LOG(ERR, EAL, "Not all secondaries have responded\n"); + goto out; + } + + for (i = 0; i < reply.nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.msgs[i].param; + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response from secondary\n"); + goto out; + } + if (resp->id != req->id) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto out; + } + if (resp->result != REQ_RESULT_SUCCESS) { + RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n"); + goto out; + } + } + + ret = 0; +out: + free(reply.msgs); + return ret; +} + +/* this is a synchronous wrapper around a bunch of asynchronous requests to + * primary process. this will initiate a request and wait until responses come. + */ +int +request_to_primary(struct malloc_mp_req *user_req) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param; + struct mp_request *entry; + struct timespec ts; + struct timeval now; + int ret; + + memset(&msg, 0, sizeof(msg)); + memset(&ts, 0, sizeof(ts)); + + pthread_mutex_lock(&mp_request_list.lock); + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n"); + goto fail; + } + + memset(entry, 0, sizeof(*entry)); + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto fail; + } + + ts.tv_nsec = (now.tv_usec * 1000) % 1000000000; + ts.tv_sec = now.tv_sec + MP_TIMEOUT_S + + (now.tv_usec * 1000) / 1000000000; + + /* initialize the request */ + pthread_cond_init(&entry->cond, NULL); + + msg.num_fds = 0; + msg.len_param = sizeof(*msg_req); + strlcpy(msg.name, MP_ACTION_REQUEST, sizeof(msg.name)); + + /* (attempt to) get a unique id */ + user_req->id = get_unique_id(); + + /* copy contents of user request into the message */ + memcpy(msg_req, user_req, sizeof(*msg_req)); + + if (rte_mp_sendmsg(&msg)) { + RTE_LOG(ERR, EAL, "Cannot send message to primary\n"); + goto fail; + } + + /* copy contents of user request into active request */ + memcpy(&entry->user_req, user_req, sizeof(*user_req)); + + /* mark request as in progress */ + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + + /* finally, wait on timeout */ + do { + ret = pthread_cond_timedwait(&entry->cond, + &mp_request_list.lock, &ts); + } while (ret != 0 && ret != ETIMEDOUT); + + if (entry->state != REQ_STATE_COMPLETE) { + RTE_LOG(ERR, EAL, "Request timed out\n"); + ret = -1; + } else { + ret = 0; + user_req->result = entry->user_req.result; + } + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return ret; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +int +register_mp_requests(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* it's OK for primary to not support IPC */ + if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request) && + rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_REQUEST); + return -1; + } + } else { + if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_RESPONSE, + handle_response)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_RESPONSE); + return -1; + } + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h new file mode 100644 index 000000000..2b86b76f6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef MALLOC_MP_H +#define MALLOC_MP_H + +#include <stdbool.h> +#include <stdint.h> + +#include <rte_common.h> +#include <rte_random.h> +#include <rte_spinlock.h> +#include <rte_tailq.h> + +/* forward declarations */ +struct malloc_heap; +struct rte_memseg; + +/* multiprocess synchronization structures for malloc */ +enum malloc_req_type { + REQ_TYPE_ALLOC, /**< ask primary to allocate */ + REQ_TYPE_FREE, /**< ask primary to free */ + REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */ +}; + +enum malloc_req_result { + REQ_RESULT_SUCCESS, + REQ_RESULT_FAIL +}; + +struct malloc_req_alloc { + struct malloc_heap *heap; + uint64_t page_sz; + size_t elt_size; + int socket; + unsigned int flags; + size_t align; + size_t bound; + bool contig; +}; + +struct malloc_req_free { + RTE_STD_C11 + union { + void *addr; + uint64_t addr_64; + }; + uint64_t len; +}; + +struct malloc_mp_req { + enum malloc_req_type t; + RTE_STD_C11 + union { + struct malloc_req_alloc alloc_req; + struct malloc_req_free free_req; + }; + uint64_t id; /**< not to be populated by caller */ + enum malloc_req_result result; +}; + +int +register_mp_requests(void); + +int +request_to_primary(struct malloc_mp_req *req); + +/* synchronous memory map sync request */ +int +request_sync(void); + +/* functions from malloc_heap exposed here */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len); + +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs); + +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len); + +#endif /* MALLOC_MP_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/meson.build b/src/spdk/dpdk/lib/librte_eal/common/meson.build new file mode 100644 index 000000000..55aaeb18e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/meson.build @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +includes += include_directories('.') + +if is_windows + sources += files( + 'eal_common_bus.c', + 'eal_common_class.c', + 'eal_common_devargs.c', + 'eal_common_errno.c', + 'eal_common_launch.c', + 'eal_common_lcore.c', + 'eal_common_log.c', + 'eal_common_options.c', + 'eal_common_thread.c', + ) + subdir_done() +endif + +sources += files( + 'eal_common_bus.c', + 'eal_common_cpuflags.c', + 'eal_common_class.c', + 'eal_common_devargs.c', + 'eal_common_dev.c', + 'eal_common_errno.c', + 'eal_common_fbarray.c', + 'eal_common_hexdump.c', + 'eal_common_hypervisor.c', + 'eal_common_launch.c', + 'eal_common_lcore.c', + 'eal_common_log.c', + 'eal_common_mcfg.c', + 'eal_common_memalloc.c', + 'eal_common_memory.c', + 'eal_common_memzone.c', + 'eal_common_options.c', + 'eal_common_proc.c', + 'eal_common_string_fns.c', + 'eal_common_tailqs.c', + 'eal_common_thread.c', + 'eal_common_timer.c', + 'eal_common_trace.c', + 'eal_common_trace_ctf.c', + 'eal_common_trace_points.c', + 'eal_common_trace_utils.c', + 'eal_common_uuid.c', + 'hotplug_mp.c', + 'malloc_elem.c', + 'malloc_heap.c', + 'malloc_mp.c', + 'rte_keepalive.c', + 'rte_malloc.c', + 'rte_random.c', + 'rte_reciprocal.c', + 'rte_service.c', +) diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c b/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c new file mode 100644 index 000000000..e0494b201 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2016 Intel Corporation + */ + +#include <inttypes.h> + +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_keepalive.h> +#include <rte_malloc.h> + +struct rte_keepalive { + /** Core Liveness. */ + struct { + /* + * Each element must be cache aligned to prevent false sharing. + */ + enum rte_keepalive_state core_state __rte_cache_aligned; + } live_data[RTE_KEEPALIVE_MAXCORES]; + + /** Last-seen-alive timestamps */ + uint64_t last_alive[RTE_KEEPALIVE_MAXCORES]; + + /** + * Cores to check. + * Indexed by core id, non-zero if the core should be checked. + */ + uint8_t active_cores[RTE_KEEPALIVE_MAXCORES]; + + /** Dead core handler. */ + rte_keepalive_failure_callback_t callback; + + /** + * Dead core handler app data. + * Pointer is passed to dead core handler. + */ + void *callback_data; + uint64_t tsc_initial; + uint64_t tsc_mhz; + + /** Core state relay handler. */ + rte_keepalive_relay_callback_t relay_callback; + + /** + * Core state relay handler app data. + * Pointer is passed to live core handler. + */ + void *relay_callback_data; +}; + +static void +print_trace(const char *msg, struct rte_keepalive *keepcfg, int idx_core) +{ + RTE_LOG(INFO, EAL, "%sLast seen %" PRId64 "ms ago.\n", + msg, + ((rte_rdtsc() - keepcfg->last_alive[idx_core])*1000) + / rte_get_tsc_hz() + ); +} + +void +rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer, + void *ptr_data) +{ + struct rte_keepalive *keepcfg = ptr_data; + int idx_core; + + for (idx_core = 0; idx_core < RTE_KEEPALIVE_MAXCORES; idx_core++) { + if (keepcfg->active_cores[idx_core] == 0) + continue; + + switch (keepcfg->live_data[idx_core].core_state) { + case RTE_KA_STATE_UNUSED: + break; + case RTE_KA_STATE_ALIVE: /* Alive */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_MISSING; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_MISSING: /* MIA */ + print_trace("Core MIA. ", keepcfg, idx_core); + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_DEAD; + break; + case RTE_KA_STATE_DEAD: /* Dead */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_GONE; + print_trace("Core died. ", keepcfg, idx_core); + if (keepcfg->callback) + keepcfg->callback( + keepcfg->callback_data, + idx_core + ); + break; + case RTE_KA_STATE_GONE: /* Buried */ + break; + case RTE_KA_STATE_DOZING: /* Core going idle */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_SLEEP; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_SLEEP: /* Idled core */ + break; + } + if (keepcfg->relay_callback) + keepcfg->relay_callback( + keepcfg->relay_callback_data, + idx_core, + keepcfg->live_data[idx_core].core_state, + keepcfg->last_alive[idx_core] + ); + } +} + +struct rte_keepalive * +rte_keepalive_create(rte_keepalive_failure_callback_t callback, + void *data) +{ + struct rte_keepalive *keepcfg; + + keepcfg = rte_zmalloc("RTE_EAL_KEEPALIVE", + sizeof(struct rte_keepalive), + RTE_CACHE_LINE_SIZE); + if (keepcfg != NULL) { + keepcfg->callback = callback; + keepcfg->callback_data = data; + keepcfg->tsc_initial = rte_rdtsc(); + keepcfg->tsc_mhz = rte_get_tsc_hz() / 1000; + } + return keepcfg; +} + +void rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data) +{ + keepcfg->relay_callback = callback; + keepcfg->relay_callback_data = data; +} + +void +rte_keepalive_register_core(struct rte_keepalive *keepcfg, const int id_core) +{ + if (id_core < RTE_KEEPALIVE_MAXCORES) { + keepcfg->active_cores[id_core] = RTE_KA_STATE_ALIVE; + keepcfg->last_alive[id_core] = rte_rdtsc(); + } +} + +void +rte_keepalive_mark_alive(struct rte_keepalive *keepcfg) +{ + keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_ALIVE; +} + +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg) +{ + keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_DOZING; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c b/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c new file mode 100644 index 000000000..f1b73168b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c @@ -0,0 +1,668 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_errno.h> +#include <rte_memcpy.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_branch_prediction.h> +#include <rte_debug.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_spinlock.h> +#include <rte_eal_trace.h> + +#include <rte_malloc.h> +#include "malloc_elem.h" +#include "malloc_heap.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_private.h" + + +/* Free the memory space back to heap */ +static void +mem_free(void *addr, const bool trace_ena) +{ + if (trace_ena) + rte_eal_trace_mem_free(addr); + + if (addr == NULL) return; + if (malloc_heap_free(malloc_elem_from_data(addr)) < 0) + RTE_LOG(ERR, EAL, "Error: Invalid memory\n"); +} + +void +rte_free(void *addr) +{ + return mem_free(addr, true); +} + +void +eal_free_no_trace(void *addr) +{ + return mem_free(addr, false); +} + +static void * +malloc_socket(const char *type, size_t size, unsigned int align, + int socket_arg, const bool trace_ena) +{ + void *ptr; + + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + /* if there are no hugepages and if we are not allocating from an + * external heap, use memory from any socket available. checking for + * socket being external may return -1 in case of invalid socket, but + * that's OK - if there are no hugepages, it doesn't matter. + */ + if (rte_malloc_heap_socket_is_external(socket_arg) != 1 && + !rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + ptr = malloc_heap_alloc(type, size, socket_arg, 0, + align == 0 ? 1 : align, 0, false); + + if (trace_ena) + rte_eal_trace_mem_malloc(type, size, align, socket_arg, ptr); + return ptr; +} + +/* + * Allocate memory on specified heap. + */ +void * +rte_malloc_socket(const char *type, size_t size, unsigned int align, + int socket_arg) +{ + return malloc_socket(type, size, align, socket_arg, true); +} + +void * +eal_malloc_no_trace(const char *type, size_t size, unsigned int align) +{ + return malloc_socket(type, size, align, SOCKET_ID_ANY, false); +} + +/* + * Allocate memory on default heap. + */ +void * +rte_malloc(const char *type, size_t size, unsigned align) +{ + return rte_malloc_socket(type, size, align, SOCKET_ID_ANY); +} + +/* + * Allocate zero'd memory on specified heap. + */ +void * +rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket) +{ + void *ptr = rte_malloc_socket(type, size, align, socket); + +#ifdef RTE_MALLOC_DEBUG + /* + * If DEBUG is enabled, then freed memory is marked with poison + * value and set to zero on allocation. + * If DEBUG is not enabled then memory is already zeroed. + */ + if (ptr != NULL) + memset(ptr, 0, size); +#endif + + rte_eal_trace_mem_zmalloc(type, size, align, socket, ptr); + return ptr; +} + +/* + * Allocate zero'd memory on default heap. + */ +void * +rte_zmalloc(const char *type, size_t size, unsigned align) +{ + return rte_zmalloc_socket(type, size, align, SOCKET_ID_ANY); +} + +/* + * Allocate zero'd memory on specified heap. + */ +void * +rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket) +{ + return rte_zmalloc_socket(type, num * size, align, socket); +} + +/* + * Allocate zero'd memory on default heap. + */ +void * +rte_calloc(const char *type, size_t num, size_t size, unsigned align) +{ + return rte_zmalloc(type, num * size, align); +} + +/* + * Resize allocated memory on specified heap. + */ +void * +rte_realloc_socket(void *ptr, size_t size, unsigned int align, int socket) +{ + if (ptr == NULL) + return rte_malloc_socket(NULL, size, align, socket); + + struct malloc_elem *elem = malloc_elem_from_data(ptr); + if (elem == NULL) { + RTE_LOG(ERR, EAL, "Error: memory corruption detected\n"); + return NULL; + } + + size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align); + + /* check requested socket id and alignment matches first, and if ok, + * see if we can resize block + */ + if ((socket == SOCKET_ID_ANY || + (unsigned int)socket == elem->heap->socket_id) && + RTE_PTR_ALIGN(ptr, align) == ptr && + malloc_heap_resize(elem, size) == 0) { + rte_eal_trace_mem_realloc(size, align, socket, ptr); + return ptr; + } + + /* either requested socket id doesn't match, alignment is off + * or we have no room to expand, + * so move the data. + */ + void *new_ptr = rte_malloc_socket(NULL, size, align, socket); + if (new_ptr == NULL) + return NULL; + /* elem: |pad|data_elem|data|trailer| */ + const size_t old_size = elem->size - elem->pad - MALLOC_ELEM_OVERHEAD; + rte_memcpy(new_ptr, ptr, old_size < size ? old_size : size); + rte_free(ptr); + + rte_eal_trace_mem_realloc(size, align, socket, new_ptr); + return new_ptr; +} + +/* + * Resize allocated memory. + */ +void * +rte_realloc(void *ptr, size_t size, unsigned int align) +{ + return rte_realloc_socket(ptr, size, align, SOCKET_ID_ANY); +} + +int +rte_malloc_validate(const void *ptr, size_t *size) +{ + const struct malloc_elem *elem = malloc_elem_from_data(ptr); + if (!malloc_elem_cookies_ok(elem)) + return -1; + if (size != NULL) + *size = elem->size - elem->pad - MALLOC_ELEM_OVERHEAD; + return 0; +} + +/* + * Function to retrieve data for heap on given socket + */ +int +rte_malloc_get_socket_stats(int socket, + struct rte_malloc_socket_stats *socket_stats) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int heap_idx; + + heap_idx = malloc_socket_to_heap_id(socket); + if (heap_idx < 0) + return -1; + + return malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx], + socket_stats); +} + +/* + * Function to dump contents of all heaps + */ +void +rte_malloc_dump_heaps(FILE *f) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + fprintf(f, "Heap id: %u\n", idx); + malloc_heap_dump(&mcfg->malloc_heaps[idx], f); + } +} + +int +rte_malloc_heap_get_socket(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int idx; + int ret; + + if (name == NULL || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_read_lock(); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) { + heap = tmp; + break; + } + } + + if (heap != NULL) { + ret = heap->socket_id; + } else { + rte_errno = ENOENT; + ret = -1; + } + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_malloc_heap_socket_is_external(int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + int ret = -1; + + if (socket_id == SOCKET_ID_ANY) + return 0; + + rte_mcfg_mem_read_lock(); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if ((int)tmp->socket_id == socket_id) { + /* external memory always has large socket ID's */ + ret = tmp->socket_id >= RTE_MAX_NUMA_NODES; + break; + } + } + rte_mcfg_mem_read_unlock(); + + return ret; +} + +/* + * Print stats on memory type. If type is NULL, info on all types is printed + */ +void +rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int heap_id; + struct rte_malloc_socket_stats sock_stats; + + /* Iterate through all initialised heaps */ + for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + + malloc_heap_get_stats(heap, &sock_stats); + + fprintf(f, "Heap id:%u\n", heap_id); + fprintf(f, "\tHeap name:%s\n", heap->name); + fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes); + fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes); + fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes); + fprintf(f, "\tGreatest_free_size:%zu,\n", + sock_stats.greatest_free_size); + fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count); + fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count); + } + return; +} + +/* + * TODO: Set limit to memory that can be allocated to memory type + */ +int +rte_malloc_set_limit(__rte_unused const char *type, + __rte_unused size_t max) +{ + return 0; +} + +/* + * Return the IO address of a virtual address obtained through rte_malloc + */ +rte_iova_t +rte_malloc_virt2iova(const void *addr) +{ + const struct rte_memseg *ms; + struct malloc_elem *elem = malloc_elem_from_data(addr); + + if (elem == NULL) + return RTE_BAD_IOVA; + + if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t) addr; + + ms = rte_mem_virt2memseg(addr, elem->msl); + if (ms == NULL) + return RTE_BAD_IOVA; + + if (ms->iova == RTE_BAD_IOVA) + return RTE_BAD_IOVA; + + return ms->iova + RTE_PTR_DIFF(addr, ms->addr); +} + +static struct malloc_heap * +find_named_heap(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN)) + return heap; + } + return NULL; +} + +int +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct malloc_heap *heap = NULL; + struct rte_memseg_list *msl; + unsigned int n; + int ret; + + if (heap_name == NULL || va_addr == NULL || + page_sz == 0 || !rte_is_power_of_2(page_sz) || + RTE_ALIGN(len, page_sz) != len || + !rte_is_aligned(va_addr, page_sz) || + ((len / page_sz) != n_pages && iova_addrs != NULL) || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot add memory to internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + n = len / page_sz; + + msl = malloc_heap_create_external_seg(va_addr, iova_addrs, n, page_sz, + heap_name, heap->socket_id); + if (msl == NULL) { + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_add_external_memory(heap, msl); + msl->heap = 1; /* mark it as heap segment */ + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} + +int +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len) +{ + struct malloc_heap *heap = NULL; + struct rte_memseg_list *msl; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot remove memory from internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_remove_external_memory(heap, va_addr, len); + rte_spinlock_unlock(&heap->lock); + if (ret != 0) + goto unlock; + + ret = malloc_heap_destroy_external_seg(msl); + +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} + +static int +sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach) +{ + struct malloc_heap *heap = NULL; + struct rte_memseg_list *msl; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_read_lock(); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to sync to internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + /* find corresponding memseg list to sync to */ + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + ret = -1; + goto unlock; + } + + if (attach) { + ret = rte_fbarray_attach(&msl->memseg_arr); + if (ret == 0) { + /* notify all subscribers that a new memory area was + * added. + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, len); + } else { + ret = -1; + goto unlock; + } + } else { + /* notify all subscribers that a memory area is about to + * be removed. + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + msl->base_va, msl->len); + ret = rte_fbarray_detach(&msl->memseg_arr); + if (ret < 0) { + ret = -1; + goto unlock; + } + } +unlock: + rte_mcfg_mem_read_unlock(); + return ret; +} + +int +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, true); +} + +int +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, false); +} + +int +rte_malloc_heap_create(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int i, ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + /* check if there is space in the heap list, or if heap with this name + * already exists. + */ + rte_mcfg_mem_write_lock(); + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[i]; + /* existing heap */ + if (strncmp(heap_name, tmp->name, + RTE_HEAP_NAME_MAX_LEN) == 0) { + RTE_LOG(ERR, EAL, "Heap %s already exists\n", + heap_name); + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + /* empty heap */ + if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) { + heap = tmp; + break; + } + } + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we're sure that we can create a new heap, so do it */ + ret = malloc_heap_create(heap, heap_name); +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} + +int +rte_malloc_heap_destroy(const char *heap_name) +{ + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* start from non-socket heaps */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name); + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to destroy internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + /* sanity checks done, now we can destroy the heap */ + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_destroy(heap); + + /* if we failed, lock is still active */ + if (ret < 0) + rte_spinlock_unlock(&heap->lock); +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_random.c b/src/spdk/dpdk/lib/librte_eal/common/rte_random.c new file mode 100644 index 000000000..b7a089ac4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_random.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Ericsson AB + */ + +#ifdef RTE_MACHINE_CPUFLAG_RDSEED +#include <x86intrin.h> +#endif +#include <stdlib.h> +#include <unistd.h> + +#include <rte_branch_prediction.h> +#include <rte_cycles.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_random.h> + +struct rte_rand_state { + uint64_t z1; + uint64_t z2; + uint64_t z3; + uint64_t z4; + uint64_t z5; +} __rte_cache_aligned; + +static struct rte_rand_state rand_states[RTE_MAX_LCORE]; + +static uint32_t +__rte_rand_lcg32(uint32_t *seed) +{ + *seed = 1103515245U * *seed + 12345U; + + return *seed; +} + +static uint64_t +__rte_rand_lcg64(uint32_t *seed) +{ + uint64_t low; + uint64_t high; + + /* A 64-bit LCG would have been much cleaner, but good + * multiplier/increments for such seem hard to come by. + */ + + low = __rte_rand_lcg32(seed); + high = __rte_rand_lcg32(seed); + + return low | (high << 32); +} + +static uint64_t +__rte_rand_lfsr258_gen_seed(uint32_t *seed, uint64_t min_value) +{ + uint64_t res; + + res = __rte_rand_lcg64(seed); + + if (res < min_value) + res += min_value; + + return res; +} + +static void +__rte_srand_lfsr258(uint64_t seed, struct rte_rand_state *state) +{ + uint32_t lcg_seed; + + lcg_seed = (uint32_t)(seed ^ (seed >> 32)); + + state->z1 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 2UL); + state->z2 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 512UL); + state->z3 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 4096UL); + state->z4 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 131072UL); + state->z5 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 8388608UL); +} + +void +rte_srand(uint64_t seed) +{ + unsigned int lcore_id; + + /* add lcore_id to seed to avoid having the same sequence */ + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]); +} + +static __rte_always_inline uint64_t +__rte_rand_lfsr258_comp(uint64_t z, uint64_t a, uint64_t b, uint64_t c, + uint64_t d) +{ + return ((z & c) << d) ^ (((z << a) ^ z) >> b); +} + +/* Based on L’Ecuyer, P.: Tables of maximally equidistributed combined + * LFSR generators. + */ + +static __rte_always_inline uint64_t +__rte_rand_lfsr258(struct rte_rand_state *state) +{ + state->z1 = __rte_rand_lfsr258_comp(state->z1, 1UL, 53UL, + 18446744073709551614UL, 10UL); + state->z2 = __rte_rand_lfsr258_comp(state->z2, 24UL, 50UL, + 18446744073709551104UL, 5UL); + state->z3 = __rte_rand_lfsr258_comp(state->z3, 3UL, 23UL, + 18446744073709547520UL, 29UL); + state->z4 = __rte_rand_lfsr258_comp(state->z4, 5UL, 24UL, + 18446744073709420544UL, 23UL); + state->z5 = __rte_rand_lfsr258_comp(state->z5, 3UL, 33UL, + 18446744073701163008UL, 8UL); + + return state->z1 ^ state->z2 ^ state->z3 ^ state->z4 ^ state->z5; +} + +static __rte_always_inline +struct rte_rand_state *__rte_rand_get_state(void) +{ + unsigned int lcore_id; + + lcore_id = rte_lcore_id(); + + if (unlikely(lcore_id == LCORE_ID_ANY)) + lcore_id = rte_get_master_lcore(); + + return &rand_states[lcore_id]; +} + +uint64_t +rte_rand(void) +{ + struct rte_rand_state *state; + + state = __rte_rand_get_state(); + + return __rte_rand_lfsr258(state); +} + +uint64_t +rte_rand_max(uint64_t upper_bound) +{ + struct rte_rand_state *state; + uint8_t ones; + uint8_t leading_zeros; + uint64_t mask = ~((uint64_t)0); + uint64_t res; + + if (unlikely(upper_bound < 2)) + return 0; + + state = __rte_rand_get_state(); + + ones = __builtin_popcountll(upper_bound); + + /* Handle power-of-2 upper_bound as a special case, since it + * has no bias issues. + */ + if (unlikely(ones == 1)) + return __rte_rand_lfsr258(state) & (upper_bound - 1); + + /* The approach to avoiding bias is to create a mask that + * stretches beyond the request value range, and up to the + * next power-of-2. In case the masked generated random value + * is equal to or greater than the upper bound, just discard + * the value and generate a new one. + */ + + leading_zeros = __builtin_clzll(upper_bound); + mask >>= leading_zeros; + + do { + res = __rte_rand_lfsr258(state) & mask; + } while (unlikely(res >= upper_bound)); + + return res; +} + +static uint64_t +__rte_random_initial_seed(void) +{ +#ifdef RTE_LIBEAL_USE_GETENTROPY + int ge_rc; + uint64_t ge_seed; + + ge_rc = getentropy(&ge_seed, sizeof(ge_seed)); + + if (ge_rc == 0) + return ge_seed; +#endif +#ifdef RTE_MACHINE_CPUFLAG_RDSEED + unsigned int rdseed_low; + unsigned int rdseed_high; + + /* first fallback: rdseed instruction, if available */ + if (_rdseed32_step(&rdseed_low) == 1 && + _rdseed32_step(&rdseed_high) == 1) + return (uint64_t)rdseed_low | ((uint64_t)rdseed_high << 32); +#endif + /* second fallback: seed using rdtsc */ + return rte_get_tsc_cycles(); +} + +RTE_INIT(rte_rand_init) +{ + uint64_t seed; + + seed = __rte_random_initial_seed(); + + rte_srand(seed); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c b/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c new file mode 100644 index 000000000..42dfa44eb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + * Copyright(c) Hannes Frederic Sowa + * All rights reserved. + */ + +#include <stdio.h> +#include <stdint.h> + +#include <rte_common.h> + +#include "rte_reciprocal.h" + +struct rte_reciprocal rte_reciprocal_value(uint32_t d) +{ + struct rte_reciprocal R; + uint64_t m; + int l; + + l = rte_fls_u32(d - 1); + m = ((1ULL << 32) * ((1ULL << l) - d)); + m /= d; + + ++m; + R.m = m; + R.sh1 = RTE_MIN(l, 1); + R.sh2 = RTE_MAX(l - 1, 0); + + return R; +} + +/* + * Code taken from Hacker's Delight: + * http://www.hackersdelight.org/hdcodetxt/divlu.c.txt + * License permits inclusion here per: + * http://www.hackersdelight.org/permissions.htm + */ +static uint64_t +divide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) +{ + const uint64_t b = (1ULL << 32); /* Number base (16 bits). */ + uint64_t un1, un0, /* Norm. dividend LSD's. */ + vn1, vn0, /* Norm. divisor digits. */ + q1, q0, /* Quotient digits. */ + un64, un21, un10, /* Dividend digit pairs. */ + rhat; /* A remainder. */ + int s; /* Shift amount for norm. */ + + /* If overflow, set rem. to an impossible value. */ + if (u1 >= v) { + if (r != NULL) + *r = (uint64_t) -1; + return (uint64_t) -1; + } + + /* Count leading zeros. */ + s = __builtin_clzll(v); + if (s > 0) { + v = v << s; + un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); + un10 = u0 << s; + } else { + + un64 = u1 | u0; + un10 = u0; + } + + vn1 = v >> 32; + vn0 = v & 0xFFFFFFFF; + + un1 = un10 >> 32; + un0 = un10 & 0xFFFFFFFF; + + q1 = un64/vn1; + rhat = un64 - q1*vn1; +again1: + if (q1 >= b || q1*vn0 > b*rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat < b) + goto again1; + } + + un21 = un64*b + un1 - q1*v; + + q0 = un21/vn1; + rhat = un21 - q0*vn1; +again2: + if (q0 >= b || q0*vn0 > b*rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat < b) + goto again2; + } + + if (r != NULL) + *r = (un21*b + un0 - q0*v) >> s; + return q1*b + q0; +} + +struct rte_reciprocal_u64 +rte_reciprocal_value_u64(uint64_t d) +{ + struct rte_reciprocal_u64 R; + uint64_t m; + uint64_t r; + int l; + + l = 63 - __builtin_clzll(d); + + m = divide_128_div_64_to_64((1ULL << l), 0, d, &r) << 1; + if (r << 1 < r || r << 1 >= d) + m++; + m = (1ULL << l) - d ? m + 1 : 1; + R.m = m; + + R.sh1 = l > 1 ? 1 : l; + R.sh2 = (l > 0) ? l : 0; + R.sh2 -= R.sh2 && (m == 1) ? 1 : 0; + + return R; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_service.c b/src/spdk/dpdk/lib/librte_eal/common/rte_service.c new file mode 100644 index 000000000..6123a2124 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_service.c @@ -0,0 +1,919 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <limits.h> +#include <string.h> + +#include <rte_compat.h> +#include <rte_service.h> +#include <rte_service_component.h> + +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_debug.h> +#include <rte_cycles.h> +#include <rte_atomic.h> +#include <rte_memory.h> +#include <rte_malloc.h> +#include <rte_spinlock.h> + +#include "eal_private.h" + +#define RTE_SERVICE_NUM_MAX 64 + +#define SERVICE_F_REGISTERED (1 << 0) +#define SERVICE_F_STATS_ENABLED (1 << 1) +#define SERVICE_F_START_CHECK (1 << 2) + +/* runstates for services and lcores, denoting if they are active or not */ +#define RUNSTATE_STOPPED 0 +#define RUNSTATE_RUNNING 1 + +/* internal representation of a service */ +struct rte_service_spec_impl { + /* public part of the struct */ + struct rte_service_spec spec; + + /* spin lock that when set indicates a service core is currently + * running this service callback. When not set, a core may take the + * lock and then run the service callback. + */ + rte_spinlock_t execute_lock; + + /* API set/get-able variables */ + int8_t app_runstate; + int8_t comp_runstate; + uint8_t internal_flags; + + /* per service statistics */ + /* Indicates how many cores the service is mapped to run on. + * It does not indicate the number of cores the service is running + * on currently. + */ + uint32_t num_mapped_cores; + uint64_t calls; + uint64_t cycles_spent; +} __rte_cache_aligned; + +/* the internal values of a service core */ +struct core_state { + /* map of services IDs are run on this core */ + uint64_t service_mask; + uint8_t runstate; /* running or stopped */ + uint8_t is_service_core; /* set if core is currently a service core */ + uint8_t service_active_on_lcore[RTE_SERVICE_NUM_MAX]; + uint64_t loops; + uint64_t calls_per_service[RTE_SERVICE_NUM_MAX]; +} __rte_cache_aligned; + +static uint32_t rte_service_count; +static struct rte_service_spec_impl *rte_services; +static struct core_state *lcore_states; +static uint32_t rte_service_library_initialized; + +int32_t +rte_service_init(void) +{ + if (rte_service_library_initialized) { + RTE_LOG(NOTICE, EAL, + "service library init() called, init flag %d\n", + rte_service_library_initialized); + return -EALREADY; + } + + rte_services = rte_calloc("rte_services", RTE_SERVICE_NUM_MAX, + sizeof(struct rte_service_spec_impl), + RTE_CACHE_LINE_SIZE); + if (!rte_services) { + RTE_LOG(ERR, EAL, "error allocating rte services array\n"); + goto fail_mem; + } + + lcore_states = rte_calloc("rte_service_core_states", RTE_MAX_LCORE, + sizeof(struct core_state), RTE_CACHE_LINE_SIZE); + if (!lcore_states) { + RTE_LOG(ERR, EAL, "error allocating core states array\n"); + goto fail_mem; + } + + int i; + int count = 0; + struct rte_config *cfg = rte_eal_get_configuration(); + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_config[i].core_role == ROLE_SERVICE) { + if ((unsigned int)i == cfg->master_lcore) + continue; + rte_service_lcore_add(i); + count++; + } + } + + rte_service_library_initialized = 1; + return 0; +fail_mem: + rte_free(rte_services); + rte_free(lcore_states); + return -ENOMEM; +} + +void +rte_service_finalize(void) +{ + if (!rte_service_library_initialized) + return; + + rte_service_lcore_reset_all(); + rte_eal_mp_wait_lcore(); + + rte_free(rte_services); + rte_free(lcore_states); + + rte_service_library_initialized = 0; +} + +/* returns 1 if service is registered and has not been unregistered + * Returns 0 if service never registered, or has been unregistered + */ +static inline int +service_valid(uint32_t id) +{ + return !!(rte_services[id].internal_flags & SERVICE_F_REGISTERED); +} + +static struct rte_service_spec_impl * +service_get(uint32_t id) +{ + return &rte_services[id]; +} + +/* validate ID and retrieve service pointer, or return error value */ +#define SERVICE_VALID_GET_OR_ERR_RET(id, service, retval) do { \ + if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) \ + return retval; \ + service = &rte_services[id]; \ +} while (0) + +/* returns 1 if statistics should be collected for service + * Returns 0 if statistics should not be collected for service + */ +static inline int +service_stats_enabled(struct rte_service_spec_impl *impl) +{ + return !!(impl->internal_flags & SERVICE_F_STATS_ENABLED); +} + +static inline int +service_mt_safe(struct rte_service_spec_impl *s) +{ + return !!(s->spec.capabilities & RTE_SERVICE_CAP_MT_SAFE); +} + +int32_t +rte_service_set_stats_enable(uint32_t id, int32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + + if (enabled) + s->internal_flags |= SERVICE_F_STATS_ENABLED; + else + s->internal_flags &= ~(SERVICE_F_STATS_ENABLED); + + return 0; +} + +int32_t +rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + + if (enabled) + s->internal_flags |= SERVICE_F_START_CHECK; + else + s->internal_flags &= ~(SERVICE_F_START_CHECK); + + return 0; +} + +uint32_t +rte_service_get_count(void) +{ + return rte_service_count; +} + +int32_t +rte_service_get_by_name(const char *name, uint32_t *service_id) +{ + if (!service_id) + return -EINVAL; + + int i; + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (service_valid(i) && + strcmp(name, rte_services[i].spec.name) == 0) { + *service_id = i; + return 0; + } + } + + return -ENODEV; +} + +const char * +rte_service_get_name(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + return s->spec.name; +} + +int32_t +rte_service_probe_capability(uint32_t id, uint32_t capability) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + return !!(s->spec.capabilities & capability); +} + +int32_t +rte_service_component_register(const struct rte_service_spec *spec, + uint32_t *id_ptr) +{ + uint32_t i; + int32_t free_slot = -1; + + if (spec->callback == NULL || strlen(spec->name) == 0) + return -EINVAL; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) { + free_slot = i; + break; + } + } + + if ((free_slot < 0) || (i == RTE_SERVICE_NUM_MAX)) + return -ENOSPC; + + struct rte_service_spec_impl *s = &rte_services[free_slot]; + s->spec = *spec; + s->internal_flags |= SERVICE_F_REGISTERED | SERVICE_F_START_CHECK; + + rte_service_count++; + + if (id_ptr) + *id_ptr = free_slot; + + return 0; +} + +int32_t +rte_service_component_unregister(uint32_t id) +{ + uint32_t i; + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + rte_service_count--; + + s->internal_flags &= ~(SERVICE_F_REGISTERED); + + /* clear the run-bit in all cores */ + for (i = 0; i < RTE_MAX_LCORE; i++) + lcore_states[i].service_mask &= ~(UINT64_C(1) << id); + + memset(&rte_services[id], 0, sizeof(struct rte_service_spec_impl)); + + return 0; +} + +int32_t +rte_service_component_runstate_set(uint32_t id, uint32_t runstate) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* comp_runstate act as the guard variable. Use store-release + * memory order. This synchronizes with load-acquire in + * service_run and service_runstate_get function. + */ + if (runstate) + __atomic_store_n(&s->comp_runstate, RUNSTATE_RUNNING, + __ATOMIC_RELEASE); + else + __atomic_store_n(&s->comp_runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return 0; +} + +int32_t +rte_service_runstate_set(uint32_t id, uint32_t runstate) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* app_runstate act as the guard variable. Use store-release + * memory order. This synchronizes with load-acquire in + * service_run runstate_get function. + */ + if (runstate) + __atomic_store_n(&s->app_runstate, RUNSTATE_RUNNING, + __ATOMIC_RELEASE); + else + __atomic_store_n(&s->app_runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return 0; +} + +int32_t +rte_service_runstate_get(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* comp_runstate and app_runstate act as the guard variables. + * Use load-acquire memory order. This synchronizes with + * store-release in service state set functions. + */ + if (__atomic_load_n(&s->comp_runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING && + __atomic_load_n(&s->app_runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING) { + int check_disabled = !(s->internal_flags & + SERVICE_F_START_CHECK); + int lcore_mapped = (__atomic_load_n(&s->num_mapped_cores, + __ATOMIC_RELAXED) > 0); + + return (check_disabled | lcore_mapped); + } else + return 0; + +} + +static inline void +service_runner_do_callback(struct rte_service_spec_impl *s, + struct core_state *cs, uint32_t service_idx) +{ + void *userdata = s->spec.callback_userdata; + + if (service_stats_enabled(s)) { + uint64_t start = rte_rdtsc(); + s->spec.callback(userdata); + uint64_t end = rte_rdtsc(); + s->cycles_spent += end - start; + cs->calls_per_service[service_idx]++; + s->calls++; + } else + s->spec.callback(userdata); +} + + +/* Expects the service 's' is valid. */ +static int32_t +service_run(uint32_t i, struct core_state *cs, uint64_t service_mask, + struct rte_service_spec_impl *s, uint32_t serialize_mt_unsafe) +{ + if (!s) + return -EINVAL; + + /* comp_runstate and app_runstate act as the guard variables. + * Use load-acquire memory order. This synchronizes with + * store-release in service state set functions. + */ + if (__atomic_load_n(&s->comp_runstate, __ATOMIC_ACQUIRE) != + RUNSTATE_RUNNING || + __atomic_load_n(&s->app_runstate, __ATOMIC_ACQUIRE) != + RUNSTATE_RUNNING || + !(service_mask & (UINT64_C(1) << i))) { + cs->service_active_on_lcore[i] = 0; + return -ENOEXEC; + } + + cs->service_active_on_lcore[i] = 1; + + if ((service_mt_safe(s) == 0) && (serialize_mt_unsafe == 1)) { + if (!rte_spinlock_trylock(&s->execute_lock)) + return -EBUSY; + + service_runner_do_callback(s, cs, i); + rte_spinlock_unlock(&s->execute_lock); + } else + service_runner_do_callback(s, cs, i); + + return 0; +} + +int32_t +rte_service_may_be_active(uint32_t id) +{ + uint32_t ids[RTE_MAX_LCORE] = {0}; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + int i; + + if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) + return -EINVAL; + + for (i = 0; i < lcore_count; i++) { + if (lcore_states[i].service_active_on_lcore[id]) + return 1; + } + + return 0; +} + +int32_t +rte_service_run_iter_on_app_lcore(uint32_t id, uint32_t serialize_mt_unsafe) +{ + struct core_state *cs = &lcore_states[rte_lcore_id()]; + struct rte_service_spec_impl *s; + + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* Increment num_mapped_cores to reflect that this core is + * now mapped capable of running the service. + */ + __atomic_add_fetch(&s->num_mapped_cores, 1, __ATOMIC_RELAXED); + + int ret = service_run(id, cs, UINT64_MAX, s, serialize_mt_unsafe); + + __atomic_sub_fetch(&s->num_mapped_cores, 1, __ATOMIC_RELAXED); + + return ret; +} + +static int32_t +service_runner_func(void *arg) +{ + RTE_SET_USED(arg); + uint32_t i; + const int lcore = rte_lcore_id(); + struct core_state *cs = &lcore_states[lcore]; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + while (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING) { + const uint64_t service_mask = cs->service_mask; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + /* return value ignored as no change to code flow */ + service_run(i, cs, service_mask, service_get(i), 1); + } + + cs->loops++; + } + + lcore_config[lcore].state = WAIT; + + return 0; +} + +int32_t +rte_service_lcore_count(void) +{ + int32_t count = 0; + uint32_t i; + for (i = 0; i < RTE_MAX_LCORE; i++) + count += lcore_states[i].is_service_core; + return count; +} + +int32_t +rte_service_lcore_list(uint32_t array[], uint32_t n) +{ + uint32_t count = rte_service_lcore_count(); + if (count > n) + return -ENOMEM; + + if (!array) + return -EINVAL; + + uint32_t i; + uint32_t idx = 0; + for (i = 0; i < RTE_MAX_LCORE; i++) { + struct core_state *cs = &lcore_states[i]; + if (cs->is_service_core) { + array[idx] = i; + idx++; + } + } + + return count; +} + +int32_t +rte_service_lcore_count_services(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + return __builtin_popcountll(cs->service_mask); +} + +int32_t +rte_service_start_with_defaults(void) +{ + /* create a default mapping from cores to services, then start the + * services to make them transparent to unaware applications. + */ + uint32_t i; + int ret; + uint32_t count = rte_service_get_count(); + + int32_t lcore_iter = 0; + uint32_t ids[RTE_MAX_LCORE] = {0}; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + + if (lcore_count == 0) + return -ENOTSUP; + + for (i = 0; (int)i < lcore_count; i++) + rte_service_lcore_start(ids[i]); + + for (i = 0; i < count; i++) { + /* do 1:1 core mapping here, with each service getting + * assigned a single core by default. Adding multiple services + * should multiplex to a single core, or 1:1 if there are the + * same amount of services as service-cores + */ + ret = rte_service_map_lcore_set(i, ids[lcore_iter], 1); + if (ret) + return -ENODEV; + + lcore_iter++; + if (lcore_iter >= lcore_count) + lcore_iter = 0; + + ret = rte_service_runstate_set(i, 1); + if (ret) + return -ENOEXEC; + } + + return 0; +} + +static int32_t +service_update(uint32_t sid, uint32_t lcore, uint32_t *set, uint32_t *enabled) +{ + /* validate ID, or return error value */ + if (sid >= RTE_SERVICE_NUM_MAX || !service_valid(sid) || + lcore >= RTE_MAX_LCORE || !lcore_states[lcore].is_service_core) + return -EINVAL; + + uint64_t sid_mask = UINT64_C(1) << sid; + if (set) { + uint64_t lcore_mapped = lcore_states[lcore].service_mask & + sid_mask; + + if (*set && !lcore_mapped) { + lcore_states[lcore].service_mask |= sid_mask; + __atomic_add_fetch(&rte_services[sid].num_mapped_cores, + 1, __ATOMIC_RELAXED); + } + if (!*set && lcore_mapped) { + lcore_states[lcore].service_mask &= ~(sid_mask); + __atomic_sub_fetch(&rte_services[sid].num_mapped_cores, + 1, __ATOMIC_RELAXED); + } + } + + if (enabled) + *enabled = !!(lcore_states[lcore].service_mask & (sid_mask)); + + return 0; +} + +int32_t +rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled) +{ + uint32_t on = enabled > 0; + return service_update(id, lcore, &on, 0); +} + +int32_t +rte_service_map_lcore_get(uint32_t id, uint32_t lcore) +{ + uint32_t enabled; + int ret = service_update(id, lcore, 0, &enabled); + if (ret == 0) + return enabled; + return ret; +} + +static void +set_lcore_state(uint32_t lcore, int32_t state) +{ + /* mark core state in hugepage backed config */ + struct rte_config *cfg = rte_eal_get_configuration(); + cfg->lcore_role[lcore] = state; + + /* mark state in process local lcore_config */ + lcore_config[lcore].core_role = state; + + /* update per-lcore optimized state tracking */ + lcore_states[lcore].is_service_core = (state == ROLE_SERVICE); +} + +int32_t +rte_service_lcore_reset_all(void) +{ + /* loop over cores, reset all to mask 0 */ + uint32_t i; + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_states[i].is_service_core) { + lcore_states[i].service_mask = 0; + set_lcore_state(i, ROLE_RTE); + /* runstate act as guard variable Use + * store-release memory order here to synchronize + * with load-acquire in runstate read functions. + */ + __atomic_store_n(&lcore_states[i].runstate, + RUNSTATE_STOPPED, __ATOMIC_RELEASE); + } + } + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) + __atomic_store_n(&rte_services[i].num_mapped_cores, 0, + __ATOMIC_RELAXED); + + return 0; +} + +int32_t +rte_service_lcore_add(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + if (lcore_states[lcore].is_service_core) + return -EALREADY; + + set_lcore_state(lcore, ROLE_SERVICE); + + /* ensure that after adding a core the mask and state are defaults */ + lcore_states[lcore].service_mask = 0; + /* Use store-release memory order here to synchronize with + * load-acquire in runstate read functions. + */ + __atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return rte_eal_wait_lcore(lcore); +} + +int32_t +rte_service_lcore_del(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -EINVAL; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + if (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) != + RUNSTATE_STOPPED) + return -EBUSY; + + set_lcore_state(lcore, ROLE_RTE); + + rte_smp_wmb(); + return 0; +} + +int32_t +rte_service_lcore_start(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -EINVAL; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + if (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING) + return -EALREADY; + + /* set core to run state first, and then launch otherwise it will + * return immediately as runstate keeps it in the service poll loop + */ + /* Use load-acquire memory order here to synchronize with + * store-release in runstate update functions. + */ + __atomic_store_n(&cs->runstate, RUNSTATE_RUNNING, __ATOMIC_RELEASE); + + int ret = rte_eal_remote_launch(service_runner_func, 0, lcore); + /* returns -EBUSY if the core is already launched, 0 on success */ + return ret; +} + +int32_t +rte_service_lcore_stop(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + if (__atomic_load_n(&lcore_states[lcore].runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_STOPPED) + return -EALREADY; + + uint32_t i; + uint64_t service_mask = lcore_states[lcore].service_mask; + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + int32_t enabled = service_mask & (UINT64_C(1) << i); + int32_t service_running = rte_service_runstate_get(i); + int32_t only_core = (1 == + __atomic_load_n(&rte_services[i].num_mapped_cores, + __ATOMIC_RELAXED)); + + /* if the core is mapped, and the service is running, and this + * is the only core that is mapped, the service would cease to + * run if this core stopped, so fail instead. + */ + if (enabled && service_running && only_core) + return -EBUSY; + } + + /* Use store-release memory order here to synchronize with + * load-acquire in runstate read functions. + */ + __atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return 0; +} + +int32_t +rte_service_attr_get(uint32_t id, uint32_t attr_id, uint64_t *attr_value) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + if (!attr_value) + return -EINVAL; + + switch (attr_id) { + case RTE_SERVICE_ATTR_CYCLES: + *attr_value = s->cycles_spent; + return 0; + case RTE_SERVICE_ATTR_CALL_COUNT: + *attr_value = s->calls; + return 0; + default: + return -EINVAL; + } +} + +int32_t +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE || !attr_value) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + switch (attr_id) { + case RTE_SERVICE_LCORE_ATTR_LOOPS: + *attr_value = cs->loops; + return 0; + default: + return -EINVAL; + } +} + +static void +service_dump_one(FILE *f, struct rte_service_spec_impl *s, uint32_t reset) +{ + /* avoid divide by zero */ + int calls = 1; + if (s->calls != 0) + calls = s->calls; + + if (reset) { + s->cycles_spent = 0; + s->calls = 0; + return; + } + + if (f == NULL) + return; + + fprintf(f, " %s: stats %d\tcalls %"PRIu64"\tcycles %" + PRIu64"\tavg: %"PRIu64"\n", + s->spec.name, service_stats_enabled(s), s->calls, + s->cycles_spent, s->cycles_spent / calls); +} + +int32_t +rte_service_attr_reset_all(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + int reset = 1; + service_dump_one(NULL, s, reset); + return 0; +} + +int32_t +rte_service_lcore_attr_reset_all(uint32_t lcore) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + cs->loops = 0; + + return 0; +} + +static void +service_dump_calls_per_lcore(FILE *f, uint32_t lcore, uint32_t reset) +{ + uint32_t i; + struct core_state *cs = &lcore_states[lcore]; + + fprintf(f, "%02d\t", lcore); + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + fprintf(f, "%"PRIu64"\t", cs->calls_per_service[i]); + if (reset) + cs->calls_per_service[i] = 0; + } + fprintf(f, "\n"); +} + +int32_t +rte_service_dump(FILE *f, uint32_t id) +{ + uint32_t i; + int print_one = (id != UINT32_MAX); + + /* print only the specified service */ + if (print_one) { + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + fprintf(f, "Service %s Summary\n", s->spec.name); + uint32_t reset = 0; + service_dump_one(f, s, reset); + return 0; + } + + /* print all services, as UINT32_MAX was passed as id */ + fprintf(f, "Services Summary\n"); + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + uint32_t reset = 0; + service_dump_one(f, &rte_services[i], reset); + } + + fprintf(f, "Service Cores Summary\n"); + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_config[i].core_role != ROLE_SERVICE) + continue; + + uint32_t reset = 0; + service_dump_calls_per_lcore(f, i, reset); + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/Makefile b/src/spdk/dpdk/lib/librte_eal/freebsd/Makefile new file mode 100644 index 000000000..af95386d4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/Makefile @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2019 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) +VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR) +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -lexecinfo +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrte_kvargs +LDLIBS += -lrte_telemetry + +EXPORT_MAP := ../rte_eal_version.map + +# specific to freebsd exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_mcfg.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_trace.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_trace_ctf.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_trace_points.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_trace_utils.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += hotplug_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_random.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +CFLAGS_eal_hpet.o += -Wno-return-type +endif + +INC := rte_os.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_FREEBSD)-include := $(addprefix include/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal.c new file mode 100644 index 000000000..c41f265fa --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal.c @@ -0,0 +1,1105 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdarg.h> +#include <unistd.h> +#include <pthread.h> +#include <syslog.h> +#include <getopt.h> +#include <sys/file.h> +#include <stddef.h> +#include <errno.h> +#include <limits.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/stat.h> + +#include <rte_compat.h> +#include <rte_common.h> +#include <rte_debug.h> +#include <rte_memory.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_service_component.h> +#include <rte_log.h> +#include <rte_random.h> +#include <rte_cycles.h> +#include <rte_string_fns.h> +#include <rte_cpuflags.h> +#include <rte_interrupts.h> +#include <rte_bus.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_version.h> +#include <rte_vfio.h> +#include <rte_atomic.h> +#include <malloc_heap.h> +#include <rte_telemetry.h> + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" +#include "eal_memcfg.h" +#include "eal_trace.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, eal_get_hugefile_prefix()); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +int +eal_clean_runtime_dir(void) +{ + /* FreeBSD doesn't need this implemented for now, because, unlike Linux, + * FreeBSD doesn't create per-process files, so no need to clean up. + */ + return 0; +} + + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static int +rte_eal_config_create(void) +{ + size_t page_sz = sysconf(_SC_PAGE_SIZE); + size_t cfg_len = sizeof(*rte_config.mem_config); + size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz); + void *rte_mem_cfg_addr, *mapped_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + /* map the config before base address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), page_sz); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + retval = ftruncate(mem_cfg_fd, cfg_len); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n", + pathname); + return -1; + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + return -1; + } + + /* reserve space for config */ + rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr, + &cfg_len_aligned, page_sz, 0, 0); + if (rte_mem_cfg_addr == NULL) { + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n"); + close(mem_cfg_fd); + mem_cfg_fd = -1; + return -1; + } + + /* remap the actual file into the space we've just reserved */ + mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr, + cfg_len_aligned, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0); + if (mapped_mem_cfg_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n"); + munmap(rte_mem_cfg_addr, cfg_len); + close(mem_cfg_fd); + mem_cfg_fd = -1; + return -1; + } + + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location + */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + + return 0; +} + +/* attach to an existing shared memory config */ +static int +rte_eal_config_attach(void) +{ + void *rte_mem_cfg_addr; + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + /* don't close the fd here, it will be closed on reattach */ + if (rte_mem_cfg_addr == MAP_FAILED) { + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = rte_mem_cfg_addr; + + return 0; +} + +/* reattach the shared config at exact memory location primary process has it */ +static int +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return 0; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = + (void *)(uintptr_t)rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + close(mem_cfg_fd); + mem_cfg_fd = -1; + + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) { + /* errno is stale, don't use */ + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--" OPT_BASE_VIRTADDR + "' option\n", + rte_mem_cfg_addr, mem_config); + munmap(mem_config, sizeof(struct rte_mem_config)); + return -1; + } + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = mem_config; + + return 0; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static int +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + if (rte_eal_config_create() < 0) + return -1; + eal_mcfg_update_from_internal(); + break; + case RTE_PROC_SECONDARY: + if (rte_eal_config_attach() < 0) + return -1; + eal_mcfg_wait_complete(); + if (eal_mcfg_check_version() < 0) { + RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n"); + return -1; + } + if (rte_eal_config_reattach() < 0) + return -1; + eal_mcfg_update_internal(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + RTE_LOG(ERR, EAL, "Invalid process type %d\n", + rte_config.process_type); + return -1; + } + + return 0; +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + const int old_optreset = optreset; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + optreset = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optreset = old_optreset; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + const int old_optreset = optreset; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + optreset = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* getopt didn't recognise the option */ + if (opt == '?') { + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case OPT_MBUF_POOL_OPS_NAME_NUM: + { + char *ops_name = strdup(optarg); + if (ops_name == NULL) + RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); + else { + /* free old ops name */ + if (internal_config.user_mbuf_pool_ops_name != + NULL) + free(internal_config.user_mbuf_pool_ops_name); + + internal_config.user_mbuf_pool_ops_name = + ops_name; + } + break; + } + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on FreeBSD\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on FreeBSD\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on FreeBSD\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optreset = old_optreset; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + if (msl->external) + return 0; + + if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0) + return 1; + + return 0; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + + +static int +sync_func(__rte_unused void *arg) +{ + return 0; +} + +/* return non-zero if hugepages are enabled. */ +int rte_eal_has_hugepages(void) +{ + return !internal_config.no_hugetlbfs; +} + +/* Abstraction for port I/0 privilege */ +int +rte_eal_iopl_init(void) +{ + static int fd = -1; + + if (fd < 0) + fd = open("/dev/io", O_RDWR); + + if (fd < 0) + return -1; + /* keep fd open for iopl */ + return 0; +} + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* clone argv to report out later in telemetry */ + eal_save_args(argc, argv); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + /* FreeBSD always uses legacy memory model */ + internal_config.legacy_mem = true; + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_trace_init() < 0) { + rte_eal_init_alert("Cannot init trace"); + rte_errno = EFAULT; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + if (rte_config_init() < 0) { + rte_eal_init_alert("Cannot init config"); + return -1; + } + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread"); + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init alarm"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) { + rte_eal_init_alert("failed to init mp channel"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + enum rte_iova_mode iova_mode = rte_bus_get_iommu_class(); + + if (iova_mode == RTE_IOVA_DC) + iova_mode = RTE_IOVA_PA; + rte_eal_get_configuration()->iova_mode = iova_mode; + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } + + RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n", + rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA"); + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + else + internal_config.memory = eal_get_hugepage_mem_size(); + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory"); + rte_errno = ENOMEM; + return -1; + } + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n", + rte_config.master_lcore, thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + rte_thread_setname(lcore_config[i].thread_id, thread_name); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed"); + rte_errno = ENOEXEC; + return -1; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices"); + rte_errno = ENOTSUP; + return -1; + } + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + return -1; + } + + /* + * Clean up unused files in runtime directory. We do this at the end of + * init and not at the beginning because we want to clean stuff up + * whether we are primary or secondary process, but we cannot remove + * primary process' files because secondary should be able to run even + * if primary process is dead. + * + * In no_shconf mode, no runtime directory is created in the first + * place, so no cleanup needed. + */ + if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { + rte_eal_init_alert("Cannot clear runtime directory\n"); + return -1; + } + if (!internal_config.no_telemetry) { + const char *error_str = NULL; + if (rte_telemetry_init(rte_eal_get_runtime_dir(), + &internal_config.ctrl_cpuset, &error_str) + != 0) { + rte_eal_init_alert(error_str); + return -1; + } + if (error_str != NULL) + RTE_LOG(NOTICE, EAL, "%s\n", error_str); + } + + eal_mcfg_complete(); + + return fctret; +} + +int +rte_eal_cleanup(void) +{ + rte_service_finalize(); + rte_mp_channel_cleanup(); + rte_trace_save(); + eal_trace_fini(); + eal_cleanup_config(&internal_config); + return 0; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return RTE_INTR_MODE_NONE; +} + +int rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int fd) +{ + return -1; +} + +int rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return 0; +} + +int rte_vfio_noiommu_is_enabled(void) +{ + return 0; +} + +int rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return 0; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_alarm.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_alarm.c new file mode 100644 index 000000000..c38b2e04f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_alarm.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <errno.h> + +#include <rte_alarm.h> +#include <rte_cycles.h> +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_interrupts.h> +#include <rte_spinlock.h> +#include <rte_eal_trace.h> + +#include "eal_private.h" +#include "eal_alarm_private.h" + +#define NS_PER_US 1000 + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct rte_intr_handle handle; + struct timespec time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + + /* on FreeBSD, timers don't use fd's, and their identifiers are stored + * in separate namespace from fd's, so using any value is OK. however, + * EAL interrupts handler expects fd's to be unique, so use an actual fd + * to guarantee unique timer identifier. + */ + intr_handle.fd = open("/dev/zero", O_RDONLY); + + return 0; +} + +static inline int +timespec_cmp(const struct timespec *now, const struct timespec *at) +{ + if (now->tv_sec < at->tv_sec) + return -1; + if (now->tv_sec > at->tv_sec) + return 1; + if (now->tv_nsec < at->tv_nsec) + return -1; + if (now->tv_nsec > at->tv_nsec) + return 1; + return 0; +} + +static inline uint64_t +diff_ns(struct timespec *now, struct timespec *at) +{ + uint64_t now_ns, at_ns; + + if (timespec_cmp(now, at) >= 0) + return 0; + + now_ns = now->tv_sec * NS_PER_S + now->tv_nsec; + at_ns = at->tv_sec * NS_PER_S + at->tv_nsec; + + return at_ns - now_ns; +} + +int +eal_alarm_get_timeout_ns(uint64_t *val) +{ + struct alarm_entry *ap; + struct timespec now; + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return -1; + + if (LIST_EMPTY(&alarm_list)) + return -1; + + ap = LIST_FIRST(&alarm_list); + + *val = diff_ns(&now, &ap->time); + + return 0; +} + +static int +unregister_current_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + do { + ret = rte_intr_callback_unregister(&intr_handle, + eal_alarm_callback, &ap->time); + } while (ret == -EAGAIN); + } + + return ret; +} + +static int +register_first_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + /* register a new callback */ + ret = rte_intr_callback_register(&intr_handle, + eal_alarm_callback, &ap->time); + } + return ret; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + ap = LIST_FIRST(&alarm_list); + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return; + + while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + + ap = LIST_FIRST(&alarm_list); + } + + /* timer has been deleted from the kqueue, so recreate it if needed */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); +} + + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *new_alarm; + struct timespec now; + uint64_t ns; + int ret = 0; + + /* check parameters, also ensure us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + ns = us * NS_PER_US; + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S; + new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (timespec_cmp(&new_alarm->time, &ap->time) < 0) { + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + /* re-register first callback just in case */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + rte_eal_trace_alarm_set(us, cb_fn, cb_arg, ret); + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while (1) { + ap = LIST_FIRST(&alarm_list); + if (ap == NULL) + break; + if (cb_fn != ap->cb_fn) + break; + if (cb_arg != ap->cb_arg && cb_arg != (void *) -1) + break; + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that + * alarm is executing so loop can spin till it + * finish. Otherwise we are trying to cancel + * ourselves - mark it by EINPROGRESS. + */ + if (pthread_equal(ap->executing_id, + pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || + cb_arg == ap->cb_arg)) { + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, + pthread_self()) == 0) { + executing++; + } else { + err = EINPROGRESS; + } + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + rte_spinlock_lock(&alarm_list_lk); + + /* unregister if no alarms left, otherwise re-register first */ + if (LIST_EMPTY(&alarm_list)) + unregister_current_callback(); + else + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + rte_eal_trace_alarm_cancel(cb_fn, cb_arg, count); + return count; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_alarm_private.h b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_alarm_private.h new file mode 100644 index 000000000..65c711518 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_alarm_private.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef EAL_ALARM_PRIVATE_H +#define EAL_ALARM_PRIVATE_H + +#include <inttypes.h> + +/* + * FreeBSD needs a back-channel communication mechanism between interrupt and + * alarm thread, because on FreeBSD, timer period is set up inside the interrupt + * API and not inside alarm API like on Linux. + */ + +int +eal_alarm_get_timeout_ns(uint64_t *val); + +#endif // EAL_ALARM_PRIVATE_H diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_cpuflags.c new file mode 100644 index 000000000..69b161ea6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_cpuflags.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <rte_common.h> +#include <rte_cpuflags.h> + +unsigned long +rte_cpu_getauxval(unsigned long type __rte_unused) +{ + /* not implemented */ + return 0; +} + +int +rte_cpu_strcmp_auxval(unsigned long type __rte_unused, + const char *str __rte_unused) +{ + /* not implemented */ + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_debug.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_debug.c new file mode 100644 index 000000000..5d92500bf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include <execinfo.h> +#endif +#include <stdarg.h> +#include <signal.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> + +#include <rte_log.h> +#include <rte_debug.h> +#include <rte_common.h> +#include <rte_eal.h> + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_dev.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_dev.c new file mode 100644 index 000000000..8e06e7089 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_dev.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <rte_log.h> +#include <rte_compat.h> +#include <rte_dev.h> + +int +rte_dev_event_monitor_start(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int +rte_dev_event_monitor_stop(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int +rte_dev_hotplug_handle_enable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int +rte_dev_hotplug_handle_disable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_hugepage_info.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_hugepage_info.c new file mode 100644 index 000000000..32012e142 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_hugepage_info.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/mman.h> +#include <string.h> + +#include <rte_log.h> +#include <fcntl.h> +#include "eal_hugepages.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" + +#define CONTIGMEM_DEV "/dev/contigmem" + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0600); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +/* + * No hugepage support on freebsd, but we dummy it, using contigmem driver + */ +int +eal_hugepage_info_init(void) +{ + size_t sysctl_size; + int num_buffers, fd, error; + int64_t buffer_size; + /* re-use the linux "internal config" structure for our memory data */ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + unsigned int i; + + internal_config.num_hugepage_sizes = 1; + + sysctl_size = sizeof(num_buffers); + error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers, + &sysctl_size, NULL, 0); + + if (error != 0) { + RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n"); + return -1; + } + + sysctl_size = sizeof(buffer_size); + error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size, + &sysctl_size, NULL, 0); + + if (error != 0) { + RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n"); + return -1; + } + + fd = open(CONTIGMEM_DEV, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "could not open "CONTIGMEM_DEV"\n"); + return -1; + } + + if (buffer_size >= 1<<30) + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dGB\n", + num_buffers, (int)(buffer_size>>30)); + else if (buffer_size >= 1<<20) + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dMB\n", + num_buffers, (int)(buffer_size>>20)); + else + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n", + num_buffers, (int)(buffer_size>>10)); + + strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir)); + hpi->hugepage_sz = buffer_size; + hpi->num_pages[0] = num_buffers; + hpi->lock_descriptor = fd; + + /* for no shared files mode, do not create shared memory config */ + if (internal_config.no_shconf) + return 0; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL ) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + + return 0; +} + +/* copy stuff from shared info into internal config */ +int +eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + internal_config.num_hugepage_sizes = 1; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_interrupts.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_interrupts.c new file mode 100644 index 000000000..6d53d33c8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_interrupts.c @@ -0,0 +1,713 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include <string.h> +#include <sys/types.h> +#include <sys/event.h> +#include <sys/queue.h> +#include <unistd.h> + +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_spinlock.h> +#include <rte_common.h> +#include <rte_interrupts.h> +#include <rte_eal_trace.h> + +#include "eal_private.h" +#include "eal_alarm_private.h" + +#define MAX_INTR_EVENTS 16 + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ + uint8_t pending_delete; /**< delete after callback is called */ + rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +static volatile int kq = -1; + +static int +intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke) +{ + /* alarm callbacks are special case */ + if (ih->type == RTE_INTR_HANDLE_ALARM) { + uint64_t timeout_ns; + + /* get soonest alarm timeout */ + if (eal_alarm_get_timeout_ns(&timeout_ns) < 0) + return -1; + + ke->filter = EVFILT_TIMER; + /* timers are one shot */ + ke->flags |= EV_ONESHOT; + ke->fflags = NOTE_NSECONDS; + ke->data = timeout_ns; + } else { + ke->filter = EVFILT_READ; + } + ke->ident = ih->fd; + + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + struct rte_intr_callback *callback; + struct rte_intr_source *src; + int ret = 0, add_event = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* find the source for this intr_handle */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) + break; + } + + /* if this is an alarm interrupt and it already has a callback, + * then we don't want to create a new callback because the only + * thing on the list should be eal_alarm_callback() and we may + * be called just to reset the timer. + */ + if (src != NULL && src->intr_handle.type == RTE_INTR_HANDLE_ALARM && + !TAILQ_EMPTY(&src->callbacks)) { + callback = NULL; + } else { + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + callback->pending_delete = 0; + callback->ucb_fn = NULL; + + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + } + } + + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + add_event = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + } + + /* add events to the queue. timer events are special as we need to + * re-set the timer. + */ + if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) { + struct kevent ke; + + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_ADD; /* mark for addition to the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n"); + ret = -ENODEV; + goto fail; + } + + /** + * add the intr file descriptor into wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + /* currently, nic_uio does not support interrupts, so + * this error will always be triggered and output to the + * user. so, don't output it unless debug log level set. + */ + if (errno == ENODEV) + RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n", + src->intr_handle.fd); + else + RTE_LOG(ERR, EAL, "Error adding fd %d " + "kevent, %s\n", + src->intr_handle.fd, + strerror(errno)); + ret = -errno; + goto fail; + } + } + rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret); + rte_spinlock_unlock(&intr_lock); + + return 0; +fail: + /* clean up */ + if (src != NULL) { + if (callback != NULL) + TAILQ_REMOVE(&(src->callbacks), callback, next); + if (TAILQ_EMPTY(&(src->callbacks))) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + free(callback); + rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret); + rte_spinlock_unlock(&intr_lock); + return ret; +} + +int +rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg, + rte_intr_unregister_callback_fn ucb_fn) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active\n"); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* only usable if the source is active */ + } else if (src->active == 0) { + ret = -EAGAIN; + + } else { + ret = 0; + + /* walk through the callbacks and mark all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + cb->pending_delete = 1; + cb->ucb_fn = ucb_fn; + ret++; + } + } + } + + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active\n"); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + struct kevent ke; + + ret = 0; + + /* remove it from the kqueue */ + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_DELETE; /* mark for deletion from the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); + ret = -ENODEV; + goto out; + } + + /** + * remove intr file descriptor from wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n", + src->intr_handle.fd, strerror(errno)); + /* removing non-existent even is an expected condition + * in some circumstances (e.g. oneshot events). + */ + } + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } +out: + rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg, + ret); + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + int rc = 0; + + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) { + rc = 0; + goto out; + } + + if (!intr_handle || intr_handle->fd < 0 || + intr_handle->uio_cfg_fd < 0) { + rc = -1; + goto out; + } + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + rc = -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + rc = -1; + break; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + rc = -1; + break; + } + +out: + rte_eal_trace_intr_enable(intr_handle, rc); + return rc; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + int rc = 0; + + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) { + rc = 0; + goto out; + } + + if (!intr_handle || intr_handle->fd < 0 || + intr_handle->uio_cfg_fd < 0) { + rc = -1; + goto out; + } + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + rc = -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + rc = -1; + break; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + rc = -1; + break; + } +out: + rte_eal_trace_intr_disable(intr_handle, rc); + return rc; +} + +int +rte_intr_ack(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + return -1; +} + +static void +eal_intr_process_interrupts(struct kevent *events, int nfds) +{ + struct rte_intr_callback active_cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback *cb, *next; + struct rte_intr_source *src; + bool call = false; + int n, bytes_read; + struct kevent ke; + + for (n = 0; n < nfds; n++) { + int event_fd = events[n].ident; + + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == event_fd) + break; + if (src == NULL) { + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_ALARM: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(event_fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + event_fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", event_fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + + /* check if any callback are supposed to be removed */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->pending_delete) { + /* remove it from the kqueue */ + memset(&ke, 0, sizeof(ke)); + /* mark for deletion from the queue */ + ke.flags = EV_DELETE; + + if (intr_source_to_kevent(&src->intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); + rte_spinlock_unlock(&intr_lock); + return; + } + + /** + * remove intr file descriptor from wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + RTE_LOG(ERR, EAL, "Error removing fd %d kevent, " + "%s\n", src->intr_handle.fd, + strerror(errno)); + /* removing non-existent even is an expected + * condition in some circumstances + * (e.g. oneshot events). + */ + } + + TAILQ_REMOVE(&src->callbacks, cb, next); + if (cb->ucb_fn) + cb->ucb_fn(&src->intr_handle, cb->cb_arg); + free(cb); + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + + rte_spinlock_unlock(&intr_lock); + } +} + +static void * +eal_intr_thread_main(void *arg __rte_unused) +{ + struct kevent events[MAX_INTR_EVENTS]; + int nfds; + + /* host thread, never break out */ + for (;;) { + /* do not change anything, just wait */ + nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL); + + /* kevent fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "kevent returns with fail\n"); + break; + } + /* kevent timeout, will never happen here */ + else if (nfds == 0) + continue; + + /* kevent has at least one fd ready to read */ + eal_intr_process_interrupts(events, nfds); + } + close(kq); + kq = -1; + return NULL; +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + kq = kqueue(); + if (kq < 0) { + RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n"); + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data) +{ + RTE_SET_USED(intr_handle); + RTE_SET_USED(epfd); + RTE_SET_USED(op); + RTE_SET_USED(vec); + RTE_SET_USED(data); + + return -ENOTSUP; +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + RTE_SET_USED(intr_handle); + RTE_SET_USED(nb_efd); + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 0; +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 1; +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 0; +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + RTE_SET_USED(epfd); + RTE_SET_USED(events); + RTE_SET_USED(maxevents); + RTE_SET_USED(timeout); + + return -ENOTSUP; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, struct rte_epoll_event *event) +{ + RTE_SET_USED(epfd); + RTE_SET_USED(op); + RTE_SET_USED(fd); + RTE_SET_USED(event); + + return -ENOTSUP; +} + +int +rte_intr_tls_epfd(void) +{ + return -ENOTSUP; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); +} + +int rte_thread_is_intr(void) +{ + return pthread_equal(intr_thread, pthread_self()); +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_lcore.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_lcore.c new file mode 100644 index 000000000..d9ef4bc9c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_lcore.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <unistd.h> +#include <sys/sysctl.h> + +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_thread.h" + +/* No topology information available on FreeBSD including NUMA info */ +unsigned +eal_cpu_core_id(__rte_unused unsigned lcore_id) +{ + return 0; +} + +static int +eal_get_ncpus(void) +{ + static int ncpu = -1; + int mib[2] = {CTL_HW, HW_NCPU}; + size_t len = sizeof(ncpu); + + if (ncpu < 0) { + sysctl(mib, 2, &ncpu, &len, NULL, 0); + RTE_LOG(INFO, EAL, "Sysctl reports %d cpus\n", ncpu); + } + return ncpu; +} + +unsigned +eal_cpu_socket_id(__rte_unused unsigned cpu_id) +{ + return 0; +} + +/* Check if a cpu is present by the presence of the + * cpu information for it. + */ +int +eal_cpu_detected(unsigned lcore_id) +{ + const unsigned ncpus = eal_get_ncpus(); + return lcore_id < ncpus; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_memalloc.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_memalloc.c new file mode 100644 index 000000000..6893448db --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_memalloc.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include <inttypes.h> + +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_memory.h> + +#include "eal_memalloc.h" + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused, + int __rte_unused n_segs, size_t __rte_unused page_sz, + int __rte_unused socket, bool __rte_unused exact) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return NULL; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused, + int n_segs __rte_unused) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_sync_with_primary(void) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused, + int fd __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_set_seg_list_fd(int list_idx __rte_unused, int fd __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused, + int seg_idx __rte_unused, size_t *offset __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_init(void) +{ + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_memory.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_memory.c new file mode 100644 index 000000000..5bc2da160 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_memory.c @@ -0,0 +1,536 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <sys/mman.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/sysctl.h> +#include <inttypes.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> + +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_memcfg.h" +#include "eal_options.h" + +#define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE)) + +uint64_t eal_get_baseaddr(void) +{ + /* + * FreeBSD may allocate something in the space we will be mapping things + * before we get a chance to do that, so use a base address that's far + * away from where malloc() et al usually map things. + */ + return 0x1000000000ULL; +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + /* XXX not implemented. This function is only used by + * rte_mempool_virt2iova() when hugepages are disabled. */ + (void)virtaddr; + return RTE_BAD_IOVA; +} +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + return rte_mem_virt2phy(virtaddr); +} + +int +rte_eal_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + uint64_t total_mem = 0; + void *addr; + unsigned int i, j, seg_idx = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* for debug purposes, hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + struct rte_memseg *ms; + uint64_t page_sz; + int n_segs, cur_seg; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + addr = mmap(NULL, internal_config.memory, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->len = internal_config.memory; + msl->socket_id = 0; + msl->heap = 1; + + /* populate memsegs. each memseg is 1 page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->socket_id = 0; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, page_sz); + } + return 0; + } + + /* map all hugepages and sort them */ + for (i = 0; i < internal_config.num_hugepage_sizes; i ++){ + struct hugepage_info *hpi; + rte_iova_t prev_end = 0; + int prev_ms_idx = -1; + uint64_t page_sz, mem_needed; + unsigned int n_pages, max_pages; + + hpi = &internal_config.hugepage_info[i]; + page_sz = hpi->hugepage_sz; + max_pages = hpi->num_pages[0]; + mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem, + page_sz); + + n_pages = RTE_MIN(mem_needed / page_sz, max_pages); + + for (j = 0; j < n_pages; j++) { + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + struct rte_memseg *seg; + int msl_idx, ms_idx; + rte_iova_t physaddr; + int error; + size_t sysctl_size = sizeof(physaddr); + char physaddr_str[64]; + bool is_adjacent; + + /* first, check if this segment is IOVA-adjacent to + * the previous one. + */ + snprintf(physaddr_str, sizeof(physaddr_str), + "hw.contigmem.physaddr.%d", j); + error = sysctlbyname(physaddr_str, &physaddr, + &sysctl_size, NULL, 0); + if (error < 0) { + RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u " + "from %s\n", j, hpi->hugedir); + return -1; + } + + is_adjacent = prev_end != 0 && physaddr == prev_end; + prev_end = physaddr + hpi->hugepage_sz; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + bool empty, need_hole; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + + empty = arr->count == 0; + + /* we need a hole if this isn't an empty memseg + * list, and if previous segment was not + * adjacent to current one. + */ + need_hole = !empty && !is_adjacent; + + /* we need 1, plus hole if not adjacent */ + ms_idx = rte_fbarray_find_next_n_free(arr, + 0, 1 + (need_hole ? 1 : 0)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + if (need_hole && prev_ms_idx == ms_idx - 1) + ms_idx++; + prev_ms_idx = ms_idx; + + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + arr = &msl->memseg_arr; + seg = rte_fbarray_get(arr, ms_idx); + + addr = RTE_PTR_ADD(msl->base_va, + (size_t)msl->page_sz * ms_idx); + + /* address is already mapped in memseg list, so using + * MAP_FIXED here is safe. + */ + addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED, + hpi->lock_descriptor, + j * EAL_PAGE_SIZE); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", + j, hpi->hugedir); + return -1; + } + + seg->addr = addr; + seg->iova = physaddr; + seg->hugepage_sz = page_sz; + seg->len = page_sz; + seg->nchannel = mcfg->nchannel; + seg->nrank = mcfg->nrank; + seg->socket_id = 0; + + rte_fbarray_set_used(arr, ms_idx); + + RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%" + PRIx64", len %zu\n", + seg_idx++, addr, physaddr, page_sz); + + total_mem += seg->len; + } + if (total_mem >= internal_config.memory) + break; + } + if (total_mem < internal_config.memory) { + RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, " + "requested: %" PRIu64 "M " + "available: %" PRIu64 "M\n", + internal_config.memory >> 20, total_mem >> 20); + return -1; + } + return 0; +} + +struct attach_walk_args { + int fd_hugepage; + int seg_idx; +}; +static int +attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct attach_walk_args *wa = arg; + void *addr; + + if (msl->external) + return 0; + + addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, wa->fd_hugepage, + wa->seg_idx * EAL_PAGE_SIZE); + if (addr == MAP_FAILED || addr != ms->addr) + return -1; + wa->seg_idx++; + + return 0; +} + +int +rte_eal_hugepage_attach(void) +{ + const struct hugepage_info *hpi; + int fd_hugepage = -1; + unsigned int i; + + hpi = &internal_config.hugepage_info[0]; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + const struct hugepage_info *cur_hpi = &hpi[i]; + struct attach_walk_args wa; + + memset(&wa, 0, sizeof(wa)); + + /* Obtain a file descriptor for contiguous memory */ + fd_hugepage = open(cur_hpi->hugedir, O_RDWR); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + cur_hpi->hugedir); + goto error; + } + wa.fd_hugepage = fd_hugepage; + wa.seg_idx = 0; + + /* Map the contiguous memory into each memory segment */ + if (rte_memseg_walk(attach_segment, &wa) < 0) { + RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", + wa.seg_idx, cur_hpi->hugedir); + goto error; + } + + close(fd_hugepage); + fd_hugepage = -1; + } + + /* hugepage_info is no longer required */ + return 0; + +error: + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +int +rte_eal_using_phys_addrs(void) +{ + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - " + "please use '--" OPT_BASE_VIRTADDR "' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + msl->len = mem_sz; + + return 0; +} + + +static int +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* FreeBSD has an issue where core dump will dump the entire memory + * contents, including anonymous zero-page memory. Therefore, while we + * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will + * also be further limiting total memory amount to whatever memory is + * available to us through contigmem driver (plus spacing blocks). + * + * so, at each stage, we will be checking how much memory we are + * preallocating, and adjust all the values accordingly. + */ + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t max_type_mem, total_type_mem = 0; + uint64_t avail_mem; + int type_msl_idx, max_segs, avail_segs, total_segs = 0; + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* no NUMA support on FreeBSD */ + + /* check if we've already exceeded total memory amount */ + if (total_mem >= max_mem) + break; + + /* first, calculate theoretical limits according to config */ + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + /* now, limit all of that to whatever will actually be + * available to us, because without dynamic allocation support, + * all of that extra memory will be sitting there being useless + * and slowing down core dumps in case of a crash. + * + * we need (N*2)-1 segments because we cannot guarantee that + * each segment will be IOVA-contiguous with the previous one, + * so we will allocate more and put spaces between segments + * that are non-contiguous. + */ + avail_segs = (hpi->num_pages[0] * 2) - 1; + avail_mem = avail_segs * hugepage_sz; + + max_type_mem = RTE_MIN(avail_mem, max_type_mem); + max_segs = RTE_MIN(avail_segs, max_segs); + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem, cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + + cur_mem = get_mem_amount(hugepage_sz, + cur_max_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + 0, type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + return 0; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + return rte_eal_process_type() == RTE_PROC_PRIMARY ? + memseg_primary_init() : + memseg_secondary_init(); +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_thread.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_thread.c new file mode 100644 index 000000000..b52019782 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_thread.c @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <sched.h> +#include <pthread_np.h> +#include <sys/queue.h> +#include <sys/thr.h> + +#include <rte_debug.h> +#include <rte_atomic.h> +#include <rte_launch.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_eal_trace.h> + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + int rc = -EBUSY; + + if (lcore_config[slave_id].state != WAIT) + goto finish; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + rc = 0; +finish: + rte_eal_trace_thread_remote_launch(f, arg, slave_id, rc); + return rc; +} + +/* set affinity for current thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__rte_noreturn void * +eal_thread_loop(__rte_unused void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n", + lcore_id, thread_id, cpuset, ret == 0 ? "" : "..."); + + __rte_trace_mem_per_thread_alloc(); + rte_eal_trace_thread_lcore_ready(lcore_id, cpuset); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + long lwpid; + thr_self(&lwpid); + return (int)lwpid; +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + /* this BSD function returns no error */ + pthread_set_name_np(id, name); + return 0; +} + +int rte_thread_getname(pthread_t id, char *name, size_t len) +{ + RTE_SET_USED(id); + RTE_SET_USED(name); + RTE_SET_USED(len); + + return -ENOTSUP; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/eal_timer.c b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_timer.c new file mode 100644 index 000000000..beff755a4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/eal_timer.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/types.h> +#include <sys/sysctl.h> +#include <errno.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_cycles.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +#ifdef RTE_LIBEAL_USE_HPET +#warning HPET is not supported in FreeBSD +#endif + +enum timer_source eal_timer_source = EAL_TIMER_TSC; + +uint64_t +get_tsc_freq(void) +{ + size_t sz; + int tmp; + uint64_t tsc_hz; + + sz = sizeof(tmp); + tmp = 0; + + if (sysctlbyname("kern.timecounter.smp_tsc", &tmp, &sz, NULL, 0)) + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + else if (tmp != 1) + RTE_LOG(WARNING, EAL, "TSC is not safe to use in SMP mode\n"); + + tmp = 0; + + if (sysctlbyname("kern.timecounter.invariant_tsc", &tmp, &sz, NULL, 0)) + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + else if (tmp != 1) + RTE_LOG(WARNING, EAL, "TSC is not invariant\n"); + + sz = sizeof(tsc_hz); + if (sysctlbyname("machdep.tsc_freq", &tsc_hz, &sz, NULL, 0)) { + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + return 0; + } + + return tsc_hz; +} + +int +rte_eal_timer_init(void) +{ + set_tsc_freq(); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/include/meson.build b/src/spdk/dpdk/lib/librte_eal/freebsd/include/meson.build new file mode 100644 index 000000000..7d18dd52f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/include/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020 Mellanox Technologies, Ltd + +includes += include_directories('.') + +headers += files( + 'rte_os.h', +) diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/include/rte_os.h b/src/spdk/dpdk/lib/librte_eal/freebsd/include/rte_os.h new file mode 100644 index 000000000..eeb750cd8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/include/rte_os.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_OS_H_ +#define _RTE_OS_H_ + +/** + * This is header should contain any function/macro definition + * which are not supported natively or named differently in the + * freebsd OS. Functions will be added in future releases. + */ + +#include <pthread_np.h> + +typedef cpuset_t rte_cpuset_t; +#define RTE_CPU_AND(dst, src1, src2) do \ +{ \ + cpuset_t tmp; \ + CPU_COPY(src1, &tmp); \ + CPU_AND(&tmp, src2); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#define RTE_CPU_OR(dst, src1, src2) do \ +{ \ + cpuset_t tmp; \ + CPU_COPY(src1, &tmp); \ + CPU_OR(&tmp, src2); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#define RTE_CPU_FILL(set) CPU_FILL(set) + +/* In FreeBSD 13 CPU_NAND macro is CPU_ANDNOT */ +#ifdef CPU_NAND +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpuset_t tmp; \ + CPU_FILL(&tmp); \ + CPU_NAND(&tmp, src); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#else +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpuset_t tmp; \ + CPU_FILL(&tmp); \ + CPU_ANDNOT(&tmp, src); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#endif + +#endif /* _RTE_OS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/freebsd/meson.build b/src/spdk/dpdk/lib/librte_eal/freebsd/meson.build new file mode 100644 index 000000000..e10fd8a16 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/freebsd/meson.build @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +subdir('include') + +sources += files( + 'eal.c', + 'eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_dev.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_lcore.c', + 'eal_memalloc.c', + 'eal_memory.c', + 'eal_thread.c', + 'eal_timer.c', +) + +deps += ['kvargs', 'telemetry'] diff --git a/src/spdk/dpdk/lib/librte_eal/include/Makefile b/src/spdk/dpdk/lib/librte_eal/include/Makefile new file mode 100644 index 000000000..eb99190d1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include := \ + $(sort $(notdir \ + $(wildcard $(RTE_SDK)/lib/librte_eal/include/*.h))) + +SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include/generic := \ + $(sort $(addprefix generic/, $(notdir \ + $(wildcard $(RTE_SDK)/lib/librte_eal/include/generic/*.h)))) + +ARCH_DIR ?= $(RTE_ARCH) +SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include += \ + $(sort $(addprefix ../$(ARCH_DIR)/include/, $(notdir \ + $(wildcard $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR)/include/*.h)))) + +include $(RTE_SDK)/mk/rte.install.mk diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_atomic.h new file mode 100644 index 000000000..e6ab15a97 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_atomic.h @@ -0,0 +1,1150 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ATOMIC_H_ +#define _RTE_ATOMIC_H_ + +/** + * @file + * Atomic Operations + * + * This file defines a generic API for atomic operations. + */ + +#include <stdint.h> +#include <rte_common.h> + +#ifdef __DOXYGEN__ + +/** @name Memory Barrier + */ +///@{ +/** + * General memory barrier. + * + * Guarantees that the LOAD and STORE operations generated before the + * barrier occur before the LOAD and STORE operations generated after. + */ +static inline void rte_mb(void); + +/** + * Write memory barrier. + * + * Guarantees that the STORE operations generated before the barrier + * occur before the STORE operations generated after. + */ +static inline void rte_wmb(void); + +/** + * Read memory barrier. + * + * Guarantees that the LOAD operations generated before the barrier + * occur before the LOAD operations generated after. + */ +static inline void rte_rmb(void); +///@} + +/** @name SMP Memory Barrier + */ +///@{ +/** + * General memory barrier between lcores + * + * Guarantees that the LOAD and STORE operations that precede the + * rte_smp_mb() call are globally visible across the lcores + * before the LOAD and STORE operations that follows it. + */ +static inline void rte_smp_mb(void); + +/** + * Write memory barrier between lcores + * + * Guarantees that the STORE operations that precede the + * rte_smp_wmb() call are globally visible across the lcores + * before the STORE operations that follows it. + */ +static inline void rte_smp_wmb(void); + +/** + * Read memory barrier between lcores + * + * Guarantees that the LOAD operations that precede the + * rte_smp_rmb() call are globally visible across the lcores + * before the LOAD operations that follows it. + */ +static inline void rte_smp_rmb(void); +///@} + +/** @name I/O Memory Barrier + */ +///@{ +/** + * General memory barrier for I/O device + * + * Guarantees that the LOAD and STORE operations that precede the + * rte_io_mb() call are visible to I/O device or CPU before the + * LOAD and STORE operations that follow it. + */ +static inline void rte_io_mb(void); + +/** + * Write memory barrier for I/O device + * + * Guarantees that the STORE operations that precede the + * rte_io_wmb() call are visible to I/O device before the STORE + * operations that follow it. + */ +static inline void rte_io_wmb(void); + +/** + * Read memory barrier for IO device + * + * Guarantees that the LOAD operations on I/O device that precede the + * rte_io_rmb() call are visible to CPU before the LOAD + * operations that follow it. + */ +static inline void rte_io_rmb(void); +///@} + +/** @name Coherent I/O Memory Barrier + * + * Coherent I/O memory barrier is a lightweight version of I/O memory + * barriers which are system-wide data synchronization barriers. This + * is for only coherent memory domain between lcore and I/O device but + * it is same as the I/O memory barriers in most of architectures. + * However, some architecture provides even lighter barriers which are + * somewhere in between I/O memory barriers and SMP memory barriers. + * For example, in case of ARMv8, DMB(data memory barrier) instruction + * can have different shareability domains - inner-shareable and + * outer-shareable. And inner-shareable DMB fits for SMP memory + * barriers and outer-shareable DMB for coherent I/O memory barriers, + * which acts on coherent memory. + * + * In most cases, I/O memory barriers are safer but if operations are + * on coherent memory instead of incoherent MMIO region of a device, + * then coherent I/O memory barriers can be used and this could bring + * performance gain depending on architectures. + */ +///@{ +/** + * Write memory barrier for coherent memory between lcore and I/O device + * + * Guarantees that the STORE operations on coherent memory that + * precede the rte_cio_wmb() call are visible to I/O device before the + * STORE operations that follow it. + */ +static inline void rte_cio_wmb(void); + +/** + * Read memory barrier for coherent memory between lcore and I/O device + * + * Guarantees that the LOAD operations on coherent memory updated by + * I/O device that precede the rte_cio_rmb() call are visible to CPU + * before the LOAD operations that follow it. + */ +static inline void rte_cio_rmb(void); +///@} + +#endif /* __DOXYGEN__ */ + +/** + * Compiler barrier. + * + * Guarantees that operation reordering does not occur at compile time + * for operations directly before and after the barrier. + */ +#define rte_compiler_barrier() do { \ + asm volatile ("" : : : "memory"); \ +} while(0) + +/*------------------------- 16 bit atomic operations -------------------------*/ + +/** + * Atomic compare and set. + * + * (atomic) equivalent to: + * if (*dst == exp) + * *dst = src (all 16-bit words) + * + * @param dst + * The destination location into which the value will be written. + * @param exp + * The expected value. + * @param src + * The new value. + * @return + * Non-zero on success; 0 on failure. + */ +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) +{ + return __sync_bool_compare_and_swap(dst, exp, src); +} +#endif + +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ +#if defined(__clang__) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_2(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + +/** + * The atomic counter structure. + */ +typedef struct { + volatile int16_t cnt; /**< An internal counter value. */ +} rte_atomic16_t; + +/** + * Static initializer for an atomic counter. + */ +#define RTE_ATOMIC16_INIT(val) { (val) } + +/** + * Initialize an atomic counter. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic16_init(rte_atomic16_t *v) +{ + v->cnt = 0; +} + +/** + * Atomically read a 16-bit value from a counter. + * + * @param v + * A pointer to the atomic counter. + * @return + * The value of the counter. + */ +static inline int16_t +rte_atomic16_read(const rte_atomic16_t *v) +{ + return v->cnt; +} + +/** + * Atomically set a counter to a 16-bit value. + * + * @param v + * A pointer to the atomic counter. + * @param new_value + * The new value for the counter. + */ +static inline void +rte_atomic16_set(rte_atomic16_t *v, int16_t new_value) +{ + v->cnt = new_value; +} + +/** + * Atomically add a 16-bit value to an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + */ +static inline void +rte_atomic16_add(rte_atomic16_t *v, int16_t inc) +{ + __sync_fetch_and_add(&v->cnt, inc); +} + +/** + * Atomically subtract a 16-bit value from an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + */ +static inline void +rte_atomic16_sub(rte_atomic16_t *v, int16_t dec) +{ + __sync_fetch_and_sub(&v->cnt, dec); +} + +/** + * Atomically increment a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic16_inc(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic16_inc(rte_atomic16_t *v) +{ + rte_atomic16_add(v, 1); +} +#endif + +/** + * Atomically decrement a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic16_dec(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic16_dec(rte_atomic16_t *v) +{ + rte_atomic16_sub(v, 1); +} +#endif + +/** + * Atomically add a 16-bit value to a counter and return the result. + * + * Atomically adds the 16-bits value (inc) to the atomic counter (v) and + * returns the value of v after addition. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + * @return + * The value of v after the addition. + */ +static inline int16_t +rte_atomic16_add_return(rte_atomic16_t *v, int16_t inc) +{ + return __sync_add_and_fetch(&v->cnt, inc); +} + +/** + * Atomically subtract a 16-bit value from a counter and return + * the result. + * + * Atomically subtracts the 16-bit value (inc) from the atomic counter + * (v) and returns the value of v after the subtraction. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + * @return + * The value of v after the subtraction. + */ +static inline int16_t +rte_atomic16_sub_return(rte_atomic16_t *v, int16_t dec) +{ + return __sync_sub_and_fetch(&v->cnt, dec); +} + +/** + * Atomically increment a 16-bit counter by one and test. + * + * Atomically increments the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the increment operation is 0; false otherwise. + */ +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) +{ + return __sync_add_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically decrement a 16-bit counter by one and test. + * + * Atomically decrements the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the decrement operation is 0; false otherwise. + */ +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) +{ + return __sync_sub_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically test and set a 16-bit atomic counter. + * + * If the counter value is already set, return 0 (failed). Otherwise, set + * the counter value to 1 and return 1 (success). + * + * @param v + * A pointer to the atomic counter. + * @return + * 0 if failed; else 1, success. + */ +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) +{ + return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); +} +#endif + +/** + * Atomically set a 16-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic16_clear(rte_atomic16_t *v) +{ + v->cnt = 0; +} + +/*------------------------- 32 bit atomic operations -------------------------*/ + +/** + * Atomic compare and set. + * + * (atomic) equivalent to: + * if (*dst == exp) + * *dst = src (all 32-bit words) + * + * @param dst + * The destination location into which the value will be written. + * @param exp + * The expected value. + * @param src + * The new value. + * @return + * Non-zero on success; 0 on failure. + */ +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + return __sync_bool_compare_and_swap(dst, exp, src); +} +#endif + +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ +#if defined(__clang__) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + +/** + * The atomic counter structure. + */ +typedef struct { + volatile int32_t cnt; /**< An internal counter value. */ +} rte_atomic32_t; + +/** + * Static initializer for an atomic counter. + */ +#define RTE_ATOMIC32_INIT(val) { (val) } + +/** + * Initialize an atomic counter. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic32_init(rte_atomic32_t *v) +{ + v->cnt = 0; +} + +/** + * Atomically read a 32-bit value from a counter. + * + * @param v + * A pointer to the atomic counter. + * @return + * The value of the counter. + */ +static inline int32_t +rte_atomic32_read(const rte_atomic32_t *v) +{ + return v->cnt; +} + +/** + * Atomically set a counter to a 32-bit value. + * + * @param v + * A pointer to the atomic counter. + * @param new_value + * The new value for the counter. + */ +static inline void +rte_atomic32_set(rte_atomic32_t *v, int32_t new_value) +{ + v->cnt = new_value; +} + +/** + * Atomically add a 32-bit value to an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + */ +static inline void +rte_atomic32_add(rte_atomic32_t *v, int32_t inc) +{ + __sync_fetch_and_add(&v->cnt, inc); +} + +/** + * Atomically subtract a 32-bit value from an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + */ +static inline void +rte_atomic32_sub(rte_atomic32_t *v, int32_t dec) +{ + __sync_fetch_and_sub(&v->cnt, dec); +} + +/** + * Atomically increment a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic32_inc(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic32_inc(rte_atomic32_t *v) +{ + rte_atomic32_add(v, 1); +} +#endif + +/** + * Atomically decrement a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic32_dec(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic32_dec(rte_atomic32_t *v) +{ + rte_atomic32_sub(v,1); +} +#endif + +/** + * Atomically add a 32-bit value to a counter and return the result. + * + * Atomically adds the 32-bits value (inc) to the atomic counter (v) and + * returns the value of v after addition. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + * @return + * The value of v after the addition. + */ +static inline int32_t +rte_atomic32_add_return(rte_atomic32_t *v, int32_t inc) +{ + return __sync_add_and_fetch(&v->cnt, inc); +} + +/** + * Atomically subtract a 32-bit value from a counter and return + * the result. + * + * Atomically subtracts the 32-bit value (inc) from the atomic counter + * (v) and returns the value of v after the subtraction. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + * @return + * The value of v after the subtraction. + */ +static inline int32_t +rte_atomic32_sub_return(rte_atomic32_t *v, int32_t dec) +{ + return __sync_sub_and_fetch(&v->cnt, dec); +} + +/** + * Atomically increment a 32-bit counter by one and test. + * + * Atomically increments the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the increment operation is 0; false otherwise. + */ +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) +{ + return __sync_add_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically decrement a 32-bit counter by one and test. + * + * Atomically decrements the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the decrement operation is 0; false otherwise. + */ +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) +{ + return __sync_sub_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically test and set a 32-bit atomic counter. + * + * If the counter value is already set, return 0 (failed). Otherwise, set + * the counter value to 1 and return 1 (success). + * + * @param v + * A pointer to the atomic counter. + * @return + * 0 if failed; else 1, success. + */ +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) +{ + return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); +} +#endif + +/** + * Atomically set a 32-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic32_clear(rte_atomic32_t *v) +{ + v->cnt = 0; +} + +/*------------------------- 64 bit atomic operations -------------------------*/ + +/** + * An atomic compare and set function used by the mutex functions. + * (atomic) equivalent to: + * if (*dst == exp) + * *dst = src (all 64-bit words) + * + * @param dst + * The destination into which the value will be written. + * @param exp + * The expected value. + * @param src + * The new value. + * @return + * Non-zero on success; 0 on failure. + */ +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + return __sync_bool_compare_and_swap(dst, exp, src); +} +#endif + +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ +#if defined(__clang__) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_8(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + +/** + * The atomic counter structure. + */ +typedef struct { + volatile int64_t cnt; /**< Internal counter value. */ +} rte_atomic64_t; + +/** + * Static initializer for an atomic counter. + */ +#define RTE_ATOMIC64_INIT(val) { (val) } + +/** + * Initialize the atomic counter. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic64_init(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ +#ifdef __LP64__ + v->cnt = 0; +#else + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, 0); + } +#endif +} +#endif + +/** + * Atomically read a 64-bit counter. + * + * @param v + * A pointer to the atomic counter. + * @return + * The value of the counter. + */ +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ +#ifdef __LP64__ + return v->cnt; +#else + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + /* replace the value by itself */ + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp); + } + return tmp; +#endif +} +#endif + +/** + * Atomically set a 64-bit counter. + * + * @param v + * A pointer to the atomic counter. + * @param new_value + * The new value of the counter. + */ +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ +#ifdef __LP64__ + v->cnt = new_value; +#else + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, new_value); + } +#endif +} +#endif + +/** + * Atomically add a 64-bit value to a counter. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + */ +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + __sync_fetch_and_add(&v->cnt, inc); +} +#endif + +/** + * Atomically subtract a 64-bit value from a counter. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + */ +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + __sync_fetch_and_sub(&v->cnt, dec); +} +#endif + +/** + * Atomically increment a 64-bit counter by one and test. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic64_inc(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + rte_atomic64_add(v, 1); +} +#endif + +/** + * Atomically decrement a 64-bit counter by one and test. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic64_dec(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + rte_atomic64_sub(v, 1); +} +#endif + +/** + * Add a 64-bit value to an atomic counter and return the result. + * + * Atomically adds the 64-bit value (inc) to the atomic counter (v) and + * returns the value of v after the addition. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + * @return + * The value of v after the addition. + */ +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc); + +#ifdef RTE_FORCE_INTRINSICS +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + return __sync_add_and_fetch(&v->cnt, inc); +} +#endif + +/** + * Subtract a 64-bit value from an atomic counter and return the result. + * + * Atomically subtracts the 64-bit value (dec) from the atomic counter (v) + * and returns the value of v after the subtraction. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + * @return + * The value of v after the subtraction. + */ +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec); + +#ifdef RTE_FORCE_INTRINSICS +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + return __sync_sub_and_fetch(&v->cnt, dec); +} +#endif + +/** + * Atomically increment a 64-bit counter by one and test. + * + * Atomically increments the atomic counter (v) by one and returns + * true if the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the addition is 0; false otherwise. + */ +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_add_return(v, 1) == 0; +} +#endif + +/** + * Atomically decrement a 64-bit counter by one and test. + * + * Atomically decrements the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after subtraction is 0; false otherwise. + */ +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_sub_return(v, 1) == 0; +} +#endif + +/** + * Atomically test and set a 64-bit atomic counter. + * + * If the counter value is already set, return 0 (failed). Otherwise, set + * the counter value to 1 and return 1 (success). + * + * @param v + * A pointer to the atomic counter. + * @return + * 0 if failed; else 1, success. + */ +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} +#endif + +/** + * Atomically set a 64-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic64_clear(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + rte_atomic64_set(v, 0); +} +#endif + +/*------------------------ 128 bit atomic operations -------------------------*/ + +/** + * 128-bit integer structure. + */ +RTE_STD_C11 +typedef struct { + RTE_STD_C11 + union { + uint64_t val[2]; +#ifdef RTE_ARCH_64 + __extension__ __int128 int128; +#endif + }; +} __rte_aligned(16) rte_int128_t; + +#ifdef __DOXYGEN__ + +/** + * An atomic compare and set function used by the mutex functions. + * (Atomically) Equivalent to: + * @code + * if (*dst == *exp) + * *dst = *src + * else + * *exp = *dst + * @endcode + * + * @note This function is currently available for the x86-64 and aarch64 + * platforms. + * + * @note The success and failure arguments must be one of the __ATOMIC_* values + * defined in the C++11 standard. For details on their behavior, refer to the + * standard. + * + * @param dst + * The destination into which the value will be written. + * @param exp + * Pointer to the expected value. If the operation fails, this memory is + * updated with the actual value. + * @param src + * Pointer to the new value. + * @param weak + * A value of true allows the comparison to spuriously fail and allows the + * 'exp' update to occur non-atomically (i.e. a torn read may occur). + * Implementations may ignore this argument and only implement the strong + * variant. + * @param success + * If successful, the operation's memory behavior conforms to this (or a + * stronger) model. + * @param failure + * If unsuccessful, the operation's memory behavior conforms to this (or a + * stronger) model. This argument cannot be __ATOMIC_RELEASE, + * __ATOMIC_ACQ_REL, or a stronger model than success. + * @return + * Non-zero on success; 0 on failure. + */ +__rte_experimental +static inline int +rte_atomic128_cmp_exchange(rte_int128_t *dst, + rte_int128_t *exp, + const rte_int128_t *src, + unsigned int weak, + int success, + int failure); + +#endif /* __DOXYGEN__ */ + +#endif /* _RTE_ATOMIC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_byteorder.h new file mode 100644 index 000000000..9ca960932 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_byteorder.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_H_ +#define _RTE_BYTEORDER_H_ + +/** + * @file + * + * Byte Swap Operations + * + * This file defines a generic API for byte swap operations. Part of + * the implementation is architecture-specific. + */ + +#include <stdint.h> +#ifdef RTE_EXEC_ENV_FREEBSD +#include <sys/endian.h> +#else +#include <endian.h> +#endif + +#include <rte_common.h> +#include <rte_config.h> + +/* + * Compile-time endianness detection + */ +#define RTE_BIG_ENDIAN 1 +#define RTE_LITTLE_ENDIAN 2 +#if defined __BYTE_ORDER__ +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define RTE_BYTE_ORDER RTE_BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif /* __BYTE_ORDER__ */ +#elif defined __BYTE_ORDER +#if __BYTE_ORDER == __BIG_ENDIAN +#define RTE_BYTE_ORDER RTE_BIG_ENDIAN +#elif __BYTE_ORDER == __LITTLE_ENDIAN +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif /* __BYTE_ORDER */ +#elif defined __BIG_ENDIAN__ +#define RTE_BYTE_ORDER RTE_BIG_ENDIAN +#elif defined __LITTLE_ENDIAN__ +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif +#if !defined(RTE_BYTE_ORDER) +#error Unknown endianness. +#endif + +#define RTE_STATIC_BSWAP16(v) \ + ((((uint16_t)(v) & UINT16_C(0x00ff)) << 8) | \ + (((uint16_t)(v) & UINT16_C(0xff00)) >> 8)) + +#define RTE_STATIC_BSWAP32(v) \ + ((((uint32_t)(v) & UINT32_C(0x000000ff)) << 24) | \ + (((uint32_t)(v) & UINT32_C(0x0000ff00)) << 8) | \ + (((uint32_t)(v) & UINT32_C(0x00ff0000)) >> 8) | \ + (((uint32_t)(v) & UINT32_C(0xff000000)) >> 24)) + +#define RTE_STATIC_BSWAP64(v) \ + ((((uint64_t)(v) & UINT64_C(0x00000000000000ff)) << 56) | \ + (((uint64_t)(v) & UINT64_C(0x000000000000ff00)) << 40) | \ + (((uint64_t)(v) & UINT64_C(0x0000000000ff0000)) << 24) | \ + (((uint64_t)(v) & UINT64_C(0x00000000ff000000)) << 8) | \ + (((uint64_t)(v) & UINT64_C(0x000000ff00000000)) >> 8) | \ + (((uint64_t)(v) & UINT64_C(0x0000ff0000000000)) >> 24) | \ + (((uint64_t)(v) & UINT64_C(0x00ff000000000000)) >> 40) | \ + (((uint64_t)(v) & UINT64_C(0xff00000000000000)) >> 56)) + +/* + * These macros are functionally similar to rte_cpu_to_(be|le)(16|32|64)(), + * they take values in host CPU order and return them converted to the + * intended endianness. + * + * They resolve at compilation time to integer constants which can safely be + * used with static initializers, since those cannot involve function calls. + * + * On the other hand, they are not as optimized as their rte_cpu_to_*() + * counterparts, therefore applications should refrain from using them on + * variable values, particularly inside performance-sensitive code. + */ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN +#define RTE_BE16(v) (rte_be16_t)(v) +#define RTE_BE32(v) (rte_be32_t)(v) +#define RTE_BE64(v) (rte_be64_t)(v) +#define RTE_LE16(v) (rte_le16_t)(RTE_STATIC_BSWAP16(v)) +#define RTE_LE32(v) (rte_le32_t)(RTE_STATIC_BSWAP32(v)) +#define RTE_LE64(v) (rte_le64_t)(RTE_STATIC_BSWAP64(v)) +#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +#define RTE_BE16(v) (rte_be16_t)(RTE_STATIC_BSWAP16(v)) +#define RTE_BE32(v) (rte_be32_t)(RTE_STATIC_BSWAP32(v)) +#define RTE_BE64(v) (rte_be64_t)(RTE_STATIC_BSWAP64(v)) +#define RTE_LE16(v) (rte_le16_t)(v) +#define RTE_LE32(v) (rte_le32_t)(v) +#define RTE_LE64(v) (rte_le64_t)(v) +#else +#error Unsupported endianness. +#endif + +/* + * The following types should be used when handling values according to a + * specific byte ordering, which may differ from that of the host CPU. + * + * Libraries, public APIs and applications are encouraged to use them for + * documentation purposes. + */ +typedef uint16_t rte_be16_t; /**< 16-bit big-endian value. */ +typedef uint32_t rte_be32_t; /**< 32-bit big-endian value. */ +typedef uint64_t rte_be64_t; /**< 64-bit big-endian value. */ +typedef uint16_t rte_le16_t; /**< 16-bit little-endian value. */ +typedef uint32_t rte_le32_t; /**< 32-bit little-endian value. */ +typedef uint64_t rte_le64_t; /**< 64-bit little-endian value. */ + +/* + * An internal function to swap bytes in a 16-bit value. + * + * It is used by rte_bswap16() when the value is constant. Do not use + * this function directly; rte_bswap16() is preferred. + */ +static inline uint16_t +rte_constant_bswap16(uint16_t x) +{ + return (uint16_t)RTE_STATIC_BSWAP16(x); +} + +/* + * An internal function to swap bytes in a 32-bit value. + * + * It is used by rte_bswap32() when the value is constant. Do not use + * this function directly; rte_bswap32() is preferred. + */ +static inline uint32_t +rte_constant_bswap32(uint32_t x) +{ + return (uint32_t)RTE_STATIC_BSWAP32(x); +} + +/* + * An internal function to swap bytes of a 64-bit value. + * + * It is used by rte_bswap64() when the value is constant. Do not use + * this function directly; rte_bswap64() is preferred. + */ +static inline uint64_t +rte_constant_bswap64(uint64_t x) +{ + return (uint64_t)RTE_STATIC_BSWAP64(x); +} + + +#ifdef __DOXYGEN__ + +/** + * Swap bytes in a 16-bit value. + */ +static uint16_t rte_bswap16(uint16_t _x); + +/** + * Swap bytes in a 32-bit value. + */ +static uint32_t rte_bswap32(uint32_t x); + +/** + * Swap bytes in a 64-bit value. + */ +static uint64_t rte_bswap64(uint64_t x); + +/** + * Convert a 16-bit value from CPU order to little endian. + */ +static rte_le16_t rte_cpu_to_le_16(uint16_t x); + +/** + * Convert a 32-bit value from CPU order to little endian. + */ +static rte_le32_t rte_cpu_to_le_32(uint32_t x); + +/** + * Convert a 64-bit value from CPU order to little endian. + */ +static rte_le64_t rte_cpu_to_le_64(uint64_t x); + + +/** + * Convert a 16-bit value from CPU order to big endian. + */ +static rte_be16_t rte_cpu_to_be_16(uint16_t x); + +/** + * Convert a 32-bit value from CPU order to big endian. + */ +static rte_be32_t rte_cpu_to_be_32(uint32_t x); + +/** + * Convert a 64-bit value from CPU order to big endian. + */ +static rte_be64_t rte_cpu_to_be_64(uint64_t x); + + +/** + * Convert a 16-bit value from little endian to CPU order. + */ +static uint16_t rte_le_to_cpu_16(rte_le16_t x); + +/** + * Convert a 32-bit value from little endian to CPU order. + */ +static uint32_t rte_le_to_cpu_32(rte_le32_t x); + +/** + * Convert a 64-bit value from little endian to CPU order. + */ +static uint64_t rte_le_to_cpu_64(rte_le64_t x); + + +/** + * Convert a 16-bit value from big endian to CPU order. + */ +static uint16_t rte_be_to_cpu_16(rte_be16_t x); + +/** + * Convert a 32-bit value from big endian to CPU order. + */ +static uint32_t rte_be_to_cpu_32(rte_be32_t x); + +/** + * Convert a 64-bit value from big endian to CPU order. + */ +static uint64_t rte_be_to_cpu_64(rte_be64_t x); + +#endif /* __DOXYGEN__ */ + +#ifdef RTE_FORCE_INTRINSICS +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +#define rte_bswap16(x) __builtin_bswap16(x) +#endif + +#define rte_bswap32(x) __builtin_bswap32(x) + +#define rte_bswap64(x) __builtin_bswap64(x) + +#endif + +#endif /* _RTE_BYTEORDER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_cpuflags.h new file mode 100644 index 000000000..872f0ebe3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_cpuflags.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_CPUFLAGS_H_ +#define _RTE_CPUFLAGS_H_ + +/** + * @file + * Architecture specific API to determine available CPU features at runtime. + */ + +#include "rte_common.h" +#include <errno.h> + +/** + * Enumeration of all CPU features supported + */ +__extension__ +enum rte_cpu_flag_t; + +/** + * Get name of CPU flag + * + * @param feature + * CPU flag ID + * @return + * flag name + * NULL if flag ID is invalid + */ +__extension__ +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature); + +/** + * Function for checking a CPU flag availability + * + * @param feature + * CPU flag to query CPU for + * @return + * 1 if flag is available + * 0 if flag is not available + * -ENOENT if flag is invalid + */ +__extension__ +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature); + +/** + * This function checks that the currently used CPU supports the CPU features + * that were specified at compile time. It is called automatically within the + * EAL, so does not need to be used by applications. This version returns a + * result so that decisions may be made (for instance, graceful shutdowns). + */ +int +rte_cpu_is_supported(void); + +/** + * This function attempts to retrieve a value from the auxiliary vector. + * If it is unsuccessful, the result will be 0, and errno will be set. + * + * @return A value from the auxiliary vector. When the value is 0, check + * errno to determine if an error occurred. + */ +unsigned long +rte_cpu_getauxval(unsigned long type); + +/** + * This function retrieves a value from the auxiliary vector, and compares it + * as a string against the value retrieved. + * + * @return The result of calling strcmp() against the value retrieved from + * the auxiliary vector. When the value is 0 (meaning a match is found), + * check errno to determine if an error occurred. + */ +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str); + +#endif /* _RTE_CPUFLAGS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_cycles.h new file mode 100644 index 000000000..73d1fa7b9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_cycles.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#ifndef _RTE_CYCLES_H_ +#define _RTE_CYCLES_H_ + +/** + * @file + * + * Simple Time Reference Functions (Cycles and HPET). + */ + +#include <stdint.h> +#include <rte_compat.h> +#include <rte_debug.h> +#include <rte_atomic.h> + +#define MS_PER_S 1000 +#define US_PER_S 1000000 +#define NS_PER_S 1000000000 + +enum timer_source { + EAL_TIMER_TSC = 0, + EAL_TIMER_HPET +}; +extern enum timer_source eal_timer_source; + +/** + * Get the measured frequency of the RDTSC counter + * + * @return + * The TSC frequency for this lcore + */ +uint64_t +rte_get_tsc_hz(void); + +/** + * Return the number of TSC cycles since boot + * + * @return + * the number of cycles + */ +static inline uint64_t +rte_get_tsc_cycles(void); + +#ifdef RTE_LIBEAL_USE_HPET +/** + * Return the number of HPET cycles since boot + * + * This counter is global for all execution units. The number of + * cycles in one second can be retrieved using rte_get_hpet_hz(). + * + * @return + * the number of cycles + */ +uint64_t +rte_get_hpet_cycles(void); + +/** + * Get the number of HPET cycles in one second. + * + * @return + * The number of cycles in one second. + */ +uint64_t +rte_get_hpet_hz(void); + +/** + * Initialise the HPET for use. This must be called before the rte_get_hpet_hz + * and rte_get_hpet_cycles APIs are called. If this function does not succeed, + * then the HPET functions are unavailable and should not be called. + * + * @param make_default + * If set, the hpet timer becomes the default timer whose values are + * returned by the rte_get_timer_hz/cycles API calls + * + * @return + * 0 on success, + * -1 on error, and the make_default parameter is ignored. + */ +int rte_eal_hpet_init(int make_default); + +#endif + +/** + * Get the number of cycles since boot from the default timer. + * + * @return + * The number of cycles + */ +static inline uint64_t +rte_get_timer_cycles(void) +{ +#ifdef RTE_LIBEAL_USE_HPET + switch(eal_timer_source) { + case EAL_TIMER_TSC: +#endif + return rte_get_tsc_cycles(); +#ifdef RTE_LIBEAL_USE_HPET + case EAL_TIMER_HPET: + return rte_get_hpet_cycles(); + default: rte_panic("Invalid timer source specified\n"); + } +#endif +} + +/** + * Get the number of cycles in one second for the default timer. + * + * @return + * The number of cycles in one second. + */ +static inline uint64_t +rte_get_timer_hz(void) +{ +#ifdef RTE_LIBEAL_USE_HPET + switch(eal_timer_source) { + case EAL_TIMER_TSC: +#endif + return rte_get_tsc_hz(); +#ifdef RTE_LIBEAL_USE_HPET + case EAL_TIMER_HPET: + return rte_get_hpet_hz(); + default: rte_panic("Invalid timer source specified\n"); + } +#endif +} +/** + * Wait at least us microseconds. + * This function can be replaced with user-defined function. + * @see rte_delay_us_callback_register + * + * @param us + * The number of microseconds to wait. + */ +extern void +(*rte_delay_us)(unsigned int us); + +/** + * Wait at least ms milliseconds. + * + * @param ms + * The number of milliseconds to wait. + */ +static inline void +rte_delay_ms(unsigned ms) +{ + rte_delay_us(ms * 1000); +} + +/** + * Blocking delay function. + * + * @param us + * Number of microseconds to wait. + */ +void rte_delay_us_block(unsigned int us); + +/** + * Delay function that uses system sleep. + * Does not block the CPU core. + * + * @param us + * Number of microseconds to wait. + */ +__rte_experimental +void +rte_delay_us_sleep(unsigned int us); + +/** + * Replace rte_delay_us with user defined function. + * + * @param userfunc + * User function which replaces rte_delay_us. rte_delay_us_block restores + * builtin block delay function. + */ +void rte_delay_us_callback_register(void(*userfunc)(unsigned int)); + +#endif /* _RTE_CYCLES_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_io.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_io.h new file mode 100644 index 000000000..da457f7f7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_io.h @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_H_ +#define _RTE_IO_H_ + +/** + * @file + * I/O device memory operations + * + * This file defines the generic API for I/O device memory read/write operations + */ + +#include <stdint.h> +#include <rte_common.h> +#include <rte_atomic.h> + +#ifdef __DOXYGEN__ + +/** + * Read a 8-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint8_t +rte_read8_relaxed(const volatile void *addr); + +/** + * Read a 16-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint16_t +rte_read16_relaxed(const volatile void *addr); + +/** + * Read a 32-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint32_t +rte_read32_relaxed(const volatile void *addr); + +/** + * Read a 64-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint64_t +rte_read64_relaxed(const volatile void *addr); + +/** + * Write a 8-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ + +static inline void +rte_write8_relaxed(uint8_t value, volatile void *addr); + +/** + * Write a 16-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write16_relaxed(uint16_t value, volatile void *addr); + +/** + * Write a 32-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write32_relaxed(uint32_t value, volatile void *addr); + +/** + * Write a 64-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write64_relaxed(uint64_t value, volatile void *addr); + +/** + * Read a 8-bit value from I/O device memory address *addr*. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint8_t +rte_read8(const volatile void *addr); + +/** + * Read a 16-bit value from I/O device memory address *addr*. + * + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint16_t +rte_read16(const volatile void *addr); + +/** + * Read a 32-bit value from I/O device memory address *addr*. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint32_t +rte_read32(const volatile void *addr); + +/** + * Read a 64-bit value from I/O device memory address *addr*. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint64_t +rte_read64(const volatile void *addr); + +/** + * Write a 8-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ + +static inline void +rte_write8(uint8_t value, volatile void *addr); + +/** + * Write a 16-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write16(uint16_t value, volatile void *addr); + +/** + * Write a 32-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write32(uint32_t value, volatile void *addr); + +/** + * Write a 64-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write64(uint64_t value, volatile void *addr); + +#endif /* __DOXYGEN__ */ + +#ifndef RTE_OVERRIDE_IO_H + +static __rte_always_inline uint8_t +rte_read8_relaxed(const volatile void *addr) +{ + return *(const volatile uint8_t *)addr; +} + +static __rte_always_inline uint16_t +rte_read16_relaxed(const volatile void *addr) +{ + return *(const volatile uint16_t *)addr; +} + +static __rte_always_inline uint32_t +rte_read32_relaxed(const volatile void *addr) +{ + return *(const volatile uint32_t *)addr; +} + +static __rte_always_inline uint64_t +rte_read64_relaxed(const volatile void *addr) +{ + return *(const volatile uint64_t *)addr; +} + +static __rte_always_inline void +rte_write8_relaxed(uint8_t value, volatile void *addr) +{ + *(volatile uint8_t *)addr = value; +} + +static __rte_always_inline void +rte_write16_relaxed(uint16_t value, volatile void *addr) +{ + *(volatile uint16_t *)addr = value; +} + +static __rte_always_inline void +rte_write32_relaxed(uint32_t value, volatile void *addr) +{ + *(volatile uint32_t *)addr = value; +} + +static __rte_always_inline void +rte_write64_relaxed(uint64_t value, volatile void *addr) +{ + *(volatile uint64_t *)addr = value; +} + +static __rte_always_inline uint8_t +rte_read8(const volatile void *addr) +{ + uint8_t val; + val = rte_read8_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint16_t +rte_read16(const volatile void *addr) +{ + uint16_t val; + val = rte_read16_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint32_t +rte_read32(const volatile void *addr) +{ + uint32_t val; + val = rte_read32_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint64_t +rte_read64(const volatile void *addr) +{ + uint64_t val; + val = rte_read64_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline void +rte_write8(uint8_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write8_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write16(uint16_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write16_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write32(uint32_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write32_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write64(uint64_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write64_relaxed(value, addr); +} + +#endif /* RTE_OVERRIDE_IO_H */ + +#endif /* _RTE_IO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_mcslock.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_mcslock.h new file mode 100644 index 000000000..2bef28351 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_mcslock.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_MCSLOCK_H_ +#define _RTE_MCSLOCK_H_ + +/** + * @file + * + * RTE MCS lock + * + * This file defines the main data structure and APIs for MCS queued lock. + * + * The MCS lock (proposed by John M. Mellor-Crummey and Michael L. Scott) + * provides scalability by spinning on a CPU/thread local variable which + * avoids expensive cache bouncings. It provides fairness by maintaining + * a list of acquirers and passing the lock to each CPU/thread in the order + * they acquired the lock. + */ + +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_pause.h> + +/** + * The rte_mcslock_t type. + */ +typedef struct rte_mcslock { + struct rte_mcslock *next; + int locked; /* 1 if the queue locked, 0 otherwise */ +} rte_mcslock_t; + +/** + * @warning + * @b EXPERIMENTAL: This API may change without prior notice + * + * Take the MCS lock. + * + * @param msl + * A pointer to the pointer of a MCS lock. + * When the lock is initialized or declared, the msl pointer should be + * set to NULL. + * @param me + * A pointer to a new node of MCS lock. Each CPU/thread acquiring the + * lock should use its 'own node'. + */ +__rte_experimental +static inline void +rte_mcslock_lock(rte_mcslock_t **msl, rte_mcslock_t *me) +{ + rte_mcslock_t *prev; + + /* Init me node */ + __atomic_store_n(&me->locked, 1, __ATOMIC_RELAXED); + __atomic_store_n(&me->next, NULL, __ATOMIC_RELAXED); + + /* If the queue is empty, the exchange operation is enough to acquire + * the lock. Hence, the exchange operation requires acquire semantics. + * The store to me->next above should complete before the node is + * visible to other CPUs/threads. Hence, the exchange operation requires + * release semantics as well. + */ + prev = __atomic_exchange_n(msl, me, __ATOMIC_ACQ_REL); + if (likely(prev == NULL)) { + /* Queue was empty, no further action required, + * proceed with lock taken. + */ + return; + } + __atomic_store_n(&prev->next, me, __ATOMIC_RELAXED); + + /* The while-load of me->locked should not move above the previous + * store to prev->next. Otherwise it will cause a deadlock. Need a + * store-load barrier. + */ + __atomic_thread_fence(__ATOMIC_ACQ_REL); + /* If the lock has already been acquired, it first atomically + * places the node at the end of the queue and then proceeds + * to spin on me->locked until the previous lock holder resets + * the me->locked using mcslock_unlock(). + */ + while (__atomic_load_n(&me->locked, __ATOMIC_ACQUIRE)) + rte_pause(); +} + +/** + * @warning + * @b EXPERIMENTAL: This API may change without prior notice + * + * Release the MCS lock. + * + * @param msl + * A pointer to the pointer of a MCS lock. + * @param me + * A pointer to the node of MCS lock passed in rte_mcslock_lock. + */ +__rte_experimental +static inline void +rte_mcslock_unlock(rte_mcslock_t **msl, rte_mcslock_t *me) +{ + /* Check if there are more nodes in the queue. */ + if (likely(__atomic_load_n(&me->next, __ATOMIC_RELAXED) == NULL)) { + /* No, last member in the queue. */ + rte_mcslock_t *save_me = __atomic_load_n(&me, __ATOMIC_RELAXED); + + /* Release the lock by setting it to NULL */ + if (likely(__atomic_compare_exchange_n(msl, &save_me, NULL, 0, + __ATOMIC_RELEASE, __ATOMIC_RELAXED))) + return; + + /* Speculative execution would be allowed to read in the + * while-loop first. This has the potential to cause a + * deadlock. Need a load barrier. + */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); + /* More nodes added to the queue by other CPUs. + * Wait until the next pointer is set. + */ + while (__atomic_load_n(&me->next, __ATOMIC_RELAXED) == NULL) + rte_pause(); + } + + /* Pass lock to next waiter. */ + __atomic_store_n(&me->next->locked, 0, __ATOMIC_RELEASE); +} + +/** + * @warning + * @b EXPERIMENTAL: This API may change without prior notice + * + * Try to take the lock. + * + * @param msl + * A pointer to the pointer of a MCS lock. + * @param me + * A pointer to a new node of MCS lock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +__rte_experimental +static inline int +rte_mcslock_trylock(rte_mcslock_t **msl, rte_mcslock_t *me) +{ + /* Init me node */ + __atomic_store_n(&me->next, NULL, __ATOMIC_RELAXED); + + /* Try to lock */ + rte_mcslock_t *expected = NULL; + + /* The lock can be taken only when the queue is empty. Hence, + * the compare-exchange operation requires acquire semantics. + * The store to me->next above should complete before the node + * is visible to other CPUs/threads. Hence, the compare-exchange + * operation requires release semantics as well. + */ + return __atomic_compare_exchange_n(msl, &expected, me, 0, + __ATOMIC_ACQ_REL, __ATOMIC_RELAXED); +} + +/** + * @warning + * @b EXPERIMENTAL: This API may change without prior notice + * + * Test if the lock is taken. + * + * @param msl + * A pointer to a MCS lock node. + * @return + * 1 if the lock is currently taken; 0 otherwise. + */ +__rte_experimental +static inline int +rte_mcslock_is_locked(rte_mcslock_t *msl) +{ + return (__atomic_load_n(&msl, __ATOMIC_RELAXED) != NULL); +} + +#endif /* _RTE_MCSLOCK_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_memcpy.h new file mode 100644 index 000000000..701e550c3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_memcpy.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMCPY_H_ +#define _RTE_MEMCPY_H_ + +/** + * @file + * + * Functions for vectorised implementation of memcpy(). + */ + +/** + * Copy 16 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src); + +/** + * Copy 32 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src); + +#ifdef __DOXYGEN__ + +/** + * Copy 48 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src); + +#endif /* __DOXYGEN__ */ + +/** + * Copy 64 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src); + +/** + * Copy 128 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src); + +/** + * Copy 256 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src); + +#ifdef __DOXYGEN__ + +/** + * Copy bytes from one location to another. The locations must not overlap. + * + * @note This is implemented as a macro, so it's address should not be taken + * and care is needed as parameter expressions may be evaluated multiple times. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + * @param n + * Number of bytes to copy. + * @return + * Pointer to the destination data. + */ +static void * +rte_memcpy(void *dst, const void *src, size_t n); + +#endif /* __DOXYGEN__ */ + +#endif /* _RTE_MEMCPY_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_pause.h new file mode 100644 index 000000000..7422785f1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_pause.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_PAUSE_H_ +#define _RTE_PAUSE_H_ + +/** + * @file + * + * CPU pause operation. + * + */ + +#include <stdint.h> +#include <assert.h> +#include <rte_common.h> +#include <rte_atomic.h> +#include <rte_compat.h> + +/** + * Pause CPU execution for a short while + * + * This call is intended for tight loops which poll a shared resource or wait + * for an event. A short pause within the loop may reduce the power consumption. + */ +static inline void rte_pause(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed + * memory ordering model meaning the loads around this API can be reordered. + * + * @param addr + * A pointer to the memory location. + * @param expected + * A 16-bit expected value to be in the memory location. + * @param memorder + * Two different memory orders that can be specified: + * __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to + * C++11 memory orders with the same names, see the C++11 standard or + * the GCC wiki on atomic synchronization for detailed definition. + */ +__rte_experimental +static __rte_always_inline void +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected, + int memorder); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed + * memory ordering model meaning the loads around this API can be reordered. + * + * @param addr + * A pointer to the memory location. + * @param expected + * A 32-bit expected value to be in the memory location. + * @param memorder + * Two different memory orders that can be specified: + * __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to + * C++11 memory orders with the same names, see the C++11 standard or + * the GCC wiki on atomic synchronization for detailed definition. + */ +__rte_experimental +static __rte_always_inline void +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, + int memorder); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed + * memory ordering model meaning the loads around this API can be reordered. + * + * @param addr + * A pointer to the memory location. + * @param expected + * A 64-bit expected value to be in the memory location. + * @param memorder + * Two different memory orders that can be specified: + * __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to + * C++11 memory orders with the same names, see the C++11 standard or + * the GCC wiki on atomic synchronization for detailed definition. + */ +__rte_experimental +static __rte_always_inline void +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected, + int memorder); + +#ifndef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED +static __rte_always_inline void +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected, + int memorder) +{ + assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); + + while (__atomic_load_n(addr, memorder) != expected) + rte_pause(); +} + +static __rte_always_inline void +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, + int memorder) +{ + assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); + + while (__atomic_load_n(addr, memorder) != expected) + rte_pause(); +} + +static __rte_always_inline void +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected, + int memorder) +{ + assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); + + while (__atomic_load_n(addr, memorder) != expected) + rte_pause(); +} +#endif + +#endif /* _RTE_PAUSE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_prefetch.h new file mode 100644 index 000000000..6e47bdfba --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_prefetch.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_PREFETCH_H_ +#define _RTE_PREFETCH_H_ + +/** + * @file + * + * Prefetch operations. + * + * This file defines an API for prefetch macros / inline-functions, + * which are architecture-dependent. Prefetching occurs when a + * processor requests an instruction or data from memory to cache + * before it is actually needed, potentially speeding up the execution of the + * program. + */ + +/** + * Prefetch a cache line into all cache levels. + * @param p + * Address to prefetch + */ +static inline void rte_prefetch0(const volatile void *p); + +/** + * Prefetch a cache line into all cache levels except the 0th cache level. + * @param p + * Address to prefetch + */ +static inline void rte_prefetch1(const volatile void *p); + +/** + * Prefetch a cache line into all cache levels except the 0th and 1th cache + * levels. + * @param p + * Address to prefetch + */ +static inline void rte_prefetch2(const volatile void *p); + +/** + * Prefetch a cache line into all cache levels (non-temporal/transient version) + * + * The non-temporal prefetch is intended as a prefetch hint that processor will + * use the prefetched data only once or short period, unlike the + * rte_prefetch0() function which imply that prefetched data to use repeatedly. + * + * @param p + * Address to prefetch + */ +static inline void rte_prefetch_non_temporal(const volatile void *p); + +#endif /* _RTE_PREFETCH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_rwlock.h new file mode 100644 index 000000000..da9bc3e9c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_rwlock.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_RWLOCK_H_ +#define _RTE_RWLOCK_H_ + +/** + * @file + * + * RTE Read-Write Locks + * + * This file defines an API for read-write locks. The lock is used to + * protect data that allows multiple readers in parallel, but only + * one writer. All readers are blocked until the writer is finished + * writing. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include <rte_atomic.h> +#include <rte_pause.h> + +/** + * The rte_rwlock_t type. + * + * cnt is -1 when write lock is held, and > 0 when read locks are held. + */ +typedef struct { + volatile int32_t cnt; /**< -1 when W lock held, > 0 when R locks held. */ +} rte_rwlock_t; + +/** + * A static rwlock initializer. + */ +#define RTE_RWLOCK_INITIALIZER { 0 } + +/** + * Initialize the rwlock to an unlocked state. + * + * @param rwl + * A pointer to the rwlock structure. + */ +static inline void +rte_rwlock_init(rte_rwlock_t *rwl) +{ + rwl->cnt = 0; +} + +/** + * Take a read lock. Loop until the lock is held. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_read_lock(rte_rwlock_t *rwl) +{ + int32_t x; + int success = 0; + + while (success == 0) { + x = __atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED); + /* write lock is held */ + if (x < 0) { + rte_pause(); + continue; + } + success = __atomic_compare_exchange_n(&rwl->cnt, &x, x + 1, 1, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); + } +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * try to take a read lock. + * + * @param rwl + * A pointer to a rwlock structure. + * @return + * - zero if the lock is successfully taken + * - -EBUSY if lock could not be acquired for reading because a + * writer holds the lock + */ +__rte_experimental +static inline int +rte_rwlock_read_trylock(rte_rwlock_t *rwl) +{ + int32_t x; + int success = 0; + + while (success == 0) { + x = __atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED); + /* write lock is held */ + if (x < 0) + return -EBUSY; + success = __atomic_compare_exchange_n(&rwl->cnt, &x, x + 1, 1, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); + } + + return 0; +} + +/** + * Release a read lock. + * + * @param rwl + * A pointer to the rwlock structure. + */ +static inline void +rte_rwlock_read_unlock(rte_rwlock_t *rwl) +{ + __atomic_fetch_sub(&rwl->cnt, 1, __ATOMIC_RELEASE); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * try to take a write lock. + * + * @param rwl + * A pointer to a rwlock structure. + * @return + * - zero if the lock is successfully taken + * - -EBUSY if lock could not be acquired for writing because + * it was already locked for reading or writing + */ +__rte_experimental +static inline int +rte_rwlock_write_trylock(rte_rwlock_t *rwl) +{ + int32_t x; + + x = __atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED); + if (x != 0 || __atomic_compare_exchange_n(&rwl->cnt, &x, -1, 1, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED) == 0) + return -EBUSY; + + return 0; +} + +/** + * Take a write lock. Loop until the lock is held. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_lock(rte_rwlock_t *rwl) +{ + int32_t x; + int success = 0; + + while (success == 0) { + x = __atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED); + /* a lock is held */ + if (x != 0) { + rte_pause(); + continue; + } + success = __atomic_compare_exchange_n(&rwl->cnt, &x, -1, 1, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); + } +} + +/** + * Release a write lock. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_unlock(rte_rwlock_t *rwl) +{ + __atomic_store_n(&rwl->cnt, 0, __ATOMIC_RELEASE); +} + +/** + * Try to execute critical section in a hardware memory transaction, if it + * fails or not available take a read lock + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl); + +/** + * Commit hardware memory transaction or release the read lock if the lock is used as a fall-back + * + * @param rwl + * A pointer to the rwlock structure. + */ +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl); + +/** + * Try to execute critical section in a hardware memory transaction, if it + * fails or not available take a write lock + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl); + +/** + * Commit hardware memory transaction or release the write lock if the lock is used as a fall-back + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_spinlock.h new file mode 100644 index 000000000..87ae7a4f1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_spinlock.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_SPINLOCK_H_ +#define _RTE_SPINLOCK_H_ + +/** + * @file + * + * RTE Spinlocks + * + * This file defines an API for read-write locks, which are implemented + * in an architecture-specific way. This kind of lock simply waits in + * a loop repeatedly checking until the lock becomes available. + * + * All locks must be initialised before use, and only initialised once. + * + */ + +#include <rte_lcore.h> +#ifdef RTE_FORCE_INTRINSICS +#include <rte_common.h> +#endif +#include <rte_pause.h> + +/** + * The rte_spinlock_t type. + */ +typedef struct { + volatile int locked; /**< lock status 0 = unlocked, 1 = locked */ +} rte_spinlock_t; + +/** + * A static spinlock initializer. + */ +#define RTE_SPINLOCK_INITIALIZER { 0 } + +/** + * Initialize the spinlock to an unlocked state. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_init(rte_spinlock_t *sl) +{ + sl->locked = 0; +} + +/** + * Take the spinlock. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_lock(rte_spinlock_t *sl); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_spinlock_lock(rte_spinlock_t *sl) +{ + int exp = 0; + + while (!__atomic_compare_exchange_n(&sl->locked, &exp, 1, 0, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) { + while (__atomic_load_n(&sl->locked, __ATOMIC_RELAXED)) + rte_pause(); + exp = 0; + } +} +#endif + +/** + * Release the spinlock. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_unlock (rte_spinlock_t *sl); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_spinlock_unlock (rte_spinlock_t *sl) +{ + __atomic_store_n(&sl->locked, 0, __ATOMIC_RELEASE); +} +#endif + +/** + * Try to take the lock. + * + * @param sl + * A pointer to the spinlock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +static inline int +rte_spinlock_trylock (rte_spinlock_t *sl); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_spinlock_trylock (rte_spinlock_t *sl) +{ + int exp = 0; + return __atomic_compare_exchange_n(&sl->locked, &exp, 1, + 0, /* disallow spurious failure */ + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); +} +#endif + +/** + * Test if the lock is taken. + * + * @param sl + * A pointer to the spinlock. + * @return + * 1 if the lock is currently taken; 0 otherwise. + */ +static inline int rte_spinlock_is_locked (rte_spinlock_t *sl) +{ + return __atomic_load_n(&sl->locked, __ATOMIC_ACQUIRE); +} + +/** + * Test if hardware transactional memory (lock elision) is supported + * + * @return + * 1 if the hardware transactional memory is supported; 0 otherwise. + */ +static inline int rte_tm_supported(void); + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available take the spinlock. + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl); + +/** + * Commit hardware memory transaction or release the spinlock if + * the spinlock is used as a fall-back + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl); + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available try to take the lock. + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param sl + * A pointer to the spinlock. + * @return + * 1 if the hardware memory transaction is successfully started + * or lock is successfully taken; 0 otherwise. + */ +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl); + +/** + * The rte_spinlock_recursive_t type. + */ +typedef struct { + rte_spinlock_t sl; /**< the actual spinlock */ + volatile int user; /**< core id using lock, -1 for unused */ + volatile int count; /**< count of time this lock has been called */ +} rte_spinlock_recursive_t; + +/** + * A static recursive spinlock initializer. + */ +#define RTE_SPINLOCK_RECURSIVE_INITIALIZER {RTE_SPINLOCK_INITIALIZER, -1, 0} + +/** + * Initialize the recursive spinlock to an unlocked state. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_init(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_init(&slr->sl); + slr->user = -1; + slr->count = 0; +} + +/** + * Take the recursive spinlock. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_lock(rte_spinlock_recursive_t *slr) +{ + int id = rte_gettid(); + + if (slr->user != id) { + rte_spinlock_lock(&slr->sl); + slr->user = id; + } + slr->count++; +} +/** + * Release the recursive spinlock. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_unlock(rte_spinlock_recursive_t *slr) +{ + if (--(slr->count) == 0) { + slr->user = -1; + rte_spinlock_unlock(&slr->sl); + } + +} + +/** + * Try to take the recursive lock. + * + * @param slr + * A pointer to the recursive spinlock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +static inline int rte_spinlock_recursive_trylock(rte_spinlock_recursive_t *slr) +{ + int id = rte_gettid(); + + if (slr->user != id) { + if (rte_spinlock_trylock(&slr->sl) == 0) + return 0; + slr->user = id; + } + slr->count++; + return 1; +} + + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available take the recursive spinlocks + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_lock_tm( + rte_spinlock_recursive_t *slr); + +/** + * Commit hardware memory transaction or release the recursive spinlock + * if the recursive spinlock is used as a fall-back + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_unlock_tm( + rte_spinlock_recursive_t *slr); + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available try to take the recursive lock + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param slr + * A pointer to the recursive spinlock. + * @return + * 1 if the hardware memory transaction is successfully started + * or lock is successfully taken; 0 otherwise. + */ +static inline int rte_spinlock_recursive_trylock_tm( + rte_spinlock_recursive_t *slr); + +#endif /* _RTE_SPINLOCK_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_ticketlock.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_ticketlock.h new file mode 100644 index 000000000..c295ae7f7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_ticketlock.h @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_TICKETLOCK_H_ +#define _RTE_TICKETLOCK_H_ + +/** + * @file + * + * RTE ticket locks + * + * This file defines an API for ticket locks, which give each waiting + * thread a ticket and take the lock one by one, first come, first + * serviced. + * + * All locks must be initialised before use, and only initialised once. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include <rte_lcore.h> +#include <rte_pause.h> + +/** + * The rte_ticketlock_t type. + */ +typedef union { + uint32_t tickets; + struct { + uint16_t current; + uint16_t next; + } s; +} rte_ticketlock_t; + +/** + * A static ticketlock initializer. + */ +#define RTE_TICKETLOCK_INITIALIZER { 0 } + +/** + * Initialize the ticketlock to an unlocked state. + * + * @param tl + * A pointer to the ticketlock. + */ +__rte_experimental +static inline void +rte_ticketlock_init(rte_ticketlock_t *tl) +{ + __atomic_store_n(&tl->tickets, 0, __ATOMIC_RELAXED); +} + +/** + * Take the ticketlock. + * + * @param tl + * A pointer to the ticketlock. + */ +__rte_experimental +static inline void +rte_ticketlock_lock(rte_ticketlock_t *tl) +{ + uint16_t me = __atomic_fetch_add(&tl->s.next, 1, __ATOMIC_RELAXED); + rte_wait_until_equal_16(&tl->s.current, me, __ATOMIC_ACQUIRE); +} + +/** + * Release the ticketlock. + * + * @param tl + * A pointer to the ticketlock. + */ +__rte_experimental +static inline void +rte_ticketlock_unlock(rte_ticketlock_t *tl) +{ + uint16_t i = __atomic_load_n(&tl->s.current, __ATOMIC_RELAXED); + __atomic_store_n(&tl->s.current, i + 1, __ATOMIC_RELEASE); +} + +/** + * Try to take the lock. + * + * @param tl + * A pointer to the ticketlock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +__rte_experimental +static inline int +rte_ticketlock_trylock(rte_ticketlock_t *tl) +{ + rte_ticketlock_t old, new; + old.tickets = __atomic_load_n(&tl->tickets, __ATOMIC_RELAXED); + new.tickets = old.tickets; + new.s.next++; + if (old.s.next == old.s.current) { + if (__atomic_compare_exchange_n(&tl->tickets, &old.tickets, + new.tickets, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) + return 1; + } + + return 0; +} + +/** + * Test if the lock is taken. + * + * @param tl + * A pointer to the ticketlock. + * @return + * 1 if the lock is currently taken; 0 otherwise. + */ +__rte_experimental +static inline int +rte_ticketlock_is_locked(rte_ticketlock_t *tl) +{ + rte_ticketlock_t tic; + tic.tickets = __atomic_load_n(&tl->tickets, __ATOMIC_ACQUIRE); + return (tic.s.current != tic.s.next); +} + +/** + * The rte_ticketlock_recursive_t type. + */ +#define TICKET_LOCK_INVALID_ID -1 + +typedef struct { + rte_ticketlock_t tl; /**< the actual ticketlock */ + int user; /**< core id using lock, TICKET_LOCK_INVALID_ID for unused */ + unsigned int count; /**< count of time this lock has been called */ +} rte_ticketlock_recursive_t; + +/** + * A static recursive ticketlock initializer. + */ +#define RTE_TICKETLOCK_RECURSIVE_INITIALIZER {RTE_TICKETLOCK_INITIALIZER, \ + TICKET_LOCK_INVALID_ID, 0} + +/** + * Initialize the recursive ticketlock to an unlocked state. + * + * @param tlr + * A pointer to the recursive ticketlock. + */ +__rte_experimental +static inline void +rte_ticketlock_recursive_init(rte_ticketlock_recursive_t *tlr) +{ + rte_ticketlock_init(&tlr->tl); + __atomic_store_n(&tlr->user, TICKET_LOCK_INVALID_ID, __ATOMIC_RELAXED); + tlr->count = 0; +} + +/** + * Take the recursive ticketlock. + * + * @param tlr + * A pointer to the recursive ticketlock. + */ +__rte_experimental +static inline void +rte_ticketlock_recursive_lock(rte_ticketlock_recursive_t *tlr) +{ + int id = rte_gettid(); + + if (__atomic_load_n(&tlr->user, __ATOMIC_RELAXED) != id) { + rte_ticketlock_lock(&tlr->tl); + __atomic_store_n(&tlr->user, id, __ATOMIC_RELAXED); + } + tlr->count++; +} + +/** + * Release the recursive ticketlock. + * + * @param tlr + * A pointer to the recursive ticketlock. + */ +__rte_experimental +static inline void +rte_ticketlock_recursive_unlock(rte_ticketlock_recursive_t *tlr) +{ + if (--(tlr->count) == 0) { + __atomic_store_n(&tlr->user, TICKET_LOCK_INVALID_ID, + __ATOMIC_RELAXED); + rte_ticketlock_unlock(&tlr->tl); + } +} + +/** + * Try to take the recursive lock. + * + * @param tlr + * A pointer to the recursive ticketlock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +__rte_experimental +static inline int +rte_ticketlock_recursive_trylock(rte_ticketlock_recursive_t *tlr) +{ + int id = rte_gettid(); + + if (__atomic_load_n(&tlr->user, __ATOMIC_RELAXED) != id) { + if (rte_ticketlock_trylock(&tlr->tl) == 0) + return 0; + __atomic_store_n(&tlr->user, id, __ATOMIC_RELAXED); + } + tlr->count++; + return 1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TICKETLOCK_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/generic/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_vect.h new file mode 100644 index 000000000..3fc47979f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/generic/rte_vect.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + */ + +#ifndef _RTE_VECT_H_ +#define _RTE_VECT_H_ + +/** + * @file + * SIMD vector types + * + * This file defines types to use vector instructions with generic C code. + */ + +#include <stdint.h> + +/* Unsigned vector types */ + +/** + * 64 bits vector size to use with unsigned 8 bits elements. + * + * a = (rte_v64u8_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef uint8_t rte_v64u8_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with unsigned 16 bits elements. + * + * a = (rte_v64u16_t){ a0, a1, a2, a3 } + */ +typedef uint16_t rte_v64u16_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with unsigned 32 bits elements. + * + * a = (rte_v64u32_t){ a0, a1 } + */ +typedef uint32_t rte_v64u32_t __attribute__((vector_size(8), aligned(8))); + +/** + * 128 bits vector size to use with unsigned 8 bits elements. + * + * a = (rte_v128u8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef uint8_t rte_v128u8_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with unsigned 16 bits elements. + * + * a = (rte_v128u16_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef uint16_t rte_v128u16_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with unsigned 32 bits elements. + * + * a = (rte_v128u32_t){ a0, a1, a2, a3 } + */ +typedef uint32_t rte_v128u32_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with unsigned 64 bits elements. + * + * a = (rte_v128u64_t){ a0, a1 } + */ +typedef uint64_t rte_v128u64_t __attribute__((vector_size(16), aligned(16))); + +/** + * 256 bits vector size to use with unsigned 8 bits elements. + * + * a = (rte_v256u8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15, + * a16, a17, a18, a19, a20, a21, a22, a23, + * a24, a25, a26, a27, a28, a29, a30, a31 } + */ +typedef uint8_t rte_v256u8_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with unsigned 16 bits elements. + * + * a = (rte_v256u16_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef uint16_t rte_v256u16_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with unsigned 32 bits elements. + * + * a = (rte_v256u32_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef uint32_t rte_v256u32_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with unsigned 64 bits elements. + * + * a = (rte_v256u64_t){ a0, a1, a2, a3 } + */ +typedef uint64_t rte_v256u64_t __attribute__((vector_size(32), aligned(32))); + + +/* Signed vector types */ + +/** + * 64 bits vector size to use with 8 bits elements. + * + * a = (rte_v64s8_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef int8_t rte_v64s8_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with 16 bits elements. + * + * a = (rte_v64s16_t){ a0, a1, a2, a3 } + */ +typedef int16_t rte_v64s16_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with 32 bits elements. + * + * a = (rte_v64s32_t){ a0, a1 } + */ +typedef int32_t rte_v64s32_t __attribute__((vector_size(8), aligned(8))); + +/** + * 128 bits vector size to use with 8 bits elements. + * + * a = (rte_v128s8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef int8_t rte_v128s8_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with 16 bits elements. + * + * a = (rte_v128s16_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef int16_t rte_v128s16_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with 32 bits elements. + * + * a = (rte_v128s32_t){ a0, a1, a2, a3 } + */ +typedef int32_t rte_v128s32_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with 64 bits elements. + * + * a = (rte_v128s64_t){ a1, a2 } + */ +typedef int64_t rte_v128s64_t __attribute__((vector_size(16), aligned(16))); + +/** + * 256 bits vector size to use with 8 bits elements. + * + * a = (rte_v256s8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15, + * a16, a17, a18, a19, a20, a21, a22, a23, + * a24, a25, a26, a27, a28, a29, a30, a31 } + */ +typedef int8_t rte_v256s8_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with 16 bits elements. + * + * a = (rte_v256s16_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef int16_t rte_v256s16_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with 32 bits elements. + * + * a = (rte_v256s32_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef int32_t rte_v256s32_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with 64 bits elements. + * + * a = (rte_v256s64_t){ a0, a1, a2, a3 } + */ +typedef int64_t rte_v256s64_t __attribute__((vector_size(32), aligned(32))); + +#endif /* _RTE_VECT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/meson.build b/src/spdk/dpdk/lib/librte_eal/include/meson.build new file mode 100644 index 000000000..bc73ec2c5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/meson.build @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +includes += include_directories('.') + +headers += files( + 'rte_alarm.h', + 'rte_bitmap.h', + 'rte_branch_prediction.h', + 'rte_bus.h', + 'rte_class.h', + 'rte_common.h', + 'rte_compat.h', + 'rte_debug.h', + 'rte_dev.h', + 'rte_devargs.h', + 'rte_eal.h', + 'rte_eal_interrupts.h', + 'rte_eal_memconfig.h', + 'rte_eal_trace.h', + 'rte_errno.h', + 'rte_fbarray.h', + 'rte_hexdump.h', + 'rte_hypervisor.h', + 'rte_interrupts.h', + 'rte_keepalive.h', + 'rte_launch.h', + 'rte_lcore.h', + 'rte_log.h', + 'rte_malloc.h', + 'rte_memory.h', + 'rte_memzone.h', + 'rte_pci_dev_feature_defs.h', + 'rte_pci_dev_features.h', + 'rte_per_lcore.h', + 'rte_random.h', + 'rte_reciprocal.h', + 'rte_service.h', + 'rte_service_component.h', + 'rte_string_fns.h', + 'rte_tailq.h', + 'rte_time.h', + 'rte_trace.h', + 'rte_trace_point.h', + 'rte_trace_point_register.h', + 'rte_uuid.h', + 'rte_version.h', + 'rte_vfio.h', +) + +# special case install the generic headers, since they go in a subdir +generic_headers = files( + 'generic/rte_atomic.h', + 'generic/rte_byteorder.h', + 'generic/rte_cpuflags.h', + 'generic/rte_cycles.h', + 'generic/rte_io.h', + 'generic/rte_mcslock.h', + 'generic/rte_memcpy.h', + 'generic/rte_pause.h', + 'generic/rte_prefetch.h', + 'generic/rte_rwlock.h', + 'generic/rte_spinlock.h', + 'generic/rte_ticketlock.h', + 'generic/rte_vect.h', +) +install_headers(generic_headers, subdir: 'generic') diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_alarm.h b/src/spdk/dpdk/lib/librte_eal/include/rte_alarm.h new file mode 100644 index 000000000..7e4d0b240 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_alarm.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ALARM_H_ +#define _RTE_ALARM_H_ + +/** + * @file + * + * Alarm functions + * + * Simple alarm-clock functionality supplied by eal. + * Does not require hpet support. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> + +/** + * Signature of callback back function called when an alarm goes off. + */ +typedef void (*rte_eal_alarm_callback)(void *arg); + +/** + * Function to set a callback to be triggered when us microseconds + * have expired. Accuracy of timing to the microsecond is not guaranteed. The + * alarm function will not be called *before* the requested time, but may + * be called a short period of time afterwards. + * The alarm handler will be called only once. There is no need to call + * "rte_eal_alarm_cancel" from within the callback function. + * + * @param us + * The time in microseconds before the callback is called + * @param cb + * The function to be called when the alarm expires + * @param cb_arg + * Pointer parameter to be passed to the callback function + * + * @return + * On success, zero. + * On failure, a negative error number + */ +int rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb, void *cb_arg); + +/** + * Function to cancel an alarm callback which has been registered before. If + * used outside alarm callback it wait for all callbacks to finish execution. + * + * @param cb_fn + * alarm callback + * @param cb_arg + * Pointer parameter to be passed to the callback function. To remove all + * copies of a given callback function, irrespective of parameter, (void *)-1 + * can be used here. + * + * @return + * - value greater than 0 and rte_errno not changed - returned value is + * the number of canceled alarm callback functions + * - value greater or equal 0 and rte_errno set to EINPROGRESS, at least one + * alarm could not be canceled because cancellation was requested from alarm + * callback context. Returned value is the number of successfully canceled + * alarm callbacks + * - 0 and rte_errno set to ENOENT - no alarm found + * - -1 and rte_errno set to EINVAL - invalid parameter (NULL callback) + */ +int rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_ALARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_bitmap.h b/src/spdk/dpdk/lib/librte_eal/include/rte_bitmap.h new file mode 100644 index 000000000..7c90ef333 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_bitmap.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_BITMAP_H__ +#define __INCLUDE_RTE_BITMAP_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Bitmap + * + * The bitmap component provides a mechanism to manage large arrays of bits + * through bit get/set/clear and bit array scan operations. + * + * The bitmap scan operation is optimized for 64-bit CPUs using 64/128 byte cache + * lines. The bitmap is hierarchically organized using two arrays (array1 and + * array2), with each bit in array1 being associated with a full cache line + * (512/1024 bits) of bitmap bits, which are stored in array2: the bit in array1 + * is set only when there is at least one bit set within its associated array2 + * bits, otherwise the bit in array1 is cleared. The read and write operations + * for array1 and array2 are always done in slabs of 64 bits. + * + * This bitmap is not thread safe. For lock free operation on a specific bitmap + * instance, a single writer thread performing bit set/clear operations is + * allowed, only the writer thread can do bitmap scan operations, while there + * can be several reader threads performing bit get operations in parallel with + * the writer thread. When the use of locking primitives is acceptable, the + * serialization of the bit set/clear and bitmap scan operations needs to be + * enforced by the caller, while the bit get operation does not require locking + * the bitmap. + * + ***/ + +#include <string.h> +#include <rte_common.h> +#include <rte_config.h> +#include <rte_debug.h> +#include <rte_memory.h> +#include <rte_branch_prediction.h> +#include <rte_prefetch.h> + +/* Slab */ +#define RTE_BITMAP_SLAB_BIT_SIZE 64 +#define RTE_BITMAP_SLAB_BIT_SIZE_LOG2 6 +#define RTE_BITMAP_SLAB_BIT_MASK (RTE_BITMAP_SLAB_BIT_SIZE - 1) + +/* Cache line (CL) */ +#define RTE_BITMAP_CL_BIT_SIZE (RTE_CACHE_LINE_SIZE * 8) +#define RTE_BITMAP_CL_BIT_SIZE_LOG2 (RTE_CACHE_LINE_SIZE_LOG2 + 3) +#define RTE_BITMAP_CL_BIT_MASK (RTE_BITMAP_CL_BIT_SIZE - 1) + +#define RTE_BITMAP_CL_SLAB_SIZE (RTE_BITMAP_CL_BIT_SIZE / RTE_BITMAP_SLAB_BIT_SIZE) +#define RTE_BITMAP_CL_SLAB_SIZE_LOG2 (RTE_BITMAP_CL_BIT_SIZE_LOG2 - RTE_BITMAP_SLAB_BIT_SIZE_LOG2) +#define RTE_BITMAP_CL_SLAB_MASK (RTE_BITMAP_CL_SLAB_SIZE - 1) + +/** Bitmap data structure */ +struct rte_bitmap { + /* Context for array1 and array2 */ + uint64_t *array1; /**< Bitmap array1 */ + uint64_t *array2; /**< Bitmap array2 */ + uint32_t array1_size; /**< Number of 64-bit slabs in array1 that are actually used */ + uint32_t array2_size; /**< Number of 64-bit slabs in array2 */ + + /* Context for the "scan next" operation */ + uint32_t index1; /**< Bitmap scan: Index of current array1 slab */ + uint32_t offset1; /**< Bitmap scan: Offset of current bit within current array1 slab */ + uint32_t index2; /**< Bitmap scan: Index of current array2 slab */ + uint32_t go2; /**< Bitmap scan: Go/stop condition for current array2 cache line */ + + /* Storage space for array1 and array2 */ + uint8_t memory[]; +}; + +static inline void +__rte_bitmap_index1_inc(struct rte_bitmap *bmp) +{ + bmp->index1 = (bmp->index1 + 1) & (bmp->array1_size - 1); +} + +static inline uint64_t +__rte_bitmap_mask1_get(struct rte_bitmap *bmp) +{ + return (~1llu) << bmp->offset1; +} + +static inline void +__rte_bitmap_index2_set(struct rte_bitmap *bmp) +{ + bmp->index2 = (((bmp->index1 << RTE_BITMAP_SLAB_BIT_SIZE_LOG2) + bmp->offset1) << RTE_BITMAP_CL_SLAB_SIZE_LOG2); +} + +static inline uint32_t +__rte_bitmap_get_memory_footprint(uint32_t n_bits, + uint32_t *array1_byte_offset, uint32_t *array1_slabs, + uint32_t *array2_byte_offset, uint32_t *array2_slabs) +{ + uint32_t n_slabs_context, n_slabs_array1, n_cache_lines_context_and_array1; + uint32_t n_cache_lines_array2; + uint32_t n_bytes_total; + + n_cache_lines_array2 = (n_bits + RTE_BITMAP_CL_BIT_SIZE - 1) / RTE_BITMAP_CL_BIT_SIZE; + n_slabs_array1 = (n_cache_lines_array2 + RTE_BITMAP_SLAB_BIT_SIZE - 1) / RTE_BITMAP_SLAB_BIT_SIZE; + n_slabs_array1 = rte_align32pow2(n_slabs_array1); + n_slabs_context = (sizeof(struct rte_bitmap) + (RTE_BITMAP_SLAB_BIT_SIZE / 8) - 1) / (RTE_BITMAP_SLAB_BIT_SIZE / 8); + n_cache_lines_context_and_array1 = (n_slabs_context + n_slabs_array1 + RTE_BITMAP_CL_SLAB_SIZE - 1) / RTE_BITMAP_CL_SLAB_SIZE; + n_bytes_total = (n_cache_lines_context_and_array1 + n_cache_lines_array2) * RTE_CACHE_LINE_SIZE; + + if (array1_byte_offset) { + *array1_byte_offset = n_slabs_context * (RTE_BITMAP_SLAB_BIT_SIZE / 8); + } + if (array1_slabs) { + *array1_slabs = n_slabs_array1; + } + if (array2_byte_offset) { + *array2_byte_offset = n_cache_lines_context_and_array1 * RTE_CACHE_LINE_SIZE; + } + if (array2_slabs) { + *array2_slabs = n_cache_lines_array2 * RTE_BITMAP_CL_SLAB_SIZE; + } + + return n_bytes_total; +} + +static inline void +__rte_bitmap_scan_init(struct rte_bitmap *bmp) +{ + bmp->index1 = bmp->array1_size - 1; + bmp->offset1 = RTE_BITMAP_SLAB_BIT_SIZE - 1; + __rte_bitmap_index2_set(bmp); + bmp->index2 += RTE_BITMAP_CL_SLAB_SIZE; + + bmp->go2 = 0; +} + +/** + * Bitmap memory footprint calculation + * + * @param n_bits + * Number of bits in the bitmap + * @return + * Bitmap memory footprint measured in bytes on success, 0 on error + */ +static inline uint32_t +rte_bitmap_get_memory_footprint(uint32_t n_bits) { + /* Check input arguments */ + if (n_bits == 0) { + return 0; + } + + return __rte_bitmap_get_memory_footprint(n_bits, NULL, NULL, NULL, NULL); +} + +/** + * Bitmap initialization + * + * @param n_bits + * Number of pre-allocated bits in array2. + * @param mem + * Base address of array1 and array2. + * @param mem_size + * Minimum expected size of bitmap. + * @return + * Handle to bitmap instance. + */ +static inline struct rte_bitmap * +rte_bitmap_init(uint32_t n_bits, uint8_t *mem, uint32_t mem_size) +{ + struct rte_bitmap *bmp; + uint32_t array1_byte_offset, array1_slabs, array2_byte_offset, array2_slabs; + uint32_t size; + + /* Check input arguments */ + if (n_bits == 0) { + return NULL; + } + + if ((mem == NULL) || (((uintptr_t) mem) & RTE_CACHE_LINE_MASK)) { + return NULL; + } + + size = __rte_bitmap_get_memory_footprint(n_bits, + &array1_byte_offset, &array1_slabs, + &array2_byte_offset, &array2_slabs); + if (size < mem_size) { + return NULL; + } + + /* Setup bitmap */ + memset(mem, 0, size); + bmp = (struct rte_bitmap *) mem; + + bmp->array1 = (uint64_t *) &mem[array1_byte_offset]; + bmp->array1_size = array1_slabs; + bmp->array2 = (uint64_t *) &mem[array2_byte_offset]; + bmp->array2_size = array2_slabs; + + __rte_bitmap_scan_init(bmp); + + return bmp; +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Bitmap clear slab overhead bits. + * + * @param slabs + * Slab array. + * @param slab_size + * Number of 64-bit slabs in the slabs array. + * @param pos + * The start bit position in the slabs to be cleared. + */ +__rte_experimental +static inline void +__rte_bitmap_clear_slab_overhead_bits(uint64_t *slabs, uint32_t slab_size, + uint32_t pos) +{ + uint32_t i; + uint32_t index = pos / RTE_BITMAP_SLAB_BIT_SIZE; + uint32_t offset = pos & RTE_BITMAP_SLAB_BIT_MASK; + + if (offset) { + for (i = offset; i < RTE_BITMAP_SLAB_BIT_SIZE; i++) + slabs[index] &= ~(1llu << i); + index++; + } + if (index < slab_size) + memset(&slabs[index], 0, sizeof(slabs[0]) * + (slab_size - index)); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Bitmap initialization with all bits set + * + * @param n_bits + * Number of pre-allocated bits in array2. + * @param mem + * Base address of array1 and array2. + * @param mem_size + * Minimum expected size of bitmap. + * @return + * Handle to bitmap instance. + */ +__rte_experimental +static inline struct rte_bitmap * +rte_bitmap_init_with_all_set(uint32_t n_bits, uint8_t *mem, uint32_t mem_size) +{ + struct rte_bitmap *bmp; + uint32_t array1_byte_offset, array1_slabs; + uint32_t array2_byte_offset, array2_slabs; + uint32_t size; + + /* Check input arguments */ + if (!n_bits || !mem || (((uintptr_t) mem) & RTE_CACHE_LINE_MASK)) + return NULL; + + size = __rte_bitmap_get_memory_footprint(n_bits, + &array1_byte_offset, &array1_slabs, + &array2_byte_offset, &array2_slabs); + if (size < mem_size) + return NULL; + + /* Setup bitmap */ + bmp = (struct rte_bitmap *) mem; + bmp->array1 = (uint64_t *) &mem[array1_byte_offset]; + bmp->array1_size = array1_slabs; + bmp->array2 = (uint64_t *) &mem[array2_byte_offset]; + bmp->array2_size = array2_slabs; + + __rte_bitmap_scan_init(bmp); + + memset(bmp->array1, 0xff, bmp->array1_size * sizeof(bmp->array1[0])); + memset(bmp->array2, 0xff, bmp->array2_size * sizeof(bmp->array2[0])); + /* Clear overhead bits. */ + __rte_bitmap_clear_slab_overhead_bits(bmp->array1, bmp->array1_size, + bmp->array2_size >> RTE_BITMAP_CL_SLAB_SIZE_LOG2); + __rte_bitmap_clear_slab_overhead_bits(bmp->array2, bmp->array2_size, + n_bits); + return bmp; +} + +/** + * Bitmap free + * + * @param bmp + * Handle to bitmap instance + * @return + * 0 upon success, error code otherwise + */ +static inline int +rte_bitmap_free(struct rte_bitmap *bmp) +{ + /* Check input arguments */ + if (bmp == NULL) { + return -1; + } + + return 0; +} + +/** + * Bitmap reset + * + * @param bmp + * Handle to bitmap instance + */ +static inline void +rte_bitmap_reset(struct rte_bitmap *bmp) +{ + memset(bmp->array1, 0, bmp->array1_size * sizeof(uint64_t)); + memset(bmp->array2, 0, bmp->array2_size * sizeof(uint64_t)); + __rte_bitmap_scan_init(bmp); +} + +/** + * Bitmap location prefetch into CPU L1 cache + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + * @return + * 0 upon success, error code otherwise + */ +static inline void +rte_bitmap_prefetch0(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab2; + uint32_t index2; + + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + slab2 = bmp->array2 + index2; + rte_prefetch0((void *) slab2); +} + +/** + * Bitmap bit get + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + * @return + * 0 when bit is cleared, non-zero when bit is set + */ +static inline uint64_t +rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab2; + uint32_t index2, offset2; + + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + return (*slab2) & (1llu << offset2); +} + +/** + * Bitmap bit set + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + */ +static inline void +rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab1, *slab2; + uint32_t index1, index2, offset1, offset2; + + /* Set bit in array2 slab and set bit in array1 slab */ + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; + index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); + offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + slab1 = bmp->array1 + index1; + + *slab2 |= 1llu << offset2; + *slab1 |= 1llu << offset1; +} + +/** + * Bitmap slab set + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position identifying the array2 slab + * @param slab + * Value to be assigned to the 64-bit slab in array2 + */ +static inline void +rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab) +{ + uint64_t *slab1, *slab2; + uint32_t index1, index2, offset1; + + /* Set bits in array2 slab and set bit in array1 slab */ + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); + offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + slab1 = bmp->array1 + index1; + + *slab2 |= slab; + *slab1 |= 1llu << offset1; +} + +static inline uint64_t +__rte_bitmap_line_not_empty(uint64_t *slab2) +{ + uint64_t v1, v2, v3, v4; + + v1 = slab2[0] | slab2[1]; + v2 = slab2[2] | slab2[3]; + v3 = slab2[4] | slab2[5]; + v4 = slab2[6] | slab2[7]; + v1 |= v2; + v3 |= v4; + + return v1 | v3; +} + +/** + * Bitmap bit clear + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + */ +static inline void +rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab1, *slab2; + uint32_t index1, index2, offset1, offset2; + + /* Clear bit in array2 slab */ + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + + /* Return if array2 slab is not all-zeros */ + *slab2 &= ~(1llu << offset2); + if (*slab2){ + return; + } + + /* Check the entire cache line of array2 for all-zeros */ + index2 &= ~ RTE_BITMAP_CL_SLAB_MASK; + slab2 = bmp->array2 + index2; + if (__rte_bitmap_line_not_empty(slab2)) { + return; + } + + /* The array2 cache line is all-zeros, so clear bit in array1 slab */ + index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); + offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; + slab1 = bmp->array1 + index1; + *slab1 &= ~(1llu << offset1); + + return; +} + +static inline int +__rte_bitmap_scan_search(struct rte_bitmap *bmp) +{ + uint64_t value1; + uint32_t i; + + /* Check current array1 slab */ + value1 = bmp->array1[bmp->index1]; + value1 &= __rte_bitmap_mask1_get(bmp); + + if (rte_bsf64_safe(value1, &bmp->offset1)) + return 1; + + __rte_bitmap_index1_inc(bmp); + bmp->offset1 = 0; + + /* Look for another array1 slab */ + for (i = 0; i < bmp->array1_size; i ++, __rte_bitmap_index1_inc(bmp)) { + value1 = bmp->array1[bmp->index1]; + + if (rte_bsf64_safe(value1, &bmp->offset1)) + return 1; + } + + return 0; +} + +static inline void +__rte_bitmap_scan_read_init(struct rte_bitmap *bmp) +{ + __rte_bitmap_index2_set(bmp); + bmp->go2 = 1; + rte_prefetch1((void *)(bmp->array2 + bmp->index2 + 8)); +} + +static inline int +__rte_bitmap_scan_read(struct rte_bitmap *bmp, uint32_t *pos, uint64_t *slab) +{ + uint64_t *slab2; + + slab2 = bmp->array2 + bmp->index2; + for ( ; bmp->go2 ; bmp->index2 ++, slab2 ++, bmp->go2 = bmp->index2 & RTE_BITMAP_CL_SLAB_MASK) { + if (*slab2) { + *pos = bmp->index2 << RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + *slab = *slab2; + + bmp->index2 ++; + slab2 ++; + bmp->go2 = bmp->index2 & RTE_BITMAP_CL_SLAB_MASK; + return 1; + } + } + + return 0; +} + +/** + * Bitmap scan (with automatic wrap-around) + * + * @param bmp + * Handle to bitmap instance + * @param pos + * When function call returns 1, pos contains the position of the next set + * bit, otherwise not modified + * @param slab + * When function call returns 1, slab contains the value of the entire 64-bit + * slab where the bit indicated by pos is located. Slabs are always 64-bit + * aligned, so the position of the first bit of the slab (this bit is not + * necessarily set) is pos / 64. Once a slab has been returned by the bitmap + * scan operation, the internal pointers of the bitmap are updated to point + * after this slab, so the same slab will not be returned again if it + * contains more than one bit which is set. When function call returns 0, + * slab is not modified. + * @return + * 0 if there is no bit set in the bitmap, 1 otherwise + */ +static inline int +rte_bitmap_scan(struct rte_bitmap *bmp, uint32_t *pos, uint64_t *slab) +{ + /* Return data from current array2 line if available */ + if (__rte_bitmap_scan_read(bmp, pos, slab)) { + return 1; + } + + /* Look for non-empty array2 line */ + if (__rte_bitmap_scan_search(bmp)) { + __rte_bitmap_scan_read_init(bmp); + __rte_bitmap_scan_read(bmp, pos, slab); + return 1; + } + + /* Empty bitmap */ + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_BITMAP_H__ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_branch_prediction.h b/src/spdk/dpdk/lib/librte_eal/include/rte_branch_prediction.h new file mode 100644 index 000000000..854ef9e5d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_branch_prediction.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Branch Prediction Helpers in RTE + */ + +#ifndef _RTE_BRANCH_PREDICTION_H_ +#define _RTE_BRANCH_PREDICTION_H_ + +/** + * Check if a branch is likely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * likely to be taken. Example: + * + * if (likely(x > 1)) + * do_stuff(); + * + */ +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif /* likely */ + +/** + * Check if a branch is unlikely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * unlikely to be taken. Example: + * + * if (unlikely(x < 1)) + * do_stuff(); + * + */ +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* unlikely */ + +#endif /* _RTE_BRANCH_PREDICTION_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_bus.h b/src/spdk/dpdk/lib/librte_eal/include/rte_bus.h new file mode 100644 index 000000000..d3034d0ed --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_bus.h @@ -0,0 +1,389 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 NXP + */ + +#ifndef _RTE_BUS_H_ +#define _RTE_BUS_H_ + +/** + * @file + * + * DPDK device bus interface + * + * This file exposes API and interfaces for bus abstraction + * over the devices and drivers in EAL. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <sys/queue.h> + +#include <rte_log.h> +#include <rte_dev.h> + +/** Double linked list of buses */ +TAILQ_HEAD(rte_bus_list, rte_bus); + + +/** + * IOVA mapping mode. + * + * IOVA mapping mode is iommu programming mode of a device. + * That device (for example: IOMMU backed DMA device) based + * on rte_iova_mode will generate physical or virtual address. + * + */ +enum rte_iova_mode { + RTE_IOVA_DC = 0, /* Don't care mode */ + RTE_IOVA_PA = (1 << 0), /* DMA using physical address */ + RTE_IOVA_VA = (1 << 1) /* DMA using virtual address */ +}; + +/** + * Bus specific scan for devices attached on the bus. + * For each bus object, the scan would be responsible for finding devices and + * adding them to its private device list. + * + * A bus should mandatorily implement this method. + * + * @return + * 0 for successful scan + * <0 for unsuccessful scan with error value + */ +typedef int (*rte_bus_scan_t)(void); + +/** + * Implementation specific probe function which is responsible for linking + * devices on that bus with applicable drivers. + * + * This is called while iterating over each registered bus. + * + * @return + * 0 for successful probe + * !0 for any error while probing + */ +typedef int (*rte_bus_probe_t)(void); + +/** + * Device iterator to find a device on a bus. + * + * This function returns an rte_device if one of those held by the bus + * matches the data passed as parameter. + * + * If the comparison function returns zero this function should stop iterating + * over any more devices. To continue a search the device of a previous search + * can be passed via the start parameter. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to compare each device against. + * + * @param start + * starting point for the iteration + * + * @return + * The first device matching the data, NULL if none exists. + */ +typedef struct rte_device * +(*rte_bus_find_device_t)(const struct rte_device *start, rte_dev_cmp_t cmp, + const void *data); + +/** + * Implementation specific probe function which is responsible for linking + * devices on that bus with applicable drivers. + * + * @param dev + * Device pointer that was returned by a previous call to find_device. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_plug_t)(struct rte_device *dev); + +/** + * Implementation specific remove function which is responsible for unlinking + * devices on that bus from assigned driver. + * + * @param dev + * Device pointer that was returned by a previous call to find_device. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_unplug_t)(struct rte_device *dev); + +/** + * Bus specific parsing function. + * Validates the syntax used in the textual representation of a device, + * If the syntax is valid and ``addr`` is not NULL, writes the bus-specific + * device representation to ``addr``. + * + * @param[in] name + * device textual description + * + * @param[out] addr + * device information location address, into which parsed info + * should be written. If NULL, nothing should be written, which + * is not an error. + * + * @return + * 0 if parsing was successful. + * !0 for any error. + */ +typedef int (*rte_bus_parse_t)(const char *name, void *addr); + +/** + * Device level DMA map function. + * After a successful call, the memory segment will be mapped to the + * given device. + * + * @param dev + * Device pointer. + * @param addr + * Virtual address to map. + * @param iova + * IOVA address to map. + * @param len + * Length of the memory segment being mapped. + * + * @return + * 0 if mapping was successful. + * Negative value and rte_errno is set otherwise. + */ +typedef int (*rte_dev_dma_map_t)(struct rte_device *dev, void *addr, + uint64_t iova, size_t len); + +/** + * Device level DMA unmap function. + * After a successful call, the memory segment will no longer be + * accessible by the given device. + * + * @param dev + * Device pointer. + * @param addr + * Virtual address to unmap. + * @param iova + * IOVA address to unmap. + * @param len + * Length of the memory segment being mapped. + * + * @return + * 0 if un-mapping was successful. + * Negative value and rte_errno is set otherwise. + */ +typedef int (*rte_dev_dma_unmap_t)(struct rte_device *dev, void *addr, + uint64_t iova, size_t len); + +/** + * Implement a specific hot-unplug handler, which is responsible for + * handle the failure when device be hot-unplugged. When the event of + * hot-unplug be detected, it could call this function to handle + * the hot-unplug failure and avoid app crash. + * @param dev + * Pointer of the device structure. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev); + +/** + * Implement a specific sigbus handler, which is responsible for handling + * the sigbus error which is either original memory error, or specific memory + * error that caused of device be hot-unplugged. When sigbus error be captured, + * it could call this function to handle sigbus error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 for success handle the sigbus for hot-unplug. + * 1 for not process it, because it is a generic sigbus error. + * -1 for failed to handle the sigbus for hot-unplug. + */ +typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr); + +/** + * Bus scan policies + */ +enum rte_bus_scan_mode { + RTE_BUS_SCAN_UNDEFINED, + RTE_BUS_SCAN_WHITELIST, + RTE_BUS_SCAN_BLACKLIST, +}; + +/** + * A structure used to configure bus operations. + */ +struct rte_bus_conf { + enum rte_bus_scan_mode scan_mode; /**< Scan policy. */ +}; + + +/** + * Get common iommu class of the all the devices on the bus. The bus may + * check that those devices are attached to iommu driver. + * If no devices are attached to the bus. The bus may return with don't care + * (_DC) value. + * Otherwise, The bus will return appropriate _pa or _va iova mode. + * + * @return + * enum rte_iova_mode value. + */ +typedef enum rte_iova_mode (*rte_bus_get_iommu_class_t)(void); + + +/** + * A structure describing a generic bus. + */ +struct rte_bus { + TAILQ_ENTRY(rte_bus) next; /**< Next bus object in linked list */ + const char *name; /**< Name of the bus */ + rte_bus_scan_t scan; /**< Scan for devices attached to bus */ + rte_bus_probe_t probe; /**< Probe devices on bus */ + rte_bus_find_device_t find_device; /**< Find a device on the bus */ + rte_bus_plug_t plug; /**< Probe single device for drivers */ + rte_bus_unplug_t unplug; /**< Remove single device from driver */ + rte_bus_parse_t parse; /**< Parse a device name */ + rte_dev_dma_map_t dma_map; /**< DMA map for device in the bus */ + rte_dev_dma_unmap_t dma_unmap; /**< DMA unmap for device in the bus */ + struct rte_bus_conf conf; /**< Bus configuration */ + rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */ + rte_dev_iterate_t dev_iterate; /**< Device iterator. */ + rte_bus_hot_unplug_handler_t hot_unplug_handler; + /**< handle hot-unplug failure on the bus */ + rte_bus_sigbus_handler_t sigbus_handler; + /**< handle sigbus error on the bus */ + +}; + +/** + * Register a Bus handler. + * + * @param bus + * A pointer to a rte_bus structure describing the bus + * to be registered. + */ +void rte_bus_register(struct rte_bus *bus); + +/** + * Unregister a Bus handler. + * + * @param bus + * A pointer to a rte_bus structure describing the bus + * to be unregistered. + */ +void rte_bus_unregister(struct rte_bus *bus); + +/** + * Scan all the buses. + * + * @return + * 0 in case of success in scanning all buses + * !0 in case of failure to scan + */ +int rte_bus_scan(void); + +/** + * For each device on the buses, perform a driver 'match' and call the + * driver-specific probe for device initialization. + * + * @return + * 0 for successful match/probe + * !0 otherwise + */ +int rte_bus_probe(void); + +/** + * Dump information of all the buses registered with EAL. + * + * @param f + * A valid and open output stream handle + */ +void rte_bus_dump(FILE *f); + +/** + * Bus comparison function. + * + * @param bus + * Bus under test. + * + * @param data + * Data to compare against. + * + * @return + * 0 if the bus matches the data. + * !0 if the bus does not match. + * <0 if ordering is possible and the bus is lower than the data. + * >0 if ordering is possible and the bus is greater than the data. + */ +typedef int (*rte_bus_cmp_t)(const struct rte_bus *bus, const void *data); + +/** + * Bus iterator to find a particular bus. + * + * This function compares each registered bus to find one that matches + * the data passed as parameter. + * + * If the comparison function returns zero this function will stop iterating + * over any more buses. To continue a search the bus of a previous search can + * be passed via the start parameter. + * + * @param start + * Starting point for the iteration. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to pass to comparison function. + * + * @return + * A pointer to a rte_bus structure or NULL in case no bus matches + */ +struct rte_bus *rte_bus_find(const struct rte_bus *start, rte_bus_cmp_t cmp, + const void *data); + +/** + * Find the registered bus for a particular device. + */ +struct rte_bus *rte_bus_find_by_device(const struct rte_device *dev); + +/** + * Find the registered bus for a given name. + */ +struct rte_bus *rte_bus_find_by_name(const char *busname); + + +/** + * Get the common iommu class of devices bound on to buses available in the + * system. RTE_IOVA_DC means that no preference has been expressed. + * + * @return + * enum rte_iova_mode value. + */ +enum rte_iova_mode rte_bus_get_iommu_class(void); + +/** + * Helper for Bus registration. + * The constructor has higher priority than PMD constructors. + */ +#define RTE_REGISTER_BUS(nm, bus) \ +RTE_INIT_PRIO(businitfn_ ##nm, BUS) \ +{\ + (bus).name = RTE_STR(nm);\ + rte_bus_register(&bus); \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BUS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_class.h b/src/spdk/dpdk/lib/librte_eal/include/rte_class.h new file mode 100644 index 000000000..856d09b22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_class.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#ifndef _RTE_CLASS_H_ +#define _RTE_CLASS_H_ + +/** + * @file + * + * DPDK device class interface. + * + * This file describes the interface of the device class + * abstraction layer. + * + * A device class defines the type of function a device + * will be used for e.g.: Ethernet adapter (eth), + * cryptographic co-processor (crypto), etc. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/queue.h> + +#include <rte_dev.h> + +/** Double linked list of classes */ +TAILQ_HEAD(rte_class_list, rte_class); + +/** + * A structure describing a generic device class. + */ +struct rte_class { + TAILQ_ENTRY(rte_class) next; /**< Next device class in linked list */ + const char *name; /**< Name of the class */ + rte_dev_iterate_t dev_iterate; /**< Device iterator. */ +}; + +/** + * Class comparison function. + * + * @param cls + * Class under test. + * + * @param data + * Data to compare against. + * + * @return + * 0 if the class matches the data. + * !0 if the class does not match. + * <0 if ordering is possible and the class is lower than the data. + * >0 if ordering is possible and the class is greater than the data. + */ +typedef int (*rte_class_cmp_t)(const struct rte_class *cls, const void *data); + +/** + * Class iterator to find a particular class. + * + * This function compares each registered class to find one that matches + * the data passed as parameter. + * + * If the comparison function returns zero this function will stop iterating + * over any more classes. To continue a search the class of a previous search + * can be passed via the start parameter. + * + * @param start + * Starting point for the iteration. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to pass to comparison function. + * + * @return + * A pointer to a rte_class structure or NULL in case no class matches + */ +__rte_experimental +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data); + +/** + * Find the registered class for a given name. + */ +__rte_experimental +struct rte_class * +rte_class_find_by_name(const char *name); + +/** + * Register a Class handle. + * + * @param cls + * A pointer to a rte_class structure describing the class + * to be registered. + */ +__rte_experimental +void rte_class_register(struct rte_class *cls); + +/** + * Unregister a Class handle. + * + * @param cls + * A pointer to a rte_class structure describing the class + * to be unregistered. + */ +__rte_experimental +void rte_class_unregister(struct rte_class *cls); + +/** + * Helper for Class registration. + * The constructor has lower priority than Bus constructors. + * The constructor has higher priority than PMD constructors. + */ +#define RTE_REGISTER_CLASS(nm, cls) \ +RTE_INIT_PRIO(classinitfn_ ##nm, CLASS) \ +{\ + (cls).name = RTE_STR(nm); \ + rte_class_register(&cls); \ +} + +#define RTE_UNREGISTER_CLASS(nm, cls) \ +RTE_FINI_PRIO(classfinifn_ ##nm, CLASS) \ +{ \ + rte_class_unregister(&cls); \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CLASS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_common.h b/src/spdk/dpdk/lib/librte_eal/include/rte_common.h new file mode 100644 index 000000000..0843ce69e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_common.h @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_COMMON_H_ +#define _RTE_COMMON_H_ + +/** + * @file + * + * Generic, commonly-used macro and inline function definitions + * for DPDK. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <stdlib.h> +#include <ctype.h> +#include <errno.h> +#include <limits.h> + +#include <rte_config.h> + +/* OS specific include */ +#include <rte_os.h> + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef asm +#define asm __asm__ +#endif + +/** C extension macro for environments lacking C11 features. */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L +#define RTE_STD_C11 __extension__ +#else +#define RTE_STD_C11 +#endif + +/* + * RTE_TOOLCHAIN_GCC is defined if the target is built with GCC, + * while a host application (like pmdinfogen) may have another compiler. + * RTE_CC_IS_GNU is true if the file is compiled with GCC, + * no matter it is a target or host application. + */ +#define RTE_CC_IS_GNU 0 +#if defined __clang__ +#define RTE_CC_CLANG +#elif defined __INTEL_COMPILER +#define RTE_CC_ICC +#elif defined __GNUC__ +#define RTE_CC_GCC +#undef RTE_CC_IS_GNU +#define RTE_CC_IS_GNU 1 +#endif +#if RTE_CC_IS_GNU +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + \ + __GNUC_PATCHLEVEL__) +#endif + +/** + * Force alignment + */ +#define __rte_aligned(a) __attribute__((__aligned__(a))) + +#ifdef RTE_ARCH_STRICT_ALIGN +typedef uint64_t unaligned_uint64_t __rte_aligned(1); +typedef uint32_t unaligned_uint32_t __rte_aligned(1); +typedef uint16_t unaligned_uint16_t __rte_aligned(1); +#else +typedef uint64_t unaligned_uint64_t; +typedef uint32_t unaligned_uint32_t; +typedef uint16_t unaligned_uint16_t; +#endif + +/** + * Force a structure to be packed + */ +#define __rte_packed __attribute__((__packed__)) + +/******* Macro to mark functions and fields scheduled for removal *****/ +#define __rte_deprecated __attribute__((__deprecated__)) + +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + +/** + * Force symbol to be generated even if it appears to be unused. + */ +#define __rte_used __attribute__((used)) + +/*********** Macros to eliminate unused variable warnings ********/ + +/** + * short definition to mark a function parameter unused + */ +#define __rte_unused __attribute__((__unused__)) + +/** + * definition to mark a variable or function parameter as used so + * as to avoid a compiler warning + */ +#define RTE_SET_USED(x) (void)(x) + +/** + * Check format string and its arguments at compile-time. + * + * GCC on Windows assumes MS-specific format string by default, + * even if the underlying stdio implementation is ANSI-compliant, + * so this must be overridden. + */ +#if RTE_CC_IS_GNU +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(gnu_printf, format_index, first_arg))) +#else +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(printf, format_index, first_arg))) +#endif + +#define RTE_PRIORITY_LOG 101 +#define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_CLASS 120 +#define RTE_PRIORITY_LAST 65535 + +#define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio + +/** + * Run function before main() with high priority. + * + * @param func + * Constructor function. + * @param prio + * Priority number must be above 100. + * Lowest number is the first to run. + */ +#ifndef RTE_INIT_PRIO /* Allow to override from EAL */ +#define RTE_INIT_PRIO(func, prio) \ +static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run function before main() with low priority. + * + * The constructor will be run after prioritized constructors. + * + * @param func + * Constructor function. + */ +#define RTE_INIT(func) \ + RTE_INIT_PRIO(func, LAST) + +/** + * Run after main() with low priority. + * + * @param func + * Destructor function name. + * @param prio + * Priority number must be above 100. + * Lowest number is the last to run. + */ +#ifndef RTE_FINI_PRIO /* Allow to override from EAL */ +#define RTE_FINI_PRIO(func, prio) \ +static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run after main() with high priority. + * + * The destructor will be run *before* prioritized destructors. + * + * @param func + * Destructor function name. + */ +#define RTE_FINI(func) \ + RTE_FINI_PRIO(func, LAST) + +/** + * Hint never returning function + */ +#define __rte_noreturn __attribute__((noreturn)) + +/** + * Force a function to be inlined + */ +#define __rte_always_inline inline __attribute__((always_inline)) + +/** + * Force a function to be noinlined + */ +#define __rte_noinline __attribute__((noinline)) + +/** + * Hint function in the hot path + */ +#define __rte_hot __attribute__((hot)) + +/** + * Hint function in the cold path + */ +#define __rte_cold __attribute__((cold)) + +/*********** Macros for pointer arithmetic ********/ + +/** + * add a byte-value offset to a pointer + */ +#define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x))) + +/** + * subtract a byte-value offset from a pointer + */ +#define RTE_PTR_SUB(ptr, x) ((void*)((uintptr_t)ptr - (x))) + +/** + * get the difference between two pointer values, i.e. how far apart + * in bytes are the locations they point two. It is assumed that + * ptr1 is greater than ptr2. + */ +#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) + +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + +/*********** Macros/static functions for doing alignment ********/ + + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no higher than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_FLOOR(ptr, align) \ + ((typeof(ptr))RTE_ALIGN_FLOOR((uintptr_t)ptr, align)) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no + * bigger than the first parameter. Second parameter must be a + * power-of-two value. + */ +#define RTE_ALIGN_FLOOR(val, align) \ + (typeof(val))((val) & (~((typeof(val))((align) - 1)))) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_CEIL(ptr, align) \ + RTE_PTR_ALIGN_FLOOR((typeof(ptr))RTE_PTR_ADD(ptr, (align) - 1), align) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no lower + * than the first parameter. Second parameter must be a power-of-two + * value. + */ +#define RTE_ALIGN_CEIL(val, align) \ + RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_PTR_ALIGN_CEIL + */ +#define RTE_PTR_ALIGN(ptr, align) RTE_PTR_ALIGN_CEIL(ptr, align) + +/** + * Macro to align a value to a given power-of-two. The resultant + * value will be of the same type as the first parameter, and + * will be no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_ALIGN_CEIL + */ +#define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no lower + * than the first parameter. + */ +#define RTE_ALIGN_MUL_CEIL(v, mul) \ + (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no higher + * than the first parameter. + */ +#define RTE_ALIGN_MUL_FLOOR(v, mul) \ + ((v / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align value to the nearest multiple of the given value. + * The resultant value might be greater than or less than the first parameter + * whichever difference is the lowest. + */ +#define RTE_ALIGN_MUL_NEAR(v, mul) \ + ({ \ + typeof(v) ceil = RTE_ALIGN_MUL_CEIL(v, mul); \ + typeof(v) floor = RTE_ALIGN_MUL_FLOOR(v, mul); \ + (ceil - v) > (v - floor) ? floor : ceil; \ + }) + +/** + * Checks if a pointer is aligned to a given power-of-two value + * + * @param ptr + * The pointer whose alignment is to be checked + * @param align + * The power-of-two value to which the ptr should be aligned + * + * @return + * True(1) where the pointer is correctly aligned, false(0) otherwise + */ +static inline int +rte_is_aligned(void *ptr, unsigned align) +{ + return RTE_PTR_ALIGN(ptr, align) == ptr; +} + +/*********** Macros for compile type checks ********/ + +/** + * Triggers an error at compilation time if the condition is true. + */ +#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +/*********** Cache line related macros ********/ + +/** Cache line mask. */ +#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1) + +/** Return the first cache-aligned value greater or equal to size. */ +#define RTE_CACHE_LINE_ROUNDUP(size) \ + (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / \ + RTE_CACHE_LINE_SIZE)) + +/** Cache line size in terms of log2 */ +#if RTE_CACHE_LINE_SIZE == 64 +#define RTE_CACHE_LINE_SIZE_LOG2 6 +#elif RTE_CACHE_LINE_SIZE == 128 +#define RTE_CACHE_LINE_SIZE_LOG2 7 +#else +#error "Unsupported cache line size" +#endif + +/** Minimum Cache line size. */ +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/** Force alignment to cache line. */ +#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) + +/** Force minimum cache line alignment. */ +#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE) + +/*********** PA/IOVA type definitions ********/ + +/** Physical address */ +typedef uint64_t phys_addr_t; +#define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1) + +/** + * IO virtual address type. + * When the physical addressing mode (IOVA as PA) is in use, + * the translation from an IO virtual address (IOVA) to a physical address + * is a direct mapping, i.e. the same value. + * Otherwise, in virtual mode (IOVA as VA), an IOMMU may do the translation. + */ +typedef uint64_t rte_iova_t; +#define RTE_BAD_IOVA ((rte_iova_t)-1) + +/*********** Structure alignment markers ********/ + +/** Generic marker for any place in a structure. */ +__extension__ typedef void *RTE_MARKER[0]; +/** Marker for 1B alignment in a structure. */ +__extension__ typedef uint8_t RTE_MARKER8[0]; +/** Marker for 2B alignment in a structure. */ +__extension__ typedef uint16_t RTE_MARKER16[0]; +/** Marker for 4B alignment in a structure. */ +__extension__ typedef uint32_t RTE_MARKER32[0]; +/** Marker for 8B alignment in a structure. */ +__extension__ typedef uint64_t RTE_MARKER64[0]; + +/** + * Combines 32b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param x + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint32_t +rte_combine32ms1b(uint32_t x) +{ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x; +} + +/** + * Combines 64b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param v + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint64_t +rte_combine64ms1b(uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + + return v; +} + +/*********** Macros to work with powers of 2 ********/ + +/** + * Macro to return 1 if n is a power of 2, 0 otherwise + */ +#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n))) + +/** + * Returns true if n is a power of 2 + * @param n + * Number to check + * @return 1 if true, 0 otherwise + */ +static inline int +rte_is_power_of_2(uint32_t n) +{ + return n && !(n & (n - 1)); +} + +/** + * Aligns input parameter to the next power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint32_t +rte_align32pow2(uint32_t x) +{ + x--; + x = rte_combine32ms1b(x); + + return x + 1; +} + +/** + * Aligns input parameter to the previous power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint32_t +rte_align32prevpow2(uint32_t x) +{ + x = rte_combine32ms1b(x); + + return x - (x >> 1); +} + +/** + * Aligns 64b input parameter to the next power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint64_t +rte_align64pow2(uint64_t v) +{ + v--; + v = rte_combine64ms1b(v); + + return v + 1; +} + +/** + * Aligns 64b input parameter to the previous power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint64_t +rte_align64prevpow2(uint64_t v) +{ + v = rte_combine64ms1b(v); + + return v - (v >> 1); +} + +/*********** Macros for calculating min and max **********/ + +/** + * Macro to return the minimum of two numbers + */ +#define RTE_MIN(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +/** + * Macro to return the maximum of two numbers + */ +#define RTE_MAX(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +/*********** Other general functions / macros ********/ + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline uint32_t +rte_bsf32(uint32_t v) +{ + return (uint32_t)__builtin_ctz(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf32_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf32(v); + return 1; +} + +/** + * Return the rounded-up log2 of a integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u32(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u32(uint32_t v) +{ + if (v == 0) + return 0; + v = rte_align32pow2(v); + return rte_bsf32(v); +} + + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 32. + * @note rte_fls_u32(0) = 0, rte_fls_u32(1) = 1, rte_fls_u32(0x80000000) = 32 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u32(uint32_t x) +{ + return (x == 0) ? 0 : 32 - __builtin_clz(x); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline int +rte_bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf64_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf64(v); + return 1; +} + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 64. + * @note rte_fls_u64(0) = 0, rte_fls_u64(1) = 1, + * rte_fls_u64(0x8000000000000000) = 64 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u64(uint64_t x) +{ + return (x == 0) ? 0 : 64 - __builtin_clzll(x); +} + +/** + * Return the rounded-up log2 of a 64-bit integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u64(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + /* we checked for v being 0 already, so no undefined behavior */ + return rte_bsf64(v); +} + +#ifndef offsetof +/** Return the offset of a field in a structure. */ +#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) +#endif + +/** + * Return pointer to the wrapping struct instance. + * + * Example: + * + * struct wrapper { + * ... + * struct child c; + * ... + * }; + * + * struct child *x = obtain(...); + * struct wrapper *w = container_of(x, struct wrapper, c); + */ +#ifndef container_of +#define container_of(ptr, type, member) __extension__ ({ \ + const typeof(((type *)0)->member) *_ptr = (ptr); \ + __rte_unused type *_target_ptr = \ + (type *)(ptr); \ + (type *)(((uintptr_t)_ptr) - offsetof(type, member)); \ + }) +#endif + +/** + * Get the size of a field in a structure. + * + * @param type + * The type of the structure. + * @param field + * The field in the structure. + * @return + * The size of the field in the structure, in bytes. + */ +#define RTE_SIZEOF_FIELD(type, field) (sizeof(((type *)0)->field)) + +#define _RTE_STR(x) #x +/** Take a macro value and get a string version of it */ +#define RTE_STR(x) _RTE_STR(x) + +/** + * ISO C helpers to modify format strings using variadic macros. + * This is a replacement for the ", ## __VA_ARGS__" GNU extension. + * An empty %s argument is appended to avoid a dangling comma. + */ +#define RTE_FMT(fmt, ...) fmt "%.0s", __VA_ARGS__ "" +#define RTE_FMT_HEAD(fmt, ...) fmt +#define RTE_FMT_TAIL(fmt, ...) __VA_ARGS__ + +/** Mask value of type "tp" for the first "ln" bit set. */ +#define RTE_LEN2MASK(ln, tp) \ + ((tp)((uint64_t)-1 >> (sizeof(uint64_t) * CHAR_BIT - (ln)))) + +/** Number of elements in the array. */ +#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0])) + +/** + * Converts a numeric string to the equivalent uint64_t value. + * As well as straight number conversion, also recognises the suffixes + * k, m and g for kilobytes, megabytes and gigabytes respectively. + * + * If a negative number is passed in i.e. a string with the first non-black + * character being "-", zero is returned. Zero is also returned in the case of + * an error with the strtoull call in the function. + * + * @param str + * String containing number to convert. + * @return + * Number. + */ +static inline uint64_t +rte_str_to_size(const char *str) +{ + char *endptr; + unsigned long long size; + + while (isspace((int)*str)) + str++; + if (*str == '-') + return 0; + + errno = 0; + size = strtoull(str, &endptr, 0); + if (errno) + return 0; + + if (*endptr == ' ') + endptr++; /* allow 1 space gap */ + + switch (*endptr){ + case 'G': case 'g': size *= 1024; /* fall-through */ + case 'M': case 'm': size *= 1024; /* fall-through */ + case 'K': case 'k': size *= 1024; /* fall-through */ + default: + break; + } + return size; +} + +/** + * Function to terminate the application immediately, printing an error + * message and returning the exit_code back to the shell. + * + * This function never returns + * + * @param exit_code + * The exit code to be returned by the application + * @param format + * The format string to be used for printing the message. This can include + * printf format characters which will be expanded using any further parameters + * to the function. + */ +__rte_noreturn void +rte_exit(int exit_code, const char *format, ...) + __rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_compat.h b/src/spdk/dpdk/lib/librte_eal/include/rte_compat.h new file mode 100644 index 000000000..4cd8f68d6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_compat.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Neil Horman <nhorman@tuxdriver.com>. + * All rights reserved. + */ + +#ifndef _RTE_COMPAT_H_ +#define _RTE_COMPAT_H_ + +#ifndef ALLOW_EXPERIMENTAL_API + +#define __rte_experimental \ +__attribute__((deprecated("Symbol is not yet part of stable ABI"), \ +section(".text.experimental"))) + +#else + +#define __rte_experimental \ +__attribute__((section(".text.experimental"))) + +#endif + +#ifndef ALLOW_INTERNAL_API + +#define __rte_internal \ +__attribute__((error("Symbol is not public ABI"), \ +section(".text.internal"))) + +#else + +#define __rte_internal \ +__attribute__((section(".text.internal"))) + +#endif + +#endif /* _RTE_COMPAT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_debug.h b/src/spdk/dpdk/lib/librte_eal/include/rte_debug.h new file mode 100644 index 000000000..50052c5a9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_debug.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_DEBUG_H_ +#define _RTE_DEBUG_H_ + +/** + * @file + * + * Debug Functions in RTE + * + * This file defines a generic API for debug operations. Part of + * the implementation is architecture-specific. + */ + +#include "rte_log.h" +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Dump the stack of the calling core to the console. + */ +void rte_dump_stack(void); + +/** + * Dump the registers of the calling core to the console. + * + * Note: Not implemented in a userapp environment; use gdb instead. + */ +void rte_dump_registers(void); + +/** + * Provide notification of a critical non-recoverable error and terminate + * execution abnormally. + * + * Display the format string and its expanded arguments (printf-like). + * + * In a linux environment, this function dumps the stack and calls + * abort() resulting in a core dump if enabled. + * + * The function never returns. + * + * @param ... + * The format string, followed by the variable list of arguments. + */ +#define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy") +#define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__) + +#ifdef RTE_ENABLE_ASSERT +#define RTE_ASSERT(exp) RTE_VERIFY(exp) +#else +#define RTE_ASSERT(exp) do {} while (0) +#endif +#define RTE_VERIFY(exp) do { \ + if (unlikely(!(exp))) \ + rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \ +} while (0) + +/* + * Provide notification of a critical non-recoverable error and stop. + * + * This function should not be called directly. Refer to rte_panic() macro + * documentation. + */ +void __rte_panic(const char *funcname , const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + __rte_noreturn + __rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEBUG_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_dev.h b/src/spdk/dpdk/lib/librte_eal/include/rte_dev.h new file mode 100644 index 000000000..c8d985fb5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_dev.h @@ -0,0 +1,518 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2014 6WIND S.A. + */ + +#ifndef _RTE_DEV_H_ +#define _RTE_DEV_H_ + +/** + * @file + * + * RTE PMD Driver Registration Interface + * + * This file manages the list of device drivers. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <sys/queue.h> + +#include <rte_config.h> +#include <rte_compat.h> +#include <rte_log.h> + +/** + * The device event type. + */ +enum rte_dev_event_type { + RTE_DEV_EVENT_ADD, /**< device being added */ + RTE_DEV_EVENT_REMOVE, /**< device being removed */ + RTE_DEV_EVENT_MAX /**< max value of this enum */ +}; + +struct rte_dev_event { + enum rte_dev_event_type type; /**< device event type */ + int subsystem; /**< subsystem id */ + char *devname; /**< device name */ +}; + +typedef void (*rte_dev_event_cb_fn)(const char *device_name, + enum rte_dev_event_type event, + void *cb_arg); + +/* Macros to check for invalid function pointers */ +#define RTE_FUNC_PTR_OR_ERR_RET(func, retval) do { \ + if ((func) == NULL) \ + return retval; \ +} while (0) + +#define RTE_FUNC_PTR_OR_RET(func) do { \ + if ((func) == NULL) \ + return; \ +} while (0) + +/** + * Device driver. + */ +enum rte_kernel_driver { + RTE_KDRV_UNKNOWN = 0, + RTE_KDRV_IGB_UIO, + RTE_KDRV_VFIO, + RTE_KDRV_UIO_GENERIC, + RTE_KDRV_NIC_UIO, + RTE_KDRV_NONE, +}; + +/** + * Device policies. + */ +enum rte_dev_policy { + RTE_DEV_WHITELISTED, + RTE_DEV_BLACKLISTED, +}; + +/** + * A generic memory resource representation. + */ +struct rte_mem_resource { + uint64_t phys_addr; /**< Physical address, 0 if not resource. */ + uint64_t len; /**< Length of the resource. */ + void *addr; /**< Virtual address, NULL when not mapped. */ +}; + +/** + * A structure describing a device driver. + */ +struct rte_driver { + TAILQ_ENTRY(rte_driver) next; /**< Next in list. */ + const char *name; /**< Driver name. */ + const char *alias; /**< Driver alias. */ +}; + +/* + * Internal identifier length + * Sufficiently large to allow for UUID or PCI address + */ +#define RTE_DEV_NAME_MAX_LEN 64 + +/** + * A structure describing a generic device. + */ +struct rte_device { + TAILQ_ENTRY(rte_device) next; /**< Next device */ + const char *name; /**< Device name */ + const struct rte_driver *driver; /**< Driver assigned after probing */ + const struct rte_bus *bus; /**< Bus handle assigned on scan */ + int numa_node; /**< NUMA node connection */ + struct rte_devargs *devargs; /**< Arguments for latest probing */ +}; + +/** + * Query status of a device. + * + * @param dev + * Generic device pointer. + * @return + * (int)true if already probed successfully, 0 otherwise. + */ +int rte_dev_is_probed(const struct rte_device *dev); + +/** + * Hotplug add a given device to a specific bus. + * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * + * @param busname + * The bus name the device is added to. + * @param devname + * The device name. Based on this device name, eal will identify a driver + * capable of handling it and pass it to the driver probing function. + * @param drvargs + * Device arguments to be passed to the driver. + * @return + * 0 on success, negative on error. + */ +int rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs); + +/** + * Add matching devices. + * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @return + * 0 on success, negative on error. + */ +int rte_dev_probe(const char *devargs); + +/** + * Hotplug remove a given device from a specific bus. + * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * + * @param busname + * The bus name the device is removed from. + * @param devname + * The device name being removed. + * @return + * 0 on success, negative on error. + */ +int rte_eal_hotplug_remove(const char *busname, const char *devname); + +/** + * Remove one device. + * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int rte_dev_remove(struct rte_device *dev); + +/** + * Device comparison function. + * + * This type of function is used to compare an rte_device with arbitrary + * data. + * + * @param dev + * Device handle. + * + * @param data + * Data to compare against. The type of this parameter is determined by + * the kind of comparison performed by the function. + * + * @return + * 0 if the device matches the data. + * !0 if the device does not match. + * <0 if ordering is possible and the device is lower than the data. + * >0 if ordering is possible and the device is greater than the data. + */ +typedef int (*rte_dev_cmp_t)(const struct rte_device *dev, const void *data); + +#define RTE_PMD_EXPORT_NAME_ARRAY(n, idx) n##idx[] + +#define RTE_PMD_EXPORT_NAME(name, idx) \ +static const char RTE_PMD_EXPORT_NAME_ARRAY(this_pmd_name, idx) \ +__rte_used = RTE_STR(name) + +#define DRV_EXP_TAG(name, tag) __##name##_##tag + +#define RTE_PMD_REGISTER_PCI_TABLE(name, table) \ +static const char DRV_EXP_TAG(name, pci_tbl_export)[] __rte_used = \ +RTE_STR(table) + +#define RTE_PMD_REGISTER_PARAM_STRING(name, str) \ +static const char DRV_EXP_TAG(name, param_string_export)[] \ +__rte_used = str + +/** + * Advertise the list of kernel modules required to run this driver + * + * This string lists the kernel modules required for the devices + * associated to a PMD. The format of each line of the string is: + * "<device-pattern> <kmod-expression>". + * + * The possible formats for the device pattern are: + * "*" all devices supported by this driver + * "pci:*" all PCI devices supported by this driver + * "pci:v8086:d*:sv*:sd*" all PCI devices supported by this driver + * whose vendor id is 0x8086. + * + * The format of the kernel modules list is a parenthesized expression + * containing logical-and (&) and logical-or (|). + * + * The device pattern and the kmod expression are separated by a space. + * + * Example: + * - "* igb_uio | uio_pci_generic | vfio" + */ +#define RTE_PMD_REGISTER_KMOD_DEP(name, str) \ +static const char DRV_EXP_TAG(name, kmod_dep_export)[] \ +__rte_used = str + +/** + * Iteration context. + * + * This context carries over the current iteration state. + */ +struct rte_dev_iterator { + const char *dev_str; /**< device string. */ + const char *bus_str; /**< bus-related part of device string. */ + const char *cls_str; /**< class-related part of device string. */ + struct rte_bus *bus; /**< bus handle. */ + struct rte_class *cls; /**< class handle. */ + struct rte_device *device; /**< current position. */ + void *class_device; /**< additional specialized context. */ +}; + +/** + * Device iteration function. + * + * Find the next device matching properties passed in parameters. + * The function takes an additional ``start`` parameter, that is + * used as starting context when relevant. + * + * The function returns the current element in the iteration. + * This return value will potentially be used as a start parameter + * in subsequent calls to the function. + * + * The additional iterator parameter is only there if a specific + * implementation needs additional context. It must not be modified by + * the iteration function itself. + * + * @param start + * Starting iteration context. + * + * @param devstr + * Device description string. + * + * @param it + * Device iterator. + * + * @return + * The address of the current element matching the device description + * string. + */ +typedef void *(*rte_dev_iterate_t)(const void *start, + const char *devstr, + const struct rte_dev_iterator *it); + +/** + * Initializes a device iterator. + * + * This iterator allows accessing a list of devices matching a criteria. + * The device matching is made among all buses and classes currently registered, + * filtered by the device description given as parameter. + * + * This function will not allocate any memory. It is safe to stop the + * iteration at any moment and let the iterator go out of context. + * + * @param it + * Device iterator handle. + * + * @param str + * Device description string. + * + * @return + * 0 on successful initialization. + * <0 on error. + */ +__rte_experimental +int +rte_dev_iterator_init(struct rte_dev_iterator *it, const char *str); + +/** + * Iterates on a device iterator. + * + * Generates a new rte_device handle corresponding to the next element + * in the list described in comprehension by the iterator. + * + * The next object is returned, and the iterator is updated. + * + * @param it + * Device iterator handle. + * + * @return + * An rte_device handle if found. + * NULL if an error occurred (rte_errno is set). + * NULL if no device could be found (rte_errno is not set). + */ +__rte_experimental +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it); + +#define RTE_DEV_FOREACH(dev, devstr, it) \ + for (rte_dev_iterator_init(it, devstr), \ + dev = rte_dev_iterator_next(it); \ + dev != NULL; \ + dev = rte_dev_iterator_next(it)) + +#ifdef __cplusplus +} +#endif + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It registers the callback for the specific device. + * Multiple callbacks can be registered at the same time. + * + * @param device_name + * The device name, that is the param name of the struct rte_device, + * null value means for all devices. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It unregisters the callback according to the specified device. + * + * @param device_name + * The device name, that is the param name of the struct rte_device, + * null value means for all devices and their callbacks. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback, (void *)-1 means to remove all + * registered which has the same callback address. + * + * @return + * - On success, return the number of callback entities removed. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Executes all the user application registered callbacks for + * the specific device. + * + * @param device_name + * The device name. + * @param event + * the device event type. + */ +__rte_experimental +void +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start the device event monitoring. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_dev_event_monitor_start(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop the device event monitoring. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_dev_event_monitor_stop(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Enable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_dev_hotplug_handle_enable(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Disable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_dev_hotplug_handle_disable(void); + +/** + * Device level DMA map function. + * After a successful call, the memory segment will be mapped to the + * given device. + * + * @note: Memory must be registered in advance using rte_extmem_* APIs. + * + * @param dev + * Device pointer. + * @param addr + * Virtual address to map. + * @param iova + * IOVA address to map. + * @param len + * Length of the memory segment being mapped. + * + * @return + * 0 if mapping was successful. + * Negative value and rte_errno is set otherwise. + */ +__rte_experimental +int +rte_dev_dma_map(struct rte_device *dev, void *addr, uint64_t iova, size_t len); + +/** + * Device level DMA unmap function. + * After a successful call, the memory segment will no longer be + * accessible by the given device. + * + * @note: Memory must be registered in advance using rte_extmem_* APIs. + * + * @param dev + * Device pointer. + * @param addr + * Virtual address to unmap. + * @param iova + * IOVA address to unmap. + * @param len + * Length of the memory segment being mapped. + * + * @return + * 0 if un-mapping was successful. + * Negative value and rte_errno is set otherwise. + */ +__rte_experimental +int +rte_dev_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, + size_t len); + +#endif /* _RTE_DEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_devargs.h b/src/spdk/dpdk/lib/librte_eal/include/rte_devargs.h new file mode 100644 index 000000000..898efa0d6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_devargs.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2014 6WIND S.A. + */ + +#ifndef _RTE_DEVARGS_H_ +#define _RTE_DEVARGS_H_ + +/** + * @file + * + * RTE devargs: list of devices and their user arguments + * + * This file stores a list of devices and their arguments given by + * the user when a DPDK application is started. These devices can be PCI + * devices or virtual devices. These devices are stored at startup in a + * list of rte_devargs structures. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <sys/queue.h> +#include <rte_compat.h> +#include <rte_bus.h> + +/** + * Type of generic device + */ +enum rte_devtype { + RTE_DEVTYPE_WHITELISTED_PCI, + RTE_DEVTYPE_BLACKLISTED_PCI, + RTE_DEVTYPE_VIRTUAL, +}; + +/** + * Structure that stores a device given by the user with its arguments + * + * A user device is a physical or a virtual device given by the user to + * the DPDK application at startup through command line arguments. + * + * The structure stores the configuration of the device, its PCI + * identifier if it's a PCI device or the driver name if it's a virtual + * device. + */ +struct rte_devargs { + /** Next in list. */ + TAILQ_ENTRY(rte_devargs) next; + /** Type of device. */ + enum rte_devtype type; + /** Device policy. */ + enum rte_dev_policy policy; + /** Name of the device. */ + char name[RTE_DEV_NAME_MAX_LEN]; + RTE_STD_C11 + union { + /** Arguments string as given by user or "" for no argument. */ + char *args; + const char *drv_str; + }; + struct rte_bus *bus; /**< bus handle. */ + struct rte_class *cls; /**< class handle. */ + const char *bus_str; /**< bus-related part of device string. */ + const char *cls_str; /**< class-related part of device string. */ + const char *data; /**< Device string storage. */ +}; + +/** + * Parse a device string. + * + * Verify that a bus is capable of handling the device passed + * in argument. Store which bus will handle the device, its name + * and the eventual device parameters. + * + * The syntax is: + * + * bus:device_identifier,arg1=val1,arg2=val2 + * + * where "bus:" is the bus name followed by any character separator. + * The bus name is optional. If no bus name is specified, each bus + * will attempt to recognize the device identifier. The first one + * to succeed will be used. + * + * Examples: + * + * pci:0000:05.00.0,arg=val + * 05.00.0,arg=val + * vdev:net_ring0 + * + * @param da + * The devargs structure holding the device information. + * + * @param dev + * String describing a device. + * + * @return + * - 0 on success. + * - Negative errno on error. + */ +int +rte_devargs_parse(struct rte_devargs *da, const char *dev); + +/** + * Parse a device string. + * + * Verify that a bus is capable of handling the device passed + * in argument. Store which bus will handle the device, its name + * and the eventual device parameters. + * + * The device string is built with a printf-like syntax. + * + * The syntax is: + * + * bus:device_identifier,arg1=val1,arg2=val2 + * + * where "bus:" is the bus name followed by any character separator. + * The bus name is optional. If no bus name is specified, each bus + * will attempt to recognize the device identifier. The first one + * to succeed will be used. + * + * Examples: + * + * pci:0000:05.00.0,arg=val + * 05.00.0,arg=val + * vdev:net_ring0 + * + * @param da + * The devargs structure holding the device information. + * @param format + * Format string describing a device. + * + * @return + * - 0 on success. + * - Negative errno on error. + */ +int +rte_devargs_parsef(struct rte_devargs *da, + const char *format, ...) +__rte_format_printf(2, 0); + +/** + * Insert an rte_devargs in the global list. + * + * @param da + * The devargs structure to insert. + * If a devargs for the same device is already inserted, + * it will be updated and returned. It means *da pointer can change. + * + * @return + * - 0 on success + * - Negative on error. + */ +int +rte_devargs_insert(struct rte_devargs **da); + +/** + * Add a device to the user device list + * See rte_devargs_parse() for details. + * + * @param devtype + * The type of the device. + * @param devargs_str + * The arguments as given by the user. + * + * @return + * - 0 on success + * - A negative value on error + */ +int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str); + +/** + * Remove a device from the user device list. + * Its resources are freed. + * If the devargs cannot be found, nothing happens. + * + * @param devargs + * The instance or a copy of devargs to remove. + * + * @return + * 0 on success. + * <0 on error. + * >0 if the devargs was not within the user device list. + */ +int rte_devargs_remove(struct rte_devargs *devargs); + +/** + * Count the number of user devices of a specified type + * + * @param devtype + * The type of the devices to counted. + * + * @return + * The number of devices. + */ +unsigned int +rte_devargs_type_count(enum rte_devtype devtype); + +/** + * This function dumps the list of user device and their arguments. + * + * @param f + * A pointer to a file for output + */ +void rte_devargs_dump(FILE *f); + +/** + * Find next rte_devargs matching the provided bus name. + * + * @param busname + * Limit the iteration to devargs related to buses + * matching this name. + * Will return any next rte_devargs if NULL. + * + * @param start + * Starting iteration point. The iteration will start at + * the first rte_devargs if NULL. + * + * @return + * Next rte_devargs entry matching the requested bus, + * NULL if there is none. + */ +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start); + +/** + * Iterate over all rte_devargs for a specific bus. + */ +#define RTE_EAL_DEVARGS_FOREACH(busname, da) \ + for (da = rte_devargs_next(busname, NULL); \ + da != NULL; \ + da = rte_devargs_next(busname, da)) \ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEVARGS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_eal.h b/src/spdk/dpdk/lib/librte_eal/include/rte_eal.h new file mode 100644 index 000000000..2f9ed298d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_eal.h @@ -0,0 +1,495 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _RTE_EAL_H_ +#define _RTE_EAL_H_ + +/** + * @file + * + * EAL Configuration API + */ + +#include <stdint.h> +#include <sched.h> +#include <time.h> + +#include <rte_config.h> +#include <rte_compat.h> +#include <rte_per_lcore.h> +#include <rte_bus.h> + +#include <rte_pci_dev_feature_defs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_MAGIC 19820526 /**< Magic number written by the main partition when ready. */ + +/* Maximum thread_name length. */ +#define RTE_MAX_THREAD_NAME_LEN 16 + +/** + * The lcore role (used in RTE or not). + */ +enum rte_lcore_role_t { + ROLE_RTE, + ROLE_OFF, + ROLE_SERVICE, +}; + +/** + * The type of process in a linux, multi-process setup + */ +enum rte_proc_type_t { + RTE_PROC_AUTO = -1, /* allow auto-detection of primary/secondary */ + RTE_PROC_PRIMARY = 0, /* set to zero, so primary is the default */ + RTE_PROC_SECONDARY, + + RTE_PROC_INVALID +}; + +/** + * Get the process type in a multi-process setup + * + * @return + * The process type + */ +enum rte_proc_type_t rte_eal_process_type(void); + +/** + * Request iopl privilege for all RPL. + * + * This function should be called by pmds which need access to ioports. + + * @return + * - On success, returns 0. + * - On failure, returns -1. + */ +int rte_eal_iopl_init(void); + +/** + * Initialize the Environment Abstraction Layer (EAL). + * + * This function is to be executed on the MASTER lcore only, as soon + * as possible in the application's main() function. + * + * The function finishes the initialization process before main() is called. + * It puts the SLAVE lcores in the WAIT state. + * + * When the multi-partition feature is supported, depending on the + * configuration (if CONFIG_RTE_EAL_MAIN_PARTITION is disabled), this + * function waits to ensure that the magic number is set before + * returning. See also the rte_eal_get_configuration() function. Note: + * This behavior may change in the future. + * + * @param argc + * A non-negative value. If it is greater than 0, the array members + * for argv[0] through argv[argc] (non-inclusive) shall contain pointers + * to strings. + * @param argv + * An array of strings. The contents of the array, as well as the strings + * which are pointed to by the array, may be modified by this function. + * @return + * - On success, the number of parsed arguments, which is greater or + * equal to zero. After the call to rte_eal_init(), + * all arguments argv[x] with x < ret may have been modified by this + * function call and should not be further interpreted by the + * application. The EAL does not take any ownership of the memory used + * for either the argv array, or its members. + * - On failure, -1 and rte_errno is set to a value indicating the cause + * for failure. In some instances, the application will need to be + * restarted as part of clearing the issue. + * + * Error codes returned via rte_errno: + * EACCES indicates a permissions issue. + * + * EAGAIN indicates either a bus or system resource was not available, + * setup may be attempted again. + * + * EALREADY indicates that the rte_eal_init function has already been + * called, and cannot be called again. + * + * EFAULT indicates the tailq configuration name was not found in + * memory configuration. + * + * EINVAL indicates invalid parameters were passed as argv/argc. + * + * ENOMEM indicates failure likely caused by an out-of-memory condition. + * + * ENODEV indicates memory setup issues. + * + * ENOTSUP indicates that the EAL cannot initialize on this system. + * + * EPROTO indicates that the PCI bus is either not present, or is not + * readable by the eal. + * + * ENOEXEC indicates that a service core failed to launch successfully. + */ +int rte_eal_init(int argc, char **argv); + +/** + * Clean up the Environment Abstraction Layer (EAL) + * + * This function must be called to release any internal resources that EAL has + * allocated during rte_eal_init(). After this call, no DPDK function calls may + * be made. It is expected that common usage of this function is to call it + * just before terminating the process. + * + * @return 0 Successfully released all internal EAL resources + * @return -EFAULT There was an error in releasing all resources. + */ +int rte_eal_cleanup(void); + +/** + * Check if a primary process is currently alive + * + * This function returns true when a primary process is currently + * active. + * + * @param config_file_path + * The config_file_path argument provided should point at the location + * that the primary process will create its config file. If NULL, the default + * config file path is used. + * + * @return + * - If alive, returns 1. + * - If dead, returns 0. + */ +int rte_eal_primary_proc_alive(const char *config_file_path); + +#define RTE_MP_MAX_FD_NUM 8 /* The max amount of fds */ +#define RTE_MP_MAX_NAME_LEN 64 /* The max length of action name */ +#define RTE_MP_MAX_PARAM_LEN 256 /* The max length of param */ +struct rte_mp_msg { + char name[RTE_MP_MAX_NAME_LEN]; + int len_param; + int num_fds; + uint8_t param[RTE_MP_MAX_PARAM_LEN]; + int fds[RTE_MP_MAX_FD_NUM]; +}; + +struct rte_mp_reply { + int nb_sent; + int nb_received; + struct rte_mp_msg *msgs; /* caller to free */ +}; + +/** + * Action function typedef used by other components. + * + * As we create socket channel for primary/secondary communication, use + * this function typedef to register action for coming messages. + * + * @note When handling IPC request callbacks, the reply must be sent even in + * cases of error handling. Simply returning success or failure will *not* + * send a response to the requestor. + * Implementation of error signalling mechanism is up to the application. + * + * @note No memory allocations should take place inside the callback. + */ +typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer); + +/** + * Asynchronous reply function typedef used by other components. + * + * As we create socket channel for primary/secondary communication, use + * this function typedef to register action for coming responses to asynchronous + * requests. + * + * @note When handling IPC request callbacks, the reply must be sent even in + * cases of error handling. Simply returning success or failure will *not* + * send a response to the requestor. + * Implementation of error signalling mechanism is up to the application. + * + * @note No memory allocations should take place inside the callback. + */ +typedef int (*rte_mp_async_reply_t)(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register an action function for primary/secondary communication. + * + * Call this function to register an action, if the calling component wants + * to response the messages from the corresponding component in its primary + * process or secondary processes. + * + * @note IPC may be unsupported in certain circumstances, so caller should check + * for ENOTSUP error. + * + * @param name + * The name argument plays as the nonredundant key to find the action. + * + * @param action + * The action argument is the function pointer to the action function. + * + * @return + * - 0 on success. + * - (<0) on failure. + */ +__rte_experimental +int +rte_mp_action_register(const char *name, rte_mp_t action); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Unregister an action function for primary/secondary communication. + * + * Call this function to unregister an action if the calling component does + * not want to response the messages from the corresponding component in its + * primary process or secondary processes. + * + * @note IPC may be unsupported in certain circumstances, so caller should check + * for ENOTSUP error. + * + * @param name + * The name argument plays as the nonredundant key to find the action. + * + */ +__rte_experimental +void +rte_mp_action_unregister(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a message to the peer process. + * + * This function will send a message which will be responded by the action + * identified by name in the peer process. + * + * @param msg + * The msg argument contains the customized message. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +__rte_experimental +int +rte_mp_sendmsg(struct rte_mp_msg *msg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a request to the peer process and expect a reply. + * + * This function sends a request message to the peer process, and will + * block until receiving reply message from the peer process. + * + * @note The caller is responsible to free reply->replies. + * + * @note This API must not be used inside memory-related or IPC callbacks, and + * no memory allocations should take place inside such callback. + * + * @note IPC may be unsupported in certain circumstances, so caller should check + * for ENOTSUP error. + * + * @param req + * The req argument contains the customized request message. + * + * @param reply + * The reply argument will be for storing all the replied messages; + * the caller is responsible for free reply->msgs. + * + * @param ts + * The ts argument specifies how long we can wait for the peer(s) to reply. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +__rte_experimental +int +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, + const struct timespec *ts); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a request to the peer process and expect a reply in a separate callback. + * + * This function sends a request message to the peer process, and will not + * block. Instead, reply will be received in a separate callback. + * + * @note IPC may be unsupported in certain circumstances, so caller should check + * for ENOTSUP error. + * + * @param req + * The req argument contains the customized request message. + * + * @param ts + * The ts argument specifies how long we can wait for the peer(s) to reply. + * + * @param clb + * The callback to trigger when all responses for this request have arrived. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +__rte_experimental +int +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a reply to the peer process. + * + * This function will send a reply message in response to a request message + * received previously. + * + * @note When handling IPC request callbacks, the reply must be sent even in + * cases of error handling. Simply returning success or failure will *not* + * send a response to the requestor. + * Implementation of error signalling mechanism is up to the application. + * + * @param msg + * The msg argument contains the customized message. + * + * @param peer + * The peer argument is the pointer to the peer socket path. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +__rte_experimental +int +rte_mp_reply(struct rte_mp_msg *msg, const char *peer); + +/** + * Usage function typedef used by the application usage function. + * + * Use this function typedef to define and call rte_set_application_usage_hook() + * routine. + */ +typedef void (*rte_usage_hook_t)(const char * prgname); + +/** + * Add application usage routine callout from the eal_usage() routine. + * + * This function allows the application to include its usage message + * in the EAL system usage message. The routine rte_set_application_usage_hook() + * needs to be called before the rte_eal_init() routine in the application. + * + * This routine is optional for the application and will behave as if the set + * routine was never called as the default behavior. + * + * @param usage_func + * The func argument is a function pointer to the application usage routine. + * Called function is defined using rte_usage_hook_t typedef, which is of + * the form void rte_usage_func(const char * prgname). + * + * Calling this routine with a NULL value will reset the usage hook routine and + * return the current value, which could be NULL. + * @return + * - Returns the current value of the rte_application_usage pointer to allow + * the caller to daisy chain the usage routines if needing more then one. + */ +rte_usage_hook_t +rte_set_application_usage_hook(rte_usage_hook_t usage_func); + +/** + * Whether EAL is using huge pages (disabled by --no-huge option). + * The no-huge mode is not compatible with all drivers or features. + * + * @return + * Nonzero if hugepages are enabled. + */ +int rte_eal_has_hugepages(void); + +/** + * Whether EAL is using PCI bus. + * Disabled by --no-pci option. + * + * @return + * Nonzero if the PCI bus is enabled. + */ +int rte_eal_has_pci(void); + +/** + * Whether the EAL was asked to create UIO device. + * + * @return + * Nonzero if true. + */ +int rte_eal_create_uio_dev(void); + +/** + * The user-configured vfio interrupt mode. + * + * @return + * Interrupt mode configured with the command line, + * RTE_INTR_MODE_NONE by default. + */ +enum rte_intr_mode rte_eal_vfio_intr_mode(void); + +/** + * A wrap API for syscall gettid. + * + * @return + * On success, returns the thread ID of calling process. + * It is always successful. + */ +int rte_sys_gettid(void); + +/** + * Get system unique thread id. + * + * @return + * On success, returns the thread ID of calling process. + * It is always successful. + */ +static inline int rte_gettid(void) +{ + static RTE_DEFINE_PER_LCORE(int, _thread_id) = -1; + if (RTE_PER_LCORE(_thread_id) == -1) + RTE_PER_LCORE(_thread_id) = rte_sys_gettid(); + return RTE_PER_LCORE(_thread_id); +} + +/** + * Get the iova mode + * + * @return + * enum rte_iova_mode value. + */ +enum rte_iova_mode rte_eal_iova_mode(void); + +/** + * Get user provided pool ops name for mbuf + * + * @return + * returns user provided pool ops name. + */ +const char * +rte_eal_mbuf_user_pool_ops(void); + +/** + * Get the runtime directory of DPDK + * + * @return + * The runtime directory path of DPDK + */ +const char * +rte_eal_get_runtime_dir(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EAL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_eal_interrupts.h b/src/spdk/dpdk/lib/librte_eal/include/rte_eal_interrupts.h new file mode 100644 index 000000000..773a34a42 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_eal_interrupts.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_INTERRUPTS_H_ +#error "don't include this file directly, please include generic <rte_interrupts.h>" +#endif + +/** + * @file rte_eal_interrupts.h + * @internal + * + * Contains function prototypes exposed by the EAL for interrupt handling by + * drivers and other DPDK internal consumers. + */ + +#ifndef _RTE_EAL_INTERRUPTS_H_ +#define _RTE_EAL_INTERRUPTS_H_ + +#define RTE_MAX_RXTX_INTR_VEC_ID 512 +#define RTE_INTR_VEC_ZERO_OFFSET 0 +#define RTE_INTR_VEC_RXTX_OFFSET 1 + +/** + * The interrupt source type, e.g. UIO, VFIO, ALARM etc. + */ +enum rte_intr_handle_type { + RTE_INTR_HANDLE_UNKNOWN = 0, /**< generic unknown handle */ + RTE_INTR_HANDLE_UIO, /**< uio device handle */ + RTE_INTR_HANDLE_UIO_INTX, /**< uio generic handle */ + RTE_INTR_HANDLE_VFIO_LEGACY, /**< vfio device handle (legacy) */ + RTE_INTR_HANDLE_VFIO_MSI, /**< vfio device handle (MSI) */ + RTE_INTR_HANDLE_VFIO_MSIX, /**< vfio device handle (MSIX) */ + RTE_INTR_HANDLE_ALARM, /**< alarm handle */ + RTE_INTR_HANDLE_EXT, /**< external handler */ + RTE_INTR_HANDLE_VDEV, /**< virtual device */ + RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */ + RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */ + RTE_INTR_HANDLE_MAX /**< count of elements */ +}; + +#define RTE_INTR_EVENT_ADD 1UL +#define RTE_INTR_EVENT_DEL 2UL + +typedef void (*rte_intr_event_cb_t)(int fd, void *arg); + +struct rte_epoll_data { + uint32_t event; /**< event type */ + void *data; /**< User data */ + rte_intr_event_cb_t cb_fun; /**< IN: callback fun */ + void *cb_arg; /**< IN: callback arg */ +}; + +enum { + RTE_EPOLL_INVALID = 0, + RTE_EPOLL_VALID, + RTE_EPOLL_EXEC, +}; + +/** interrupt epoll event obj, taken by epoll_event.ptr */ +struct rte_epoll_event { + volatile uint32_t status; /**< OUT: event status */ + int fd; /**< OUT: event fd */ + int epfd; /**< OUT: epoll instance the ev associated with */ + struct rte_epoll_data epdata; +}; + +/** Handle for interrupts. */ +struct rte_intr_handle { + RTE_STD_C11 + union { + int vfio_dev_fd; /**< VFIO device file descriptor */ + int uio_cfg_fd; /**< UIO cfg file desc for uio_pci_generic */ + }; + int fd; /**< interrupt event file descriptor */ + enum rte_intr_handle_type type; /**< handle type */ + uint32_t max_intr; /**< max interrupt requested */ + uint32_t nb_efd; /**< number of available efd(event fd) */ + uint8_t efd_counter_size; /**< size of efd counter, used for vdev */ + int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /**< intr vectors/efds mapping */ + struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID]; + /**< intr vector epoll event */ + int *intr_vec; /**< intr vector number array */ +}; + +#define RTE_EPOLL_PER_THREAD -1 /**< to hint using per thread epfd */ + +/** + * It waits for events on the epoll instance. + * + * @param epfd + * Epoll instance fd on which the caller wait for events. + * @param events + * Memory area contains the events that will be available for the caller. + * @param maxevents + * Up to maxevents are returned, must greater than zero. + * @param timeout + * Specifying a timeout of -1 causes a block indefinitely. + * Specifying a timeout equal to zero cause to return immediately. + * @return + * - On success, returns the number of available event. + * - On failure, a negative value. + */ +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout); + +/** + * It performs control operations on epoll instance referred by the epfd. + * It requests that the operation op be performed for the target fd. + * + * @param epfd + * Epoll instance fd on which the caller perform control operations. + * @param op + * The operation be performed for the target fd. + * @param fd + * The target fd on which the control ops perform. + * @param event + * Describes the object linked to the fd. + * Note: The caller must take care the object deletion after CTL_DEL. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event); + +/** + * The function returns the per thread epoll instance. + * + * @return + * epfd the epoll instance referred to. + */ +int +rte_intr_tls_epfd(void); + +/** + * @param intr_handle + * Pointer to the interrupt handle. + * @param epfd + * Epoll instance fd which the intr vector associated to. + * @param op + * The operation be performed for the vector. + * Operation type of {ADD, DEL}. + * @param vec + * RX intr vector number added to the epoll instance wait list. + * @param data + * User raw data. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data); + +/** + * It deletes registered eventfds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle); + +/** + * It enables the packet I/O interrupt event if it's necessary. + * It creates event fd for each interrupt vector when MSIX is used, + * otherwise it multiplexes a single event fd. + * + * @param intr_handle + * Pointer to the interrupt handle. + * @param nb_efd + * Number of interrupt vector trying to enable. + * The value 0 is not allowed. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd); + +/** + * It disables the packet I/O interrupt event. + * It deletes registered eventfds and closes the open fds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle); + +/** + * The packet I/O interrupt on datapath is enabled or not. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle); + +/** + * The interrupt handle instance allows other causes or not. + * Other causes stand for any none packet I/O interrupts. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle); + +/** + * The multiple interrupt vector capability of interrupt handle instance. + * It returns zero if no multiple interrupt vector support. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * @internal + * Check if currently executing in interrupt context + * + * @return + * - non zero in case of interrupt context + * - zero in case of process context + */ +__rte_experimental +int +rte_thread_is_intr(void); + +#endif /* _RTE_EAL_INTERRUPTS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_eal_memconfig.h b/src/spdk/dpdk/lib/librte_eal/include/rte_eal_memconfig.h new file mode 100644 index 000000000..dede2ee32 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_eal_memconfig.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_EAL_MEMCONFIG_H_ +#define _RTE_EAL_MEMCONFIG_H_ + +#include <stdbool.h> + +#include <rte_compat.h> + +/** + * @file + * + * This API allows access to EAL shared memory configuration through an API. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Lock the internal EAL shared memory configuration for shared access. + */ +void +rte_mcfg_mem_read_lock(void); + +/** + * Unlock the internal EAL shared memory configuration for shared access. + */ +void +rte_mcfg_mem_read_unlock(void); + +/** + * Lock the internal EAL shared memory configuration for exclusive access. + */ +void +rte_mcfg_mem_write_lock(void); + +/** + * Unlock the internal EAL shared memory configuration for exclusive access. + */ +void +rte_mcfg_mem_write_unlock(void); + +/** + * Lock the internal EAL TAILQ list for shared access. + */ +void +rte_mcfg_tailq_read_lock(void); + +/** + * Unlock the internal EAL TAILQ list for shared access. + */ +void +rte_mcfg_tailq_read_unlock(void); + +/** + * Lock the internal EAL TAILQ list for exclusive access. + */ +void +rte_mcfg_tailq_write_lock(void); + +/** + * Unlock the internal EAL TAILQ list for exclusive access. + */ +void +rte_mcfg_tailq_write_unlock(void); + +/** + * Lock the internal EAL Mempool list for shared access. + */ +void +rte_mcfg_mempool_read_lock(void); + +/** + * Unlock the internal EAL Mempool list for shared access. + */ +void +rte_mcfg_mempool_read_unlock(void); + +/** + * Lock the internal EAL Mempool list for exclusive access. + */ +void +rte_mcfg_mempool_write_lock(void); + +/** + * Unlock the internal EAL Mempool list for exclusive access. + */ +void +rte_mcfg_mempool_write_unlock(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Lock the internal EAL Timer Library lock for exclusive access. + */ +__rte_experimental +void +rte_mcfg_timer_lock(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Unlock the internal EAL Timer Library lock for exclusive access. + */ +__rte_experimental +void +rte_mcfg_timer_unlock(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * If true, pages are put in single files (per memseg list), + * as opposed to creating a file per page. + */ +__rte_experimental +bool +rte_mcfg_get_single_file_segments(void); + +#ifdef __cplusplus +} +#endif + +#endif /*__RTE_EAL_MEMCONFIG_H_*/ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_eal_trace.h b/src/spdk/dpdk/lib/librte_eal/include/rte_eal_trace.h new file mode 100644 index 000000000..1ebb2905a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_eal_trace.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#ifndef _RTE_EAL_TRACE_H_ +#define _RTE_EAL_TRACE_H_ + +/** + * @file + * + * API for EAL trace support + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_alarm.h> +#include <rte_interrupts.h> +#include <rte_trace_point.h> + +/* Generic */ +RTE_TRACE_POINT( + rte_eal_trace_generic_void, + RTE_TRACE_POINT_ARGS(void), +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_u64, + RTE_TRACE_POINT_ARGS(uint64_t in), + rte_trace_point_emit_u64(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_u32, + RTE_TRACE_POINT_ARGS(uint32_t in), + rte_trace_point_emit_u32(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_u16, + RTE_TRACE_POINT_ARGS(uint16_t in), + rte_trace_point_emit_u16(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_u8, + RTE_TRACE_POINT_ARGS(uint8_t in), + rte_trace_point_emit_u8(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_i64, + RTE_TRACE_POINT_ARGS(int64_t in), + rte_trace_point_emit_i64(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_i32, + RTE_TRACE_POINT_ARGS(int32_t in), + rte_trace_point_emit_i32(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_i16, + RTE_TRACE_POINT_ARGS(int16_t in), + rte_trace_point_emit_i16(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_i8, + RTE_TRACE_POINT_ARGS(int8_t in), + rte_trace_point_emit_i8(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_int, + RTE_TRACE_POINT_ARGS(int in), + rte_trace_point_emit_int(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_long, + RTE_TRACE_POINT_ARGS(long in), + rte_trace_point_emit_long(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_float, + RTE_TRACE_POINT_ARGS(float in), + rte_trace_point_emit_float(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_double, + RTE_TRACE_POINT_ARGS(double in), + rte_trace_point_emit_double(in); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_ptr, + RTE_TRACE_POINT_ARGS(const void *ptr), + rte_trace_point_emit_ptr(ptr); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_str, + RTE_TRACE_POINT_ARGS(const char *str), + rte_trace_point_emit_string(str); +) + +RTE_TRACE_POINT( + rte_eal_trace_generic_func, + RTE_TRACE_POINT_ARGS(const char *func), + rte_trace_point_emit_string(func); +) + +#define RTE_EAL_TRACE_GENERIC_FUNC rte_eal_trace_generic_func(__func__) + +/* Alarm */ +RTE_TRACE_POINT( + rte_eal_trace_alarm_set, + RTE_TRACE_POINT_ARGS(uint64_t us, rte_eal_alarm_callback cb_fn, + void *cb_arg, int rc), + rte_trace_point_emit_u64(us); + rte_trace_point_emit_ptr(cb_fn); + rte_trace_point_emit_ptr(cb_arg); + rte_trace_point_emit_int(rc); +) + +RTE_TRACE_POINT( + rte_eal_trace_alarm_cancel, + RTE_TRACE_POINT_ARGS(rte_eal_alarm_callback cb_fn, void *cb_arg, + int count), + rte_trace_point_emit_ptr(cb_fn); + rte_trace_point_emit_ptr(cb_arg); + rte_trace_point_emit_int(count); +) + +/* Memory */ +RTE_TRACE_POINT( + rte_eal_trace_mem_zmalloc, + RTE_TRACE_POINT_ARGS(const char *type, size_t size, unsigned int align, + int socket, void *ptr), + rte_trace_point_emit_string(type); + rte_trace_point_emit_long(size); + rte_trace_point_emit_u32(align); + rte_trace_point_emit_int(socket); + rte_trace_point_emit_ptr(ptr); +) + +RTE_TRACE_POINT( + rte_eal_trace_mem_malloc, + RTE_TRACE_POINT_ARGS(const char *type, size_t size, unsigned int align, + int socket, void *ptr), + rte_trace_point_emit_string(type); + rte_trace_point_emit_long(size); + rte_trace_point_emit_u32(align); + rte_trace_point_emit_int(socket); + rte_trace_point_emit_ptr(ptr); +) + +RTE_TRACE_POINT( + rte_eal_trace_mem_realloc, + RTE_TRACE_POINT_ARGS(size_t size, unsigned int align, int socket, + void *ptr), + rte_trace_point_emit_long(size); + rte_trace_point_emit_u32(align); + rte_trace_point_emit_int(socket); + rte_trace_point_emit_ptr(ptr); +) + +RTE_TRACE_POINT( + rte_eal_trace_mem_free, + RTE_TRACE_POINT_ARGS(void *ptr), + rte_trace_point_emit_ptr(ptr); +) + +/* Memzone */ +RTE_TRACE_POINT( + rte_eal_trace_memzone_reserve, + RTE_TRACE_POINT_ARGS(const char *name, size_t len, int socket_id, + unsigned int flags, unsigned int align, unsigned int bound, + const void *mz), + rte_trace_point_emit_string(name); + rte_trace_point_emit_long(len); + rte_trace_point_emit_int(socket_id); + rte_trace_point_emit_u32(flags); + rte_trace_point_emit_u32(align); + rte_trace_point_emit_u32(bound); + rte_trace_point_emit_ptr(mz); +) + +RTE_TRACE_POINT( + rte_eal_trace_memzone_lookup, + RTE_TRACE_POINT_ARGS(const char *name, const void *memzone), + rte_trace_point_emit_string(name); + rte_trace_point_emit_ptr(memzone); +) + +RTE_TRACE_POINT( + rte_eal_trace_memzone_free, + RTE_TRACE_POINT_ARGS(const char *name, void *addr, int rc), + rte_trace_point_emit_string(name); + rte_trace_point_emit_ptr(addr); + rte_trace_point_emit_int(rc); +) + +/* Thread */ +RTE_TRACE_POINT( + rte_eal_trace_thread_remote_launch, + RTE_TRACE_POINT_ARGS(int (*f)(void *), void *arg, + unsigned int slave_id, int rc), + rte_trace_point_emit_ptr(f); + rte_trace_point_emit_ptr(arg); + rte_trace_point_emit_u32(slave_id); + rte_trace_point_emit_int(rc); +) +RTE_TRACE_POINT( + rte_eal_trace_thread_lcore_ready, + RTE_TRACE_POINT_ARGS(unsigned int lcore_id, const char *cpuset), + rte_trace_point_emit_u32(lcore_id); + rte_trace_point_emit_string(cpuset); +) + +/* Interrupt */ +RTE_TRACE_POINT( + rte_eal_trace_intr_callback_register, + RTE_TRACE_POINT_ARGS(const struct rte_intr_handle *handle, + rte_intr_callback_fn cb, void *cb_arg, int rc), + rte_trace_point_emit_int(rc); + rte_trace_point_emit_int(handle->vfio_dev_fd); + rte_trace_point_emit_int(handle->fd); + rte_trace_point_emit_int(handle->type); + rte_trace_point_emit_u32(handle->max_intr); + rte_trace_point_emit_u32(handle->nb_efd); + rte_trace_point_emit_ptr(cb); + rte_trace_point_emit_ptr(cb_arg); +) +RTE_TRACE_POINT( + rte_eal_trace_intr_callback_unregister, + RTE_TRACE_POINT_ARGS(const struct rte_intr_handle *handle, + rte_intr_callback_fn cb, void *cb_arg, int rc), + rte_trace_point_emit_int(rc); + rte_trace_point_emit_int(handle->vfio_dev_fd); + rte_trace_point_emit_int(handle->fd); + rte_trace_point_emit_int(handle->type); + rte_trace_point_emit_u32(handle->max_intr); + rte_trace_point_emit_u32(handle->nb_efd); + rte_trace_point_emit_ptr(cb); + rte_trace_point_emit_ptr(cb_arg); +) +RTE_TRACE_POINT( + rte_eal_trace_intr_enable, + RTE_TRACE_POINT_ARGS(const struct rte_intr_handle *handle, int rc), + rte_trace_point_emit_int(rc); + rte_trace_point_emit_int(handle->vfio_dev_fd); + rte_trace_point_emit_int(handle->fd); + rte_trace_point_emit_int(handle->type); + rte_trace_point_emit_u32(handle->max_intr); + rte_trace_point_emit_u32(handle->nb_efd); +) +RTE_TRACE_POINT( + rte_eal_trace_intr_disable, + RTE_TRACE_POINT_ARGS(const struct rte_intr_handle *handle, int rc), + rte_trace_point_emit_int(rc); + rte_trace_point_emit_int(handle->vfio_dev_fd); + rte_trace_point_emit_int(handle->fd); + rte_trace_point_emit_int(handle->type); + rte_trace_point_emit_u32(handle->max_intr); + rte_trace_point_emit_u32(handle->nb_efd); +) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EAL_TRACE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_errno.h b/src/spdk/dpdk/lib/librte_eal/include/rte_errno.h new file mode 100644 index 000000000..ba45591d2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_errno.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * + * API for error cause tracking + */ + +#ifndef _RTE_ERRNO_H_ +#define _RTE_ERRNO_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_per_lcore.h> + +RTE_DECLARE_PER_LCORE(int, _rte_errno); /**< Per core error number. */ + +/** + * Error number value, stored per-thread, which can be queried after + * calls to certain functions to determine why those functions failed. + * + * Uses standard values from errno.h wherever possible, with a small number + * of additional possible values for RTE-specific conditions. + */ +#define rte_errno RTE_PER_LCORE(_rte_errno) + +/** + * Function which returns a printable string describing a particular + * error code. For non-RTE-specific error codes, this function returns + * the value from the libc strerror function. + * + * @param errnum + * The error number to be looked up - generally the value of rte_errno + * @return + * A pointer to a thread-local string containing the text describing + * the error. + */ +const char *rte_strerror(int errnum); + +#ifndef __ELASTERROR +/** + * Check if we have a defined value for the max system-defined errno values. + * if no max defined, start from 1000 to prevent overlap with standard values + */ +#define __ELASTERROR 1000 +#endif + +/** Error types */ +enum { + RTE_MIN_ERRNO = __ELASTERROR, /**< Start numbering above std errno vals */ + + E_RTE_SECONDARY, /**< Operation not allowed in secondary processes */ + E_RTE_NO_CONFIG, /**< Missing rte_config */ + + RTE_MAX_ERRNO /**< Max RTE error number */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ERRNO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_fbarray.h b/src/spdk/dpdk/lib/librte_eal/include/rte_fbarray.h new file mode 100644 index 000000000..6dccdbec9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_fbarray.h @@ -0,0 +1,565 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef RTE_FBARRAY_H +#define RTE_FBARRAY_H + +/** + * @file + * + * File-backed shared indexed array for DPDK. + * + * Basic workflow is expected to be the following: + * 1) Allocate array either using ``rte_fbarray_init()`` or + * ``rte_fbarray_attach()`` (depending on whether it's shared between + * multiple DPDK processes) + * 2) find free spots using ``rte_fbarray_find_next_free()`` + * 3) get pointer to data in the free spot using ``rte_fbarray_get()``, and + * copy data into the pointer (element size is fixed) + * 4) mark entry as used using ``rte_fbarray_set_used()`` + * + * Calls to ``rte_fbarray_init()`` and ``rte_fbarray_destroy()`` will have + * consequences for all processes, while calls to ``rte_fbarray_attach()`` and + * ``rte_fbarray_detach()`` will only have consequences within a single process. + * Therefore, it is safe to call ``rte_fbarray_attach()`` or + * ``rte_fbarray_detach()`` while another process is using ``rte_fbarray``, + * provided no other thread within the same process will try to use + * ``rte_fbarray`` before attaching or after detaching. It is not safe to call + * ``rte_fbarray_init()`` or ``rte_fbarray_destroy()`` while another thread or + * another process is using ``rte_fbarray``. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> + +#include <rte_compat.h> +#include <rte_rwlock.h> + +#define RTE_FBARRAY_NAME_LEN 64 + +struct rte_fbarray { + char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */ + unsigned int count; /**< number of entries stored */ + unsigned int len; /**< current length of the array */ + unsigned int elt_sz; /**< size of each element */ + void *data; /**< data pointer */ + rte_rwlock_t rwlock; /**< multiprocess lock */ +}; + +/** + * Set up ``rte_fbarray`` structure and allocate underlying resources. + * + * Call this function to correctly set up ``rte_fbarray`` and allocate + * underlying files that will be backing the data in the current process. Note + * that in order to use and share ``rte_fbarray`` between multiple processes, + * data pointed to by ``arr`` pointer must itself be allocated in shared memory. + * + * @param arr + * Valid pointer to allocated ``rte_fbarray`` structure. + * + * @param name + * Unique name to be assigned to this array. + * + * @param len + * Number of elements initially available in the array. + * + * @param elt_sz + * Size of each element. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz); + + +/** + * Attach to a file backing an already allocated and correctly set up + * ``rte_fbarray`` structure. + * + * Call this function to attach to file that will be backing the data in the + * current process. The structure must have been previously correctly set up + * with a call to ``rte_fbarray_init()``. Calls to ``rte_fbarray_attach()`` are + * usually meant to be performed in a multiprocessing scenario, with data + * pointed to by ``arr`` pointer allocated in shared memory. + * + * @param arr + * Valid pointer to allocated and correctly set up rte_fbarray structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_attach(struct rte_fbarray *arr); + + +/** + * Deallocate resources for an already allocated and correctly set up + * ``rte_fbarray`` structure, and remove the underlying file. + * + * Call this function to deallocate all resources associated with an + * ``rte_fbarray`` structure within the current process. This will also + * zero-fill data pointed to by ``arr`` pointer and remove the underlying file + * backing the data, so it is expected that by the time this function is called, + * all other processes have detached from this ``rte_fbarray``. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_destroy(struct rte_fbarray *arr); + + +/** + * Deallocate resources for an already allocated and correctly set up + * ``rte_fbarray`` structure. + * + * Call this function to deallocate all resources associated with an + * ``rte_fbarray`` structure within current process. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_detach(struct rte_fbarray *arr); + + +/** + * Get pointer to element residing at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Index of an element to get a pointer to. + * + * @return + * - non-NULL pointer on success. + * - NULL on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +void * +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx); + + +/** + * Find index of a specified element within the array. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param elt + * Pointer to element to find index to. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt); + + +/** + * Mark specified element as used. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to mark as used. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Mark specified element as free. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to mark as free. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Check whether element at specified index is marked as used. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to check as used. + * + * @return + * - 1 if element is used. + * - 0 if element is unused. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Find index of next free element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of next used element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of next chunk of ``n`` free elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of free elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find index of next chunk of ``n`` used elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of used elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find how many more free entries there are, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_contig_free(struct rte_fbarray *arr, + unsigned int start); + + +/** + * Find how many more used entries there are, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start); + +/** + * Find index of previous free element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of previous used element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find lowest start index of chunk of ``n`` free elements, down from specified + * index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of free elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find lowest start index of chunk of ``n`` used elements, down from specified + * index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of used elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find how many more free entries there are before specified index (like + * ``rte_fbarray_find_contig_free`` but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, + unsigned int start); + + +/** + * Find how many more used entries there are before specified index (like + * ``rte_fbarray_find_contig_used`` but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of biggest chunk of free elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_biggest_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of biggest chunk of used elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_biggest_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of biggest chunk of free elements before a specified index (like + * ``rte_fbarray_find_biggest_free``, but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_rev_biggest_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of biggest chunk of used elements before a specified index (like + * ``rte_fbarray_find_biggest_used``, but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +__rte_experimental +int +rte_fbarray_find_rev_biggest_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Dump ``rte_fbarray`` metadata. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param f + * File object to dump information into. + */ +__rte_experimental +void +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_FBARRAY_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_function_versioning.h b/src/spdk/dpdk/lib/librte_eal/include/rte_function_versioning.h new file mode 100644 index 000000000..f588f2643 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_function_versioning.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Neil Horman <nhorman@tuxdriver.com>. + * All rights reserved. + */ + +#ifndef _RTE_FUNCTION_VERSIONING_H_ +#define _RTE_FUNCTION_VERSIONING_H_ +#include <rte_common.h> + +#ifndef RTE_USE_FUNCTION_VERSIONING +#error Use of function versioning disabled, is "use_function_versioning=true" in meson.build? +#endif + +#ifdef RTE_BUILD_SHARED_LIB + +/* + * Provides backwards compatibility when updating exported functions. + * When a symol is exported from a library to provide an API, it also provides a + * calling convention (ABI) that is embodied in its name, return type, + * arguments, etc. On occasion that function may need to change to accommodate + * new functionality, behavior, etc. When that occurs, it is desirable to + * allow for backwards compatibility for a time with older binaries that are + * dynamically linked to the dpdk. To support that, the __vsym and + * VERSION_SYMBOL macros are created. They, in conjunction with the + * <library>_version.map file for a given library allow for multiple versions of + * a symbol to exist in a shared library so that older binaries need not be + * immediately recompiled. + * + * Refer to the guidelines document in the docs subdirectory for details on the + * use of these macros + */ + +/* + * Macro Parameters: + * b - function base name + * e - function version extension, to be concatenated with base name + * n - function symbol version string to be applied + * f - function prototype + * p - full function symbol name + */ + +/* + * VERSION_SYMBOL + * Creates a symbol version table entry binding symbol <b>@DPDK_<n> to the internal + * function name <b><e> + */ +#define VERSION_SYMBOL(b, e, n) __asm__(".symver " RTE_STR(b) RTE_STR(e) ", " RTE_STR(b) "@DPDK_" RTE_STR(n)) + +/* + * VERSION_SYMBOL_EXPERIMENTAL + * Creates a symbol version table entry binding the symbol <b>@EXPERIMENTAL to the internal + * function name <b><e>. The macro is used when a symbol matures to become part of the stable ABI, + * to provide an alias to experimental for some time. + */ +#define VERSION_SYMBOL_EXPERIMENTAL(b, e) __asm__(".symver " RTE_STR(b) RTE_STR(e) ", " RTE_STR(b) "@EXPERIMENTAL") + +/* + * BIND_DEFAULT_SYMBOL + * Creates a symbol version entry instructing the linker to bind references to + * symbol <b> to the internal symbol <b><e> + */ +#define BIND_DEFAULT_SYMBOL(b, e, n) __asm__(".symver " RTE_STR(b) RTE_STR(e) ", " RTE_STR(b) "@@DPDK_" RTE_STR(n)) + +/* + * __vsym + * Annotation to be used in declaration of the internal symbol <b><e> to signal + * that it is being used as an implementation of a particular version of symbol + * <b>. + */ +#define __vsym __rte_used + +/* + * MAP_STATIC_SYMBOL + * If a function has been bifurcated into multiple versions, none of which + * are defined as the exported symbol name in the map file, this macro can be + * used to alias a specific version of the symbol to its exported name. For + * example, if you have 2 versions of a function foo_v1 and foo_v2, where the + * former is mapped to foo@DPDK_1 and the latter is mapped to foo@DPDK_2 when + * building a shared library, this macro can be used to map either foo_v1 or + * foo_v2 to the symbol foo when building a static library, e.g.: + * MAP_STATIC_SYMBOL(void foo(), foo_v2); + */ +#define MAP_STATIC_SYMBOL(f, p) + +#else +/* + * No symbol versioning in use + */ +#define VERSION_SYMBOL(b, e, n) +#define VERSION_SYMBOL_EXPERIMENTAL(b, e) +#define __vsym +#define BIND_DEFAULT_SYMBOL(b, e, n) +#define MAP_STATIC_SYMBOL(f, p) f __attribute__((alias(RTE_STR(p)))) +/* + * RTE_BUILD_SHARED_LIB=n + */ +#endif + +#endif /* _RTE_FUNCTION_VERSIONING_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_hexdump.h b/src/spdk/dpdk/lib/librte_eal/include/rte_hexdump.h new file mode 100644 index 000000000..2d03c089c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_hexdump.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_HEXDUMP_H_ +#define _RTE_HEXDUMP_H_ + +/** + * @file + * Simple API to dump out memory in a special hex format. + */ + +#include <stdio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** +* Dump out memory in a special hex dump format. +* +* @param f +* A pointer to a file for output +* @param title +* If not NULL this string is printed as a header to the output. +* @param buf +* This is the buffer address to print out. +* @param len +* The number of bytes to dump out +* @return +* None. +*/ + +extern void +rte_hexdump(FILE *f, const char * title, const void * buf, unsigned int len); + +/** +* Dump out memory in a hex format with colons between bytes. +* +* @param f +* A pointer to a file for output +* @param title +* If not NULL this string is printed as a header to the output. +* @param buf +* This is the buffer address to print out. +* @param len +* The number of bytes to dump out +* @return +* None. +*/ + +void +rte_memdump(FILE *f, const char * title, const void * buf, unsigned int len); + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_HEXDUMP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_hypervisor.h b/src/spdk/dpdk/lib/librte_eal/include/rte_hypervisor.h new file mode 100644 index 000000000..5fe719c1d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_hypervisor.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_HYPERVISOR_H +#define RTE_HYPERVISOR_H + +/** + * @file + * Hypervisor awareness. + */ + +enum rte_hypervisor { + RTE_HYPERVISOR_NONE, + RTE_HYPERVISOR_KVM, + RTE_HYPERVISOR_HYPERV, + RTE_HYPERVISOR_VMWARE, + RTE_HYPERVISOR_UNKNOWN +}; + +/** + * Get the id of hypervisor it is running on. + */ +enum rte_hypervisor +rte_hypervisor_get(void); + +/** + * Get the name of a given hypervisor id. + */ +const char * +rte_hypervisor_get_name(enum rte_hypervisor id); + +#endif /* RTE_HYPERVISOR_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_interrupts.h b/src/spdk/dpdk/lib/librte_eal/include/rte_interrupts.h new file mode 100644 index 000000000..e3b406abc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_interrupts.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_INTERRUPTS_H_ +#define _RTE_INTERRUPTS_H_ + +#include <rte_common.h> +#include <rte_compat.h> + +/** + * @file + * + * The RTE interrupt interface provides functions to register/unregister + * callbacks for a specific interrupt. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** Interrupt handle */ +struct rte_intr_handle; + +/** Function to be registered for the specific interrupt */ +typedef void (*rte_intr_callback_fn)(void *cb_arg); + +/** + * Function to call after a callback is unregistered. + * Can be used to close fd and free cb_arg. + */ +typedef void (*rte_intr_unregister_callback_fn)(struct rte_intr_handle *intr_handle, + void *cb_arg); + +#include "rte_eal_interrupts.h" + +/** + * It registers the callback for the specific interrupt. Multiple + * callbacks can be registered at the same time. + * @param intr_handle + * Pointer to the interrupt handle. + * @param cb + * callback address. + * @param cb_arg + * address of parameter for callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg); + +/** + * It unregisters the callback according to the specified interrupt handle. + * + * @param intr_handle + * pointer to the interrupt handle. + * @param cb + * callback address. + * @param cb_arg + * address of parameter for callback, (void *)-1 means to remove all + * registered which has the same callback address. + * + * @return + * - On success, return the number of callback entities removed. + * - On failure, a negative value. + */ +int rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg); + +/** + * Unregister the callback according to the specified interrupt handle, + * after it's no longer active. Fail if source is not active. + * + * @param intr_handle + * pointer to the interrupt handle. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback, (void *)-1 means to remove all + * registered which has the same callback address. + * @param ucb_fn + * callback to call before cb is unregistered (optional). + * can be used to close fd and free cb_arg. + * + * @return + * - On success, return the number of callback entities marked for remove. + * - On failure, a negative value. + */ +__rte_experimental +int +rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg, + rte_intr_unregister_callback_fn ucb_fn); + +/** + * It enables the interrupt for the specified handle. + * + * @param intr_handle + * pointer to the interrupt handle. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_intr_enable(const struct rte_intr_handle *intr_handle); + +/** + * It disables the interrupt for the specified handle. + * + * @param intr_handle + * pointer to the interrupt handle. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_intr_disable(const struct rte_intr_handle *intr_handle); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It acknowledges an interrupt raised for the specified handle. + * + * This function should be called at the end of each interrupt handler either + * from application or driver, so that currently raised interrupt is acked and + * further new interrupts are raised. + * + * @param intr_handle + * pointer to the interrupt handle. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +__rte_experimental +int rte_intr_ack(const struct rte_intr_handle *intr_handle); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_keepalive.h b/src/spdk/dpdk/lib/librte_eal/include/rte_keepalive.h new file mode 100644 index 000000000..4bda7ca56 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_keepalive.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015-2016 Intel Corporation. + */ + +/** + * @file rte_keepalive.h + * DPDK RTE LCore Keepalive Monitor. + * + **/ + +#ifndef _KEEPALIVE_H_ +#define _KEEPALIVE_H_ + +#include <rte_config.h> +#include <rte_memory.h> + +#ifndef RTE_KEEPALIVE_MAXCORES +/** + * Number of cores to track. + * @note Must be larger than the highest core id. */ +#define RTE_KEEPALIVE_MAXCORES RTE_MAX_LCORE +#endif + +enum rte_keepalive_state { + RTE_KA_STATE_UNUSED = 0, + RTE_KA_STATE_ALIVE = 1, + RTE_KA_STATE_MISSING = 4, + RTE_KA_STATE_DEAD = 2, + RTE_KA_STATE_GONE = 3, + RTE_KA_STATE_DOZING = 5, + RTE_KA_STATE_SLEEP = 6 +}; + +/** + * Keepalive failure callback. + * + * Receives a data pointer passed to rte_keepalive_create() and the id of the + * failed core. + * @param data Data pointer passed to rte_keepalive_create() + * @param id_core ID of the core that has failed + */ +typedef void (*rte_keepalive_failure_callback_t)( + void *data, + const int id_core); + +/** + * Keepalive relay callback. + * + * Receives a data pointer passed to rte_keepalive_register_relay_callback(), + * the id of the core for which state is to be forwarded, and details of the + * current core state. + * @param data Data pointer passed to rte_keepalive_register_relay_callback() + * @param id_core ID of the core for which state is being reported + * @param core_state The current state of the core + * @param Timestamp of when core was last seen alive + */ +typedef void (*rte_keepalive_relay_callback_t)( + void *data, + const int id_core, + enum rte_keepalive_state core_state, + uint64_t last_seen + ); + +/** + * Keepalive state structure. + * @internal + */ +struct rte_keepalive; + +/** + * Initialise keepalive sub-system. + * @param callback + * Function called upon detection of a dead core. + * @param data + * Data pointer to be passed to function callback. + * @return + * Keepalive structure success, NULL on failure. + */ +struct rte_keepalive *rte_keepalive_create( + rte_keepalive_failure_callback_t callback, + void *data); + +/** + * Checks & handles keepalive state of monitored cores. + * @param *ptr_timer Triggering timer (unused) + * @param *ptr_data Data pointer (keepalive structure) + */ +void rte_keepalive_dispatch_pings(void *ptr_timer, void *ptr_data); + +/** + * Registers a core for keepalive checks. + * @param *keepcfg + * Keepalive structure pointer + * @param id_core + * ID number of core to register. + */ +void rte_keepalive_register_core(struct rte_keepalive *keepcfg, + const int id_core); + +/** + * Per-core keepalive check. + * @param *keepcfg + * Keepalive structure pointer + * + * This function needs to be called from within the main process loop of + * the LCore to be checked. + */ +void +rte_keepalive_mark_alive(struct rte_keepalive *keepcfg); + +/** + * Per-core sleep-time indication. + * @param *keepcfg + * Keepalive structure pointer + * + * If CPU idling is enabled, this function needs to be called from within + * the main process loop of the LCore going to sleep, in order to avoid + * the LCore being mis-detected as dead. + */ +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg); + +/** + * Registers a 'live core' callback. + * + * The complement of the 'dead core' callback. This is called when a + * core is known to be alive, and is intended for cases when an app + * needs to know 'liveness' beyond just knowing when a core has died. + * + * @param *keepcfg + * Keepalive structure pointer + * @param callback + * Function called upon detection of a dead core. + * @param data + * Data pointer to be passed to function callback. + */ +void +rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data); + +#endif /* _KEEPALIVE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_launch.h b/src/spdk/dpdk/lib/librte_eal/include/rte_launch.h new file mode 100644 index 000000000..06a671752 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_launch.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LAUNCH_H_ +#define _RTE_LAUNCH_H_ + +/** + * @file + * + * Launch tasks on other lcores + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * State of an lcore. + */ +enum rte_lcore_state_t { + WAIT, /**< waiting a new command */ + RUNNING, /**< executing command */ + FINISHED, /**< command executed */ +}; + +/** + * Definition of a remote launch function. + */ +typedef int (lcore_function_t)(void *); + +/** + * Launch a function on another lcore. + * + * To be executed on the MASTER lcore only. + * + * Sends a message to a slave lcore (identified by the slave_id) that + * is in the WAIT state (this is true after the first call to + * rte_eal_init()). This can be checked by first calling + * rte_eal_wait_lcore(slave_id). + * + * When the remote lcore receives the message, it switches to + * the RUNNING state, then calls the function f with argument arg. Once the + * execution is done, the remote lcore switches to a FINISHED state and + * the return value of f is stored in a local variable to be read using + * rte_eal_wait_lcore(). + * + * The MASTER lcore returns as soon as the message is sent and knows + * nothing about the completion of f. + * + * Note: This function is not designed to offer optimum + * performance. It is just a practical way to launch a function on + * another lcore at initialization time. + * + * @param f + * The function to be called. + * @param arg + * The argument for the function. + * @param slave_id + * The identifier of the lcore on which the function should be executed. + * @return + * - 0: Success. Execution of function f started on the remote lcore. + * - (-EBUSY): The remote lcore is not in a WAIT state. + */ +int rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned slave_id); + +/** + * This enum indicates whether the master core must execute the handler + * launched on all logical cores. + */ +enum rte_rmt_call_master_t { + SKIP_MASTER = 0, /**< lcore handler not executed by master core. */ + CALL_MASTER, /**< lcore handler executed by master core. */ +}; + +/** + * Launch a function on all lcores. + * + * Check that each SLAVE lcore is in a WAIT state, then call + * rte_eal_remote_launch() for each lcore. + * + * @param f + * The function to be called. + * @param arg + * The argument for the function. + * @param call_master + * If call_master set to SKIP_MASTER, the MASTER lcore does not call + * the function. If call_master is set to CALL_MASTER, the function + * is also called on master before returning. In any case, the master + * lcore returns as soon as it finished its job and knows nothing + * about the completion of f on the other lcores. + * @return + * - 0: Success. Execution of function f started on all remote lcores. + * - (-EBUSY): At least one remote lcore is not in a WAIT state. In this + * case, no message is sent to any of the lcores. + */ +int rte_eal_mp_remote_launch(lcore_function_t *f, void *arg, + enum rte_rmt_call_master_t call_master); + +/** + * Get the state of the lcore identified by slave_id. + * + * To be executed on the MASTER lcore only. + * + * @param slave_id + * The identifier of the lcore. + * @return + * The state of the lcore. + */ +enum rte_lcore_state_t rte_eal_get_lcore_state(unsigned slave_id); + +/** + * Wait until an lcore finishes its job. + * + * To be executed on the MASTER lcore only. + * + * If the slave lcore identified by the slave_id is in a FINISHED state, + * switch to the WAIT state. If the lcore is in RUNNING state, wait until + * the lcore finishes its job and moves to the FINISHED state. + * + * @param slave_id + * The identifier of the lcore. + * @return + * - 0: If the lcore identified by the slave_id is in a WAIT state. + * - The value that was returned by the previous remote launch + * function call if the lcore identified by the slave_id was in a + * FINISHED or RUNNING state. In this case, it changes the state + * of the lcore to WAIT. + */ +int rte_eal_wait_lcore(unsigned slave_id); + +/** + * Wait until all lcores finish their jobs. + * + * To be executed on the MASTER lcore only. Issue an + * rte_eal_wait_lcore() for every lcore. The return values are + * ignored. + * + * After a call to rte_eal_mp_wait_lcore(), the caller can assume + * that all slave lcores are in a WAIT state. + */ +void rte_eal_mp_wait_lcore(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LAUNCH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_lcore.h b/src/spdk/dpdk/lib/librte_eal/include/rte_lcore.h new file mode 100644 index 000000000..339046bc8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_lcore.h @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LCORE_H_ +#define _RTE_LCORE_H_ + +/** + * @file + * + * API for lcore and socket manipulation + * + */ +#include <rte_config.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_launch.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define LCORE_ID_ANY UINT32_MAX /**< Any lcore. */ + +RTE_DECLARE_PER_LCORE(unsigned, _lcore_id); /**< Per thread "lcore id". */ +RTE_DECLARE_PER_LCORE(rte_cpuset_t, _cpuset); /**< Per thread "cpuset". */ + +/** + * Get a lcore's role. + * + * @param lcore_id + * The identifier of the lcore, which MUST be between 0 and RTE_MAX_LCORE-1. + * @return + * The role of the lcore. + */ +enum rte_lcore_role_t rte_eal_lcore_role(unsigned int lcore_id); + +/** + * Return the Application thread ID of the execution unit. + * + * Note: in most cases the lcore id returned here will also correspond + * to the processor id of the CPU on which the thread is pinned, this + * will not be the case if the user has explicitly changed the thread to + * core affinities using --lcores EAL argument e.g. --lcores '(0-3)@10' + * to run threads with lcore IDs 0, 1, 2 and 3 on physical core 10.. + * + * @return + * Logical core ID (in EAL thread) or LCORE_ID_ANY (in non-EAL thread) + */ +static inline unsigned +rte_lcore_id(void) +{ + return RTE_PER_LCORE(_lcore_id); +} + +/** + * Get the id of the master lcore + * + * @return + * the id of the master lcore + */ +unsigned int rte_get_master_lcore(void); + +/** + * Return the number of execution units (lcores) on the system. + * + * @return + * the number of execution units (lcores) on the system. + */ +unsigned int rte_lcore_count(void); + +/** + * Return the index of the lcore starting from zero. + * + * When option -c or -l is given, the index corresponds + * to the order in the list. + * For example: + * -c 0x30, lcore 4 has index 0, and 5 has index 1. + * -l 22,18 lcore 22 has index 0, and 18 has index 1. + * + * @param lcore_id + * The targeted lcore, or -1 for the current one. + * @return + * The relative index, or -1 if not enabled. + */ +int rte_lcore_index(int lcore_id); + +/** + * Return the ID of the physical socket of the logical core we are + * running on. + * @return + * the ID of current lcoreid's physical socket + */ +unsigned int rte_socket_id(void); + +/** + * Return number of physical sockets detected on the system. + * + * Note that number of nodes may not be correspondent to their physical id's: + * for example, a system may report two socket id's, but the actual socket id's + * may be 0 and 8. + * + * @return + * the number of physical sockets as recognized by EAL + */ +unsigned int +rte_socket_count(void); + +/** + * Return socket id with a particular index. + * + * This will return socket id at a particular position in list of all detected + * physical socket id's. For example, on a machine with sockets [0, 8], passing + * 1 as a parameter will return 8. + * + * @param idx + * index of physical socket id to return + * + * @return + * - physical socket id as recognized by EAL + * - -1 on error, with errno set to EINVAL + */ +int +rte_socket_id_by_idx(unsigned int idx); + +/** + * Get the ID of the physical socket of the specified lcore + * + * @param lcore_id + * the targeted lcore, which MUST be between 0 and RTE_MAX_LCORE-1. + * @return + * the ID of lcoreid's physical socket + */ +unsigned int +rte_lcore_to_socket_id(unsigned int lcore_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Return the id of the lcore on a socket starting from zero. + * + * @param lcore_id + * The targeted lcore, or -1 for the current one. + * @return + * The relative index, or -1 if not enabled. + */ +__rte_experimental +int +rte_lcore_to_cpu_id(int lcore_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Return the cpuset for a given lcore. + * @param lcore_id + * the targeted lcore, which MUST be between 0 and RTE_MAX_LCORE-1. + * @return + * The cpuset of that lcore + */ +__rte_experimental +rte_cpuset_t +rte_lcore_cpuset(unsigned int lcore_id); + +/** + * Test if an lcore is enabled. + * + * @param lcore_id + * The identifier of the lcore, which MUST be between 0 and + * RTE_MAX_LCORE-1. + * @return + * True if the given lcore is enabled; false otherwise. + */ +int rte_lcore_is_enabled(unsigned int lcore_id); + +/** + * Get the next enabled lcore ID. + * + * @param i + * The current lcore (reference). + * @param skip_master + * If true, do not return the ID of the master lcore. + * @param wrap + * If true, go back to 0 when RTE_MAX_LCORE is reached; otherwise, + * return RTE_MAX_LCORE. + * @return + * The next lcore_id or RTE_MAX_LCORE if not found. + */ +unsigned int rte_get_next_lcore(unsigned int i, int skip_master, int wrap); + +/** + * Macro to browse all running lcores. + */ +#define RTE_LCORE_FOREACH(i) \ + for (i = rte_get_next_lcore(-1, 0, 0); \ + i<RTE_MAX_LCORE; \ + i = rte_get_next_lcore(i, 0, 0)) + +/** + * Macro to browse all running lcores except the master lcore. + */ +#define RTE_LCORE_FOREACH_SLAVE(i) \ + for (i = rte_get_next_lcore(-1, 1, 0); \ + i<RTE_MAX_LCORE; \ + i = rte_get_next_lcore(i, 1, 0)) + +/** + * Set core affinity of the current thread. + * Support both EAL and non-EAL thread and update TLS. + * + * @param cpusetp + * Point to cpu_set_t for setting current thread affinity. + * @return + * On success, return 0; otherwise return -1; + */ +int rte_thread_set_affinity(rte_cpuset_t *cpusetp); + +/** + * Get core affinity of the current thread. + * + * @param cpusetp + * Point to cpu_set_t for getting current thread cpu affinity. + * It presumes input is not NULL, otherwise it causes panic. + * + */ +void rte_thread_get_affinity(rte_cpuset_t *cpusetp); + +/** + * Set thread names. + * + * @note It fails with glibc < 2.12. + * + * @param id + * Thread id. + * @param name + * Thread name to set. + * @return + * On success, return 0; otherwise return a negative value. + */ +int rte_thread_setname(pthread_t id, const char *name); + +/** + * Get thread name. + * + * @note It fails with glibc < 2.12. + * + * @param id + * Thread id. + * @param name + * Thread name to set. + * @param len + * Thread name buffer length. + * @return + * On success, return 0; otherwise return a negative value. + */ +__rte_experimental +int rte_thread_getname(pthread_t id, char *name, size_t len); + +/** + * Create a control thread. + * + * Wrapper to pthread_create(), pthread_setname_np() and + * pthread_setaffinity_np(). The affinity of the new thread is based + * on the CPU affinity retrieved at the time rte_eal_init() was called, + * the dataplane and service lcores are then excluded. + * + * @param thread + * Filled with the thread id of the new created thread. + * @param name + * The name of the control thread (max 16 characters including '\0'). + * @param attr + * Attributes for the new thread. + * @param start_routine + * Function to be executed by the new thread. + * @param arg + * Argument passed to start_routine. + * @return + * On success, returns 0; on error, it returns a negative value + * corresponding to the error number. + */ +int +rte_ctrl_thread_create(pthread_t *thread, const char *name, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg); + +/** + * Test if the core supplied has a specific role + * + * @param lcore_id + * The identifier of the lcore, which MUST be between 0 and + * RTE_MAX_LCORE-1. + * @param role + * The role to be checked against. + * @return + * Boolean value: positive if test is true; otherwise returns 0. + */ +int +rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_LCORE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_log.h b/src/spdk/dpdk/lib/librte_eal/include/rte_log.h new file mode 100644 index 000000000..1789ede56 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_log.h @@ -0,0 +1,383 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_LOG_H_ +#define _RTE_LOG_H_ + +/** + * @file + * + * RTE Logs API + * + * This file provides a log API to RTE applications. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdarg.h> +#include <stdbool.h> +#include <sys/queue.h> + +#include <rte_common.h> +#include <rte_config.h> +#include <rte_compat.h> + +struct rte_log_dynamic_type; + +/** The rte_log structure. */ +struct rte_logs { + uint32_t type; /**< Bitfield with enabled logs. */ + uint32_t level; /**< Log level. */ + FILE *file; /**< Output file set by rte_openlog_stream, or NULL. */ + size_t dynamic_types_len; + struct rte_log_dynamic_type *dynamic_types; +}; + +/** Global log information */ +extern struct rte_logs rte_logs; + +/* SDK log type */ +#define RTE_LOGTYPE_EAL 0 /**< Log related to eal. */ +#define RTE_LOGTYPE_MALLOC 1 /**< Log related to malloc. */ +#define RTE_LOGTYPE_RING 2 /**< Log related to ring. */ +#define RTE_LOGTYPE_MEMPOOL 3 /**< Log related to mempool. */ +#define RTE_LOGTYPE_TIMER 4 /**< Log related to timers. */ +#define RTE_LOGTYPE_PMD 5 /**< Log related to poll mode driver. */ +#define RTE_LOGTYPE_HASH 6 /**< Log related to hash table. */ +#define RTE_LOGTYPE_LPM 7 /**< Log related to LPM. */ +#define RTE_LOGTYPE_KNI 8 /**< Log related to KNI. */ +#define RTE_LOGTYPE_ACL 9 /**< Log related to ACL. */ +#define RTE_LOGTYPE_POWER 10 /**< Log related to power. */ +#define RTE_LOGTYPE_METER 11 /**< Log related to QoS meter. */ +#define RTE_LOGTYPE_SCHED 12 /**< Log related to QoS port scheduler. */ +#define RTE_LOGTYPE_PORT 13 /**< Log related to port. */ +#define RTE_LOGTYPE_TABLE 14 /**< Log related to table. */ +#define RTE_LOGTYPE_PIPELINE 15 /**< Log related to pipeline. */ +#define RTE_LOGTYPE_MBUF 16 /**< Log related to mbuf. */ +#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */ +#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */ +#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */ +#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */ + +/* these log types can be used in an application */ +#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */ +#define RTE_LOGTYPE_USER2 25 /**< User-defined log type 2. */ +#define RTE_LOGTYPE_USER3 26 /**< User-defined log type 3. */ +#define RTE_LOGTYPE_USER4 27 /**< User-defined log type 4. */ +#define RTE_LOGTYPE_USER5 28 /**< User-defined log type 5. */ +#define RTE_LOGTYPE_USER6 29 /**< User-defined log type 6. */ +#define RTE_LOGTYPE_USER7 30 /**< User-defined log type 7. */ +#define RTE_LOGTYPE_USER8 31 /**< User-defined log type 8. */ + +/** First identifier for extended logs */ +#define RTE_LOGTYPE_FIRST_EXT_ID 32 + +/* Can't use 0, as it gives compiler warnings */ +#define RTE_LOG_EMERG 1U /**< System is unusable. */ +#define RTE_LOG_ALERT 2U /**< Action must be taken immediately. */ +#define RTE_LOG_CRIT 3U /**< Critical conditions. */ +#define RTE_LOG_ERR 4U /**< Error conditions. */ +#define RTE_LOG_WARNING 5U /**< Warning conditions. */ +#define RTE_LOG_NOTICE 6U /**< Normal but significant condition. */ +#define RTE_LOG_INFO 7U /**< Informational. */ +#define RTE_LOG_DEBUG 8U /**< Debug-level messages. */ + +/** + * Change the stream that will be used by the logging system. + * + * This can be done at any time. The f argument represents the stream + * to be used to send the logs. If f is NULL, the default output is + * used (stderr). + * + * @param f + * Pointer to the stream. + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_openlog_stream(FILE *f); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the stream used by the logging system (see rte_openlog_stream() + * to change it). + * + * @return + * Pointer to the stream. + */ +__rte_experimental +FILE *rte_log_get_stream(void); + +/** + * Set the global log level. + * + * After this call, logs with a level lower or equal than the level + * passed as argument will be displayed. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + */ +void rte_log_set_global_level(uint32_t level); + +/** + * Get the global log level. + * + * @return + * The current global log level. + */ +uint32_t rte_log_get_global_level(void); + +/** + * Get the log level for a given type. + * + * @param logtype + * The log type identifier. + * @return + * 0 on success, a negative value if logtype is invalid. + */ +int rte_log_get_level(uint32_t logtype); + +/** + * For a given `logtype`, check if a log with `loglevel` can be printed. + * + * @param logtype + * The log type identifier + * @param loglevel + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @return + * Returns 'true' if log can be printed and 'false' if it can't. + */ +__rte_experimental +bool rte_log_can_log(uint32_t logtype, uint32_t loglevel); + +/** + * Set the log level for a given type based on globbing pattern. + * + * @param pattern + * The globbing pattern identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_pattern(const char *pattern, uint32_t level); + +/** + * Set the log level for a given type based on regular expression. + * + * @param regex + * The regular expression identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_regexp(const char *regex, uint32_t level); + +/** + * Set the log level for a given type. + * + * @param logtype + * The log type identifier. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if logtype or level is invalid. + */ +int rte_log_set_level(uint32_t logtype, uint32_t level); + +/** + * Get the current loglevel for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The loglevel of the message being processed. + */ +int rte_log_cur_msg_loglevel(void); + +/** + * Get the current logtype for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The logtype of the message being processed. + */ +int rte_log_cur_msg_logtype(void); + +/** + * Register a dynamic log type + * + * If a log is already registered with the same type, the returned value + * is the same than the previous one. + * + * @param name + * The string identifying the log type. + * @return + * - >0: success, the returned value is the log type identifier. + * - (-ENOMEM): cannot allocate memory. + */ +int rte_log_register(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a dynamic log type and try to pick its level from EAL options + * + * rte_log_register() is called inside. If successful, the function tries + * to search for matching regexp in the list of EAL log level options and + * pick the level from the last matching entry. If nothing can be applied + * from the list, the level will be set to the user-defined default value. + * + * @param name + * Name for the log type to be registered + * @param level_def + * Fallback level to be set if the global list has no matching options + * @return + * - >=0: the newly registered log type + * - <0: rte_log_register() error value + */ +__rte_experimental +int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def); + +/** + * Dump log information. + * + * Dump the global level and the registered log types. + * + * @param f + * The output stream where the dump should be sent. + */ +void rte_log_dump(FILE *f); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + __rte_format_printf(3, 4); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. A trailing + * newline may be added if needed. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @param ap + * The va_list of the variable arguments required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) + __rte_format_printf(3, 0); + +/** + * Generates a log message. + * + * The RTE_LOG() is a helper that prefixes the string with the log level + * and type, and call rte_log(). + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG(l, t, ...) \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) + +/** + * Generates a log message for data path. + * + * Similar to RTE_LOG(), except that it is removed at compilation time + * if the RTE_LOG_DP_LEVEL configuration option is lower than the log + * level argument. + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG_DP(l, t, ...) \ + (void)((RTE_LOG_ ## l <= RTE_LOG_DP_LEVEL) ? \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) : \ + 0) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LOG_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_malloc.h b/src/spdk/dpdk/lib/librte_eal/include/rte_malloc.h new file mode 100644 index 000000000..42ca05182 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_malloc.h @@ -0,0 +1,560 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_MALLOC_H_ +#define _RTE_MALLOC_H_ + +/** + * @file + * RTE Malloc. This library provides methods for dynamically allocating memory + * from hugepages. + */ + +#include <stdio.h> +#include <stddef.h> +#include <rte_compat.h> +#include <rte_memory.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Structure to hold heap statistics obtained from rte_malloc_get_socket_stats function. + */ +struct rte_malloc_socket_stats { + size_t heap_totalsz_bytes; /**< Total bytes on heap */ + size_t heap_freesz_bytes; /**< Total free bytes on heap */ + size_t greatest_free_size; /**< Size in bytes of largest free block */ + unsigned free_count; /**< Number of free elements on heap */ + unsigned alloc_count; /**< Number of allocated elements on heap */ + size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */ +}; + +/** + * This function allocates memory from the huge-page area of memory. The memory + * is not cleared. In NUMA systems, the memory allocated resides on the same + * NUMA socket as the core that calls this function. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_malloc(const char *type, size_t size, unsigned align); + +/** + * Allocate zero'ed memory from the heap. + * + * Equivalent to rte_malloc() except that the memory zone is + * initialised with zeros. In NUMA systems, the memory allocated resides on the + * same NUMA socket as the core that calls this function. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_zmalloc(const char *type, size_t size, unsigned align); + +/** + * Replacement function for calloc(), using huge-page memory. Memory area is + * initialised with zeros. In NUMA systems, the memory allocated resides on the + * same NUMA socket as the core that calls this function. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param num + * Number of elements to be allocated. + * @param size + * Size (in bytes) of a single element. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_calloc(const char *type, size_t num, size_t size, unsigned align); + +/** + * Replacement function for realloc(), using huge-page memory. Reserved area + * memory is resized, preserving contents. In NUMA systems, the new area + * may not reside on the same NUMA node as the old one. + * + * @param ptr + * Pointer to already allocated memory + * @param size + * Size (in bytes) of new area. If this is 0, memory is freed. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the reallocated memory. + */ +void * +rte_realloc(void *ptr, size_t size, unsigned int align); + +/** + * Replacement function for realloc(), using huge-page memory. Reserved area + * memory is resized, preserving contents. In NUMA systems, the new area + * resides on requested NUMA socket. + * + * @param ptr + * Pointer to already allocated memory + * @param size + * Size (in bytes) of new area. If this is 0, memory is freed. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the reallocated memory. + */ +__rte_experimental +void * +rte_realloc_socket(void *ptr, size_t size, unsigned int align, int socket); + +/** + * This function allocates memory from the huge-page area of memory. The memory + * is not cleared. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this function + * will behave the same as rte_malloc(). + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_malloc_socket(const char *type, size_t size, unsigned align, int socket); + +/** + * Allocate zero'ed memory from the heap. + * + * Equivalent to rte_malloc() except that the memory zone is + * initialised with zeros. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this function + * will behave the same as rte_zmalloc(). + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket); + +/** + * Replacement function for calloc(), using huge-page memory. Memory area is + * initialised with zeros. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param num + * Number of elements to be allocated. + * @param size + * Size (in bytes) of a single element. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this function + * will behave the same as rte_calloc(). + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket); + +/** + * Frees the memory space pointed to by the provided pointer. + * + * This pointer must have been returned by a previous call to + * rte_malloc(), rte_zmalloc(), rte_calloc() or rte_realloc(). The behaviour of + * rte_free() is undefined if the pointer does not match this requirement. + * + * If the pointer is NULL, the function does nothing. + * + * @param ptr + * The pointer to memory to be freed. + */ +void +rte_free(void *ptr); + +/** + * If malloc debug is enabled, check a memory block for header + * and trailer markers to indicate that all is well with the block. + * If size is non-null, also return the size of the block. + * + * @param ptr + * pointer to the start of a data block, must have been returned + * by a previous call to rte_malloc(), rte_zmalloc(), rte_calloc() + * or rte_realloc() + * @param size + * if non-null, and memory block pointer is valid, returns the size + * of the memory block + * @return + * -1 on error, invalid pointer passed or header and trailer markers + * are missing or corrupted + * 0 on success + */ +int +rte_malloc_validate(const void *ptr, size_t *size); + +/** + * Get heap statistics for the specified heap. + * + * @note This function is not thread-safe with respect to + * ``rte_malloc_heap_create()``/``rte_malloc_heap_destroy()`` functions. + * + * @param socket + * An unsigned integer specifying the socket to get heap statistics for + * @param socket_stats + * A structure which provides memory to store statistics + * @return + * Null on error + * Pointer to structure storing statistics on success + */ +int +rte_malloc_get_socket_stats(int socket, + struct rte_malloc_socket_stats *socket_stats); + +/** + * Add memory chunk to a heap with specified name. + * + * @note Multiple memory chunks can be added to the same heap + * + * @note Before accessing this memory in other processes, it needs to be + * attached in each of those processes by calling + * ``rte_malloc_heap_memory_attach`` in each other process. + * + * @note Memory must be previously allocated for DPDK to be able to use it as a + * malloc heap. Failing to do so will result in undefined behavior, up to and + * including segmentation faults. + * + * @note Calling this function will erase any contents already present at the + * supplied memory address. + * + * @param heap_name + * Name of the heap to add memory chunk to + * @param va_addr + * Start of virtual area to add to the heap. Must be aligned by ``page_sz``. + * @param len + * Length of virtual area to add to the heap. Must be aligned by ``page_sz``. + * @param iova_addrs + * Array of page IOVA addresses corresponding to each page in this memory + * area. Can be NULL, in which case page IOVA addresses will be set to + * RTE_BAD_IOVA. + * @param n_pages + * Number of elements in the iova_addrs array. Ignored if ``iova_addrs`` + * is NULL. + * @param page_sz + * Page size of the underlying memory + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to add memory to a reserved heap + * ENOSPC - no more space in internal config to store a new memory chunk + */ +__rte_experimental +int +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +/** + * Remove memory chunk from heap with specified name. + * + * @note Memory chunk being removed must be the same as one that was added; + * partially removing memory chunks is not supported + * + * @note Memory area must not contain any allocated elements to allow its + * removal from the heap + * + * @note All other processes must detach from the memory chunk prior to it being + * removed from the heap. + * + * @param heap_name + * Name of the heap to remove memory from + * @param va_addr + * Virtual address to remove from the heap + * @param len + * Length of virtual area to remove from the heap + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to remove memory from a reserved heap + * ENOENT - heap or memory chunk was not found + * EBUSY - memory chunk still contains data + */ +__rte_experimental +int +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len); + +/** + * Attach to an already existing chunk of external memory in another process. + * + * @note This function must be called before any attempt is made to use an + * already existing external memory chunk. This function does *not* need to + * be called if a call to ``rte_malloc_heap_memory_add`` was made in the + * current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful attach + * -1 on unsuccessful attach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to attach memory to a reserved heap + * ENOENT - heap or memory chunk was not found + */ +__rte_experimental +int +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len); + +/** + * Detach from a chunk of external memory in secondary process. + * + * @note This function must be called in before any attempt is made to remove + * external memory from the heap in another process. This function does *not* + * need to be called if a call to ``rte_malloc_heap_memory_remove`` will be + * called in current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful detach + * -1 on unsuccessful detach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to detach memory from a reserved heap + * ENOENT - heap or memory chunk was not found + */ +__rte_experimental +int +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len); + +/** + * Creates a new empty malloc heap with a specified name. + * + * @note Heaps created via this call will automatically get assigned a unique + * socket ID, which can be found using ``rte_malloc_heap_get_socket()`` + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on successful creation + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * EEXIST - heap by name of ``heap_name`` already exists + * ENOSPC - no more space in internal config to store a new heap + */ +__rte_experimental +int +rte_malloc_heap_create(const char *heap_name); + +/** + * Destroys a previously created malloc heap with specified name. + * + * @note This function will return a failure result if not all memory allocated + * from the heap has been freed back to the heap + * + * @note This function will return a failure result if not all memory segments + * were removed from the heap prior to its destruction + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * ENOENT - heap by the name of ``heap_name`` was not found + * EPERM - attempting to destroy reserved heap + * EBUSY - heap still contains data + */ +__rte_experimental +int +rte_malloc_heap_destroy(const char *heap_name); + +/** + * Find socket ID corresponding to a named heap. + * + * @param name + * Heap name to find socket ID for + * @return + * Socket ID in case of success (a non-negative number) + * -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``name`` was NULL + * ENOENT - heap identified by the name ``name`` was not found + */ +__rte_experimental +int +rte_malloc_heap_get_socket(const char *name); + +/** + * Check if a given socket ID refers to externally allocated memory. + * + * @note Passing SOCKET_ID_ANY will return 0. + * + * @param socket_id + * Socket ID to check + * @return + * 1 if socket ID refers to externally allocated memory + * 0 if socket ID refers to internal DPDK memory + * -1 if socket ID is invalid + */ +__rte_experimental +int +rte_malloc_heap_socket_is_external(int socket_id); + +/** + * Dump statistics. + * + * Dump for the specified type to a file. If the type argument is + * NULL, all memory types will be dumped. + * + * @note This function is not thread-safe with respect to + * ``rte_malloc_heap_create()``/``rte_malloc_heap_destroy()`` functions. + * + * @param f + * A pointer to a file for output + * @param type + * A string identifying the type of objects to dump, or NULL + * to dump all objects. + */ +void +rte_malloc_dump_stats(FILE *f, const char *type); + +/** + * Dump contents of all malloc heaps to a file. + * + * @note This function is not thread-safe with respect to + * ``rte_malloc_heap_create()``/``rte_malloc_heap_destroy()`` functions. + * + * @param f + * A pointer to a file for output + */ +__rte_experimental +void +rte_malloc_dump_heaps(FILE *f); + +/** + * Set the maximum amount of allocated memory for this type. + * + * This is not yet implemented + * + * @param type + * A string identifying the type of allocated objects. + * @param max + * The maximum amount of allocated bytes for this type. + * @return + * - 0: Success. + * - (-1): Error. + */ +__rte_deprecated +int +rte_malloc_set_limit(const char *type, size_t max); + +/** + * Return the IO address of a virtual address obtained through + * rte_malloc + * + * @param addr + * Address obtained from a previous rte_malloc call + * @return + * RTE_BAD_IOVA on error + * otherwise return an address suitable for IO + */ +rte_iova_t +rte_malloc_virt2iova(const void *addr); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MALLOC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_memory.h b/src/spdk/dpdk/lib/librte_eal/include/rte_memory.h new file mode 100644 index 000000000..3d8d0bd69 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_memory.h @@ -0,0 +1,784 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMORY_H_ +#define _RTE_MEMORY_H_ + +/** + * @file + * + * Memory-related RTE API. + */ + +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include <rte_compat.h> +#include <rte_config.h> +#include <rte_fbarray.h> + +__extension__ +enum rte_page_sizes { + RTE_PGSIZE_4K = 1ULL << 12, + RTE_PGSIZE_64K = 1ULL << 16, + RTE_PGSIZE_256K = 1ULL << 18, + RTE_PGSIZE_2M = 1ULL << 21, + RTE_PGSIZE_16M = 1ULL << 24, + RTE_PGSIZE_256M = 1ULL << 28, + RTE_PGSIZE_512M = 1ULL << 29, + RTE_PGSIZE_1G = 1ULL << 30, + RTE_PGSIZE_4G = 1ULL << 32, + RTE_PGSIZE_16G = 1ULL << 34, +}; + +#define SOCKET_ID_ANY -1 /**< Any NUMA socket. */ + +/** + * Physical memory segment descriptor. + */ +#define RTE_MEMSEG_FLAG_DO_NOT_FREE (1 << 0) +/**< Prevent this segment from being freed back to the OS. */ +struct rte_memseg { + RTE_STD_C11 + union { + phys_addr_t phys_addr; /**< deprecated - Start physical address. */ + rte_iova_t iova; /**< Start IO address. */ + }; + RTE_STD_C11 + union { + void *addr; /**< Start virtual address. */ + uint64_t addr_64; /**< Makes sure addr is always 64 bits */ + }; + size_t len; /**< Length of the segment. */ + uint64_t hugepage_sz; /**< The pagesize of underlying memory */ + int32_t socket_id; /**< NUMA socket ID. */ + uint32_t nchannel; /**< Number of channels. */ + uint32_t nrank; /**< Number of ranks. */ + uint32_t flags; /**< Memseg-specific flags */ +} __rte_packed; + +/** + * memseg list is a special case as we need to store a bunch of other data + * together with the array itself. + */ +struct rte_memseg_list { + RTE_STD_C11 + union { + void *base_va; + /**< Base virtual address for this memseg list. */ + uint64_t addr_64; + /**< Makes sure addr is always 64-bits */ + }; + uint64_t page_sz; /**< Page size for all memsegs in this list. */ + int socket_id; /**< Socket ID for all memsegs in this list. */ + volatile uint32_t version; /**< version number for multiprocess sync. */ + size_t len; /**< Length of memory area covered by this memseg list. */ + unsigned int external; /**< 1 if this list points to external memory */ + unsigned int heap; /**< 1 if this list points to a heap */ + struct rte_fbarray memseg_arr; +}; + +/** + * Lock page in physical memory and prevent from swapping. + * + * @param virt + * The virtual address. + * @return + * 0 on success, negative on error. + */ +int rte_mem_lock_page(const void *virt); + +/** + * Get physical address of any mapped virtual address in the current process. + * It is found by browsing the /proc/self/pagemap special file. + * The page must be locked. + * + * @param virt + * The virtual address. + * @return + * The physical address or RTE_BAD_IOVA on error. + */ +phys_addr_t rte_mem_virt2phy(const void *virt); + +/** + * Get IO virtual address of any mapped virtual address in the current process. + * + * @note This function will not check internal page table. Instead, in IOVA as + * PA mode, it will fall back to getting real physical address (which may + * not match the expected IOVA, such as what was specified for external + * memory). + * + * @param virt + * The virtual address. + * @return + * The IO address or RTE_BAD_IOVA on error. + */ +rte_iova_t rte_mem_virt2iova(const void *virt); + +/** + * Get virtual memory address corresponding to iova address. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param iova + * The iova address. + * @return + * Virtual address corresponding to iova address (or NULL if address does not + * exist within DPDK memory map). + */ +__rte_experimental +void * +rte_mem_iova2virt(rte_iova_t iova); + +/** + * Get memseg to which a particular virtual address belongs. + * + * @param virt + * The virtual address. + * @param msl + * The memseg list in which to look up based on ``virt`` address + * (can be NULL). + * @return + * Memseg pointer on success, or NULL on error. + */ +__rte_experimental +struct rte_memseg * +rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl); + +/** + * Get memseg list corresponding to virtual memory address. + * + * @param virt + * The virtual address. + * @return + * Memseg list to which this virtual address belongs to. + */ +__rte_experimental +struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *virt); + +/** + * Memseg walk function prototype. + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg); + +/** + * Memseg contig walk function prototype. This will trigger a callback on every + * VA-contiguous area starting at memseg ``ms``, so total valid VA space at each + * callback call will be [``ms->addr``, ``ms->addr + len``). + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg); + +/** + * Memseg list walk function prototype. This will trigger a callback on every + * allocated memseg list. + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, + void *arg); + +/** + * Walk list of all memsegs. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +__rte_experimental +int +rte_memseg_walk(rte_memseg_walk_t func, void *arg); + +/** + * Walk each VA-contiguous area. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +__rte_experimental +int +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); + +/** + * Walk each allocated memseg list. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +__rte_experimental +int +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg); + +/** + * Walk list of all memsegs without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +__rte_experimental +int +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg); + +/** + * Walk each VA-contiguous area without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +__rte_experimental +int +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg); + +/** + * Walk each allocated memseg list without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +__rte_experimental +int +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg); + +/** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +__rte_experimental +int +rte_memseg_get_fd(const struct rte_memseg *ms); + +/** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +__rte_experimental +int +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +__rte_experimental +int +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +__rte_experimental +int +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register external memory chunk with DPDK. + * + * @note Using this API is mutually exclusive with ``rte_malloc`` family of + * API's. + * + * @note This API will not perform any DMA mapping. It is expected that user + * will do that themselves. + * + * @note Before accessing this memory in other processes, it needs to be + * attached in each of those processes by calling ``rte_extmem_attach`` in + * each other process. + * + * @param va_addr + * Start of virtual area to register. Must be aligned by ``page_sz``. + * @param len + * Length of virtual area to register. Must be aligned by ``page_sz``. + * @param iova_addrs + * Array of page IOVA addresses corresponding to each page in this memory + * area. Can be NULL, in which case page IOVA addresses will be set to + * RTE_BAD_IOVA. + * @param n_pages + * Number of elements in the iova_addrs array. Ignored if ``iova_addrs`` + * is NULL. + * @param page_sz + * Page size of the underlying memory + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EEXIST - memory chunk is already registered + * ENOSPC - no more space in internal config to store a new memory chunk + */ +__rte_experimental +int +rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Unregister external memory chunk with DPDK. + * + * @note Using this API is mutually exclusive with ``rte_malloc`` family of + * API's. + * + * @note This API will not perform any DMA unmapping. It is expected that user + * will do that themselves. + * + * @note Before calling this function, all other processes must call + * ``rte_extmem_detach`` to detach from the memory area. + * + * @param va_addr + * Start of virtual area to unregister + * @param len + * Length of virtual area to unregister + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * ENOENT - memory chunk was not found + */ +__rte_experimental +int +rte_extmem_unregister(void *va_addr, size_t len); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Attach to external memory chunk registered in another process. + * + * @note Using this API is mutually exclusive with ``rte_malloc`` family of + * API's. + * + * @note This API will not perform any DMA mapping. It is expected that user + * will do that themselves. + * + * @param va_addr + * Start of virtual area to register + * @param len + * Length of virtual area to register + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * ENOENT - memory chunk was not found + */ +__rte_experimental +int +rte_extmem_attach(void *va_addr, size_t len); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Detach from external memory chunk registered in another process. + * + * @note Using this API is mutually exclusive with ``rte_malloc`` family of + * API's. + * + * @note This API will not perform any DMA unmapping. It is expected that user + * will do that themselves. + * + * @param va_addr + * Start of virtual area to unregister + * @param len + * Length of virtual area to unregister + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * ENOENT - memory chunk was not found + */ +__rte_experimental +int +rte_extmem_detach(void *va_addr, size_t len); + +/** + * Dump the physical memory layout to a file. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param f + * A pointer to a file for output + */ +void rte_dump_physmem_layout(FILE *f); + +/** + * Get the total amount of available physical memory. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @return + * The total amount of available physical memory in bytes. + */ +uint64_t rte_eal_get_physmem_size(void); + +/** + * Get the number of memory channels. + * + * @return + * The number of memory channels on the system. The value is 0 if unknown + * or not the same on all devices. + */ +unsigned rte_memory_get_nchannel(void); + +/** + * Get the number of memory ranks. + * + * @return + * The number of memory ranks on the system. The value is 0 if unknown or + * not the same on all devices. + */ +unsigned rte_memory_get_nrank(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Check if all currently allocated memory segments are compliant with + * supplied DMA address width. + * + * @param maskbits + * Address width to check against. + */ +__rte_experimental +int rte_mem_check_dma_mask(uint8_t maskbits); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Check if all currently allocated memory segments are compliant with + * supplied DMA address width. This function will use + * rte_memseg_walk_thread_unsafe instead of rte_memseg_walk implying + * memory_hotplug_lock will not be acquired avoiding deadlock during + * memory initialization. + * + * This function is just for EAL core memory internal use. Drivers should + * use the previous rte_mem_check_dma_mask. + * + * @param maskbits + * Address width to check against. + */ +__rte_experimental +int rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Set dma mask to use once memory initialization is done. Previous functions + * rte_mem_check_dma_mask and rte_mem_check_dma_mask_thread_unsafe can not be + * used safely until memory has been initialized. + */ +__rte_experimental +void rte_mem_set_dma_mask(uint8_t maskbits); + +/** + * Drivers based on uio will not load unless physical + * addresses are obtainable. It is only possible to get + * physical addresses when running as a privileged user. + * + * @return + * 1 if the system is able to obtain physical addresses. + * 0 if using DMA addresses through an IOMMU. + */ +int rte_eal_using_phys_addrs(void); + + +/** + * Enum indicating which kind of memory event has happened. Used by callbacks to + * distinguish between memory allocations and deallocations. + */ +enum rte_mem_event { + RTE_MEM_EVENT_ALLOC = 0, /**< Allocation event. */ + RTE_MEM_EVENT_FREE, /**< Deallocation event. */ +}; +#define RTE_MEM_EVENT_CALLBACK_NAME_LEN 64 +/**< maximum length of callback name */ + +/** + * Function typedef used to register callbacks for memory events. + */ +typedef void (*rte_mem_event_callback_t)(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg); + +/** + * Function used to register callbacks for memory events. + * + * @note callbacks will happen while memory hotplug subsystem is write-locked, + * therefore some functions (e.g. `rte_memseg_walk()`) will cause a + * deadlock when called from within such callbacks. + * + * @note mem event callbacks not being supported is an expected error condition, + * so user code needs to handle this situation. In these cases, return + * value will be -1, and rte_errno will be set to ENOTSUP. + * + * @param name + * Name associated with specified callback to be added to the list. + * + * @param clb + * Callback function pointer. + * + * @param arg + * Argument to pass to the callback. + * + * @return + * 0 on successful callback register + * -1 on unsuccessful callback register, with rte_errno value indicating + * reason for failure. + */ +__rte_experimental +int +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg); + +/** + * Function used to unregister callbacks for memory events. + * + * @param name + * Name associated with specified callback to be removed from the list. + * + * @param arg + * Argument to look for among callbacks with specified callback name. + * + * @return + * 0 on successful callback unregister + * -1 on unsuccessful callback unregister, with rte_errno value indicating + * reason for failure. + */ +__rte_experimental +int +rte_mem_event_callback_unregister(const char *name, void *arg); + + +#define RTE_MEM_ALLOC_VALIDATOR_NAME_LEN 64 +/**< maximum length of alloc validator name */ +/** + * Function typedef used to register memory allocation validation callbacks. + * + * Returning 0 will allow allocation attempt to continue. Returning -1 will + * prevent allocation from succeeding. + */ +typedef int (*rte_mem_alloc_validator_t)(int socket_id, + size_t cur_limit, size_t new_len); + +/** + * @brief Register validator callback for memory allocations. + * + * Callbacks registered by this function will be called right before memory + * allocator is about to trigger allocation of more pages from the system if + * said allocation will bring total memory usage above specified limit on + * specified socket. User will be able to cancel pending allocation if callback + * returns -1. + * + * @note callbacks will happen while memory hotplug subsystem is write-locked, + * therefore some functions (e.g. `rte_memseg_walk()`) will cause a + * deadlock when called from within such callbacks. + * + * @note validator callbacks not being supported is an expected error condition, + * so user code needs to handle this situation. In these cases, return + * value will be -1, and rte_errno will be set to ENOTSUP. + * + * @param name + * Name associated with specified callback to be added to the list. + * + * @param clb + * Callback function pointer. + * + * @param socket_id + * Socket ID on which to watch for allocations. + * + * @param limit + * Limit above which to trigger callbacks. + * + * @return + * 0 on successful callback register + * -1 on unsuccessful callback register, with rte_errno value indicating + * reason for failure. + */ +__rte_experimental +int +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +/** + * @brief Unregister validator callback for memory allocations. + * + * @param name + * Name associated with specified callback to be removed from the list. + * + * @param socket_id + * Socket ID on which to watch for allocations. + * + * @return + * 0 on successful callback unregister + * -1 on unsuccessful callback unregister, with rte_errno value indicating + * reason for failure. + */ +__rte_experimental +int +rte_mem_alloc_validator_unregister(const char *name, int socket_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMORY_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_memzone.h b/src/spdk/dpdk/lib/librte_eal/include/rte_memzone.h new file mode 100644 index 000000000..091c9522f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_memzone.h @@ -0,0 +1,320 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMZONE_H_ +#define _RTE_MEMZONE_H_ + +/** + * @file + * RTE Memzone + * + * The goal of the memzone allocator is to reserve contiguous + * portions of physical memory. These zones are identified by a name. + * + * The memzone descriptors are shared by all partitions and are + * located in a known place of physical memory. This zone is accessed + * using rte_eal_get_configuration(). The lookup (by name) of a + * memory zone can be done in any partition and returns the same + * physical address. + * + * A reserved memory zone cannot be unreserved. The reservation shall + * be done at initialization time only. + */ + +#include <stdio.h> +#include <rte_compat.h> +#include <rte_memory.h> +#include <rte_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_MEMZONE_2MB 0x00000001 /**< Use 2MB pages. */ +#define RTE_MEMZONE_1GB 0x00000002 /**< Use 1GB pages. */ +#define RTE_MEMZONE_16MB 0x00000100 /**< Use 16MB pages. */ +#define RTE_MEMZONE_16GB 0x00000200 /**< Use 16GB pages. */ +#define RTE_MEMZONE_256KB 0x00010000 /**< Use 256KB pages. */ +#define RTE_MEMZONE_256MB 0x00020000 /**< Use 256MB pages. */ +#define RTE_MEMZONE_512MB 0x00040000 /**< Use 512MB pages. */ +#define RTE_MEMZONE_4GB 0x00080000 /**< Use 4GB pages. */ +#define RTE_MEMZONE_SIZE_HINT_ONLY 0x00000004 /**< Use available page size */ +#define RTE_MEMZONE_IOVA_CONTIG 0x00100000 /**< Ask for IOVA-contiguous memzone. */ + +/** + * A structure describing a memzone, which is a contiguous portion of + * physical memory identified by a name. + */ +struct rte_memzone { + +#define RTE_MEMZONE_NAMESIZE 32 /**< Maximum length of memory zone name.*/ + char name[RTE_MEMZONE_NAMESIZE]; /**< Name of the memory zone. */ + + RTE_STD_C11 + union { + phys_addr_t phys_addr; /**< deprecated - Start physical address. */ + rte_iova_t iova; /**< Start IO address. */ + }; + RTE_STD_C11 + union { + void *addr; /**< Start virtual address. */ + uint64_t addr_64; /**< Makes sure addr is always 64-bits */ + }; + size_t len; /**< Length of the memzone. */ + + uint64_t hugepage_sz; /**< The page size of underlying memory */ + + int32_t socket_id; /**< NUMA socket ID. */ + + uint32_t flags; /**< Characteristics of this memzone. */ +} __rte_packed; + +/** + * Reserve a portion of physical memory. + * + * This function reserves some memory and returns a pointer to a + * correctly filled memzone descriptor. If the allocation cannot be + * done, return NULL. + * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. + * + * @param name + * The name of the memzone. If it already exists, the function will + * fail and return NULL. + * @param len + * The size of the memory to be reserved. If it + * is 0, the biggest contiguous zone will be reserved. + * @param socket_id + * The socket identifier in the case of + * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The flags parameter is used to request memzones to be + * taken from specifically sized hugepages. + * - RTE_MEMZONE_2MB - Reserved from 2MB pages + * - RTE_MEMZONE_1GB - Reserved from 1GB pages + * - RTE_MEMZONE_16MB - Reserved from 16MB pages + * - RTE_MEMZONE_16GB - Reserved from 16GB pages + * - RTE_MEMZONE_256KB - Reserved from 256KB pages + * - RTE_MEMZONE_256MB - Reserved from 256MB pages + * - RTE_MEMZONE_512MB - Reserved from 512MB pages + * - RTE_MEMZONE_4GB - Reserved from 4GB pages + * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if + * the requested page size is unavailable. + * If this flag is not set, the function + * will return error on an unavailable size + * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. + * @return + * A pointer to a correctly-filled read-only memzone descriptor, or NULL + * on error. + * On error case, rte_errno will be set appropriately: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +const struct rte_memzone *rte_memzone_reserve(const char *name, + size_t len, int socket_id, + unsigned flags); + +/** + * Reserve a portion of physical memory with alignment on a specified + * boundary. + * + * This function reserves some memory with alignment on a specified + * boundary, and returns a pointer to a correctly filled memzone + * descriptor. If the allocation cannot be done or if the alignment + * is not a power of 2, returns NULL. + * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. + * + * @param name + * The name of the memzone. If it already exists, the function will + * fail and return NULL. + * @param len + * The size of the memory to be reserved. If it + * is 0, the biggest contiguous zone will be reserved. + * @param socket_id + * The socket identifier in the case of + * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The flags parameter is used to request memzones to be + * taken from specifically sized hugepages. + * - RTE_MEMZONE_2MB - Reserved from 2MB pages + * - RTE_MEMZONE_1GB - Reserved from 1GB pages + * - RTE_MEMZONE_16MB - Reserved from 16MB pages + * - RTE_MEMZONE_16GB - Reserved from 16GB pages + * - RTE_MEMZONE_256KB - Reserved from 256KB pages + * - RTE_MEMZONE_256MB - Reserved from 256MB pages + * - RTE_MEMZONE_512MB - Reserved from 512MB pages + * - RTE_MEMZONE_4GB - Reserved from 4GB pages + * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if + * the requested page size is unavailable. + * If this flag is not set, the function + * will return error on an unavailable size + * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. + * @param align + * Alignment for resulting memzone. Must be a power of 2. + * @return + * A pointer to a correctly-filled read-only memzone descriptor, or NULL + * on error. + * On error case, rte_errno will be set appropriately: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +const struct rte_memzone *rte_memzone_reserve_aligned(const char *name, + size_t len, int socket_id, + unsigned flags, unsigned align); + +/** + * Reserve a portion of physical memory with specified alignment and + * boundary. + * + * This function reserves some memory with specified alignment and + * boundary, and returns a pointer to a correctly filled memzone + * descriptor. If the allocation cannot be done or if the alignment + * or boundary are not a power of 2, returns NULL. + * Memory buffer is reserved in a way, that it wouldn't cross specified + * boundary. That implies that requested length should be less or equal + * then boundary. + * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. + * + * @param name + * The name of the memzone. If it already exists, the function will + * fail and return NULL. + * @param len + * The size of the memory to be reserved. If it + * is 0, the biggest contiguous zone will be reserved. + * @param socket_id + * The socket identifier in the case of + * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The flags parameter is used to request memzones to be + * taken from specifically sized hugepages. + * - RTE_MEMZONE_2MB - Reserved from 2MB pages + * - RTE_MEMZONE_1GB - Reserved from 1GB pages + * - RTE_MEMZONE_16MB - Reserved from 16MB pages + * - RTE_MEMZONE_16GB - Reserved from 16GB pages + * - RTE_MEMZONE_256KB - Reserved from 256KB pages + * - RTE_MEMZONE_256MB - Reserved from 256MB pages + * - RTE_MEMZONE_512MB - Reserved from 512MB pages + * - RTE_MEMZONE_4GB - Reserved from 4GB pages + * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if + * the requested page size is unavailable. + * If this flag is not set, the function + * will return error on an unavailable size + * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. + * @param align + * Alignment for resulting memzone. Must be a power of 2. + * @param bound + * Boundary for resulting memzone. Must be a power of 2 or zero. + * Zero value implies no boundary condition. + * @return + * A pointer to a correctly-filled read-only memzone descriptor, or NULL + * on error. + * On error case, rte_errno will be set appropriately: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +const struct rte_memzone *rte_memzone_reserve_bounded(const char *name, + size_t len, int socket_id, + unsigned flags, unsigned align, unsigned bound); + +/** + * Free a memzone. + * + * @param mz + * A pointer to the memzone + * @return + * -EINVAL - invalid parameter. + * 0 - success + */ +int rte_memzone_free(const struct rte_memzone *mz); + +/** + * Lookup for a memzone. + * + * Get a pointer to a descriptor of an already reserved memory + * zone identified by the name given as an argument. + * + * @param name + * The name of the memzone. + * @return + * A pointer to a read-only memzone descriptor. + */ +const struct rte_memzone *rte_memzone_lookup(const char *name); + +/** + * Dump all reserved memzones to a file. + * + * @param f + * A pointer to a file for output + */ +void rte_memzone_dump(FILE *f); + +/** + * Walk list of all memzones + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + */ +void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *arg), + void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMZONE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_pci_dev_feature_defs.h b/src/spdk/dpdk/lib/librte_eal/include/rte_pci_dev_feature_defs.h new file mode 100644 index 000000000..e12c22081 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_pci_dev_feature_defs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0) + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_PCI_DEV_DEFS_H_ +#define _RTE_PCI_DEV_DEFS_H_ + +/* interrupt mode */ +enum rte_intr_mode { + RTE_INTR_MODE_NONE = 0, + RTE_INTR_MODE_LEGACY, + RTE_INTR_MODE_MSI, + RTE_INTR_MODE_MSIX +}; + +#endif /* _RTE_PCI_DEV_DEFS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_pci_dev_features.h b/src/spdk/dpdk/lib/librte_eal/include/rte_pci_dev_features.h new file mode 100644 index 000000000..6104123d2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_pci_dev_features.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0) + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_PCI_DEV_FEATURES_H +#define _RTE_PCI_DEV_FEATURES_H + +#include <rte_pci_dev_feature_defs.h> + +#define RTE_INTR_MODE_NONE_NAME "none" +#define RTE_INTR_MODE_LEGACY_NAME "legacy" +#define RTE_INTR_MODE_MSI_NAME "msi" +#define RTE_INTR_MODE_MSIX_NAME "msix" + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_per_lcore.h b/src/spdk/dpdk/lib/librte_eal/include/rte_per_lcore.h new file mode 100644 index 000000000..eaedf0cb3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_per_lcore.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_PER_LCORE_H_ +#define _RTE_PER_LCORE_H_ + +/** + * @file + * + * Per-lcore variables in RTE + * + * This file defines an API for instantiating per-lcore "global + * variables" that are environment-specific. Note that in all + * environments, a "shared variable" is the default when you use a + * global variable. + * + * Parts of this are execution environment specific. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <pthread.h> + +/** + * Macro to define a per lcore variable "var" of type "type", don't + * use keywords like "static" or "volatile" in type, just prefix the + * whole macro. + */ +#define RTE_DEFINE_PER_LCORE(type, name) \ + __thread __typeof__(type) per_lcore_##name + +/** + * Macro to declare an extern per lcore variable "var" of type "type" + */ +#define RTE_DECLARE_PER_LCORE(type, name) \ + extern __thread __typeof__(type) per_lcore_##name + +/** + * Read/write the per-lcore variable value + */ +#define RTE_PER_LCORE(name) (per_lcore_##name) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PER_LCORE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_random.h b/src/spdk/dpdk/lib/librte_eal/include/rte_random.h new file mode 100644 index 000000000..2b30ec85c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_random.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_RANDOM_H_ +#define _RTE_RANDOM_H_ + +/** + * @file + * + * Pseudo-random Generators in RTE + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> + +#include <rte_compat.h> + +/** + * Seed the pseudo-random generator. + * + * The generator is automatically seeded by the EAL init with a timer + * value. It may need to be re-seeded by the user with a real random + * value. + * + * This function is not multi-thread safe in regards to other + * rte_srand() calls, nor is it in relation to concurrent rte_rand() + * calls. + * + * @param seedval + * The value of the seed. + */ +void +rte_srand(uint64_t seedval); + +/** + * Get a pseudo-random value. + * + * The generator is not cryptographically secure. + * + * If called from lcore threads, this function is thread-safe. + * + * @return + * A pseudo-random value between 0 and (1<<64)-1. + */ +uint64_t +rte_rand(void); + +/** + * Generates a pseudo-random number with an upper bound. + * + * This function returns an uniformly distributed (unbiased) random + * number less than a user-specified maximum value. + * + * If called from lcore threads, this function is thread-safe. + * + * @param upper_bound + * The upper bound of the generated number. + * @return + * A pseudo-random value between 0 and (upper_bound-1). + */ +__rte_experimental +uint64_t +rte_rand_max(uint64_t upper_bound); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_RANDOM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_reciprocal.h b/src/spdk/dpdk/lib/librte_eal/include/rte_reciprocal.h new file mode 100644 index 000000000..63e16fde0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_reciprocal.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ +/* + * Reciprocal divide + * + * Used with permission from original authors + * Hannes Frederic Sowa and Daniel Borkmann + * + * This algorithm is based on the paper "Division by Invariant + * Integers Using Multiplication" by Torbjörn Granlund and Peter + * L. Montgomery. + * + * The assembler implementation from Agner Fog, which this code is + * based on, can be found here: + * http://www.agner.org/optimize/asmlib.zip + * + * This optimization for A/B is helpful if the divisor B is mostly + * runtime invariant. The reciprocal of B is calculated in the + * slow-path with reciprocal_value(). The fast-path can then just use + * a much faster multiplication operation with a variable dividend A + * to calculate the division A/B. + */ + +#ifndef _RTE_RECIPROCAL_H_ +#define _RTE_RECIPROCAL_H_ + +#include <stdint.h> + +struct rte_reciprocal { + uint32_t m; + uint8_t sh1, sh2; +}; + +struct rte_reciprocal_u64 { + uint64_t m; + uint8_t sh1, sh2; +}; + +static inline uint32_t rte_reciprocal_divide(uint32_t a, struct rte_reciprocal R) +{ + uint32_t t = (uint32_t)(((uint64_t)a * R.m) >> 32); + + return (t + ((a - t) >> R.sh1)) >> R.sh2; +} + +static __rte_always_inline uint64_t +mullhi_u64(uint64_t x, uint64_t y) +{ +#ifdef __SIZEOF_INT128__ + __uint128_t xl = x; + __uint128_t rl = xl * y; + + return (rl >> 64); +#else + uint64_t u0, u1, v0, v1, k, t; + uint64_t w1, w2; + uint64_t whi; + + u1 = x >> 32; u0 = x & 0xFFFFFFFF; + v1 = y >> 32; v0 = y & 0xFFFFFFFF; + + t = u0*v0; + k = t >> 32; + + t = u1*v0 + k; + w1 = t & 0xFFFFFFFF; + w2 = t >> 32; + + t = u0*v1 + w1; + k = t >> 32; + + whi = u1*v1 + w2 + k; + + return whi; +#endif +} + +static __rte_always_inline uint64_t +rte_reciprocal_divide_u64(uint64_t a, const struct rte_reciprocal_u64 *R) +{ + uint64_t t = mullhi_u64(a, R->m); + + return (t + ((a - t) >> R->sh1)) >> R->sh2; +} + +struct rte_reciprocal rte_reciprocal_value(uint32_t d); +struct rte_reciprocal_u64 rte_reciprocal_value_u64(uint64_t d); + +#endif /* _RTE_RECIPROCAL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_service.h b/src/spdk/dpdk/lib/librte_eal/include/rte_service.h new file mode 100644 index 000000000..3a1c735c5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_service.h @@ -0,0 +1,422 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_SERVICE_H_ +#define _RTE_SERVICE_H_ + +/** + * @file + * + * Service functions + * + * The service functionality provided by this header allows a DPDK component + * to indicate that it requires a function call in order for it to perform + * its processing. + * + * An example usage of this functionality would be a component that registers + * a service to perform a particular packet processing duty: for example the + * eventdev software PMD. At startup the application requests all services + * that have been registered, and the cores in the service-coremask run the + * required services. The EAL removes these number of cores from the available + * runtime cores, and dedicates them to performing service-core workloads. The + * application has access to the remaining lcores as normal. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include<stdio.h> +#include <stdint.h> +#include <sys/queue.h> + +#include <rte_config.h> +#include <rte_lcore.h> + +#define RTE_SERVICE_NAME_MAX 32 + +/* Capabilities of a service. + * + * Use the *rte_service_probe_capability* function to check if a service is + * capable of a specific capability. + */ +/** When set, the service is capable of having multiple threads run it at the + * same time. + */ +#define RTE_SERVICE_CAP_MT_SAFE (1 << 0) + +/** + * Return the number of services registered. + * + * The number of services registered can be passed to *rte_service_get_by_id*, + * enabling the application to retrieve the specification of each service. + * + * @return The number of services registered. + */ +uint32_t rte_service_get_count(void); + +/** + * Return the id of a service by name. + * + * This function provides the id of the service using the service name as + * lookup key. The service id is to be passed to other functions in the + * rte_service_* API. + * + * Example usage: + * @code + * uint32_t service_id; + * int32_t ret = rte_service_get_by_name("service_X", &service_id); + * if (ret) { + * // handle error + * } + * @endcode + * + * @param name The name of the service to retrieve + * @param[out] service_id A pointer to a uint32_t, to be filled in with the id. + * @retval 0 Success. The service id is provided in *service_id*. + * @retval -EINVAL Null *service_id* pointer provided + * @retval -ENODEV No such service registered + */ +int32_t rte_service_get_by_name(const char *name, uint32_t *service_id); + +/** + * Return the name of the service. + * + * @return A pointer to the name of the service. The returned pointer remains + * in ownership of the service, and the application must not free it. + */ +const char *rte_service_get_name(uint32_t id); + +/** + * Check if a service has a specific capability. + * + * This function returns if *service* has implements *capability*. + * See RTE_SERVICE_CAP_* defines for a list of valid capabilities. + * @retval 1 Capability supported by this service instance + * @retval 0 Capability not supported by this service instance + */ +int32_t rte_service_probe_capability(uint32_t id, uint32_t capability); + +/** + * Map or unmap a lcore to a service. + * + * Each core can be added or removed from running a specific service. This + * function enables or disables *lcore* to run *service_id*. + * + * If multiple cores are enabled on a service, a lock is used to ensure that + * only one core runs the service at a time. The exception to this is when + * a service indicates that it is multi-thread safe by setting the capability + * called RTE_SERVICE_CAP_MT_SAFE. With the multi-thread safe capability set, + * the service function can be run on multiple threads at the same time. + * + * If the service is known to be mapped to a single lcore, setting the + * capability of the service to RTE_SERVICE_CAP_MT_SAFE can achieve + * better performance by avoiding the use of lock. + * + * @param service_id the service to apply the lcore to + * @param lcore The lcore that will be mapped to service + * @param enable Zero to unmap or disable the core, non-zero to enable + * + * @retval 0 lcore map updated successfully + * @retval -EINVAL An invalid service or lcore was provided. + */ +int32_t rte_service_map_lcore_set(uint32_t service_id, uint32_t lcore, + uint32_t enable); + +/** + * Retrieve the mapping of an lcore to a service. + * + * @param service_id the service to apply the lcore to + * @param lcore The lcore that will be mapped to service + * + * @retval 1 lcore is mapped to service + * @retval 0 lcore is not mapped to service + * @retval -EINVAL An invalid service or lcore was provided. + */ +int32_t rte_service_map_lcore_get(uint32_t service_id, uint32_t lcore); + +/** + * Set the runstate of the service. + * + * Each service is either running or stopped. Setting a non-zero runstate + * enables the service to run, while setting runstate zero disables it. + * + * @param id The id of the service + * @param runstate The run state to apply to the service + * + * @retval 0 The service was successfully started + * @retval -EINVAL Invalid service id + */ +int32_t rte_service_runstate_set(uint32_t id, uint32_t runstate); + +/** + * Get the runstate for the service with *id*. See *rte_service_runstate_set* + * for details of runstates. A service can call this function to ensure that + * the application has indicated that it will receive CPU cycles. Either a + * service-core is mapped (default case), or the application has explicitly + * disabled the check that a service-cores is mapped to the service and takes + * responsibility to run the service manually using the available function + * *rte_service_run_iter_on_app_lcore* to do so. + * + * @retval 1 Service is running + * @retval 0 Service is stopped + * @retval -EINVAL Invalid service id + */ +int32_t rte_service_runstate_get(uint32_t id); + +/** + * This function returns whether the service may be currently executing on + * at least one lcore, or definitely is not. This function can be used to + * determine if, after setting the service runstate to stopped, the service + * is still executing a service lcore. + * + * Care must be taken if calling this function when the service runstate is + * running, since the result of this function may be incorrect by the time the + * function returns due to service cores running in parallel. + * + * @retval 1 Service may be running on one or more lcores + * @retval 0 Service is not running on any lcore + * @retval -EINVAL Invalid service id + */ +int32_t +rte_service_may_be_active(uint32_t id); + +/** + * Enable or disable the check for a service-core being mapped to the service. + * An application can disable the check when takes the responsibility to run a + * service itself using *rte_service_run_iter_on_app_lcore*. + * + * @param id The id of the service to set the check on + * @param enable When zero, the check is disabled. Non-zero enables the check. + * + * @retval 0 Success + * @retval -EINVAL Invalid service ID + */ +int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enable); + +/** + * This function runs a service callback from a non-service lcore. + * + * This function is designed to enable gradual porting to service cores, and + * to enable unit tests to verify a service behaves as expected. + * + * When called, this function ensures that the service identified by *id* is + * safe to run on this lcore. Multi-thread safe services are invoked even if + * other cores are simultaneously running them as they are multi-thread safe. + * + * Multi-thread unsafe services are handled depending on the variable + * *serialize_multithread_unsafe*: + * - When set, the function will check if a service is already being invoked + * on another lcore, refusing to run it and returning -EBUSY. + * - When zero, the application takes responsibility to ensure that the service + * indicated by *id* is not going to be invoked by another lcore. This setting + * avoids atomic operations, so is likely to be more performant. + * + * @param id The ID of the service to run + * @param serialize_multithread_unsafe This parameter indicates to the service + * cores library if it is required to use atomics to serialize access + * to mult-thread unsafe services. As there is an overhead in using + * atomics, applications can choose to enable or disable this feature + * + * Note that any thread calling this function MUST be a DPDK EAL thread, as + * the *rte_lcore_id* function is used to access internal data structures. + * + * @retval 0 Service was run on the calling thread successfully + * @retval -EBUSY Another lcore is executing the service, and it is not a + * multi-thread safe service, so the service was not run on this lcore + * @retval -ENOEXEC Service is not in a run-able state + * @retval -EINVAL Invalid service id + */ +int32_t rte_service_run_iter_on_app_lcore(uint32_t id, + uint32_t serialize_multithread_unsafe); + +/** + * Start a service core. + * + * Starting a core makes the core begin polling. Any services assigned to it + * will be run as fast as possible. The application must ensure that the lcore + * is in a launchable state: e.g. call *rte_eal_lcore_wait* on the lcore_id + * before calling this function. + * + * @retval 0 Success + * @retval -EINVAL Failed to start core. The *lcore_id* passed in is not + * currently assigned to be a service core. + */ +int32_t rte_service_lcore_start(uint32_t lcore_id); + +/** + * Stop a service core. + * + * Stopping a core makes the core become idle, but remains assigned as a + * service core. + * + * @retval 0 Success + * @retval -EINVAL Invalid *lcore_id* provided + * @retval -EALREADY Already stopped core + * @retval -EBUSY Failed to stop core, as it would cause a service to not + * be run, as this is the only core currently running the service. + * The application must stop the service first, and then stop the + * lcore. + */ +int32_t rte_service_lcore_stop(uint32_t lcore_id); + +/** + * Adds lcore to the list of service cores. + * + * This functions can be used at runtime in order to modify the service core + * mask. + * + * @retval 0 Success + * @retval -EBUSY lcore is busy, and not available for service core duty + * @retval -EALREADY lcore is already added to the service core list + * @retval -EINVAL Invalid lcore provided + */ +int32_t rte_service_lcore_add(uint32_t lcore); + +/** + * Removes lcore from the list of service cores. + * + * This can fail if the core is not stopped, see *rte_service_core_stop*. + * + * @retval 0 Success + * @retval -EBUSY Lcore is not stopped, stop service core before removing. + * @retval -EINVAL failed to add lcore to service core mask. + */ +int32_t rte_service_lcore_del(uint32_t lcore); + +/** + * Retrieve the number of service cores currently available. + * + * This function returns the integer count of service cores available. The + * service core count can be used in mapping logic when creating mappings + * from service cores to services. + * + * See *rte_service_lcore_list* for details on retrieving the lcore_id of each + * service core. + * + * @return The number of service cores currently configured. + */ +int32_t rte_service_lcore_count(void); + +/** + * Resets all service core mappings. This does not remove the service cores + * from duty, just unmaps all services / cores, and stops() the service cores. + * The runstate of services is not modified. + * + * @retval 0 Success + */ +int32_t rte_service_lcore_reset_all(void); + +/** + * Enable or disable statistics collection for *service*. + * + * This function enables per core, per-service cycle count collection. + * @param id The service to enable statistics gathering on. + * @param enable Zero to disable statistics, non-zero to enable. + * @retval 0 Success + * @retval -EINVAL Invalid service pointer passed + */ +int32_t rte_service_set_stats_enable(uint32_t id, int32_t enable); + +/** + * Retrieve the list of currently enabled service cores. + * + * This function fills in an application supplied array, with each element + * indicating the lcore_id of a service core. + * + * Adding and removing service cores can be performed using + * *rte_service_lcore_add* and *rte_service_lcore_del*. + * @param [out] array An array of at least *rte_service_lcore_count* items. + * If statically allocating the buffer, use RTE_MAX_LCORE. + * @param [out] n The size of *array*. + * @retval >=0 Number of service cores that have been populated in the array + * @retval -ENOMEM The provided array is not large enough to fill in the + * service core list. No items have been populated, call this function + * with a size of at least *rte_service_core_count* items. + */ +int32_t rte_service_lcore_list(uint32_t array[], uint32_t n); + +/** + * Get the number of services running on the supplied lcore. + * + * @param lcore Id of the service core. + * @retval >=0 Number of services registered to this core. + * @retval -EINVAL Invalid lcore provided + * @retval -ENOTSUP The provided lcore is not a service core. + */ +int32_t rte_service_lcore_count_services(uint32_t lcore); + +/** + * Dumps any information available about the service. When id is UINT32_MAX, + * this function dumps info for all services. + * + * @retval 0 Statistics have been successfully dumped + * @retval -EINVAL Invalid service id provided + */ +int32_t rte_service_dump(FILE *f, uint32_t id); + +/** + * Returns the number of cycles that this service has consumed + */ +#define RTE_SERVICE_ATTR_CYCLES 0 + +/** + * Returns the count of invocations of this service function + */ +#define RTE_SERVICE_ATTR_CALL_COUNT 1 + +/** + * Get an attribute from a service. + * + * @retval 0 Success, the attribute value has been written to *attr_value*. + * -EINVAL Invalid id, attr_id or attr_value was NULL. + */ +int32_t rte_service_attr_get(uint32_t id, uint32_t attr_id, + uint64_t *attr_value); + +/** + * Reset all attribute values of a service. + * + * @param id The service to reset all statistics of + * @retval 0 Successfully reset attributes + * -EINVAL Invalid service id provided + */ +int32_t rte_service_attr_reset_all(uint32_t id); + +/** + * Returns the number of times the service runner has looped. + */ +#define RTE_SERVICE_LCORE_ATTR_LOOPS 0 + +/** + * Get an attribute from a service core. + * + * @param lcore Id of the service core. + * @param attr_id Id of the attribute to be retrieved. + * @param [out] attr_value Pointer to storage in which to write retrieved value. + * @retval 0 Success, the attribute value has been written to *attr_value*. + * -EINVAL Invalid lcore, attr_id or attr_value was NULL. + * -ENOTSUP lcore is not a service core. + */ +int32_t +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value); + +/** + * Reset all attribute values of a service core. + * + * @param lcore The service core to reset all the statistics of + * @retval 0 Successfully reset attributes + * -EINVAL Invalid service id provided + * -ENOTSUP lcore is not a service core. + */ +int32_t +rte_service_lcore_attr_reset_all(uint32_t lcore); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_SERVICE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_service_component.h b/src/spdk/dpdk/lib/librte_eal/include/rte_service_component.h new file mode 100644 index 000000000..b75aba11b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_service_component.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _SERVICE_PRIVATE_H_ +#define _SERVICE_PRIVATE_H_ + +/* This file specifies the internal service specification. + * Include this file if you are writing a component that requires CPU cycles to + * operate, and you wish to run the component using service cores + */ +#include <rte_compat.h> +#include <rte_service.h> + +/** + * Signature of callback function to run a service. + */ +typedef int32_t (*rte_service_func)(void *args); + +/** + * The specification of a service. + * + * This struct contains metadata about the service itself, the callback + * function to run one iteration of the service, a userdata pointer, flags etc. + */ +struct rte_service_spec { + /** The name of the service. This should be used by the application to + * understand what purpose this service provides. + */ + char name[RTE_SERVICE_NAME_MAX]; + /** The callback to invoke to run one iteration of the service. */ + rte_service_func callback; + /** The userdata pointer provided to the service callback. */ + void *callback_userdata; + /** Flags to indicate the capabilities of this service. See defines in + * the public header file for values of RTE_SERVICE_CAP_* + */ + uint32_t capabilities; + /** NUMA socket ID that this service is affinitized to */ + int socket_id; +}; + +/** + * Register a new service. + * + * A service represents a component that requires CPU time periodically to + * achieve its purpose. + * + * For example the eventdev SW PMD requires CPU cycles to perform its + * scheduling. This can be achieved by registering it as a service, and the + * application can then assign CPU resources to that service. + * + * Note that when a service component registers itself, it is not permitted to + * add or remove service-core threads, or modify lcore-to-service mappings. The + * only API that may be called by the service-component is + * *rte_service_component_runstate_set*, which indicates that the service + * component is ready to be executed. + * + * If the service is known to be mapped to a single lcore, setting the + * capability of the service to RTE_SERVICE_CAP_MT_SAFE can achieve + * better performance. + * + * @param spec The specification of the service to register + * @param[out] service_id A pointer to a uint32_t, which will be filled in + * during registration of the service. It is set to the integers + * service number given to the service. This parameter may be NULL. + * @retval 0 Successfully registered the service. + * -EINVAL Attempted to register an invalid service (eg, no callback + * set) + */ +int32_t rte_service_component_register(const struct rte_service_spec *spec, + uint32_t *service_id); + +/** + * Unregister a service component. + * + * The service being removed must be stopped before calling this function. + * + * @retval 0 The service was successfully unregistered. + * @retval -EBUSY The service is currently running, stop the service before + * calling unregister. No action has been taken. + */ +int32_t rte_service_component_unregister(uint32_t id); + +/** + * Private function to allow EAL to initialized default mappings. + * + * This function iterates all the services, and maps then to the available + * cores. Based on the capabilities of the services, they are set to run on the + * available cores in a round-robin manner. + * + * @retval 0 Success + * @retval -ENOTSUP No service lcores in use + * @retval -EINVAL Error while iterating over services + * @retval -ENODEV Error in enabling service lcore on a service + * @retval -ENOEXEC Error when starting services + */ +int32_t rte_service_start_with_defaults(void); + +/** + * Set the backend runstate of a component. + * + * This function allows services to be registered at startup, but not yet + * enabled to run by default. When the service has been configured (via the + * usual method; eg rte_eventdev_configure, the service can mark itself as + * ready to run. The differentiation between backend runstate and + * service_runstate is that the backend runstate is set by the service + * component while the service runstate is reserved for application usage. + * + * @retval 0 Success + */ +int32_t rte_service_component_runstate_set(uint32_t id, uint32_t runstate); + +/** + * Initialize the service library. + * + * In order to use the service library, it must be initialized. EAL initializes + * the library at startup. + * + * @retval 0 Success + * @retval -EALREADY Service library is already initialized + */ +int32_t rte_service_init(void); + +/** + * @internal Free up the memory that has been initialized. + * This routine is to be invoked prior to process termination. + * + * @retval None + */ +void rte_service_finalize(void); + +#endif /* _SERVICE_PRIVATE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_string_fns.h b/src/spdk/dpdk/lib/librte_eal/include/rte_string_fns.h new file mode 100644 index 000000000..8bac8243c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_string_fns.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +/** + * @file + * + * String-related functions as replacement for libc equivalents + */ + +#ifndef _RTE_STRING_FNS_H_ +#define _RTE_STRING_FNS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <string.h> + +#include <rte_common.h> + +/** + * Takes string "string" parameter and splits it at character "delim" + * up to maxtokens-1 times - to give "maxtokens" resulting tokens. Like + * strtok or strsep functions, this modifies its input string, by replacing + * instances of "delim" with '\\0'. All resultant tokens are returned in the + * "tokens" array which must have enough entries to hold "maxtokens". + * + * @param string + * The input string to be split into tokens + * + * @param stringlen + * The max length of the input buffer + * + * @param tokens + * The array to hold the pointers to the tokens in the string + * + * @param maxtokens + * The number of elements in the tokens array. At most, maxtokens-1 splits + * of the string will be done. + * + * @param delim + * The character on which the split of the data will be done + * + * @return + * The number of tokens in the tokens array. + */ +int +rte_strsplit(char *string, int stringlen, + char **tokens, int maxtokens, char delim); + +/** + * @internal + * DPDK-specific version of strlcpy for systems without + * libc or libbsd copies of the function + */ +static inline size_t +rte_strlcpy(char *dst, const char *src, size_t size) +{ + return (size_t)snprintf(dst, size, "%s", src); +} + +/** + * @internal + * DPDK-specific version of strlcat for systems without + * libc or libbsd copies of the function + */ +static inline size_t +rte_strlcat(char *dst, const char *src, size_t size) +{ + size_t l = strnlen(dst, size); + if (l < size) + return l + rte_strlcpy(&dst[l], src, size - l); + return l + strlen(src); +} + +/* pull in a strlcpy function */ +#ifdef RTE_EXEC_ENV_FREEBSD +#ifndef __BSD_VISIBLE /* non-standard functions are hidden */ +#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) +#define strlcat(dst, src, size) rte_strlcat(dst, src, size) +#endif + +#else /* non-BSD platforms */ +#ifdef RTE_USE_LIBBSD +#include <bsd/string.h> + +#else /* no BSD header files, create own */ +#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) +#define strlcat(dst, src, size) rte_strlcat(dst, src, size) + +#endif /* RTE_USE_LIBBSD */ +#endif /* FREEBSD */ + +/** + * Copy string src to buffer dst of size dsize. + * At most dsize-1 chars will be copied. + * Always NUL-terminates, unless (dsize == 0). + * Returns number of bytes copied (terminating NUL-byte excluded) on success ; + * negative errno on error. + * + * @param dst + * The destination string. + * + * @param src + * The input string to be copied. + * + * @param dsize + * Length in bytes of the destination buffer. + * + * @return + * The number of bytes copied on success + * -E2BIG if the destination buffer is too small. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_STRING_FNS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_tailq.h b/src/spdk/dpdk/lib/librte_eal/include/rte_tailq.h new file mode 100644 index 000000000..b6fe4e5f7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_tailq.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_TAILQ_H_ +#define _RTE_TAILQ_H_ + +/** + * @file + * Here defines rte_tailq APIs for only internal use + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/queue.h> +#include <stdio.h> +#include <rte_debug.h> + +/** dummy structure type used by the rte_tailq APIs */ +struct rte_tailq_entry { + TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list */ + void *data; /**< Pointer to the data referenced by this tailq entry */ +}; +/** dummy */ +TAILQ_HEAD(rte_tailq_entry_head, rte_tailq_entry); + +#define RTE_TAILQ_NAMESIZE 32 + +/** + * The structure defining a tailq header entry for storing + * in the rte_config structure in shared memory. Each tailq + * is identified by name. + * Any library storing a set of objects e.g. rings, mempools, hash-tables, + * is recommended to use an entry here, so as to make it easy for + * a multi-process app to find already-created elements in shared memory. + */ +struct rte_tailq_head { + struct rte_tailq_entry_head tailq_head; /**< NOTE: must be first element */ + char name[RTE_TAILQ_NAMESIZE]; +}; + +struct rte_tailq_elem { + /** + * Reference to head in shared mem, updated at init time by + * rte_eal_tailqs_init() + */ + struct rte_tailq_head *head; + TAILQ_ENTRY(rte_tailq_elem) next; + const char name[RTE_TAILQ_NAMESIZE]; +}; + +/** + * Return the first tailq entry cast to the right struct. + */ +#define RTE_TAILQ_CAST(tailq_entry, struct_name) \ + (struct struct_name *)&(tailq_entry)->tailq_head + +/** + * Utility macro to make looking up a tailqueue for a particular struct easier. + * + * @param name + * The name of tailq + * + * @param struct_name + * The name of the list type we are using. (Generally this is the same as the + * first parameter passed to TAILQ_HEAD macro) + * + * @return + * The return value from rte_eal_tailq_lookup, typecast to the appropriate + * structure pointer type. + * NULL on error, since the tailq_head is the first + * element in the rte_tailq_head structure. + */ +#define RTE_TAILQ_LOOKUP(name, struct_name) \ + RTE_TAILQ_CAST(rte_eal_tailq_lookup(name), struct_name) + +/** + * Dump tail queues to a file. + * + * @param f + * A pointer to a file for output + */ +void rte_dump_tailq(FILE *f); + +/** + * Lookup for a tail queue. + * + * Get a pointer to a tail queue header of a tail + * queue identified by the name given as an argument. + * Note: this function is not multi-thread safe, and should only be called from + * a single thread at a time + * + * @param name + * The name of the queue. + * @return + * A pointer to the tail queue head structure. + */ +struct rte_tailq_head *rte_eal_tailq_lookup(const char *name); + +/** + * Register a tail queue. + * + * Register a tail queue from shared memory. + * This function is mainly used by EAL_REGISTER_TAILQ macro which is used to + * register tailq from the different dpdk libraries. Since this macro is a + * constructor, the function has no access to dpdk shared memory, so the + * registered tailq can not be used before call to rte_eal_init() which calls + * rte_eal_tailqs_init(). + * + * @param t + * The tailq element which contains the name of the tailq you want to + * create (/retrieve when in secondary process). + * @return + * 0 on success or -1 in case of an error. + */ +int rte_eal_tailq_register(struct rte_tailq_elem *t); + +#define EAL_REGISTER_TAILQ(t) \ +RTE_INIT(tailqinitfn_ ##t) \ +{ \ + if (rte_eal_tailq_register(&t) < 0) \ + rte_panic("Cannot initialize tailq: %s\n", t.name); \ +} + +/* This macro permits both remove and free var within the loop safely.*/ +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TAILQ_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_test.h b/src/spdk/dpdk/lib/librte_eal/include/rte_test.h new file mode 100644 index 000000000..89e47f47a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_test.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_TEST_H_ +#define _RTE_TEST_H_ + +#include <rte_log.h> + +/* Before including rte_test.h file you can define + * RTE_TEST_TRACE_FAILURE(_file, _line, _func) macro to better trace/debug test + * failures. Mostly useful in development phase. + */ +#ifndef RTE_TEST_TRACE_FAILURE +#define RTE_TEST_TRACE_FAILURE(_file, _line, _func) +#endif + + +#define RTE_TEST_ASSERT(cond, msg, ...) do { \ + if (!(cond)) { \ + RTE_LOG(DEBUG, EAL, "Test assert %s line %d failed: " \ + msg "\n", __func__, __LINE__, ##__VA_ARGS__); \ + RTE_TEST_TRACE_FAILURE(__FILE__, __LINE__, __func__); \ + return -1; \ + } \ +} while (0) + +#define RTE_TEST_ASSERT_EQUAL(a, b, msg, ...) \ + RTE_TEST_ASSERT(a == b, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_NOT_EQUAL(a, b, msg, ...) \ + RTE_TEST_ASSERT(a != b, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_SUCCESS(val, msg, ...) \ + RTE_TEST_ASSERT(val == 0, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_FAIL(val, msg, ...) \ + RTE_TEST_ASSERT(val != 0, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_NULL(val, msg, ...) \ + RTE_TEST_ASSERT(val == NULL, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_NOT_NULL(val, msg, ...) \ + RTE_TEST_ASSERT(val != NULL, msg, ##__VA_ARGS__) + +#endif /* _RTE_TEST_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_time.h b/src/spdk/dpdk/lib/librte_eal/include/rte_time.h new file mode 100644 index 000000000..5ad7c8841 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_time.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#ifndef _RTE_TIME_H_ +#define _RTE_TIME_H_ + +#include <stdint.h> +#include <time.h> + +#define NSEC_PER_SEC 1000000000L + +/** + * Structure to hold the parameters of a running cycle counter to assist + * in converting cycles to nanoseconds. + */ +struct rte_timecounter { + /** Last cycle counter value read. */ + uint64_t cycle_last; + /** Nanoseconds count. */ + uint64_t nsec; + /** Bitmask separating nanosecond and sub-nanoseconds. */ + uint64_t nsec_mask; + /** Sub-nanoseconds count. */ + uint64_t nsec_frac; + /** Bitmask for two's complement subtraction of non-64 bit counters. */ + uint64_t cc_mask; + /** Cycle to nanosecond divisor (power of two). */ + uint32_t cc_shift; +}; + +/** + * Converts cyclecounter cycles to nanoseconds. + */ +static inline uint64_t +rte_cyclecounter_cycles_to_ns(struct rte_timecounter *tc, uint64_t cycles) +{ + uint64_t ns; + + /* Add fractional nanoseconds. */ + ns = cycles + tc->nsec_frac; + tc->nsec_frac = ns & tc->nsec_mask; + + /* Shift to get only nanoseconds. */ + return ns >> tc->cc_shift; +} + +/** + * Update the internal nanosecond count in the structure. + */ +static inline uint64_t +rte_timecounter_update(struct rte_timecounter *tc, uint64_t cycle_now) +{ + uint64_t cycle_delta, ns_offset; + + /* Calculate the delta since the last call. */ + if (tc->cycle_last <= cycle_now) + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc_mask; + else + /* Handle cycle counts that have wrapped around . */ + cycle_delta = (~(tc->cycle_last - cycle_now) & tc->cc_mask) + 1; + + /* Convert to nanoseconds. */ + ns_offset = rte_cyclecounter_cycles_to_ns(tc, cycle_delta); + + /* Store current cycle counter for next call. */ + tc->cycle_last = cycle_now; + + /* Update the nanosecond count. */ + tc->nsec += ns_offset; + + return tc->nsec; +} + +/** + * Convert from timespec structure into nanosecond units. + */ +static inline uint64_t +rte_timespec_to_ns(const struct timespec *ts) +{ + return ((uint64_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; +} + +/** + * Convert from nanosecond units into timespec structure. + */ +static inline struct timespec +rte_ns_to_timespec(uint64_t nsec) +{ + struct timespec ts = {0, 0}; + + if (nsec == 0) + return ts; + + ts.tv_sec = nsec / NSEC_PER_SEC; + ts.tv_nsec = nsec % NSEC_PER_SEC; + + return ts; +} + +#endif /* _RTE_TIME_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_trace.h b/src/spdk/dpdk/lib/librte_eal/include/rte_trace.h new file mode 100644 index 000000000..a6e991fad --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_trace.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#ifndef _RTE_TRACE_H_ +#define _RTE_TRACE_H_ + +/** + * @file + * + * RTE Trace API + * + * This file provides the trace API to RTE applications. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> +#include <stdio.h> + +#include <rte_common.h> +#include <rte_compat.h> + +/** + * Test if trace is enabled. + * + * @return + * true if trace is enabled, false otherwise. + */ +__rte_experimental +bool rte_trace_is_enabled(void); + +/** + * Enumerate trace mode operation. + */ +enum rte_trace_mode { + /** + * In this mode, when no space is left in the trace buffer, the + * subsequent events overwrite the old events. + */ + RTE_TRACE_MODE_OVERWRITE, + /** + * In this mode, when no space is left in the trace buffer, the + * subsequent events shall not be recorded. + */ + RTE_TRACE_MODE_DISCARD, +}; + +/** + * Set the trace mode. + * + * @param mode + * Trace mode. + */ +__rte_experimental +void rte_trace_mode_set(enum rte_trace_mode mode); + +/** + * Get the trace mode. + * + * @return + * The current trace mode. + */ +__rte_experimental +enum rte_trace_mode rte_trace_mode_get(void); + +/** + * Enable/Disable a set of tracepoints based on globbing pattern. + * + * @param pattern + * The globbing pattern identifying the tracepoint. + * @param enable + * true to enable tracepoint, false to disable the tracepoint, upon match. + * @return + * - 0: Success and no pattern match. + * - 1: Success and found pattern match. + * - (-ERANGE): Tracepoint object is not registered. + */ +__rte_experimental +int rte_trace_pattern(const char *pattern, bool enable); + +/** + * Enable/Disable a set of tracepoints based on regular expression. + * + * @param regex + * A regular expression identifying the tracepoint. + * @param enable + * true to enable tracepoint, false to disable the tracepoint, upon match. + * @return + * - 0: Success and no pattern match. + * - 1: Success and found pattern match. + * - (-ERANGE): Tracepoint object is not registered. + * - (-EINVAL): Invalid regular expression rule. + */ +__rte_experimental +int rte_trace_regexp(const char *regex, bool enable); + +/** + * Save the trace buffer to the trace directory. + * + * By default, trace directory will be created at $HOME directory and this can + * be overridden by --trace-dir EAL parameter. + * + * @return + * - 0: Success. + * - <0 : Failure. + */ +__rte_experimental +int rte_trace_save(void); + +/** + * Dump the trace metadata to a file. + * + * @param f + * A pointer to a file for output + * @return + * - 0: Success. + * - <0 : Failure. + */ +__rte_experimental +int rte_trace_metadata_dump(FILE *f); + +/** + * Dump the trace subsystem status to a file. + * + * @param f + * A pointer to a file for output + */ +__rte_experimental +void rte_trace_dump(FILE *f); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TRACE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_trace_point.h b/src/spdk/dpdk/lib/librte_eal/include/rte_trace_point.h new file mode 100644 index 000000000..b45171275 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_trace_point.h @@ -0,0 +1,408 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#ifndef _RTE_TRACE_POINT_H_ +#define _RTE_TRACE_POINT_H_ + +/** + * @file + * + * RTE Tracepoint API + * + * This file provides the tracepoint API to RTE applications. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> +#include <stdio.h> + +#include <rte_branch_prediction.h> +#include <rte_common.h> +#include <rte_compat.h> +#include <rte_cycles.h> +#include <rte_per_lcore.h> +#include <rte_string_fns.h> +#include <rte_uuid.h> + +/** The tracepoint object. */ +typedef uint64_t rte_trace_point_t; + +/** Macro to define the tracepoint. */ +#define RTE_TRACE_POINT_DEFINE(tp) \ +rte_trace_point_t __attribute__((section("__rte_trace_point"))) __##tp + +/** + * Macro to define the tracepoint arguments in RTE_TRACE_POINT macro. + + * @see RTE_TRACE_POINT, RTE_TRACE_POINT_FP + */ +#define RTE_TRACE_POINT_ARGS + +/** @internal Helper macro to support RTE_TRACE_POINT and RTE_TRACE_POINT_FP */ +#define __RTE_TRACE_POINT(_mode, _tp, _args, ...) \ +extern rte_trace_point_t __##_tp; \ +static __rte_always_inline void \ +_tp _args \ +{ \ + __rte_trace_point_emit_header_##_mode(&__##_tp); \ + __VA_ARGS__ \ +} + +/** + * Create a tracepoint. + * + * A tracepoint is defined by specifying: + * - its input arguments: they are the C function style parameters to define + * the arguments of tracepoint function. These input arguments are embedded + * using the RTE_TRACE_POINT_ARGS macro. + * - its output event fields: they are the sources of event fields that form + * the payload of any event that the execution of the tracepoint macro emits + * for this particular tracepoint. The application uses + * rte_trace_point_emit_* macros to emit the output event fields. + * + * @param tp + * Tracepoint object. Before using the tracepoint, an application needs to + * define the tracepoint using RTE_TRACE_POINT_DEFINE macro. + * @param args + * C function style input arguments to define the arguments to tracepoint + * function. + * @param ... + * Define the payload of trace function. The payload will be formed using + * rte_trace_point_emit_* macros. Use ";" delimiter between two payloads. + * + * @see RTE_TRACE_POINT_ARGS, RTE_TRACE_POINT_DEFINE, rte_trace_point_emit_* + */ +#define RTE_TRACE_POINT(tp, args, ...) \ + __RTE_TRACE_POINT(generic, tp, args, __VA_ARGS__) + +/** + * Create a tracepoint for fast path. + * + * Similar to RTE_TRACE_POINT, except that it is removed at compilation time + * unless the RTE_ENABLE_TRACE_FP configuration parameter is set. + * + * @param tp + * Tracepoint object. Before using the tracepoint, an application needs to + * define the tracepoint using RTE_TRACE_POINT_DEFINE macro. + * @param args + * C function style input arguments to define the arguments to tracepoint. + * function. + * @param ... + * Define the payload of trace function. The payload will be formed using + * rte_trace_point_emit_* macros, Use ";" delimiter between two payloads. + * + * @see RTE_TRACE_POINT + */ +#define RTE_TRACE_POINT_FP(tp, args, ...) \ + __RTE_TRACE_POINT(fp, tp, args, __VA_ARGS__) + +#ifdef __DOXYGEN__ + +/** + * Register a tracepoint. + * + * @param trace + * The tracepoint object created using RTE_TRACE_POINT_DEFINE. + * @param name + * The name of the tracepoint object. + * @return + * - 0: Successfully registered the tracepoint. + * - <0: Failure to register the tracepoint. + */ +#define RTE_TRACE_POINT_REGISTER(trace, name) + +/** Tracepoint function payload for uint64_t datatype */ +#define rte_trace_point_emit_u64(val) +/** Tracepoint function payload for int64_t datatype */ +#define rte_trace_point_emit_i64(val) +/** Tracepoint function payload for uint32_t datatype */ +#define rte_trace_point_emit_u32(val) +/** Tracepoint function payload for int32_t datatype */ +#define rte_trace_point_emit_i32(val) +/** Tracepoint function payload for uint16_t datatype */ +#define rte_trace_point_emit_u16(val) +/** Tracepoint function payload for int16_t datatype */ +#define rte_trace_point_emit_i16(val) +/** Tracepoint function payload for uint8_t datatype */ +#define rte_trace_point_emit_u8(val) +/** Tracepoint function payload for int8_t datatype */ +#define rte_trace_point_emit_i8(val) +/** Tracepoint function payload for int datatype */ +#define rte_trace_point_emit_int(val) +/** Tracepoint function payload for long datatype */ +#define rte_trace_point_emit_long(val) +/** Tracepoint function payload for float datatype */ +#define rte_trace_point_emit_float(val) +/** Tracepoint function payload for double datatype */ +#define rte_trace_point_emit_double(val) +/** Tracepoint function payload for pointer datatype */ +#define rte_trace_point_emit_ptr(val) +/** Tracepoint function payload for string datatype */ +#define rte_trace_point_emit_string(val) + +#endif /* __DOXYGEN__ */ + +/** @internal Macro to define maximum emit length of string datatype. */ +#define __RTE_TRACE_EMIT_STRING_LEN_MAX 32 +/** @internal Macro to define event header size. */ +#define __RTE_TRACE_EVENT_HEADER_SZ sizeof(uint64_t) + +/** + * Enable recording events of the given tracepoint in the trace buffer. + * + * @param tp + * The tracepoint object to enable. + * @return + * - 0: Success. + * - (-ERANGE): Trace object is not registered. + */ +__rte_experimental +int rte_trace_point_enable(rte_trace_point_t *tp); + +/** + * Disable recording events of the given tracepoint in the trace buffer. + * + * @param tp + * The tracepoint object to disable. + * @return + * - 0: Success. + * - (-ERANGE): Trace object is not registered. + */ +__rte_experimental +int rte_trace_point_disable(rte_trace_point_t *tp); + +/** + * Test if recording events from the given tracepoint is enabled. + * + * @param tp + * The tracepoint object. + * @return + * true if tracepoint is enabled, false otherwise. + */ +__rte_experimental +bool rte_trace_point_is_enabled(rte_trace_point_t *tp); + +/** + * Lookup a tracepoint object from its name. + * + * @param name + * The name of the tracepoint. + * @return + * The tracepoint object or NULL if not found. + */ +__rte_experimental +rte_trace_point_t *rte_trace_point_lookup(const char *name); + +/** + * @internal + * + * Test if the tracepoint fast path compile-time option is enabled. + * + * @return + * true if tracepoint fast path enabled, false otherwise. + */ +__rte_experimental +static __rte_always_inline bool +__rte_trace_point_fp_is_enabled(void) +{ +#ifdef RTE_ENABLE_TRACE_FP + return true; +#else + return false; +#endif +} + +/** + * @internal + * + * Allocate trace memory buffer per thread. + * + */ +__rte_experimental +void __rte_trace_mem_per_thread_alloc(void); + +/** + * @internal + * + * Helper function to emit field. + * + * @param sz + * The tracepoint size. + * @param field + * The name of the trace event. + * @param type + * The datatype of the trace event as string. + * @return + * - 0: Success. + * - <0: Failure. + */ +__rte_experimental +void __rte_trace_point_emit_field(size_t sz, const char *field, + const char *type); + +/** + * @internal + * + * Helper function to register a dynamic tracepoint. + * Use RTE_TRACE_POINT_REGISTER macro for tracepoint registration. + * + * @param trace + * The tracepoint object created using RTE_TRACE_POINT_DEFINE. + * @param name + * The name of the tracepoint object. + * @param register_fn + * Trace registration function. + * @return + * - 0: Successfully registered the tracepoint. + * - <0: Failure to register the tracepoint. + */ +__rte_experimental +int __rte_trace_point_register(rte_trace_point_t *trace, const char *name, + void (*register_fn)(void)); + +#ifndef __DOXYGEN__ + +#ifndef _RTE_TRACE_POINT_REGISTER_H_ +#ifdef ALLOW_EXPERIMENTAL_API + +#define __RTE_TRACE_EVENT_HEADER_ID_SHIFT (48) + +#define __RTE_TRACE_FIELD_SIZE_SHIFT 0 +#define __RTE_TRACE_FIELD_SIZE_MASK (0xffffULL << __RTE_TRACE_FIELD_SIZE_SHIFT) +#define __RTE_TRACE_FIELD_ID_SHIFT (16) +#define __RTE_TRACE_FIELD_ID_MASK (0xffffULL << __RTE_TRACE_FIELD_ID_SHIFT) +#define __RTE_TRACE_FIELD_ENABLE_MASK (1ULL << 63) +#define __RTE_TRACE_FIELD_ENABLE_DISCARD (1ULL << 62) + +struct __rte_trace_stream_header { + uint32_t magic; + rte_uuid_t uuid; + uint32_t lcore_id; + char thread_name[__RTE_TRACE_EMIT_STRING_LEN_MAX]; +} __rte_packed; + +struct __rte_trace_header { + uint32_t offset; + uint32_t len; + struct __rte_trace_stream_header stream_header; + uint8_t mem[]; +}; + +RTE_DECLARE_PER_LCORE(void *, trace_mem); + +static __rte_always_inline void * +__rte_trace_mem_get(uint64_t in) +{ + struct __rte_trace_header *trace = RTE_PER_LCORE(trace_mem); + const uint16_t sz = in & __RTE_TRACE_FIELD_SIZE_MASK; + + /* Trace memory is not initialized for this thread */ + if (unlikely(trace == NULL)) { + __rte_trace_mem_per_thread_alloc(); + trace = RTE_PER_LCORE(trace_mem); + if (unlikely(trace == NULL)) + return NULL; + } + /* Check the wrap around case */ + uint32_t offset = trace->offset; + if (unlikely((offset + sz) >= trace->len)) { + /* Disable the trace event if it in DISCARD mode */ + if (unlikely(in & __RTE_TRACE_FIELD_ENABLE_DISCARD)) + return NULL; + + offset = 0; + } + /* Align to event header size */ + offset = RTE_ALIGN_CEIL(offset, __RTE_TRACE_EVENT_HEADER_SZ); + void *mem = RTE_PTR_ADD(&trace->mem[0], offset); + offset += sz; + trace->offset = offset; + + return mem; +} + +static __rte_always_inline void * +__rte_trace_point_emit_ev_header(void *mem, uint64_t in) +{ + uint64_t val; + + /* Event header [63:0] = id [63:48] | timestamp [47:0] */ + val = rte_get_tsc_cycles() & + ~(0xffffULL << __RTE_TRACE_EVENT_HEADER_ID_SHIFT); + val |= ((in & __RTE_TRACE_FIELD_ID_MASK) << + (__RTE_TRACE_EVENT_HEADER_ID_SHIFT - + __RTE_TRACE_FIELD_ID_SHIFT)); + + *(uint64_t *)mem = val; + return RTE_PTR_ADD(mem, __RTE_TRACE_EVENT_HEADER_SZ); +} + +#define __rte_trace_point_emit_header_generic(t) \ +void *mem; \ +do { \ + const uint64_t val = __atomic_load_n(t, __ATOMIC_ACQUIRE); \ + if (likely(!(val & __RTE_TRACE_FIELD_ENABLE_MASK))) \ + return; \ + mem = __rte_trace_mem_get(val); \ + if (unlikely(mem == NULL)) \ + return; \ + mem = __rte_trace_point_emit_ev_header(mem, val); \ +} while (0) + +#define __rte_trace_point_emit_header_fp(t) \ + if (!__rte_trace_point_fp_is_enabled()) \ + return; \ + __rte_trace_point_emit_header_generic(t) + +#define __rte_trace_point_emit(in, type) \ +do { \ + memcpy(mem, &(in), sizeof(in)); \ + mem = RTE_PTR_ADD(mem, sizeof(in)); \ +} while (0) + +#define rte_trace_point_emit_string(in) \ +do { \ + if (unlikely(in == NULL)) \ + return; \ + rte_strscpy(mem, in, __RTE_TRACE_EMIT_STRING_LEN_MAX); \ + mem = RTE_PTR_ADD(mem, __RTE_TRACE_EMIT_STRING_LEN_MAX); \ +} while (0) + +#else + +#define __rte_trace_point_emit_header_generic(t) RTE_SET_USED(t) +#define __rte_trace_point_emit_header_fp(t) RTE_SET_USED(t) +#define __rte_trace_point_emit(in, type) RTE_SET_USED(in) +#define rte_trace_point_emit_string(in) RTE_SET_USED(in) + +#endif /* ALLOW_EXPERIMENTAL_API */ +#endif /* _RTE_TRACE_POINT_REGISTER_H_ */ + +#define rte_trace_point_emit_u64(in) __rte_trace_point_emit(in, uint64_t) +#define rte_trace_point_emit_i64(in) __rte_trace_point_emit(in, int64_t) +#define rte_trace_point_emit_u32(in) __rte_trace_point_emit(in, uint32_t) +#define rte_trace_point_emit_i32(in) __rte_trace_point_emit(in, int32_t) +#define rte_trace_point_emit_u16(in) __rte_trace_point_emit(in, uint16_t) +#define rte_trace_point_emit_i16(in) __rte_trace_point_emit(in, int16_t) +#define rte_trace_point_emit_u8(in) __rte_trace_point_emit(in, uint8_t) +#define rte_trace_point_emit_i8(in) __rte_trace_point_emit(in, int8_t) +#define rte_trace_point_emit_int(in) __rte_trace_point_emit(in, int32_t) +#define rte_trace_point_emit_long(in) __rte_trace_point_emit(in, long) +#define rte_trace_point_emit_float(in) __rte_trace_point_emit(in, float) +#define rte_trace_point_emit_double(in) __rte_trace_point_emit(in, double) +#define rte_trace_point_emit_ptr(in) __rte_trace_point_emit(in, uintptr_t) + +#endif /* __DOXYGEN__ */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TRACE_POINT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_trace_point_register.h b/src/spdk/dpdk/lib/librte_eal/include/rte_trace_point_register.h new file mode 100644 index 000000000..4e7c25ba1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_trace_point_register.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#ifndef _RTE_TRACE_POINT_REGISTER_H_ +#define _RTE_TRACE_POINT_REGISTER_H_ + +#ifdef _RTE_TRACE_POINT_H_ +#error for registration, include this file first before <rte_trace_point.h> +#endif + +#include <rte_per_lcore.h> +#include <rte_trace_point.h> + +RTE_DECLARE_PER_LCORE(volatile int, trace_point_sz); + +#define RTE_TRACE_POINT_REGISTER(trace, name) \ + __rte_trace_point_register(&__##trace, RTE_STR(name), \ + (void (*)(void)) trace) + +#define __rte_trace_point_emit_header_generic(t) \ + RTE_PER_LCORE(trace_point_sz) = __RTE_TRACE_EVENT_HEADER_SZ + +#define __rte_trace_point_emit_header_fp(t) \ + __rte_trace_point_emit_header_generic(t) + +#define __rte_trace_point_emit(in, type) \ +do { \ + RTE_BUILD_BUG_ON(sizeof(type) != sizeof(typeof(in))); \ + __rte_trace_point_emit_field(sizeof(type), RTE_STR(in), \ + RTE_STR(type)); \ +} while (0) + +#define rte_trace_point_emit_string(in) \ +do { \ + RTE_SET_USED(in); \ + __rte_trace_point_emit_field(__RTE_TRACE_EMIT_STRING_LEN_MAX, \ + RTE_STR(in)"[32]", "string_bounded_t"); \ +} while (0) + +#endif /* _RTE_TRACE_POINT_REGISTER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_uuid.h b/src/spdk/dpdk/lib/librte_eal/include/rte_uuid.h new file mode 100644 index 000000000..044afbdfa --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_uuid.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997, 1998 Theodore Ts'o. + */ +/** + * @file + * + * UUID related functions originally from libuuid + */ + +#ifndef _RTE_UUID_H_ +#define _RTE_UUID_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> + +/** + * Struct describing a Universal Unique Identifier + */ +typedef unsigned char rte_uuid_t[16]; + +/** + * Helper for defining UUID values for id tables. + */ +#define RTE_UUID_INIT(a, b, c, d, e) { \ + ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, \ + ((a) >> 8) & 0xff, (a) & 0xff, \ + ((b) >> 8) & 0xff, (b) & 0xff, \ + ((c) >> 8) & 0xff, (c) & 0xff, \ + ((d) >> 8) & 0xff, (d) & 0xff, \ + ((e) >> 40) & 0xff, ((e) >> 32) & 0xff, \ + ((e) >> 24) & 0xff, ((e) >> 16) & 0xff, \ + ((e) >> 8) & 0xff, (e) & 0xff \ +} + +/** + * Test if UUID is all zeros. + * + * @param uu + * The uuid to check. + * @return + * true if uuid is NULL value, false otherwise + */ +bool rte_uuid_is_null(const rte_uuid_t uu); + +/** + * Copy uuid. + * + * @param dst + * Destination uuid + * @param src + * Source uuid + */ +static inline void rte_uuid_copy(rte_uuid_t dst, const rte_uuid_t src) +{ + memcpy(dst, src, sizeof(rte_uuid_t)); +} + +/** + * Compare two UUID's + * + * @param a + * A UUID to compare + * @param b + * A UUID to compare + * @return + * returns an integer less than, equal to, or greater than zero if UUID a is + * is less than, equal, or greater than UUID b. + */ +int rte_uuid_compare(const rte_uuid_t a, const rte_uuid_t b); + +/** + * Extract UUID from string + * + * @param in + * Pointer to string of characters to convert + * @param uu + * Destination UUID + * @return + * Returns 0 on success, and -1 if string is not a valid UUID. + */ +int rte_uuid_parse(const char *in, rte_uuid_t uu); + +/** + * Convert UUID to string + * + * @param uu + * UUID to format + * @param out + * Resulting string buffer + * @param len + * Sizeof the available string buffer + */ +#define RTE_UUID_STRLEN (36 + 1) +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_UUID_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_version.h b/src/spdk/dpdk/lib/librte_eal/include/rte_version.h new file mode 100644 index 000000000..f7a3a1ebc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_version.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Definitions of DPDK version numbers + */ + +#ifndef _RTE_VERSION_H_ +#define _RTE_VERSION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <rte_common.h> + +/** + * Macro to compute a version number usable for comparisons + */ +#define RTE_VERSION_NUM(a,b,c,d) ((a) << 24 | (b) << 16 | (c) << 8 | (d)) + +/** + * All version numbers in one to compare with RTE_VERSION_NUM() + */ +#define RTE_VERSION RTE_VERSION_NUM( \ + RTE_VER_YEAR, \ + RTE_VER_MONTH, \ + RTE_VER_MINOR, \ + RTE_VER_RELEASE) + +/** + * Function returning version string + * @return + * string + */ +static inline const char * +rte_version(void) +{ + static char version[32]; + if (version[0] != 0) + return version; + if (strlen(RTE_VER_SUFFIX) == 0) + snprintf(version, sizeof(version), "%s %d.%02d.%d", + RTE_VER_PREFIX, + RTE_VER_YEAR, + RTE_VER_MONTH, + RTE_VER_MINOR); + else + snprintf(version, sizeof(version), "%s %d.%02d.%d%s%d", + RTE_VER_PREFIX, + RTE_VER_YEAR, + RTE_VER_MONTH, + RTE_VER_MINOR, + RTE_VER_SUFFIX, + RTE_VER_RELEASE); + return version; +} + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_VERSION_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/include/rte_vfio.h b/src/spdk/dpdk/lib/librte_eal/include/rte_vfio.h new file mode 100644 index 000000000..20ed8c45a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/include/rte_vfio.h @@ -0,0 +1,360 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 6WIND S.A. + */ + +#ifndef _RTE_VFIO_H_ +#define _RTE_VFIO_H_ + +/** + * @file + * RTE VFIO. This library provides various VFIO related utility functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> + +/* + * determine if VFIO is present on the system + */ +#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) +#include <linux/version.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_PRESENT +#endif /* kernel version >= 3.6.0 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) +#define HAVE_VFIO_DEV_REQ_INTERFACE +#endif /* kernel version >= 4.0.0 */ +#endif /* RTE_EAL_VFIO */ + +#ifdef VFIO_PRESENT + +#include <linux/vfio.h> + +#define VFIO_DIR "/dev/vfio" +#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" +#define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" +#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) +#define VFIO_GET_REGION_IDX(x) (x >> 40) +#define VFIO_NOIOMMU_MODE \ + "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode" + +/* NOIOMMU is defined from kernel version 4.5 onwards */ +#ifdef VFIO_NOIOMMU_IOMMU +#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU +#else +#define RTE_VFIO_NOIOMMU 8 +#endif + +/* + * capabilities are only supported on kernel 4.6+. there were also some API + * changes as well, so add a macro to get cap offset. + */ +#ifdef VFIO_REGION_INFO_FLAG_CAPS +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS +#define VFIO_CAP_OFFSET(x) (x->cap_offset) +#else +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) +#define VFIO_CAP_OFFSET(x) (x->resv) +struct vfio_info_cap_header { + uint16_t id; + uint16_t version; + uint32_t next; +}; +#endif + +/* kernels 4.16+ can map BAR containing MSI-X table */ +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#else +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 +#endif + +#else /* not VFIO_PRESENT */ + +/* we don't need an actual definition, only pointer is used */ +struct vfio_device_info; + +#endif /* VFIO_PRESENT */ + +#define RTE_VFIO_DEFAULT_CONTAINER_FD (-1) + +/** + * Setup vfio_cfg for the device identified by its address. + * It discovers the configured I/O MMU groups or sets a new one for the device. + * If a new groups is assigned, the DMA mapping is performed. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param vfio_dev_fd + * VFIO fd. + * + * @param device_info + * Device information. + * + * @return + * 0 on success. + * <0 on failure. + * >1 if the device cannot be managed this way. + */ +int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info); + +/** + * Release a device mapped to a VFIO-managed I/O MMU group. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param fd + * VFIO fd. + * + * @return + * 0 on success. + * <0 on failure. + */ +int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd); + +/** + * Enable a VFIO-related kmod. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param modname + * kernel module name. + * + * @return + * 0 on success. + * <0 on failure. + */ +int rte_vfio_enable(const char *modname); + +/** + * Check whether a VFIO-related kmod is enabled. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param modname + * kernel module name. + * + * @return + * !0 if true. + * 0 otherwise. + */ +int rte_vfio_is_enabled(const char *modname); + +/** + * Whether VFIO NOIOMMU mode is enabled. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @return + * !0 if true. + * 0 otherwise. + */ +int rte_vfio_noiommu_is_enabled(void); + +/** + * Remove group fd from internal VFIO group fd array/ + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param vfio_group_fd + * VFIO Group FD. + * + * @return + * 0 on success. + * <0 on failure. + */ +int +rte_vfio_clear_group(int vfio_group_fd); + +/** + * Parse IOMMU group number for a device + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param iommu_group_num + * iommu group number + * + * @return + * >0 on success + * 0 for non-existent group or VFIO + * <0 for errors + */ +int +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num); + +/** + * Open a new VFIO container fd + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @return + * > 0 container fd + * < 0 for errors + */ +int +rte_vfio_get_container_fd(void); + +/** + * Open VFIO group fd or get an existing one + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param iommu_group_num + * iommu group number + * + * @return + * > 0 group fd + * < 0 for errors + */ +int +rte_vfio_get_group_fd(int iommu_group_num); + +/** + * Create a new container for device binding. + * + * @note Any newly allocated DPDK memory will not be mapped into these + * containers by default, user needs to manage DMA mappings for + * any container created by this API. + * + * @note When creating containers using this API, the container will only be + * available in the process that has created it. Sharing containers and + * devices between multiple processes is not supported. + * + * @return + * the container fd if successful + * <0 if failed + */ +int +rte_vfio_container_create(void); + +/** + * Destroy the container, unbind all vfio groups within it. + * + * @param container_fd + * the container fd to destroy + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_destroy(int container_fd); + +/** + * Bind a IOMMU group to a container. + * + * @param container_fd + * the container's fd + * + * @param iommu_group_num + * the iommu group number to bind to container + * + * @return + * group fd if successful + * <0 if failed + */ +int +rte_vfio_container_group_bind(int container_fd, int iommu_group_num); + +/** + * Unbind a IOMMU group from a container. + * + * @param container_fd + * the container fd of container + * + * @param iommu_group_num + * the iommu group number to delete from container + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num); + +/** + * Perform DMA mapping for devices in a container. + * + * @param container_fd + * the specified container fd. Use RTE_VFIO_DEFAULT_CONTAINER_FD to + * use the default container. + * + * @param vaddr + * Starting virtual address of memory to be mapped. + * + * @param iova + * Starting IOVA address of memory to be mapped. + * + * @param len + * Length of memory segment being mapped. + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, + uint64_t iova, uint64_t len); + +/** + * Perform DMA unmapping for devices in a container. + * + * @param container_fd + * the specified container fd. Use RTE_VFIO_DEFAULT_CONTAINER_FD to + * use the default container. + * + * @param vaddr + * Starting virtual address of memory to be unmapped. + * + * @param iova + * Starting IOVA address of memory to be unmapped. + * + * @param len + * Length of memory segment being unmapped. + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, + uint64_t iova, uint64_t len); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VFIO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/linux/Makefile b/src/spdk/dpdk/lib/librte_eal/linux/Makefile new file mode 100644 index 000000000..48cc34844 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/Makefile @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2019 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) +VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR) +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -ldl +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrt +LDLIBS += -lrte_kvargs +LDLIBS += -lrte_telemetry +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) +LDLIBS += -lnuma +endif + +EXPORT_MAP := ../rte_eal_version.map + +# specific to linux exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_mcfg.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_trace.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_trace_ctf.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_trace_points.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_trace_utils.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += hotplug_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_random.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +endif + +INC := rte_kni_common.h +INC += rte_os.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUX)-include := $(addprefix include/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal.c b/src/spdk/dpdk/lib/librte_eal/linux/eal.c new file mode 100644 index 000000000..f162124a3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal.c @@ -0,0 +1,1405 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2012-2014 6WIND S.A. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdarg.h> +#include <unistd.h> +#include <pthread.h> +#include <syslog.h> +#include <getopt.h> +#include <sys/file.h> +#include <dirent.h> +#include <fcntl.h> +#include <fnmatch.h> +#include <stddef.h> +#include <errno.h> +#include <limits.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/stat.h> +#if defined(RTE_ARCH_X86) +#include <sys/io.h> +#endif +#include <linux/version.h> + +#include <rte_compat.h> +#include <rte_common.h> +#include <rte_debug.h> +#include <rte_memory.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_service_component.h> +#include <rte_log.h> +#include <rte_random.h> +#include <rte_cycles.h> +#include <rte_string_fns.h> +#include <rte_cpuflags.h> +#include <rte_interrupts.h> +#include <rte_bus.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_version.h> +#include <rte_atomic.h> +#include <malloc_heap.h> +#include <rte_vfio.h> +#include <rte_telemetry.h> + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_memcfg.h" +#include "eal_trace.h" +#include "eal_options.h" +#include "eal_vfio.h" +#include "hotplug_mp.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + +#define KERNEL_IOMMU_GROUPS_PATH "/sys/kernel/iommu_groups" + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, eal_get_hugefile_prefix()); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +int +eal_clean_runtime_dir(void) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + static const char * const filters[] = { + "fbarray_*", + "mp_socket_*" + }; + + /* open directory */ + dir = opendir(runtime_dir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n", + runtime_dir); + goto error; + } + dir_fd = dirfd(dir); + + /* lock the directory before doing anything, to avoid races */ + if (flock(dir_fd, LOCK_EX) < 0) { + RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n", + runtime_dir); + goto error; + } + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n", + runtime_dir); + goto error; + } + + while (dirent != NULL) { + unsigned int f_idx; + bool skip = true; + + /* skip files that don't match the patterns */ + for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) { + const char *filter = filters[f_idx]; + + if (fnmatch(filter, dirent->d_name, 0) == 0) { + skip = false; + break; + } + } + if (skip) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close(fd); + dirent = readdir(dir); + } + + /* closedir closes dir_fd and drops the lock */ + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n", + strerror(errno)); + + return -1; +} + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static int +rte_eal_config_create(void) +{ + size_t page_sz = sysconf(_SC_PAGE_SIZE); + size_t cfg_len = sizeof(*rte_config.mem_config); + size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz); + void *rte_mem_cfg_addr, *mapped_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + /* map the config before hugepage address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), page_sz); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + retval = ftruncate(mem_cfg_fd, cfg_len); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n", + pathname); + return -1; + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + return -1; + } + + /* reserve space for config */ + rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr, + &cfg_len_aligned, page_sz, 0, 0); + if (rte_mem_cfg_addr == NULL) { + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n"); + close(mem_cfg_fd); + mem_cfg_fd = -1; + return -1; + } + + /* remap the actual file into the space we've just reserved */ + mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr, + cfg_len_aligned, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0); + if (mapped_mem_cfg_addr == MAP_FAILED) { + munmap(rte_mem_cfg_addr, cfg_len); + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n"); + return -1; + } + + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + + rte_config.mem_config->dma_maskbits = 0; + + return 0; +} + +/* attach to an existing shared memory config */ +static int +rte_eal_config_attach(void) +{ + struct rte_mem_config *mem_config; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + /* map it as read-only first */ + mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + if (mem_config == MAP_FAILED) { + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = mem_config; + + return 0; +} + +/* reattach the shared config at exact memory location primary process has it */ +static int +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return 0; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + + close(mem_cfg_fd); + mem_cfg_fd = -1; + + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) { + /* errno is stale, don't use */ + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--" OPT_BASE_VIRTADDR + "' option\n", rte_mem_cfg_addr, mem_config); + munmap(mem_config, sizeof(struct rte_mem_config)); + return -1; + } + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = mem_config; + + return 0; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static int +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + if (rte_eal_config_create() < 0) + return -1; + eal_mcfg_update_from_internal(); + break; + case RTE_PROC_SECONDARY: + if (rte_eal_config_attach() < 0) + return -1; + eal_mcfg_wait_complete(); + if (eal_mcfg_check_version() < 0) { + RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n"); + return -1; + } + if (rte_eal_config_reattach() < 0) + return -1; + eal_mcfg_update_internal(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + RTE_LOG(ERR, EAL, "Invalid process type %d\n", + rte_config.process_type); + return -1; + } + + return 0; +} + +/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ +static void +eal_hugedirs_unlock(void) +{ + int i; + + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + { + /* skip uninitialized */ + if (internal_config.hugepage_info[i].lock_descriptor < 0) + continue; + /* unlock hugepage file */ + flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); + close(internal_config.hugepage_info[i].lock_descriptor); + /* reset the field */ + internal_config.hugepage_info[i].lock_descriptor = -1; + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + printf("EAL Linux options:\n" + " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" + " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" + " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" + " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" + " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" + " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" + " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n" + "\n"); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static int +eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(strval, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(strval[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(strval, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + uint64_t val; + end = NULL; + val = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + val <<= 20; + total_mem += val; + socket_arg[i] = val; + } + + return 0; +} + +static int +eal_parse_vfio_intr(const char *mode) +{ + unsigned i; + static struct { + const char *name; + enum rte_intr_mode value; + } map[] = { + { "legacy", RTE_INTR_MODE_LEGACY }, + { "msi", RTE_INTR_MODE_MSI }, + { "msix", RTE_INTR_MODE_MSIX }, + }; + + for (i = 0; i < RTE_DIM(map); i++) { + if (!strcmp(mode, map[i].name)) { + internal_config.vfio_intr_mode = map[i].value; + return 0; + } + } + return -1; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* getopt didn't recognise the option */ + if (opt == '?') { + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + + case OPT_HUGE_DIR_NUM: + { + char *hdir = strdup(optarg); + if (hdir == NULL) + RTE_LOG(ERR, EAL, "Could not store hugepage directory\n"); + else { + /* free old hugepage dir */ + if (internal_config.hugepage_dir != NULL) + free(internal_config.hugepage_dir); + internal_config.hugepage_dir = hdir; + } + break; + } + case OPT_FILE_PREFIX_NUM: + { + char *prefix = strdup(optarg); + if (prefix == NULL) + RTE_LOG(ERR, EAL, "Could not store file prefix\n"); + else { + /* free old prefix */ + if (internal_config.hugefile_prefix != NULL) + free(internal_config.hugefile_prefix); + internal_config.hugefile_prefix = prefix; + } + break; + } + case OPT_SOCKET_MEM_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_mem) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_sockets = 1; + break; + + case OPT_SOCKET_LIMIT_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_limit) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_LIMIT "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_socket_limits = 1; + break; + + case OPT_VFIO_INTR_NUM: + if (eal_parse_vfio_intr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_VFIO_INTR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_CREATE_UIO_DEV_NUM: + internal_config.create_uio_dev = 1; + break; + + case OPT_MBUF_POOL_OPS_NAME_NUM: + { + char *ops_name = strdup(optarg); + if (ops_name == NULL) + RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); + else { + /* free old ops name */ + if (internal_config.user_mbuf_pool_ops_name != + NULL) + free(internal_config.user_mbuf_pool_ops_name); + + internal_config.user_mbuf_pool_ops_name = + ops_name; + } + break; + } + case OPT_MATCH_ALLOCATIONS_NUM: + internal_config.match_allocations = 1; + break; + + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Linux\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Linux\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Linux\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + if (msl->external) + return 0; + + return *socket_id == msl->socket_id; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + +static int +sync_func(__rte_unused void *arg) +{ + return 0; +} + +/* + * Request iopl privilege for all RPL, returns 0 on success + * iopl() call is mostly for the i386 architecture. For other architectures, + * return -1 to indicate IO privilege can't be changed in this way. + */ +int +rte_eal_iopl_init(void) +{ +#if defined(RTE_ARCH_X86) + if (iopl(3) != 0) + return -1; +#endif + return 0; +} + +#ifdef VFIO_PRESENT +static int rte_eal_vfio_setup(void) +{ + if (rte_vfio_enable("vfio")) + return -1; + + return 0; +} +#endif + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* + * On Linux 3.6+, even if VFIO is not loaded, whenever IOMMU is enabled in the + * BIOS and in the kernel, /sys/kernel/iommu_groups path will contain kernel + * IOMMU groups. If IOMMU is not enabled, that path would be empty. + * Therefore, checking if the path is empty will tell us if IOMMU is enabled. + */ +static bool +is_iommu_enabled(void) +{ + DIR *dir = opendir(KERNEL_IOMMU_GROUPS_PATH); + struct dirent *d; + int n = 0; + + /* if directory doesn't exist, assume IOMMU is not enabled */ + if (dir == NULL) + return false; + + while ((d = readdir(dir)) != NULL) { + /* skip dot and dot-dot */ + if (++n > 2) + break; + } + closedir(dir); + + return n > 2; +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + const char *p; + static char logid[PATH_MAX]; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + bool phys_addrs; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + p = strrchr(argv[0], '/'); + strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + /* clone argv to report out later in telemetry */ + eal_save_args(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_trace_init() < 0) { + rte_eal_init_alert("Cannot init trace"); + rte_errno = EFAULT; + return -1; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + if (rte_config_init() < 0) { + rte_eal_init_alert("Cannot init config"); + return -1; + } + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread"); + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init alarm"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) { + rte_eal_init_alert("failed to init mp channel"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + + /* register multi-process action callbacks for hotplug */ + if (eal_mp_dev_hotplug_init() < 0) { + rte_eal_init_alert("failed to register mp callback for hotplug"); + return -1; + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + phys_addrs = rte_eal_using_phys_addrs() != 0; + + /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode */ + enum rte_iova_mode iova_mode = rte_bus_get_iommu_class(); + + if (iova_mode == RTE_IOVA_DC) { + RTE_LOG(DEBUG, EAL, "Buses did not request a specific IOVA mode.\n"); + + if (!phys_addrs) { + /* if we have no access to physical addresses, + * pick IOVA as VA mode. + */ + iova_mode = RTE_IOVA_VA; + RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n"); +#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) + } else if (rte_eal_check_module("rte_kni") == 1) { + iova_mode = RTE_IOVA_PA; + RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI performance.\n"); +#endif + } else if (is_iommu_enabled()) { + /* we have an IOMMU, pick IOVA as VA mode */ + iova_mode = RTE_IOVA_VA; + RTE_LOG(DEBUG, EAL, "IOMMU is available, selecting IOVA as VA mode.\n"); + } else { + /* physical addresses available, and no IOMMU + * found, so pick IOVA as PA. + */ + iova_mode = RTE_IOVA_PA; + RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n"); + } + } +#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) + /* Workaround for KNI which requires physical address to work + * in kernels < 4.10 + */ + if (iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + if (phys_addrs) { + iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n"); + } else { + RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n"); + } + } +#endif + rte_eal_get_configuration()->iova_mode = iova_mode; + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } + + if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) { + rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available"); + rte_errno = EINVAL; + return -1; + } + + RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n", + rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA"); + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { + rte_eal_init_alert("Cannot init logging."); + rte_errno = ENOMEM; + rte_atomic32_clear(&run_once); + return -1; + } + +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) { + rte_eal_init_alert("Cannot init VFIO"); + rte_errno = EAGAIN; + rte_atomic32_clear(&run_once); + return -1; + } +#endif + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory"); + rte_errno = ENOMEM; + return -1; + } + + /* the directories are locked during eal_hugepage_info_init */ + eal_hugedirs_unlock(); + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + rte_config.master_lcore, (uintptr_t)thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + ret = rte_thread_setname(lcore_config[i].thread_id, + thread_name); + if (ret != 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for lcore thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed"); + rte_errno = ENOEXEC; + return -1; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices"); + rte_errno = ENOTSUP; + return -1; + } + +#ifdef VFIO_PRESENT + /* Register mp action after probe() so that we got enough info */ + if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) + return -1; +#endif + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + return -1; + } + + /* + * Clean up unused files in runtime directory. We do this at the end of + * init and not at the beginning because we want to clean stuff up + * whether we are primary or secondary process, but we cannot remove + * primary process' files because secondary should be able to run even + * if primary process is dead. + * + * In no_shconf mode, no runtime directory is created in the first + * place, so no cleanup needed. + */ + if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { + rte_eal_init_alert("Cannot clear runtime directory\n"); + return -1; + } + if (!internal_config.no_telemetry) { + const char *error_str = NULL; + if (rte_telemetry_init(rte_eal_get_runtime_dir(), + &internal_config.ctrl_cpuset, &error_str) + != 0) { + rte_eal_init_alert(error_str); + return -1; + } + if (error_str != NULL) + RTE_LOG(NOTICE, EAL, "%s\n", error_str); + } + + eal_mcfg_complete(); + + return fctret; +} + +static int +mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg __rte_unused) +{ + /* ms is const, so find this memseg */ + struct rte_memseg *found; + + if (msl->external) + return 0; + + found = rte_mem_virt2memseg(ms->addr, msl); + + found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; + + return 0; +} + +int +rte_eal_cleanup(void) +{ + /* if we're in a primary process, we need to mark hugepages as freeable + * so that finalization can release them back to the system. + */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_memseg_walk(mark_freeable, NULL); + rte_service_finalize(); + rte_mp_channel_cleanup(); + rte_trace_save(); + eal_trace_fini(); + eal_cleanup_config(&internal_config); + return 0; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_hugepages(void) +{ + return ! internal_config.no_hugetlbfs; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return internal_config.vfio_intr_mode; +} + +int +rte_eal_check_module(const char *module_name) +{ + char sysfs_mod_name[PATH_MAX]; + struct stat st; + int n; + + if (NULL == module_name) + return -1; + + /* Check if there is sysfs mounted */ + if (stat("/sys/module", &st) != 0) { + RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + /* A module might be built-in, therefore try sysfs */ + n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); + if (n < 0 || n > PATH_MAX) { + RTE_LOG(DEBUG, EAL, "Could not format module path\n"); + return -1; + } + + if (stat(sysfs_mod_name, &st) != 0) { + RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", + sysfs_mod_name, errno, strerror(errno)); + return 0; + } + + /* Module has been found */ + return 1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_alarm.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_alarm.c new file mode 100644 index 000000000..3252c6fa5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_alarm.c @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <stdio.h> +#include <stdint.h> +#include <signal.h> +#include <errno.h> +#include <string.h> +#include <sys/queue.h> +#include <sys/time.h> +#include <sys/timerfd.h> + +#include <rte_memory.h> +#include <rte_interrupts.h> +#include <rte_alarm.h> +#include <rte_common.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#include <rte_errno.h> +#include <rte_spinlock.h> +#include <rte_eal_trace.h> + +#include <eal_private.h> + +#ifndef TFD_NONBLOCK +#include <fcntl.h> +#define TFD_NONBLOCK O_NONBLOCK +#endif + +#define NS_PER_US 1000 +#define US_PER_MS 1000 +#define MS_PER_S 1000 +#ifndef US_PER_S +#define US_PER_S (US_PER_MS * MS_PER_S) +#endif + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct timeval time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static int handler_registered = 0; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + /* create a timerfd file descriptor */ + intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (intr_handle.fd == -1) + goto error; + + return 0; + +error: + rte_errno = errno; + return -1; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + while ((ap = LIST_FIRST(&alarm_list)) !=NULL && + clock_gettime(CLOCK_TYPE_ID, &now) == 0 && + (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && + (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + } + + if (!LIST_EMPTY(&alarm_list)) { + struct itimerspec atime = { .it_interval = { 0, 0 } }; + + ap = LIST_FIRST(&alarm_list); + atime.it_value.tv_sec = ap->time.tv_sec; + atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; + /* perform borrow for subtraction if necessary */ + if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) + atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; + + atime.it_value.tv_sec -= now.tv_sec; + atime.it_value.tv_nsec -= now.tv_nsec; + timerfd_settime(intr_handle.fd, 0, &atime, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); +} + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct timespec now; + int ret = 0; + struct alarm_entry *ap, *new_alarm; + + /* Check parameters, including that us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; + new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + if (!handler_registered) { + /* registration can fail, callback can be registered later */ + if (rte_intr_callback_register(&intr_handle, + eal_alarm_callback, NULL) == 0) + handler_registered = 1; + } + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (ap->time.tv_sec > new_alarm->time.tv_sec || + (ap->time.tv_sec == new_alarm->time.tv_sec && + ap->time.tv_usec > new_alarm->time.tv_usec)){ + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + if (LIST_FIRST(&alarm_list) == new_alarm) { + struct itimerspec alarm_time = { + .it_interval = {0, 0}, + .it_value = { + .tv_sec = us / US_PER_S, + .tv_nsec = (us % US_PER_S) * NS_PER_US, + }, + }; + ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); + + rte_eal_trace_alarm_set(us, cb_fn, cb_arg, ret); + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while ((ap = LIST_FIRST(&alarm_list)) != NULL && + cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that alarm is executing + * so loop can spin till it finish. Otherwise we are trying to + * cancel our self - mark it by EINPROGRESS */ + if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + rte_eal_trace_alarm_cancel(cb_fn, cb_arg, count); + return count; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_cpuflags.c new file mode 100644 index 000000000..d38296e1e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_cpuflags.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Red Hat, Inc. + */ + +#include <elf.h> +#include <fcntl.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 16) +#include <sys/auxv.h> +#define HAS_AUXV 1 +#endif +#endif + +#include <rte_cpuflags.h> + +#ifndef HAS_AUXV +static unsigned long +getauxval(unsigned long type __rte_unused) +{ + errno = ENOTSUP; + return 0; +} +#endif + +#ifdef RTE_ARCH_64 +typedef Elf64_auxv_t Internal_Elfx_auxv_t; +#else +typedef Elf32_auxv_t Internal_Elfx_auxv_t; +#endif + +/** + * Provides a method for retrieving values from the auxiliary vector and + * possibly running a string comparison. + * + * @return Always returns a result. When the result is 0, check errno + * to see if an error occurred during processing. + */ +static unsigned long +_rte_cpu_getauxval(unsigned long type, const char *str) +{ + unsigned long val; + + errno = 0; + val = getauxval(type); + + if (!val && (errno == ENOTSUP || errno == ENOENT)) { + int auxv_fd = open("/proc/self/auxv", O_RDONLY); + Internal_Elfx_auxv_t auxv; + + if (auxv_fd == -1) + return 0; + + errno = ENOENT; + while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv.a_type == type) { + errno = 0; + val = auxv.a_un.a_val; + if (str) + val = strcmp((const char *)val, str); + break; + } + } + close(auxv_fd); + } + + return val; +} + +unsigned long +rte_cpu_getauxval(unsigned long type) +{ + return _rte_cpu_getauxval(type, NULL); +} + +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str) +{ + return _rte_cpu_getauxval(type, str); +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_debug.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_debug.c new file mode 100644 index 000000000..5d92500bf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include <execinfo.h> +#endif +#include <stdarg.h> +#include <signal.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> + +#include <rte_log.h> +#include <rte_debug.h> +#include <rte_common.h> +#include <rte_eal.h> + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_dev.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_dev.c new file mode 100644 index 000000000..83c9cd660 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_dev.c @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/socket.h> +#include <linux/netlink.h> + +#include <rte_string_fns.h> +#include <rte_log.h> +#include <rte_compat.h> +#include <rte_dev.h> +#include <rte_malloc.h> +#include <rte_interrupts.h> +#include <rte_alarm.h> +#include <rte_bus.h> +#include <rte_eal.h> +#include <rte_spinlock.h> +#include <rte_errno.h> + +#include "eal_private.h" + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static bool monitor_started; +static bool hotplug_handle; + +#define EAL_UEV_MSG_LEN 4096 +#define EAL_UEV_MSG_ELEM_LEN 128 + +/* + * spinlock for device hot-unplug failure handling. If it try to access bus or + * device, such as handle sigbus on bus or handle memory failure for device + * just need to use this lock. It could protect the bus and the device to avoid + * race condition. + */ +static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; + +static struct sigaction sigbus_action_old; + +static int sigbus_need_recover; + +static void dev_uev_handler(__rte_unused void *param); + +/* identify the system layer which reports this event. */ +enum eal_dev_event_subsystem { + EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ + EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_MAX +}; + +static void +sigbus_action_recover(void) +{ + if (sigbus_need_recover) { + sigaction(SIGBUS, &sigbus_action_old, NULL); + sigbus_need_recover = 0; + } +} + +static void sigbus_handler(int signum, siginfo_t *info, + void *ctx __rte_unused) +{ + int ret; + + RTE_LOG(DEBUG, EAL, "Thread catch SIGBUS, fault address:%p\n", + info->si_addr); + + rte_spinlock_lock(&failure_handle_lock); + ret = rte_bus_sigbus_handler(info->si_addr); + rte_spinlock_unlock(&failure_handle_lock); + if (ret == -1) { + rte_exit(EXIT_FAILURE, + "Failed to handle SIGBUS for hot-unplug, " + "(rte_errno: %s)!", strerror(rte_errno)); + } else if (ret == 1) { + if (sigbus_action_old.sa_flags == SA_SIGINFO + && sigbus_action_old.sa_sigaction) { + (*(sigbus_action_old.sa_sigaction))(signum, + info, ctx); + } else if (sigbus_action_old.sa_flags != SA_SIGINFO + && sigbus_action_old.sa_handler) { + (*(sigbus_action_old.sa_handler))(signum); + } else { + rte_exit(EXIT_FAILURE, + "Failed to handle generic SIGBUS!"); + } + } + + RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); +} + +static int cmp_dev_name(const struct rte_device *dev, + const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +static int +dev_uev_socket_fd_create(void) +{ + struct sockaddr_nl addr; + int ret; + + intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | + SOCK_NONBLOCK, + NETLINK_KOBJECT_UEVENT); + if (intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = 0; + addr.nl_groups = 0xffffffff; + + ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); + goto err; + } + + return 0; +err: + close(intr_handle.fd); + intr_handle.fd = -1; + return ret; +} + +static int +dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) +{ + char action[EAL_UEV_MSG_ELEM_LEN]; + char subsystem[EAL_UEV_MSG_ELEM_LEN]; + char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; + int i = 0; + + memset(action, 0, EAL_UEV_MSG_ELEM_LEN); + memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); + memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); + + while (i < length) { + for (; i < length; i++) { + if (*buf) + break; + buf++; + } + /** + * check device uevent from kernel side, no need to check + * uevent from udev. + */ + if (!strncmp(buf, "libudev", 7)) { + buf += 7; + i += 7; + return -1; + } + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + i += 7; + strlcpy(action, buf, sizeof(action)); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + i += 10; + strlcpy(subsystem, buf, sizeof(subsystem)); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + i += 14; + strlcpy(pci_slot_name, buf, sizeof(subsystem)); + event->devname = strdup(pci_slot_name); + } + for (; i < length; i++) { + if (*buf == '\0') + break; + buf++; + } + } + + /* parse the subsystem layer */ + if (!strncmp(subsystem, "uio", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; + else if (!strncmp(subsystem, "pci", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; + else if (!strncmp(subsystem, "vfio", 4)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; + else + return -1; + + /* parse the action type */ + if (!strncmp(action, "add", 3)) + event->type = RTE_DEV_EVENT_ADD; + else if (!strncmp(action, "remove", 6)) + event->type = RTE_DEV_EVENT_REMOVE; + else + return -1; + return 0; +} + +static void +dev_delayed_unregister(void *param) +{ + rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); + close(intr_handle.fd); + intr_handle.fd = -1; +} + +static void +dev_uev_handler(__rte_unused void *param) +{ + struct rte_dev_event uevent; + int ret; + char buf[EAL_UEV_MSG_LEN]; + struct rte_bus *bus; + struct rte_device *dev; + const char *busname = ""; + + memset(&uevent, 0, sizeof(struct rte_dev_event)); + memset(buf, 0, EAL_UEV_MSG_LEN); + + ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); + if (ret < 0 && errno == EAGAIN) + return; + else if (ret <= 0) { + /* connection is closed or broken, can not up again. */ + RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); + rte_eal_alarm_set(1, dev_delayed_unregister, NULL); + return; + } + + ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "It is not an valid event " + "that need to be handle.\n"); + return; + } + + RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", + uevent.devname, uevent.type, uevent.subsystem); + + switch (uevent.subsystem) { + case EAL_DEV_EVENT_SUBSYSTEM_PCI: + case EAL_DEV_EVENT_SUBSYSTEM_UIO: + busname = "pci"; + break; + default: + break; + } + + if (uevent.devname) { + if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { + rte_spinlock_lock(&failure_handle_lock); + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", + busname); + goto failure_handle_err; + } + + dev = bus->find_device(NULL, cmp_dev_name, + uevent.devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s) on " + "bus (%s)\n", uevent.devname, busname); + goto failure_handle_err; + } + + ret = bus->hot_unplug_handler(dev); + if (ret) { + RTE_LOG(ERR, EAL, "Can not handle hot-unplug " + "for device (%s)\n", dev->name); + } + rte_spinlock_unlock(&failure_handle_lock); + } + rte_dev_event_callback_process(uevent.devname, uevent.type); + } + + return; + +failure_handle_err: + rte_spinlock_unlock(&failure_handle_lock); +} + +int +rte_dev_event_monitor_start(void) +{ + int ret; + + if (monitor_started) + return 0; + + ret = dev_uev_socket_fd_create(); + if (ret) { + RTE_LOG(ERR, EAL, "error create device event fd.\n"); + return -1; + } + + intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; + ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); + + if (ret) { + RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); + return -1; + } + + monitor_started = true; + + return 0; +} + +int +rte_dev_event_monitor_stop(void) +{ + int ret; + + if (!monitor_started) + return 0; + + ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, + (void *)-1); + if (ret < 0) { + RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); + return ret; + } + + close(intr_handle.fd); + intr_handle.fd = -1; + monitor_started = false; + + return 0; +} + +int +dev_sigbus_handler_register(void) +{ + sigset_t mask; + struct sigaction action; + + rte_errno = 0; + + if (sigbus_need_recover) + return 0; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = SA_SIGINFO; + action.sa_mask = mask; + action.sa_sigaction = sigbus_handler; + sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); + + return rte_errno; +} + +int +dev_sigbus_handler_unregister(void) +{ + rte_errno = 0; + + sigbus_action_recover(); + + return rte_errno; +} + +int +rte_dev_hotplug_handle_enable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_register(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to register sigbus handler for devices.\n"); + + hotplug_handle = true; + + return ret; +} + +int +rte_dev_hotplug_handle_disable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_unregister(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to unregister sigbus handler for devices.\n"); + + hotplug_handle = false; + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_hugepage_info.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_hugepage_info.c new file mode 100644 index 000000000..91a4fede7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_hugepage_info.c @@ -0,0 +1,547 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <string.h> +#include <sys/types.h> +#include <sys/file.h> +#include <dirent.h> +#include <fcntl.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <fnmatch.h> +#include <inttypes.h> +#include <stdarg.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/stat.h> + +#include <linux/mman.h> /* for hugetlb-related flags */ + +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_debug.h> +#include <rte_log.h> +#include <rte_common.h> +#include "rte_string_fns.h" +#include "eal_internal_cfg.h" +#include "eal_hugepages.h" +#include "eal_filesystem.h" + +static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; +static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0600); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +static int get_hp_sysfs_value(const char *subdir, const char *file, unsigned long *val) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, file); + return eal_parse_sysfs_value(path, val); +} + +/* this function is only called from eal_hugepage_info_init which itself + * is only called from a primary process */ +static uint32_t +get_num_hugepages(const char *subdir) +{ + unsigned long resv_pages, num_pages, over_pages, surplus_pages; + const char *nr_hp_file = "free_hugepages"; + const char *nr_rsvd_file = "resv_hugepages"; + const char *nr_over_file = "nr_overcommit_hugepages"; + const char *nr_splus_file = "surplus_hugepages"; + + /* first, check how many reserved pages kernel reports */ + if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0) + return 0; + + if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0) + return 0; + + if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0) + over_pages = 0; + + if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0) + surplus_pages = 0; + + /* adjust num_pages */ + if (num_pages >= resv_pages) + num_pages -= resv_pages; + else if (resv_pages) + num_pages = 0; + + if (over_pages >= surplus_pages) + over_pages -= surplus_pages; + else + over_pages = 0; + + if (num_pages == 0 && over_pages == 0) + RTE_LOG(WARNING, EAL, "No available hugepages reported in %s\n", + subdir); + + num_pages += over_pages; + if (num_pages < over_pages) /* overflow */ + num_pages = UINT32_MAX; + + /* we want to return a uint32_t and more than this looks suspicious + * anyway ... */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint32_t +get_num_hugepages_on_node(const char *subdir, unsigned int socket) +{ + char path[PATH_MAX], socketpath[PATH_MAX]; + DIR *socketdir; + unsigned long num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + + snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", + sys_pages_numa_dir_path, socket); + + socketdir = opendir(socketpath); + if (socketdir) { + /* Keep calm and carry on */ + closedir(socketdir); + } else { + /* Can't find socket dir, so ignore it */ + return 0; + } + + snprintf(path, sizeof(path), "%s/%s/%s", + socketpath, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* + * we want to return a uint32_t and more than this looks suspicious + * anyway ... + */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint64_t +get_default_hp_size(void) +{ + const char proc_meminfo[] = "/proc/meminfo"; + const char str_hugepagesz[] = "Hugepagesize:"; + unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; + char buffer[256]; + unsigned long long size = 0; + + FILE *fd = fopen(proc_meminfo, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_meminfo); + while(fgets(buffer, sizeof(buffer), fd)){ + if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ + size = rte_str_to_size(&buffer[hugepagesz_len]); + break; + } + } + fclose(fd); + if (size == 0) + rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); + return size; +} + +static int +get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) +{ + enum proc_mount_fieldnames { + DEVICE = 0, + MOUNTPT, + FSTYPE, + OPTIONS, + _FIELDNAME_MAX + }; + static uint64_t default_size = 0; + const char proc_mounts[] = "/proc/mounts"; + const char hugetlbfs_str[] = "hugetlbfs"; + const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; + const char pagesize_opt[] = "pagesize="; + const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; + const char split_tok = ' '; + char *splitstr[_FIELDNAME_MAX]; + char buf[BUFSIZ]; + int retval = -1; + + FILE *fd = fopen(proc_mounts, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_mounts); + + if (default_size == 0) + default_size = get_default_hp_size(); + + while (fgets(buf, sizeof(buf), fd)){ + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, + split_tok) != _FIELDNAME_MAX) { + RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); + break; /* return NULL */ + } + + /* we have a specified --huge-dir option, only examine that dir */ + if (internal_config.hugepage_dir != NULL && + strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) + continue; + + if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ + const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); + + /* if no explicit page size, the default page size is compared */ + if (pagesz_str == NULL){ + if (hugepage_sz == default_size){ + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + /* there is an explicit page size, so check it */ + else { + uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); + if (pagesz == hugepage_sz) { + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + } /* end if strncmp hugetlbfs */ + } /* end while fgets */ + + fclose(fd); + return retval; +} + +/* + * Clear the hugepage directory of whatever hugepage files + * there are. Checks if the file is locked (i.e. + * if it's in use by another DPDK process). + */ +static int +clear_hugedir(const char * hugedir) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + const char filter[] = "*map_*"; /* matches hugepage files */ + + /* open directory */ + dir = opendir(hugedir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", + hugedir); + goto error; + } + dir_fd = dirfd(dir); + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", + hugedir); + goto error; + } + + while(dirent != NULL){ + /* skip files that don't match the hugepage pattern */ + if (fnmatch(filter, dirent->d_name, 0) > 0) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close (fd); + dirent = readdir(dir); + } + + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", + strerror(errno)); + + return -1; +} + +static int +compare_hpi(const void *a, const void *b) +{ + const struct hugepage_info *hpi_a = a; + const struct hugepage_info *hpi_b = b; + + return hpi_b->hugepage_sz - hpi_a->hugepage_sz; +} + +static void +calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) +{ + uint64_t total_pages = 0; + unsigned int i; + + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) { + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + } +} + +static int +hugepage_info_init(void) +{ const char dirent_start_text[] = "hugepages-"; + const size_t dirent_start_len = sizeof(dirent_start_text) - 1; + unsigned int i, num_sizes = 0; + DIR *dir; + struct dirent *dirent; + + dir = opendir(sys_dir_path); + if (dir == NULL) { + RTE_LOG(ERR, EAL, + "Cannot open directory %s to read system hugepage info\n", + sys_dir_path); + return -1; + } + + for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { + struct hugepage_info *hpi; + + if (strncmp(dirent->d_name, dirent_start_text, + dirent_start_len) != 0) + continue; + + if (num_sizes >= MAX_HUGEPAGE_SIZES) + break; + + hpi = &internal_config.hugepage_info[num_sizes]; + hpi->hugepage_sz = + rte_str_to_size(&dirent->d_name[dirent_start_len]); + + /* first, check if we have a mountpoint */ + if (get_hugepage_dir(hpi->hugepage_sz, + hpi->hugedir, sizeof(hpi->hugedir)) < 0) { + uint32_t num_pages; + + num_pages = get_num_hugepages(dirent->d_name); + if (num_pages > 0) + RTE_LOG(NOTICE, EAL, + "%" PRIu32 " hugepages of size " + "%" PRIu64 " reserved, but no mounted " + "hugetlbfs found for that size\n", + num_pages, hpi->hugepage_sz); + /* if we have kernel support for reserving hugepages + * through mmap, and we're in in-memory mode, treat this + * page size as valid. we cannot be in legacy mode at + * this point because we've checked this earlier in the + * init process. + */ +#ifdef MAP_HUGE_SHIFT + if (internal_config.in_memory) { + RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " + "hugepages of size %" PRIu64 " bytes " + "will be allocated anonymously\n", + hpi->hugepage_sz); + calc_num_pages(hpi, dirent); + num_sizes++; + } +#endif + continue; + } + + /* try to obtain a writelock */ + hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); + + /* if blocking lock failed */ + if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { + RTE_LOG(CRIT, EAL, + "Failed to lock hugepage directory!\n"); + break; + } + /* clear out the hugepages dir from unused pages */ + if (clear_hugedir(hpi->hugedir) == -1) + break; + + calc_num_pages(hpi, dirent); + + num_sizes++; + } + closedir(dir); + + /* something went wrong, and we broke from the for loop above */ + if (dirent != NULL) + return -1; + + internal_config.num_hugepage_sizes = num_sizes; + + /* sort the page directory entries by size, largest to smallest */ + qsort(&internal_config.hugepage_info[0], num_sizes, + sizeof(internal_config.hugepage_info[0]), compare_hpi); + + /* now we have all info, check we have at least one valid size */ + for (i = 0; i < num_sizes; i++) { + /* pages may no longer all be on socket 0, so check all */ + unsigned int j, num_pages = 0; + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + num_pages += hpi->num_pages[j]; + if (num_pages > 0) + return 0; + } + + /* no valid hugepage mounts available, return error */ + return -1; +} + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + struct hugepage_info *hpi, *tmp_hpi; + unsigned int i; + + if (hugepage_info_init() < 0) + return -1; + + /* for no shared files mode, we're done */ + if (internal_config.no_shconf) + return 0; + + hpi = &internal_config.hugepage_info[0]; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} + +int eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_interrupts.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_interrupts.c new file mode 100644 index 000000000..16e7a7d51 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_interrupts.c @@ -0,0 +1,1521 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <pthread.h> +#include <sys/queue.h> +#include <stdarg.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> +#include <sys/epoll.h> +#include <sys/signalfd.h> +#include <sys/ioctl.h> +#include <sys/eventfd.h> +#include <assert.h> +#include <stdbool.h> + +#include <rte_common.h> +#include <rte_interrupts.h> +#include <rte_memory.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_debug.h> +#include <rte_log.h> +#include <rte_errno.h> +#include <rte_spinlock.h> +#include <rte_pause.h> +#include <rte_vfio.h> +#include <rte_eal_trace.h> + +#include "eal_private.h" +#include "eal_vfio.h" +#include "eal_thread.h" + +#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) +#define NB_OTHER_INTR 1 + +static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ + +/** + * union for pipe fds. + */ +union intr_pipefds{ + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; +}; + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + int uio_intr_count; /* for uio device */ +#ifdef VFIO_PRESENT + uint64_t vfio_intr_count; /* for vfio device */ +#endif + uint64_t timerfd_num; /* for timerfd */ + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ + uint8_t pending_delete; /**< delete after callback is called */ + rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* union buffer for pipe read/write */ +static union intr_pipefds intr_pipe; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +/* VFIO interrupts */ +#ifdef VFIO_PRESENT + +#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) +/* irq set buffer length for queue interrupts and LSC interrupt */ +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) + +/* enable legacy (INTx) interrupts */ +static int +vfio_enable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + /* enable INTx */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* unmask INTx after enabling */ + memset(irq_set, 0, len); + len = sizeof(struct vfio_irq_set); + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable legacy (INTx) interrupts */ +static int +vfio_disable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + /* mask interrupts before disabling */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* disable INTx*/ + memset(irq_set, 0, len); + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, + "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); + return -1; + } + return 0; +} + +/* unmask/ack legacy (INTx) interrupts */ +static int +vfio_ack_intx(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set irq_set; + + /* unmask INTx */ + memset(&irq_set, 0, sizeof(irq_set)); + irq_set.argsz = sizeof(irq_set); + irq_set.count = 1; + irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set.index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set.start = 0; + + if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* enable MSI interrupts */ +static int +vfio_enable_msi(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable MSI interrupts */ +static int +vfio_disable_msi(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +/* enable MSI-X interrupts */ +static int +vfio_enable_msix(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ + irq_set->count = intr_handle->max_intr ? + (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? + RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + /* INTR vector offset 0 reserve for non-efds mapping */ + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; + memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, + sizeof(*intr_handle->efds) * intr_handle->nb_efd); + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable MSI-X interrupts */ +static int +vfio_disable_msix(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE +/* enable req notifier */ +static int +vfio_enable_req(const struct rte_intr_handle *intr_handle) +{ + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable req notifier */ +static int +vfio_disable_req(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", + intr_handle->fd); + + return ret; +} +#endif +#endif + +static int +uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* disable interrupts */ + command_high |= 0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* enable interrupts */ + command_high &= ~0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intr_disable(const struct rte_intr_handle *intr_handle) +{ + const int value = 0; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +static int +uio_intr_enable(const struct rte_intr_handle *intr_handle) +{ + const int value = 1; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + int ret, wake_thread; + struct rte_intr_source *src; + struct rte_intr_callback *callback; + + wake_thread = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + callback->pending_delete = 0; + callback->ucb_fn = NULL; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + wake_thread = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + free(callback); + ret = -ENOMEM; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + wake_thread = 1; + ret = 0; + } + } + + rte_spinlock_unlock(&intr_lock); + + /** + * check if need to notify the pipe fd waited by epoll_wait to + * rebuild the wait list. + */ + if (wake_thread) + if (write(intr_pipe.writefd, "1", 1) < 0) + ret = -EPIPE; + + rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret); + return ret; +} + +int +rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg, + rte_intr_unregister_callback_fn ucb_fn) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* only usable if the source is active */ + } else if (src->active == 0) { + ret = -EAGAIN; + + } else { + ret = 0; + + /* walk through the callbacks and mark all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + cb->pending_delete = 1; + cb->ucb_fn = ucb_fn; + ret++; + } + } + } + + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + ret = 0; + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + + rte_spinlock_unlock(&intr_lock); + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + ret = -EPIPE; + } + + rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg, + ret); + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + int rc = 0; + + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) { + rc = 0; + goto out; + } + + if (!intr_handle || intr_handle->fd < 0 || + intr_handle->uio_cfg_fd < 0) { + rc = -1; + goto out; + } + + switch (intr_handle->type){ + /* write to the uio fd to enable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + rc = -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + rc = -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + rc = -1; + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_enable_msix(intr_handle)) + rc = -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_enable_msi(intr_handle)) + rc = -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_enable_intx(intr_handle)) + rc = -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_enable_req(intr_handle)) + rc = -1; + break; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + rc = -1; + break; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + rc = -1; + break; + } +out: + rte_eal_trace_intr_enable(intr_handle, rc); + return rc; +} + +/** + * PMD generally calls this function at the end of its IRQ callback. + * Internally, it unmasks the interrupt if possible. + * + * For INTx, unmasking is required as the interrupt is auto-masked prior to + * invoking callback. + * + * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not + * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI, + * this function is no-op. + */ +int +rte_intr_ack(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* Both acking and enabling are same for UIO */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */ + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + return 0; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_ack_intx(intr_handle)) + return -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + return -1; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + int rc = 0; + + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) { + rc = 0; + goto out; + } + + if (!intr_handle || intr_handle->fd < 0 || + intr_handle->uio_cfg_fd < 0) { + rc = -1; + goto out; + } + + switch (intr_handle->type){ + /* write to the uio fd to disable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_disable(intr_handle)) + rc = -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_disable(intr_handle)) + rc = -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + rc = -1; + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_disable_msix(intr_handle)) + rc = -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_disable_msi(intr_handle)) + rc = -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_disable_intx(intr_handle)) + rc = -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_disable_req(intr_handle)) + rc = -1; + break; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + rc = -1; + break; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + rc = -1; + break; + } +out: + rte_eal_trace_intr_disable(intr_handle, rc); + return rc; +} + +static int +eal_intr_process_interrupts(struct epoll_event *events, int nfds) +{ + bool call = false; + int n, bytes_read, rv; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + union rte_intr_read_buffer buf; + struct rte_intr_callback active_cb; + + for (n = 0; n < nfds; n++) { + + /** + * if the pipe fd is ready to read, return out to + * rebuild the wait list. + */ + if (events[n].data.fd == intr_pipe.readfd){ + int r = read(intr_pipe.readfd, buf.charbuf, + sizeof(buf.charbuf)); + RTE_SET_USED(r); + return -1; + } + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == + events[n].data.fd) + break; + if (src == NULL){ + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; + case RTE_INTR_HANDLE_ALARM: + bytes_read = sizeof(buf.timerfd_num); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + bytes_read = 0; + call = true; + break; +#endif +#endif + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(events[n].data.fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + events[n].data.fd, + strerror(errno)); + /* + * The device is unplugged or buggy, remove + * it as an interrupt source and return to + * force the wait list to be rebuilt. + */ + rte_spinlock_lock(&intr_lock); + TAILQ_REMOVE(&intr_sources, src, next); + rte_spinlock_unlock(&intr_lock); + + for (cb = TAILQ_FIRST(&src->callbacks); cb; + cb = next) { + next = TAILQ_NEXT(cb, next); + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + } + free(src); + return -1; + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", events[n].data.fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + /* we done with that interrupt source, release it. */ + src->active = 0; + + rv = 0; + + /* check if any callback are supposed to be removed */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->pending_delete) { + TAILQ_REMOVE(&src->callbacks, cb, next); + if (cb->ucb_fn) + cb->ucb_fn(&src->intr_handle, cb->cb_arg); + free(cb); + rv++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (rv >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + rte_spinlock_unlock(&intr_lock); + return -EPIPE; + } + + rte_spinlock_unlock(&intr_lock); + } + + return 0; +} + +/** + * It handles all the interrupts. + * + * @param pfd + * epoll file descriptor. + * @param totalfds + * The number of file descriptors added in epoll. + * + * @return + * void + */ +static void +eal_intr_handle_interrupts(int pfd, unsigned totalfds) +{ + struct epoll_event events[totalfds]; + int nfds = 0; + + for(;;) { + nfds = epoll_wait(pfd, events, totalfds, + EAL_INTR_EPOLL_WAIT_FOREVER); + /* epoll_wait fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "epoll_wait returns with fail\n"); + return; + } + /* epoll_wait timeout, will never happens here */ + else if (nfds == 0) + continue; + /* epoll_wait has at least one fd ready to read */ + if (eal_intr_process_interrupts(events, nfds) < 0) + return; + } +} + +/** + * It builds/rebuilds up the epoll file descriptor with all the + * file descriptors being waited on. Then handles the interrupts. + * + * @param arg + * pointer. (unused) + * + * @return + * never return; + */ +static __rte_noreturn void * +eal_intr_thread_main(__rte_unused void *arg) +{ + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to + * wait on then pass it to the handle_interrupts function + */ + static struct epoll_event pipe_event = { + .events = EPOLLIN | EPOLLPRI, + }; + struct rte_intr_source *src; + unsigned numfds = 0; + + /* create epoll fd */ + int pfd = epoll_create(1); + if (pfd < 0) + rte_panic("Cannot create epoll instance\n"); + + pipe_event.data.fd = intr_pipe.readfd; + /** + * add pipe fd into wait list, this pipe is used to + * rebuild the wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, + &pipe_event) < 0) { + rte_panic("Error adding fd to %d epoll_ctl, %s\n", + intr_pipe.readfd, strerror(errno)); + } + numfds++; + + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { + struct epoll_event ev; + + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ + memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; + ev.data.fd = src->intr_handle.fd; + + /** + * add all the uio device file descriptor + * into wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, + src->intr_handle.fd, &ev) < 0){ + rte_panic("Error adding fd %d epoll_ctl, %s\n", + src->intr_handle.fd, strerror(errno)); + } + else + numfds++; + } + rte_spinlock_unlock(&intr_lock); + /* serve the interrupt */ + eal_intr_handle_interrupts(pfd, numfds); + + /** + * when we return, we need to rebuild the + * list of fds to monitor. + */ + close(pfd); + } +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + /** + * create a pipe which will be waited by epoll and notified to + * rebuild the wait list of epoll. + */ + if (pipe(intr_pipe.pipefd) < 0) { + rte_errno = errno; + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +static void +eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) +{ + union rte_intr_read_buffer buf; + int bytes_read = 0; + int nbytes; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + bytes_read = intr_handle->efd_counter_size; + /* For vdev, number of bytes to read is set by driver */ + break; + case RTE_INTR_HANDLE_EXT: + return; + default: + bytes_read = 1; + RTE_LOG(INFO, EAL, "unexpected intr type\n"); + break; + } + + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + if (bytes_read == 0) + return; + do { + nbytes = read(fd, &buf, bytes_read); + if (nbytes < 0) { + if (errno == EINTR || errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + RTE_LOG(ERR, EAL, + "Error reading from fd %d: %s\n", + fd, strerror(errno)); + } else if (nbytes == 0) + RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); + return; + } while (1); +} + +static int +eal_epoll_process_event(struct epoll_event *evs, unsigned int n, + struct rte_epoll_event *events) +{ + unsigned int i, count = 0; + struct rte_epoll_event *rev; + + for (i = 0; i < n; i++) { + rev = evs[i].data.ptr; + if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, + RTE_EPOLL_EXEC)) + continue; + + events[count].status = RTE_EPOLL_VALID; + events[count].fd = rev->fd; + events[count].epfd = rev->epfd; + events[count].epdata.event = rev->epdata.event; + events[count].epdata.data = rev->epdata.data; + if (rev->epdata.cb_fun) + rev->epdata.cb_fun(rev->fd, + rev->epdata.cb_arg); + + rte_compiler_barrier(); + rev->status = RTE_EPOLL_VALID; + count++; + } + return count; +} + +static inline int +eal_init_tls_epfd(void) +{ + int pfd = epoll_create(255); + + if (pfd < 0) { + RTE_LOG(ERR, EAL, + "Cannot create epoll instance\n"); + return -1; + } + return pfd; +} + +int +rte_intr_tls_epfd(void) +{ + if (RTE_PER_LCORE(_epfd) == -1) + RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); + + return RTE_PER_LCORE(_epfd); +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + struct epoll_event evs[maxevents]; + int rc; + + if (!events) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + while (1) { + rc = epoll_wait(epfd, evs, maxevents, timeout); + if (likely(rc > 0)) { + /* epoll_wait has at least one fd ready to read */ + rc = eal_epoll_process_event(evs, rc, events); + break; + } else if (rc < 0) { + if (errno == EINTR) + continue; + /* epoll_wait fail */ + RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", + strerror(errno)); + rc = -1; + break; + } else { + /* rc == 0, epoll_wait timed out */ + break; + } + } + + return rc; +} + +static inline void +eal_epoll_data_safe_free(struct rte_epoll_event *ev) +{ + while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, + RTE_EPOLL_INVALID)) + while (ev->status != RTE_EPOLL_VALID) + rte_pause(); + memset(&ev->epdata, 0, sizeof(ev->epdata)); + ev->fd = -1; + ev->epfd = -1; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event) +{ + struct epoll_event ev; + + if (!event) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + if (op == EPOLL_CTL_ADD) { + event->status = RTE_EPOLL_VALID; + event->fd = fd; /* ignore fd in event */ + event->epfd = epfd; + ev.data.ptr = (void *)event; + } + + ev.events = event->epdata.event; + if (epoll_ctl(epfd, op, fd, &ev) < 0) { + RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", + op, fd, strerror(errno)); + if (op == EPOLL_CTL_ADD) + /* rollback status when CTL_ADD fail */ + event->status = RTE_EPOLL_INVALID; + return -1; + } + + if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) + eal_epoll_data_safe_free(event); + + return 0; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, + int op, unsigned int vec, void *data) +{ + struct rte_epoll_event *rev; + struct rte_epoll_data *epdata; + int epfd_op; + unsigned int efd_idx; + int rc = 0; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + if (!intr_handle || intr_handle->nb_efd == 0 || + efd_idx >= intr_handle->nb_efd) { + RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); + return -EPERM; + } + + switch (op) { + case RTE_INTR_EVENT_ADD: + epfd_op = EPOLL_CTL_ADD; + rev = &intr_handle->elist[efd_idx]; + if (rev->status != RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event already been added.\n"); + return -EEXIST; + } + + /* attach to intr vector fd */ + epdata = &rev->epdata; + epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; + epdata->data = data; + epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; + epdata->cb_arg = (void *)intr_handle; + rc = rte_epoll_ctl(epfd, epfd_op, + intr_handle->efds[efd_idx], rev); + if (!rc) + RTE_LOG(DEBUG, EAL, + "efd %d associated with vec %d added on epfd %d" + "\n", rev->fd, vec, epfd); + else + rc = -EPERM; + break; + case RTE_INTR_EVENT_DEL: + epfd_op = EPOLL_CTL_DEL; + rev = &intr_handle->elist[efd_idx]; + if (rev->status == RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event does not exist.\n"); + return -EPERM; + } + + rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); + if (rc) + rc = -EPERM; + break; + default: + RTE_LOG(ERR, EAL, "event op type mismatch\n"); + rc = -EPERM; + } + + return rc; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + struct rte_epoll_event *rev; + + for (i = 0; i < intr_handle->nb_efd; i++) { + rev = &intr_handle->elist[i]; + if (rev->status == RTE_EPOLL_INVALID) + continue; + if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { + /* force free if the entry valid */ + eal_epoll_data_safe_free(rev); + rev->status = RTE_EPOLL_INVALID; + } + } +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + uint32_t i; + int fd; + uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + assert(nb_efd != 0); + + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { + for (i = 0; i < n; i++) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, + "can't setup eventfd, error %i (%s)\n", + errno, strerror(errno)); + return -errno; + } + intr_handle->efds[i] = fd; + } + intr_handle->nb_efd = n; + intr_handle->max_intr = NB_OTHER_INTR + n; + } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { + /* only check, initialization would be done in vdev driver.*/ + if (intr_handle->efd_counter_size > + sizeof(union rte_intr_read_buffer)) { + RTE_LOG(ERR, EAL, "the efd_counter_size is oversized"); + return -EINVAL; + } + } else { + intr_handle->efds[0] = intr_handle->fd; + intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); + intr_handle->max_intr = NB_OTHER_INTR; + } + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + + rte_intr_free_epoll_fd(intr_handle); + if (intr_handle->max_intr > intr_handle->nb_efd) { + for (i = 0; i < intr_handle->nb_efd; i++) + close(intr_handle->efds[i]); + } + intr_handle->nb_efd = 0; + intr_handle->max_intr = 0; +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + return !(!intr_handle->nb_efd); +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + if (!rte_intr_dp_is_en(intr_handle)) + return 1; + else + return !!(intr_handle->max_intr - intr_handle->nb_efd); +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) + return 1; + + if (intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 1; + + return 0; +} + +int rte_thread_is_intr(void) +{ + return pthread_equal(intr_thread, pthread_self()); +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_lcore.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_lcore.c new file mode 100644 index 000000000..bc8965844 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_lcore.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <unistd.h> +#include <limits.h> +#include <string.h> +#include <dirent.h> + +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_thread.h" + +#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" +#define CORE_ID_FILE "topology/core_id" +#define NUMA_NODE_PATH "/sys/devices/system/node" + +/* Check if a cpu is present by the presence of the cpu information for it */ +int +eal_cpu_detected(unsigned lcore_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), SYS_CPU_DIR + "/"CORE_ID_FILE, lcore_id); + if (len <= 0 || (unsigned)len >= sizeof(path)) + return 0; + if (access(path, F_OK) != 0) + return 0; + + return 1; +} + +/* + * Get CPU socket id (NUMA node) for a logical core. + * + * This searches each nodeX directories in /sys for the symlink for the given + * lcore_id and returns the numa node where the lcore is found. If lcore is not + * found on any numa node, returns zero. + */ +unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +/* Get the cpu core id value from the /sys/.../cpuX core_id value */ +unsigned +eal_cpu_core_id(unsigned lcore_id) +{ + char path[PATH_MAX]; + unsigned long id; + + int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); + if (len <= 0 || (unsigned)len >= sizeof(path)) + goto err; + if (eal_parse_sysfs_value(path, &id) != 0) + goto err; + return (unsigned)id; + +err: + RTE_LOG(ERR, EAL, "Error reading core id value from %s " + "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_log.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_log.c new file mode 100644 index 000000000..43c8460bf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_log.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <string.h> +#include <stdio.h> +#include <stdint.h> +#include <sys/types.h> +#include <syslog.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_spinlock.h> +#include <rte_log.h> + +#include "eal_private.h" + +/* + * default log function + */ +static ssize_t +console_log_write(__rte_unused void *c, const char *buf, size_t size) +{ + ssize_t ret; + + /* write on stdout */ + ret = fwrite(buf, 1, size, stdout); + fflush(stdout); + + /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ + syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); + + return ret; +} + +static cookie_io_functions_t console_log_func = { + .write = console_log_write, +}; + +/* + * set the log to default function, called during eal init process, + * once memzones are available. + */ +int +rte_eal_log_init(const char *id, int facility) +{ + FILE *log_stream; + + log_stream = fopencookie(NULL, "w+", console_log_func); + if (log_stream == NULL) + return -1; + + openlog(id, LOG_NDELAY | LOG_PID, facility); + + eal_log_set_default(log_stream); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_memalloc.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_memalloc.c new file mode 100644 index 000000000..2c717f8bd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_memalloc.c @@ -0,0 +1,1607 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include <errno.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <unistd.h> +#include <limits.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <signal.h> +#include <setjmp.h> +#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ +#include <linux/memfd.h> +#define MEMFD_SUPPORTED +#endif +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include <numa.h> +#include <numaif.h> +#endif +#include <linux/falloc.h> +#include <linux/mman.h> /* for hugetlb-related mmap flags */ + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_memory.h> +#include <rte_spinlock.h> + +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_private.h" + +const int anonymous_hugepages_supported = +#ifdef MAP_HUGE_SHIFT + 1; +#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT +#else + 0; +#define RTE_MAP_HUGE_SHIFT 26 +#endif + +/* + * we've already checked memfd support at compile-time, but we also need to + * check if we can create hugepage files with memfd. + * + * also, this is not a constant, because while we may be *compiled* with memfd + * hugetlbfs support, we might not be *running* on a system that supports memfd + * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at + * runtime, and fall back to anonymous memory. + */ +static int memfd_create_supported = +#ifdef MFD_HUGETLB + 1; +#define RTE_MFD_HUGETLB MFD_HUGETLB +#else + 0; +#define RTE_MFD_HUGETLB 4U +#endif + +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* + * we have two modes - single file segments, and file-per-page mode. + * + * for single-file segments, we use memseg_list_fd to store the segment fd, + * while the fds[] will not be allocated, and len will be set to 0. + * + * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' + * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. + * + * we cannot know how many pages a system will have in advance, but we do know + * that they come in lists, and we know lengths of these lists. so, simply store + * a malloc'd array of fd's indexed by list and segment index. + * + * they will be initialized at startup, and filled as we allocate/deallocate + * segments. + */ +static struct { + int *fds; /**< dynamically allocated array of segment lock fd's */ + int memseg_list_fd; /**< memseg list fd */ + int len; /**< total length of the array */ + int count; /**< entries used in an array */ +} fd_list[RTE_MAX_MEMSEG_LISTS]; + +/** local copy of a memory map, used to synchronize memory hotplug in MP */ +static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; + +static sigjmp_buf huge_jmpenv; + +static void __rte_unused huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int __rte_unused huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void __rte_unused +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void __rte_unused +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +static bool +check_numa(void) +{ + bool ret = true; + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + ret = false; + } + return ret; +} + +static void +prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) +{ + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + *oldpolicy = MPOL_DEFAULT; + } + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + socket_id); + numa_set_preferred(socket_id); +} + +static void +restore_numa(int *oldpolicy, struct bitmask *oldmask) +{ + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", *oldpolicy); + if (*oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(*oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + numa_free_cpumask(oldmask); +} +#endif + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +static int +pagesz_flags(uint64_t page_sz) +{ + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + int log2 = rte_log2_u64(page_sz); + return log2 << RTE_MAP_HUGE_SHIFT; +} + +/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ +static int lock(int fd, int type) +{ + int ret; + + /* flock may be interrupted */ + do { + ret = flock(fd, type | LOCK_NB); + } while (ret && errno == EINTR); + + if (ret && errno == EWOULDBLOCK) { + /* couldn't lock */ + return 0; + } else if (ret) { + RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", + __func__, strerror(errno)); + return -1; + } + /* lock was successful */ + return 1; +} + +static int +get_seg_memfd(struct hugepage_info *hi __rte_unused, + unsigned int list_idx __rte_unused, + unsigned int seg_idx __rte_unused) +{ +#ifdef MEMFD_SUPPORTED + int fd; + char segname[250]; /* as per manpage, limit is 249 bytes plus null */ + + int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + snprintf(segname, sizeof(segname), "seg_%i", list_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + snprintf(segname, sizeof(segname), "seg_%i-%i", + list_idx, seg_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +#endif + return -1; +} + +static int +get_seg_fd(char *path, int buflen, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + int fd; + + /* for in-memory mode, we only make it here when we're sure we support + * memfd, and this is a special case. + */ + if (internal_config.in_memory) + return get_seg_memfd(hi, list_idx, seg_idx); + + if (internal_config.single_file_segments) { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); + + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock and keep it indefinitely */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +} + +static int +resize_hugefile_in_memory(int fd, uint64_t fa_offset, + uint64_t page_sz, bool grow) +{ + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + return 0; +} + +static int +resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, + bool grow) +{ + bool again = false; + + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, and hugepage file is already locked at + * creation, so no further synchronization needed. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* + * technically, it is perfectly safe for both primary + * and secondary to grow and shrink the page files: + * growing the file repeatedly has no effect because + * a page can only be allocated once, while mmap ensures + * that secondaries hold on to the page even after the + * page itself is removed from the filesystem. + * + * however, leaving growing/shrinking to the primary + * tends to expose bugs in fdlist page count handling, + * so leave this here just in case. + */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else + fallocate_supported = 1; + } + } while (again); + + return 0; +} + +static void +close_hugefile(int fd, char *path, int list_idx) +{ + /* + * primary process must unlink the file, but only when not in in-memory + * mode (as in that case there is no file to unlink). + */ + if (!internal_config.in_memory && + rte_eal_process_type() == RTE_PROC_PRIMARY && + unlink(path)) + RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", + __func__, path, strerror(errno)); + + close(fd); + fd_list[list_idx].memseg_list_fd = -1; +} + +static int +resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow) +{ + /* in-memory mode is a special case, because we can be sure that + * fallocate() is supported. + */ + if (internal_config.in_memory) + return resize_hugefile_in_memory(fd, fa_offset, + page_sz, grow); + + return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, + grow); +} + +static int +alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, + struct hugepage_info *hi, unsigned int list_idx, + unsigned int seg_idx) +{ +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int cur_socket_id = 0; +#endif + uint64_t map_offset; + rte_iova_t iova; + void *va; + char path[PATH_MAX]; + int ret = 0; + int fd; + size_t alloc_sz; + int flags; + void *new_addr; + + alloc_sz = hi->hugepage_sz; + + /* these are checked at init, but code analyzers don't know that */ + if (internal_config.in_memory && !anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); + return -1; + } + if (internal_config.in_memory && !memfd_create_supported && + internal_config.single_file_segments) { + RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); + return -1; + } + + /* in-memory without memfd is a special case */ + int mmap_flags; + + if (internal_config.in_memory && !memfd_create_supported) { + const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | + MAP_PRIVATE | MAP_ANONYMOUS; + int pagesz_flag; + + pagesz_flag = pagesz_flags(alloc_sz); + fd = -1; + mmap_flags = in_memory_flags | pagesz_flag; + + /* single-file segments codepath will never be active + * here because in-memory mode is incompatible with the + * fallback path, and it's stopped at EAL initialization + * stage. + */ + map_offset = 0; + } else { + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); + return -1; + } + + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, map_offset, alloc_sz, true); + if (ret < 0) + goto resized; + + fd_list[list_idx].count++; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + if (internal_config.hugepage_unlink && + !internal_config.in_memory) { + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + } + } + mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; + } + + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, + map_offset); + + if (va == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, + strerror(errno)); + /* mmap failed, but the previous region might have been + * unmapped anyway. try to remap it + */ + goto unmapped; + } + if (va != addr) { + RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); + munmap(va, alloc_sz); + goto resized; + } + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", + (unsigned int)(alloc_sz >> 20)); + goto mapped; + } + + /* we need to trigger a write to the page to enforce page fault and + * ensure that page is accessible to us, but we can't overwrite value + * that is already there, so read the old value, and write itback. + * kernel populates the page with zeroes initially. + */ + *(volatile int *)addr = *(volatile int *)addr; + + iova = rte_mem_virt2iova(addr); + if (iova == RTE_BAD_PHYS_ADDR) { + RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", + __func__); + goto mapped; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", + __func__, strerror(errno)); + goto mapped; + } else if (cur_socket_id != socket_id) { + RTE_LOG(DEBUG, EAL, + "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", + __func__, socket_id, cur_socket_id); + goto mapped; + } +#else + if (rte_socket_count() > 1) + RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", + __func__); +#endif + + ms->addr = addr; + ms->hugepage_sz = alloc_sz; + ms->len = alloc_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->iova = iova; + ms->socket_id = socket_id; + + return 0; + +mapped: + munmap(addr, alloc_sz); +unmapped: + flags = MAP_FIXED; + new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); + if (new_addr != addr) { + if (new_addr != NULL) + munmap(new_addr, alloc_sz); + /* we're leaving a hole in our virtual address space. if + * somebody else maps this hole now, we could accidentally + * override it in the future. + */ + RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); + } + /* roll back the ref count */ + if (internal_config.single_file_segments) + fd_list[list_idx].count--; +resized: + /* some codepaths will return negative fd, so exit early */ + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + resize_hugefile(fd, map_offset, alloc_sz, false); + /* ignore failure, can't make it any worse */ + + /* if refcount is at zero, close the file */ + if (fd_list[list_idx].count == 0) + close_hugefile(fd, path, list_idx); + } else { + /* only remove file if we can take out a write lock */ + if (internal_config.hugepage_unlink == 0 && + internal_config.in_memory == 0 && + lock(fd, LOCK_EX) == 1) + unlink(path); + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + } + return -1; +} + +static int +free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + uint64_t map_offset; + char path[PATH_MAX]; + int fd, ret = 0; + bool exit_early; + + /* erase page data */ + memset(ms->addr, 0, ms->len); + + if (mmap(ms->addr, ms->len, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); + return -1; + } + + if (madvise(ms->addr, ms->len, MADV_DONTDUMP) != 0) + RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", strerror(errno)); + + exit_early = false; + + /* if we're using anonymous hugepages, nothing to be done */ + if (internal_config.in_memory && !memfd_create_supported) + exit_early = true; + + /* if we've already unlinked the page, nothing needs to be done */ + if (!internal_config.in_memory && internal_config.hugepage_unlink) + exit_early = true; + + if (exit_early) { + memset(ms, 0, sizeof(*ms)); + return 0; + } + + /* if we are not in single file segments mode, we're going to unmap the + * segment and thus drop the lock on original fd, but hugepage dir is + * now locked so we can take out another one without races. + */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, map_offset, ms->len, false)) + return -1; + + if (--(fd_list[list_idx].count) == 0) + close_hugefile(fd, path, list_idx); + + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + if (!internal_config.in_memory) { + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } + } + /* closing fd will drop the lock */ + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + } + + memset(ms, 0, sizeof(*ms)); + + return ret < 0 ? -1 : 0; +} + +struct alloc_walk_param { + struct hugepage_info *hi; + struct rte_memseg **ms; + size_t page_sz; + unsigned int segs_allocated; + unsigned int n_segs; + int socket; + bool exact; +}; +static int +alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct alloc_walk_param *wa = arg; + struct rte_memseg_list *cur_msl; + size_t page_sz; + int cur_idx, start_idx, j, dir_fd = -1; + unsigned int msl_idx, need, i; + + if (msl->page_sz != wa->page_sz) + return 0; + if (msl->socket_id != wa->socket) + return 0; + + page_sz = (size_t)msl->page_sz; + + msl_idx = msl - mcfg->memsegs; + cur_msl = &mcfg->memsegs[msl_idx]; + + need = wa->n_segs; + + /* try finding space in memseg list */ + if (wa->exact) { + /* if we require exact number of pages in a list, find them */ + cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, + need); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + } else { + int cur_len; + + /* we don't require exact number of pages, so we're going to go + * for best-effort allocation. that means finding the biggest + * unused block, and going with that. + */ + cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, + 0); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + /* adjust the size to possibly be smaller than original + * request, but do not allow it to be bigger. + */ + cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, + cur_idx); + need = RTE_MIN(need, (unsigned int)cur_len); + } + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + for (i = 0; i < need; i++, cur_idx++) { + struct rte_memseg *cur; + void *map_addr; + + cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); + map_addr = RTE_PTR_ADD(cur_msl->base_va, + cur_idx * page_sz); + + if (alloc_seg(cur, map_addr, wa->socket, wa->hi, + msl_idx, cur_idx)) { + RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", + need, i); + + /* if exact number wasn't requested, stop */ + if (!wa->exact) + goto out; + + /* clean up */ + for (j = start_idx; j < cur_idx; j++) { + struct rte_memseg *tmp; + struct rte_fbarray *arr = + &cur_msl->memseg_arr; + + tmp = rte_fbarray_get(arr, j); + rte_fbarray_set_free(arr, j); + + /* free_seg may attempt to create a file, which + * may fail. + */ + if (free_seg(tmp, wa->hi, msl_idx, j)) + RTE_LOG(DEBUG, EAL, "Cannot free page\n"); + } + /* clear the list */ + if (wa->ms) + memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); + + if (dir_fd >= 0) + close(dir_fd); + return -1; + } + if (wa->ms) + wa->ms[i] = cur; + + rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); + } +out: + wa->segs_allocated = i; + if (i > 0) + cur_msl->version++; + if (dir_fd >= 0) + close(dir_fd); + /* if we didn't allocate any segments, move on to the next list */ + return i > 0; +} + +struct free_walk_param { + struct hugepage_info *hi; + struct rte_memseg *ms; +}; +static int +free_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct free_walk_param *wa = arg; + uintptr_t start_addr, end_addr; + int msl_idx, seg_idx, ret, dir_fd = -1; + + start_addr = (uintptr_t) msl->base_va; + end_addr = start_addr + msl->len; + + if ((uintptr_t)wa->ms->addr < start_addr || + (uintptr_t)wa->ms->addr >= end_addr) + return 0; + + msl_idx = msl - mcfg->memsegs; + seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; + + /* msl is const */ + found_msl = &mcfg->memsegs[msl_idx]; + + /* do not allow any page allocations during the time we're freeing, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + found_msl->version++; + + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + + ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); + + if (dir_fd >= 0) + close(dir_fd); + + if (ret < 0) + return -1; + + return 1; +} + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact) +{ + int i, ret = -1; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + bool have_numa = false; + int oldpolicy; + struct bitmask *oldmask; +#endif + struct alloc_walk_param wa; + struct hugepage_info *hi = NULL; + + memset(&wa, 0, sizeof(wa)); + + /* dynamic allocation not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { + if (page_sz == + internal_config.hugepage_info[i].hugepage_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", + __func__); + return -1; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (check_numa()) { + oldmask = numa_allocate_nodemask(); + prepare_numa(&oldpolicy, oldmask, socket); + have_numa = true; + } +#endif + + wa.exact = exact; + wa.hi = hi; + wa.ms = ms; + wa.n_segs = n_segs; + wa.page_sz = page_sz; + wa.socket = socket; + wa.segs_allocated = 0; + + /* memalloc is locked, so it's safe to use thread-unsafe version */ + ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); + if (ret == 0) { + RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", + __func__); + ret = -1; + } else if (ret > 0) { + ret = (int)wa.segs_allocated; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (have_numa) + restore_numa(&oldpolicy, oldmask); +#endif + return ret; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket) +{ + struct rte_memseg *ms; + if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) + return NULL; + /* return pointer to newly allocated memseg */ + return ms; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) +{ + int seg, ret = 0; + + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (seg = 0; seg < n_segs; seg++) { + struct rte_memseg *cur = ms[seg]; + struct hugepage_info *hi = NULL; + struct free_walk_param wa; + int i, walk_res; + + /* if this page is marked as unfreeable, fail */ + if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); + ret = -1; + continue; + } + + memset(&wa, 0, sizeof(wa)); + + for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); + i++) { + hi = &internal_config.hugepage_info[i]; + if (cur->hugepage_sz == hi->hugepage_sz) + break; + } + if (i == (int)RTE_DIM(internal_config.hugepage_info)) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + ret = -1; + continue; + } + + wa.ms = cur; + wa.hi = hi; + + /* memalloc is locked, so it's safe to use thread-unsafe version + */ + walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, + &wa); + if (walk_res == 1) + continue; + if (walk_res == 0) + RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); + ret = -1; + } + return ret; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms) +{ + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + return eal_memalloc_free_seg_bulk(&ms, 1); +} + +static int +sync_chunk(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used, int start, int end) +{ + struct rte_fbarray *l_arr, *p_arr; + int i, ret, chunk_len, diff_len; + + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + /* we need to aggregate allocations/deallocations into bigger chunks, + * as we don't want to spam the user with per-page callbacks. + * + * to avoid any potential issues, we also want to trigger + * deallocation callbacks *before* we actually deallocate + * memory, so that the user application could wrap up its use + * before it goes away. + */ + + chunk_len = end - start; + + /* find how many contiguous pages we can map/unmap for this chunk */ + diff_len = used ? + rte_fbarray_find_contig_free(l_arr, start) : + rte_fbarray_find_contig_used(l_arr, start); + + /* has to be at least one page */ + if (diff_len < 1) + return -1; + + diff_len = RTE_MIN(chunk_len, diff_len); + + /* if we are freeing memory, notify the application */ + if (!used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + start_va, len); + } + + for (i = 0; i < diff_len; i++) { + struct rte_memseg *p_ms, *l_ms; + int seg_idx = start + i; + + l_ms = rte_fbarray_get(l_arr, seg_idx); + p_ms = rte_fbarray_get(p_arr, seg_idx); + + if (l_ms == NULL || p_ms == NULL) + return -1; + + if (used) { + ret = alloc_seg(l_ms, p_ms->addr, + p_ms->socket_id, hi, + msl_idx, seg_idx); + if (ret < 0) + return -1; + rte_fbarray_set_used(l_arr, seg_idx); + } else { + ret = free_seg(l_ms, hi, msl_idx, seg_idx); + rte_fbarray_set_free(l_arr, seg_idx); + if (ret < 0) + return -1; + } + } + + /* if we just allocated memory, notify the application */ + if (used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + start_va, len); + } + + /* calculate how much we can advance until next chunk */ + diff_len = used ? + rte_fbarray_find_contig_used(l_arr, start) : + rte_fbarray_find_contig_free(l_arr, start); + ret = RTE_MIN(chunk_len, diff_len); + + return ret; +} + +static int +sync_status(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used) +{ + struct rte_fbarray *l_arr, *p_arr; + int p_idx, l_chunk_len, p_chunk_len, ret; + int start, end; + + /* this is a little bit tricky, but the basic idea is - walk both lists + * and spot any places where there are discrepancies. walking both lists + * and noting discrepancies in a single go is a hard problem, so we do + * it in two passes - first we spot any places where allocated segments + * mismatch (i.e. ensure that everything that's allocated in the primary + * is also allocated in the secondary), and then we do it by looking at + * free segments instead. + * + * we also need to aggregate changes into chunks, as we have to call + * callbacks per allocation, not per page. + */ + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + if (used) + p_idx = rte_fbarray_find_next_used(p_arr, 0); + else + p_idx = rte_fbarray_find_next_free(p_arr, 0); + + while (p_idx >= 0) { + int next_chunk_search_idx; + + if (used) { + p_chunk_len = rte_fbarray_find_contig_used(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_used(l_arr, + p_idx); + } else { + p_chunk_len = rte_fbarray_find_contig_free(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_free(l_arr, + p_idx); + } + /* best case scenario - no differences (or bigger, which will be + * fixed during next iteration), look for next chunk + */ + if (l_chunk_len >= p_chunk_len) { + next_chunk_search_idx = p_idx + p_chunk_len; + goto next_chunk; + } + + /* if both chunks start at the same point, skip parts we know + * are identical, and sync the rest. each call to sync_chunk + * will only sync contiguous segments, so we need to call this + * until we are sure there are no more differences in this + * chunk. + */ + start = p_idx + l_chunk_len; + end = p_idx + p_chunk_len; + do { + ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, + used, start, end); + start += ret; + } while (start < end && ret >= 0); + /* if ret is negative, something went wrong */ + if (ret < 0) + return -1; + + next_chunk_search_idx = p_idx + p_chunk_len; +next_chunk: + /* skip to end of this chunk */ + if (used) { + p_idx = rte_fbarray_find_next_used(p_arr, + next_chunk_search_idx); + } else { + p_idx = rte_fbarray_find_next_free(p_arr, + next_chunk_search_idx); + } + } + return 0; +} + +static int +sync_existing(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx) +{ + int ret, dir_fd; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + */ + dir_fd = open(hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + + /* ensure all allocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); + if (ret < 0) + goto fail; + + /* ensure all unallocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); + if (ret < 0) + goto fail; + + /* update version number */ + local_msl->version = primary_msl->version; + + close(dir_fd); + + return 0; +fail: + close(dir_fd); + return -1; +} + +static int +sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + struct hugepage_info *hi = NULL; + unsigned int i; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + uint64_t cur_sz = + internal_config.hugepage_info[i].hugepage_sz; + uint64_t msl_sz = primary_msl->page_sz; + if (msl_sz == cur_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + return -1; + } + + /* if versions don't match, synchronize everything */ + if (local_msl->version != primary_msl->version && + sync_existing(primary_msl, local_msl, hi, msl_idx)) + return -1; + return 0; +} + + +int +eal_memalloc_sync_with_primary(void) +{ + /* nothing to be done in primary */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return 0; + + /* memalloc is locked, so it's safe to call thread-unsafe version */ + if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) + return -1; + return 0; +} + +static int +secondary_msl_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + char name[PATH_MAX]; + int msl_idx, ret; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + /* create distinct fbarrays for each secondary */ + snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", + primary_msl->memseg_arr.name, getpid()); + + ret = rte_fbarray_init(&local_msl->memseg_arr, name, + primary_msl->memseg_arr.len, + primary_msl->memseg_arr.elt_sz); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); + return -1; + } + local_msl->base_va = primary_msl->base_va; + local_msl->len = primary_msl->len; + + return 0; +} + +static int +alloc_list(int list_idx, int len) +{ + int *data; + int i; + + /* single-file segments mode does not need fd list */ + if (!internal_config.single_file_segments) { + /* ensure we have space to store fd per each possible segment */ + data = malloc(sizeof(int) * len); + if (data == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); + return -1; + } + /* set all fd's as invalid */ + for (i = 0; i < len; i++) + data[i] = -1; + fd_list[list_idx].fds = data; + fd_list[list_idx].len = len; + } else { + fd_list[list_idx].fds = NULL; + fd_list[list_idx].len = 0; + } + + fd_list[list_idx].count = 0; + fd_list[list_idx].memseg_list_fd = -1; + + return 0; +} + +static int +fd_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int len; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + return alloc_list(msl_idx, len); +} + +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* single file segments mode doesn't support individual segment fd's */ + if (internal_config.single_file_segments) + return -ENOTSUP; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + fd_list[list_idx].fds[seg_idx] = fd; + + return 0; +} + +int +eal_memalloc_set_seg_list_fd(int list_idx, int fd) +{ + /* non-single file segment mode doesn't support segment list fd's */ + if (!internal_config.single_file_segments) + return -ENOTSUP; + + fd_list[list_idx].memseg_list_fd = fd; + + return 0; +} + +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx) +{ + int fd; + + if (internal_config.in_memory || internal_config.no_hugetlbfs) { +#ifndef MEMFD_SUPPORTED + /* in in-memory or no-huge mode, we rely on memfd support */ + return -ENOTSUP; +#endif + /* memfd supported, but hugetlbfs memfd may not be */ + if (!internal_config.no_hugetlbfs && !memfd_create_supported) + return -ENOTSUP; + } + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + } else if (fd_list[list_idx].len == 0) { + /* list not initialized */ + fd = -1; + } else { + fd = fd_list[list_idx].fds[seg_idx]; + } + if (fd < 0) + return -ENODEV; + return fd; +} + +static int +test_memfd_create(void) +{ +#ifdef MEMFD_SUPPORTED + unsigned int i; + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; + int pagesz_flag = pagesz_flags(pagesz); + int flags; + + flags = pagesz_flag | RTE_MFD_HUGETLB; + int fd = memfd_create("test", flags); + if (fd < 0) { + /* we failed - let memalloc know this isn't working */ + if (errno == EINVAL) { + memfd_create_supported = 0; + return 0; /* not supported */ + } + + /* we got other error - something's wrong */ + return -1; /* error */ + } + close(fd); + return 1; /* supported */ + } +#endif + return 0; /* not supported */ +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (internal_config.in_memory || internal_config.no_hugetlbfs) { +#ifndef MEMFD_SUPPORTED + /* in in-memory or no-huge mode, we rely on memfd support */ + return -ENOTSUP; +#endif + /* memfd supported, but hugetlbfs memfd may not be */ + if (!internal_config.no_hugetlbfs && !memfd_create_supported) + return -ENOTSUP; + } + + if (internal_config.single_file_segments) { + size_t pgsz = mcfg->memsegs[list_idx].page_sz; + + /* segment not active? */ + if (fd_list[list_idx].memseg_list_fd < 0) + return -ENOENT; + *offset = pgsz * seg_idx; + } else { + /* fd_list not initialized? */ + if (fd_list[list_idx].len == 0) + return -ENODEV; + + /* segment not active? */ + if (fd_list[list_idx].fds[seg_idx] < 0) + return -ENOENT; + *offset = 0; + } + return 0; +} + +int +eal_memalloc_init(void) +{ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) + return -1; + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + internal_config.in_memory) { + int mfd_res = test_memfd_create(); + + if (mfd_res < 0) { + RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); + return -1; + } + if (mfd_res == 1) + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + else + RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); + + /* we only support single-file segments mode with in-memory mode + * if we support hugetlbfs with memfd_create. this code will + * test if we do. + */ + if (internal_config.single_file_segments && + mfd_res != 1) { + RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); + return -1; + } + /* this cannot ever happen but better safe than sorry */ + if (!anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); + return -1; + } + } + + /* initialize all of the fd lists */ + if (rte_memseg_list_walk(fd_list_create_walk, NULL)) + return -1; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_memory.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_memory.c new file mode 100644 index 000000000..7a9c97ff8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_memory.c @@ -0,0 +1,2481 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <sys/resource.h> +#include <unistd.h> +#include <limits.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <signal.h> +#include <setjmp.h> +#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ +#include <linux/memfd.h> +#define MEMFD_SUPPORTED +#endif +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include <numa.h> +#include <numaif.h> +#endif + +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" + +#define PFN_MASK_SIZE 8 + +/** + * @file + * Huge page mapping under linux + * + * To reserve a big contiguous amount of memory, we use the hugepage + * feature of linux. For that, we need to have hugetlbfs mounted. This + * code will create many files in this directory (one per page) and + * map them in virtual memory. For each page, we will retrieve its + * physical address and remap it in order to have a virtual contiguous + * zone as well as a physical contiguous zone. + */ + +static int phys_addrs_available = -1; + +#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" + +uint64_t eal_get_baseaddr(void) +{ + /* + * Linux kernel uses a really high address as starting address for + * serving mmaps calls. If there exists addressing limitations and IOVA + * mode is VA, this starting address is likely too high for those + * devices. However, it is possible to use a lower address in the + * process virtual address space as with 64 bits there is a lot of + * available space. + * + * Current known limitations are 39 or 40 bits. Setting the starting + * address at 4GB implies there are 508GB or 1020GB for mapping the + * available hugepages. This is likely enough for most systems, although + * a device with addressing limitations should call + * rte_mem_check_dma_mask for ensuring all memory is within supported + * range. + */ + return 0x100000000ULL; +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + int fd, retval; + uint64_t page, physaddr; + unsigned long virt_pfn; + int page_size; + off_t offset; + + if (phys_addrs_available == 0) + return RTE_BAD_IOVA; + + /* standard page size */ + page_size = getpagesize(); + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) { + RTE_LOG(INFO, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } + + virt_pfn = (unsigned long)virtaddr / page_size; + offset = sizeof(uint64_t) * virt_pfn; + if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { + RTE_LOG(INFO, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_IOVA; + } + + retval = read(fd, &page, PFN_MASK_SIZE); + close(fd); + if (retval < 0) { + RTE_LOG(INFO, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } else if (retval != PFN_MASK_SIZE) { + RTE_LOG(INFO, EAL, "%s(): read %d bytes from /proc/self/pagemap " + "but expected %d:\n", + __func__, retval, PFN_MASK_SIZE); + return RTE_BAD_IOVA; + } + + /* + * the pfn (page frame number) are bits 0-54 (see + * pagemap.txt in linux Documentation) + */ + if ((page & 0x7fffffffffffffULL) == 0) + return RTE_BAD_IOVA; + + physaddr = ((page & 0x7fffffffffffffULL) * page_size) + + ((unsigned long)virtaddr % page_size); + + return physaddr; +} + +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t)virtaddr; + return rte_mem_virt2phy(virtaddr); +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value. We find + * it by browsing the /proc/self/pagemap special file. + */ +static int +find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); + if (addr == RTE_BAD_PHYS_ADDR) + return -1; + hugepg_tbl[i].physaddr = addr; + } + return 0; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. + */ +static int +set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + static phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + hugepg_tbl[i].physaddr = addr; + addr += hugepg_tbl[i].size; + } + return 0; +} + +/* + * Check whether address-space layout randomization is enabled in + * the kernel. This is important for multi-process as it can prevent + * two processes mapping data to the same virtual address + * Returns: + * 0 - address space randomization disabled + * 1/2 - address space randomization enabled + * negative error code on error + */ +static int +aslr_enabled(void) +{ + char c; + int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); + if (fd < 0) + return -errno; + retval = read(fd, &c, 1); + close(fd); + if (retval < 0) + return -errno; + if (retval == 0) + return -EIO; + switch (c) { + case '0' : return 0; + case '1' : return 1; + case '2' : return 2; + default: return -EINVAL; + } +} + +static sigjmp_buf huge_jmpenv; + +static void huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +/* Callback for numa library. */ +void numa_error(char *where) +{ + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); +} +#endif + +/* + * Mmap all hugepages of hugepage table: it first open a file in + * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the + * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored + * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to + * map contiguous physical blocks in contiguous virtual blocks. + */ +static unsigned +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, + uint64_t *essential_memory __rte_unused) +{ + int fd; + unsigned i; + void *virtaddr; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int node_id = -1; + int essential_prev = 0; + int oldpolicy; + struct bitmask *oldmask = NULL; + bool have_numa = true; + unsigned long maxnode = 0; + + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + have_numa = false; + } + + if (have_numa) { + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + oldmask = numa_allocate_nodemask(); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + if (internal_config.socket_mem[i]) + maxnode = i + 1; + } +#endif + + for (i = 0; i < hpi->num_pages[0]; i++) { + struct hugepage_file *hf = &hugepg_tbl[i]; + uint64_t hugepage_sz = hpi->hugepage_sz; + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + unsigned int j; + + for (j = 0; j < maxnode; j++) + if (essential_memory[j]) + break; + + if (j == maxnode) { + node_id = (node_id + 1) % maxnode; + while (!internal_config.socket_mem[node_id]) { + node_id++; + node_id %= maxnode; + } + essential_prev = 0; + } else { + node_id = j; + essential_prev = essential_memory[j]; + + if (essential_memory[j] < hugepage_sz) + essential_memory[j] = 0; + else + essential_memory[j] -= hugepage_sz; + } + + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + node_id); + numa_set_preferred(node_id); + } +#endif + + hf->file_id = i; + hf->size = hugepage_sz; + eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), + hpi->hugedir, hf->file_id); + hf->filepath[sizeof(hf->filepath) - 1] = '\0'; + + /* try to create hugepage file */ + fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + goto out; + } + + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros. we don't care where + * this gets mapped - we already have contiguous memory areas + * ready for us to map into. + */ + virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (virtaddr == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, + strerror(errno)); + close(fd); + goto out; + } + + hf->orig_va = virtaddr; + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned int)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) + essential_memory[node_id] = + essential_prev; +#endif + goto out; + } + *(int *)virtaddr = 0; + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + goto out; + } + + close(fd); + } + +out: +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", oldpolicy); + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + } + if (oldmask != NULL) + numa_free_cpumask(oldmask); +#endif + return i; +} + +/* + * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge + * page. + */ +static int +find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int socket_id; + char *end, *nodestr; + unsigned i, hp_count = 0; + uint64_t virt_addr; + char buf[BUFSIZ]; + char hugedir_str[PATH_MAX]; + FILE *f; + + f = fopen("/proc/self/numa_maps", "r"); + if (f == NULL) { + RTE_LOG(NOTICE, EAL, "NUMA support not available" + " consider that all memory is in socket_id 0\n"); + return 0; + } + + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, eal_get_hugefile_prefix()); + + /* parse numa map */ + while (fgets(buf, sizeof(buf), f) != NULL) { + + /* ignore non huge page */ + if (strstr(buf, " huge ") == NULL && + strstr(buf, hugedir_str) == NULL) + continue; + + /* get zone addr */ + virt_addr = strtoull(buf, &end, 16); + if (virt_addr == 0 || end == buf) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* get node id (socket id) */ + nodestr = strstr(buf, " N"); + if (nodestr == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + nodestr += 2; + end = strstr(nodestr, "="); + if (end == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + end[0] = '\0'; + end = NULL; + + socket_id = strtoul(nodestr, &end, 0); + if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* if we find this page in our mappings, set socket_id */ + for (i = 0; i < hpi->num_pages[0]; i++) { + void *va = (void *)(unsigned long)virt_addr; + if (hugepg_tbl[i].orig_va == va) { + hugepg_tbl[i].socket_id = socket_id; + hp_count++; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + RTE_LOG(DEBUG, EAL, + "Hugepage %s is on socket %d\n", + hugepg_tbl[i].filepath, socket_id); +#endif + } + } + } + + if (hp_count < hpi->num_pages[0]) + goto error; + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +cmp_physaddr(const void *a, const void *b) +{ +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = a; + const struct hugepage_file *p2 = b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = b; + const struct hugepage_file *p2 = a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; +} + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + void *retval; + int fd; + + /* if no shared files mode is used, create anonymous memory instead */ + if (internal_config.no_shconf) { + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (retval == MAP_FAILED) + return NULL; + return retval; + } + + fd = open(filename, O_CREAT | O_RDWR, 0600); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (retval == MAP_FAILED) + return NULL; + return retval; +} + +/* + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, + const struct hugepage_file * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].orig_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); + dst_pos++; + } + } + return 0; +} + +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->orig_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + uint64_t unmap_len; + + unmap_len = hp->size; + + /* get start addr and len of the remaining segment */ + munmap(hp->orig_va, + (size_t)unmap_len); + + hp->orig_va = NULL; + if (unlink(hp->filepath) == -1) { + RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + return -1; + } + } else { + /* lock the page and skip */ + pages_found++; + } + + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static int +remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int cur_page, seg_len; + unsigned int msl_idx; + int ms_idx; + uint64_t page_sz; + size_t memseg_len; + int socket_id; + + page_sz = hugepages[seg_start].size; + socket_id = hugepages[seg_start].socket_id; + seg_len = seg_end - seg_start; + + RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20ULL, socket_id); + + /* find free space in memseg lists */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + bool empty; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + if (msl->socket_id != socket_id) + continue; + + /* leave space for a hole if array is not empty */ + empty = arr->count == 0; + ms_idx = rte_fbarray_find_next_n_free(arr, 0, + seg_len + (empty ? 0 : 1)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + /* leave some space between memsegs, they are not IOVA + * contiguous, so they shouldn't be VA contiguous either. + */ + if (!empty) + ms_idx++; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + +#ifdef RTE_ARCH_PPC_64 + /* for PPC64 we go through the list backwards */ + for (cur_page = seg_end - 1; cur_page >= seg_start; + cur_page--, ms_idx++) { +#else + for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { +#endif + struct hugepage_file *hfile = &hugepages[cur_page]; + struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); + void *addr; + int fd; + + fd = open(hfile->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", + hfile->filepath, strerror(errno)); + return -1; + } + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + memseg_len = (size_t)page_sz; + addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); + + /* we know this address is already mmapped by memseg list, so + * using MAP_FIXED here is safe + */ + addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + + /* we have a new address, so unmap previous one */ +#ifndef RTE_ARCH_64 + /* in 32-bit legacy mode, we have already unmapped the page */ + if (!internal_config.legacy_mem) + munmap(hfile->orig_va, page_sz); +#else + munmap(hfile->orig_va, page_sz); +#endif + + hfile->orig_va = NULL; + hfile->final_va = addr; + + /* rewrite physical addresses in IOVA as VA mode */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + hfile->physaddr = (uintptr_t)addr; + + /* set up memseg data */ + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = memseg_len; + ms->iova = hfile->physaddr; + ms->socket_id = hfile->socket_id; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + + rte_fbarray_set_used(arr, ms_idx); + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); + } + RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20, socket_id); + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +static int +free_memseg_list(struct rte_memseg_list *msl) +{ + if (rte_fbarray_destroy(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); + return -1; + } + memset(msl, 0, sizeof(*msl)); + return 0; +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + msl->heap = 1; /* mark it as a heap segment */ + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - " + "please use '--" OPT_BASE_VIRTADDR "' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + msl->len = mem_sz; + + return 0; +} + +/* + * Our VA space is not preallocated yet, so preallocate it here. We need to know + * how many segments there are in order to map all pages into one address space, + * and leave appropriate holes between segments so that rte_malloc does not + * concatenate them into one big segment. + * + * we also need to unmap original pages to free up address space. + */ +static int __rte_unused +prealloc_segments(struct hugepage_file *hugepages, int n_pages) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int cur_page, seg_start_page, end_seg, new_memseg; + unsigned int hpi_idx, socket, i; + int n_contig_segs, n_segs; + int msl_idx; + + /* before we preallocate segments, we need to free up our VA space. + * we're not removing files, and we already have information about + * PA-contiguousness, so it is safe to unmap everything. + */ + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *hpi = &hugepages[cur_page]; + munmap(hpi->orig_va, hpi->size); + hpi->orig_va = NULL; + } + + /* we cannot know how many page sizes and sockets we have discovered, so + * loop over all of them + */ + for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t page_sz = + internal_config.hugepage_info[hpi_idx].hugepage_sz; + + for (i = 0; i < rte_socket_count(); i++) { + struct rte_memseg_list *msl; + + socket = rte_socket_id_by_idx(i); + n_contig_segs = 0; + n_segs = 0; + seg_start_page = -1; + + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + int prev_seg_start_page = -1; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : + &hugepages[cur_page - 1]; + + new_memseg = 0; + end_seg = 0; + + if (cur->size == 0) + end_seg = 1; + else if (cur->socket_id != (int) socket) + end_seg = 1; + else if (cur->size != page_sz) + end_seg = 1; + else if (cur_page == 0) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start + * from higher address to lower address. Here, + * physical addresses are in descending order. + */ + else if ((prev->physaddr - cur->physaddr) != + cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != + cur->size) + new_memseg = 1; +#endif + if (new_memseg) { + /* if we're already inside a segment, + * new segment means end of current one + */ + if (seg_start_page != -1) { + end_seg = 1; + prev_seg_start_page = + seg_start_page; + } + seg_start_page = cur_page; + } + + if (end_seg) { + if (prev_seg_start_page != -1) { + /* we've found a new segment */ + n_contig_segs++; + n_segs += cur_page - + prev_seg_start_page; + } else if (seg_start_page != -1) { + /* we didn't find new segment, + * but did end current one + */ + n_contig_segs++; + n_segs += cur_page - + seg_start_page; + seg_start_page = -1; + continue; + } else { + /* we're skipping this page */ + continue; + } + } + /* segment continues */ + } + /* check if we missed last segment */ + if (seg_start_page != -1) { + n_contig_segs++; + n_segs += cur_page - seg_start_page; + } + + /* if no segments were found, do not preallocate */ + if (n_segs == 0) + continue; + + /* we now have total number of pages that we will + * allocate for this segment list. add separator pages + * to the total count, and preallocate VA space. + */ + n_segs += n_contig_segs - 1; + + /* now, preallocate VA space for these segments */ + + /* first, find suitable memseg list for this */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + msl = &mcfg->memsegs[msl_idx]; + + if (msl->base_va != NULL) + continue; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + /* now, allocate fbarray itself */ + if (alloc_memseg_list(msl, page_sz, n_segs, socket, + msl_idx) < 0) + return -1; + + /* finally, allocate VA space */ + if (alloc_va_space(msl) < 0) + return -1; + } + } + return 0; +} + +/* + * We cannot reallocate memseg lists on the fly because PPC64 stores pages + * backwards, therefore we have to process the entire memseg first before + * remapping it into memseg list VA space. + */ +static int +remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) +{ + int cur_page, seg_start_page, new_memseg, ret; + + seg_start_page = 0; + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + + new_memseg = 0; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; + + /* if size is zero, no more pages left */ + if (cur->size == 0) + break; + + if (cur_page == 0) + new_memseg = 1; + else if (cur->socket_id != prev->socket_id) + new_memseg = 1; + else if (cur->size != prev->size) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * address to lower address. Here, physical addresses are in + * descending order. + */ + else if ((prev->physaddr - cur->physaddr) != cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != cur->size) + new_memseg = 1; +#endif + + if (new_memseg) { + /* if this isn't the first time, remap segment */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + /* remember where we started */ + seg_start_page = cur_page; + } + /* continuation of previous memseg */ + } + /* we were stopped, but we didn't remap the last segment, do it now */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + return 0; +} + +__rte_unused /* function is unused on 32-bit builds */ +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return size; +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the + * number of pages thereof, and calculates the best number of + * pages of each size to fulfill the request for <memory> ram + */ +static int +calc_num_pages_per_socket(uint64_t * memory, + struct hugepage_info *hp_info, + struct hugepage_info *hp_used, + unsigned num_hp_info) +{ + unsigned socket, j, i = 0; + unsigned requested, available; + int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + + if (num_hp_info == 0) + return -1; + + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + size_t total_size; +#ifdef RTE_ARCH_64 + int cpu_per_socket[RTE_MAX_NUMA_NODES]; + size_t default_size; + unsigned lcore_id; + + /* Compute number of cores per socket */ + memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); + RTE_LCORE_FOREACH(lcore_id) { + cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; + } + + /* + * Automatically spread requested memory amongst detected sockets according + * to number of cores from cpu mask present on each socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + + /* Set memory amount per socket */ + default_size = (internal_config.memory * cpu_per_socket[socket]) + / rte_lcore_count(); + + /* Limit to maximum available memory on socket */ + default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); + + /* Update sizes */ + memory[socket] = default_size; + total_size -= default_size; + } + + /* + * If some memory is remaining, try to allocate it by getting all + * available memory from sockets, one after the other + */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + /* take whatever is available */ + default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], + total_size); + + /* Update sizes */ + memory[socket] += default_size; + total_size -= default_size; + } +#else + /* in 32-bit mode, allocate all of the memory only on master + * lcore socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; + socket++) { + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int master_lcore_socket; + + master_lcore_socket = + rte_lcore_to_socket_id(cfg->master_lcore); + + if (master_lcore_socket != socket) + continue; + + /* Update sizes */ + memory[socket] = total_size; + break; + } +#endif + } + + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, + sizeof(hp_used[i].hugedir)); + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0 && + internal_config.socket_mem[socket] != 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; + } + } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } + return total_num_pages; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +/* + * Prepare physical memory mapping: fill configuration structure with + * these infos, return 0 on success. + * 1. map N huge pages in separate files in hugetlbfs + * 2. find associated physical addr + * 3. find associated NUMA socket ID + * 4. sort all huge pages by physical address + * 5. remap these N huge pages in the correct order + * 6. unmap the first mapping + * 7. fill memsegs in configuration with contiguous zones + */ +static int +eal_legacy_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + struct rte_fbarray *arr; + struct rte_memseg *ms; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; + int i, j; + int nr_hugefiles, nr_hugepages = 0; + void *addr; + + memset(used_hp, 0, sizeof(used_hp)); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + void *prealloc_addr; + size_t mem_sz; + struct rte_memseg_list *msl; + int n_segs, cur_seg, fd, flags; +#ifdef MEMFD_SUPPORTED + int memfd; +#endif + uint64_t page_sz; + + /* nohuge mode is legacy mode */ + internal_config.legacy_mem = 1; + + /* nohuge mode is single-file segments mode */ + internal_config.single_file_segments = 1; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + /* set up parameters for anonymous mmap */ + fd = -1; + flags = MAP_PRIVATE | MAP_ANONYMOUS; + +#ifdef MEMFD_SUPPORTED + /* create a memfd and store it in the segment fd table */ + memfd = memfd_create("nohuge", 0); + if (memfd < 0) { + RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n", + strerror(errno)); + RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n"); + } else { + /* we got an fd - now resize it */ + if (ftruncate(memfd, internal_config.memory) < 0) { + RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n", + strerror(errno)); + RTE_LOG(ERR, EAL, "Falling back to anonymous map\n"); + close(memfd); + } else { + /* creating memfd-backed file was successful. + * we want changes to memfd to be visible to + * other processes (such as vhost backend), so + * map it as shared memory. + */ + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + fd = memfd; + flags = MAP_SHARED; + } + } +#endif + /* preallocate address space for the memory, so that it can be + * fit into the DMA mask. + */ + mem_sz = internal_config.memory; + prealloc_addr = eal_get_virtual_area( + NULL, &mem_sz, page_sz, 0, 0); + if (prealloc_addr == NULL) { + RTE_LOG(ERR, EAL, + "%s: reserving memory area failed: " + "%s\n", + __func__, strerror(errno)); + return -1; + } + addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE, + flags | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED || addr != prealloc_addr) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + munmap(prealloc_addr, mem_sz); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; + msl->len = mem_sz; + msl->heap = 1; + + /* we're in single-file segments mode, so only the segment list + * fd needs to be set up. + */ + if (fd != -1) { + if (eal_memalloc_set_seg_list_fd(0, fd) < 0) { + RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n"); + /* not a serious error, proceed */ + } + } + + /* populate memsegs. each memseg is one page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->socket_id = 0; + ms->len = page_sz; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, (size_t)page_sz); + } + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", + __func__); + if (rte_eal_iova_mode() == RTE_IOVA_VA && + rte_eal_using_phys_addrs()) + RTE_LOG(ERR, EAL, + "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n", + __func__); + goto fail; + } + return 0; + } + + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); + + hp_offset = 0; /* where we start the current page size entries */ + + huge_register_sigbus(); + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* map all hugepages and sort them */ + for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + unsigned pages_old, pages_new; + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + + if (hpi->num_pages[0] == 0) + continue; + + /* map all hugepages available */ + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + + int pages = pages_old - pages_new; + + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; + } + + if (rte_eal_using_phys_addrs() && + rte_eal_iova_mode() != RTE_IOVA_VA) { + /* find physical addresses for each hugepage */ + if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to find phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } else { + /* set physical addresses for each hugepage */ + if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to set phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } + + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += hpi->num_pages[0]; + } + + huge_recover_sigbus(); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + + nr_hugefiles = nr_hugepages; + + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < nr_hugefiles; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { + internal_config.hugepage_info[j].num_pages[socket]++; + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nr_hugepages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nr_hugepages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_data_path(), + nr_hugefiles * sizeof(struct hugepage_file)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; + } + memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); + + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have orig_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, + tmp_hp, nr_hugefiles) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + +#ifndef RTE_ARCH_64 + /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ + if (internal_config.legacy_mem && + prealloc_segments(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); + goto fail; + } +#endif + + /* remap all pages we do need into memseg list VA space, so that those + * pages become first-class citizens in DPDK memory subsystem + */ + if (remap_needed_hugepages(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); + goto fail; + } + + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + hugepage = NULL; + + /* we're not going to allocate more pages, so release VA space for + * unused memseg lists + */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + size_t mem_sz; + + /* skip inactive lists */ + if (msl->base_va == NULL) + continue; + /* skip lists where there is at least one page allocated */ + if (msl->memseg_arr.count > 0) + continue; + /* this is an unused list, deallocate it */ + mem_sz = msl->len; + munmap(msl->base_va, mem_sz); + msl->base_va = NULL; + msl->heap = 0; + + /* destroy backing fbarray */ + rte_fbarray_destroy(&msl->memseg_arr); + } + + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", + __func__); + goto fail; + } + + return 0; + +fail: + huge_recover_sigbus(); + free(tmp_hp); + if (hugepage != NULL) + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + return -1; +} + +static int __rte_unused +hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct hugepage_info *hpi = arg; + + if (msl->page_sz != hpi->hugepage_sz) + return 0; + + hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; + return 0; +} + +static int +limits_callback(int socket_id, size_t cur_limit, size_t new_len) +{ + RTE_SET_USED(socket_id); + RTE_SET_USED(cur_limit); + RTE_SET_USED(new_len); + return -1; +} + +static int +eal_hugepage_init(void) +{ + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + uint64_t memory[RTE_MAX_NUMA_NODES]; + int hp_sz_idx, socket_id; + + memset(used_hp, 0, sizeof(used_hp)); + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { +#ifndef RTE_ARCH_64 + struct hugepage_info dummy; + unsigned int i; +#endif + /* also initialize used_hp hugepage sizes in used_hp */ + struct hugepage_info *hpi; + hpi = &internal_config.hugepage_info[hp_sz_idx]; + used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; + +#ifndef RTE_ARCH_64 + /* for 32-bit, limit number of pages on socket to whatever we've + * preallocated, as we cannot allocate more. + */ + memset(&dummy, 0, sizeof(dummy)); + dummy.hugepage_sz = hpi->hugepage_sz; + if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) + return -1; + + for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { + hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], + dummy.num_pages[i]); + } +#endif + } + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) + memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; + + /* calculate final number of pages */ + if (calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes) < 0) + return -1; + + for (hp_sz_idx = 0; + hp_sz_idx < (int)internal_config.num_hugepage_sizes; + hp_sz_idx++) { + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; + socket_id++) { + struct rte_memseg **pages; + struct hugepage_info *hpi = &used_hp[hp_sz_idx]; + unsigned int num_pages = hpi->num_pages[socket_id]; + unsigned int num_pages_alloc; + + if (num_pages == 0) + continue; + + RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", + num_pages, hpi->hugepage_sz >> 20, socket_id); + + /* we may not be able to allocate all pages in one go, + * because we break up our memory map into multiple + * memseg lists. therefore, try allocating multiple + * times and see if we can get the desired number of + * pages from multiple allocations. + */ + + num_pages_alloc = 0; + do { + int i, cur_pages, needed; + + needed = num_pages - num_pages_alloc; + + pages = malloc(sizeof(*pages) * needed); + + /* do not request exact number of pages */ + cur_pages = eal_memalloc_alloc_seg_bulk(pages, + needed, hpi->hugepage_sz, + socket_id, false); + if (cur_pages <= 0) { + free(pages); + return -1; + } + + /* mark preallocated pages as unfreeable */ + for (i = 0; i < cur_pages; i++) { + struct rte_memseg *ms = pages[i]; + ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + } + free(pages); + + num_pages_alloc += cur_pages; + } while (num_pages_alloc != num_pages); + } + } + /* if socket limits were specified, set them */ + if (internal_config.force_socket_limits) { + unsigned int i; + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { + uint64_t limit = internal_config.socket_limit[i]; + if (limit == 0) + continue; + if (rte_mem_alloc_validator_register("socket-limit", + limits_callback, i, limit)) + RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); + } + } + return 0; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration and finds the hugepages which form that segment, mapping them + * in order to form a contiguous block in the virtual memory space + */ +static int +eal_legacy_hugepage_attach(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct hugepage_file *hp = NULL; + unsigned int num_hp = 0; + unsigned int i = 0; + unsigned int cur_seg; + off_t size = 0; + int fd, fd_hugepage = -1; + + if (aslr_enabled() > 0) { + RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " + "(ASLR) is enabled in the kernel.\n"); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " + "into secondary processes\n"); + } + + fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + eal_hugepage_data_path()); + goto error; + } + + size = getFileSize(fd_hugepage); + hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); + if (hp == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + eal_hugepage_data_path()); + goto error; + } + + num_hp = size / sizeof(struct hugepage_file); + RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); + + /* map all segments into memory to make sure we get the addrs. the + * segments themselves are already in memseg list (which is shared and + * has its VA space already preallocated), so we just need to map + * everything into correct addresses. + */ + for (i = 0; i < num_hp; i++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + int msl_idx, ms_idx; + struct rte_memseg_list *msl; + struct rte_memseg *ms; + + /* if size is zero, no more pages left */ + if (map_sz == 0) + break; + + fd = open(hf->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", + hf->filepath, strerror(errno)); + goto error; + } + + map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not map %s: %s\n", + hf->filepath, strerror(errno)); + goto fd_error; + } + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", + __func__, strerror(errno)); + goto mmap_error; + } + + /* find segment data */ + msl = rte_mem_virt2memseg_list(map_addr); + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); + goto mmap_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); + goto mmap_error; + } + + msl_idx = msl - mcfg->memsegs; + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); + goto mmap_error; + } + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); + } + /* unmap the hugepage config file, since we are done using it */ + munmap(hp, size); + close(fd_hugepage); + return 0; + +mmap_error: + munmap(hp[i].final_va, hp[i].size); +fd_error: + close(fd); +error: + /* unwind mmap's done so far */ + for (cur_seg = 0; cur_seg < i; cur_seg++) + munmap(hp[cur_seg].final_va, hp[cur_seg].size); + + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +static int +eal_hugepage_attach(void) +{ + if (eal_memalloc_sync_with_primary()) { + RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); + if (aslr_enabled() > 0) + RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); + return -1; + } + return 0; +} + +int +rte_eal_hugepage_init(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_init() : + eal_hugepage_init(); +} + +int +rte_eal_hugepage_attach(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_attach() : + eal_hugepage_attach(); +} + +int +rte_eal_using_phys_addrs(void) +{ + if (phys_addrs_available == -1) { + uint64_t tmp = 0; + + if (rte_eal_has_hugepages() != 0 && + rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR) + phys_addrs_available = 1; + else + phys_addrs_available = 0; + } + return phys_addrs_available; +} + +static int __rte_unused +memseg_primary_init_32(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int active_sockets, hpi_idx, msl_idx = 0; + unsigned int socket_id, i; + struct rte_memseg_list *msl; + uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; + uint64_t max_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* this is a giant hack, but desperate times call for desperate + * measures. in legacy 32-bit mode, we cannot preallocate VA space, + * because having upwards of 2 gigabytes of VA space already mapped will + * interfere with our ability to map and sort hugepages. + * + * therefore, in legacy 32-bit mode, we will be initializing memseg + * lists much later - in eal_memory.c, right after we unmap all the + * unneeded pages. this will not affect secondary processes, as those + * should be able to mmap the space without (too many) problems. + */ + if (internal_config.legacy_mem) + return 0; + + /* 32-bit mode is a very special case. we cannot know in advance where + * the user will want to allocate their memory, so we have to do some + * heuristics. + */ + active_sockets = 0; + total_requested_mem = 0; + if (internal_config.force_sockets) + for (i = 0; i < rte_socket_count(); i++) { + uint64_t mem; + + socket_id = rte_socket_id_by_idx(i); + mem = internal_config.socket_mem[socket_id]; + + if (mem == 0) + continue; + + active_sockets++; + total_requested_mem += mem; + } + else + total_requested_mem = internal_config.memory; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + if (total_requested_mem > max_mem) { + RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", + (unsigned int)(max_mem >> 20)); + return -1; + } + total_extra_mem = max_mem - total_requested_mem; + extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : + total_extra_mem / active_sockets; + + /* the allocation logic is a little bit convoluted, but here's how it + * works, in a nutshell: + * - if user hasn't specified on which sockets to allocate memory via + * --socket-mem, we allocate all of our memory on master core socket. + * - if user has specified sockets to allocate memory on, there may be + * some "unused" memory left (e.g. if user has specified --socket-mem + * such that not all memory adds up to 2 gigabytes), so add it to all + * sockets that are in use equally. + * + * page sizes are sorted by size in descending order, so we can safely + * assume that we dispense with bigger page sizes first. + */ + + /* create memseg lists */ + for (i = 0; i < rte_socket_count(); i++) { + int hp_sizes = (int) internal_config.num_hugepage_sizes; + uint64_t max_socket_mem, cur_socket_mem; + unsigned int master_lcore_socket; + struct rte_config *cfg = rte_eal_get_configuration(); + bool skip; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + /* we can still sort pages by socket in legacy mode */ + if (!internal_config.legacy_mem && socket_id > 0) + break; +#endif + + /* if we didn't specifically request memory on this socket */ + skip = active_sockets != 0 && + internal_config.socket_mem[socket_id] == 0; + /* ...or if we didn't specifically request memory on *any* + * socket, and this is not master lcore + */ + master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); + skip |= active_sockets == 0 && socket_id != master_lcore_socket; + + if (skip) { + RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", + socket_id); + continue; + } + + /* max amount of memory on this socket */ + max_socket_mem = (active_sockets != 0 ? + internal_config.socket_mem[socket_id] : + internal_config.memory) + + extra_mem_per_socket; + cur_socket_mem = 0; + + for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { + uint64_t max_pagesz_mem, cur_pagesz_mem = 0; + uint64_t hugepage_sz; + struct hugepage_info *hpi; + int type_msl_idx, max_segs, total_segs = 0; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* check if pages are actually available */ + if (hpi->num_pages[socket_id] == 0) + continue; + + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + max_pagesz_mem = max_socket_mem - cur_socket_mem; + + /* make it multiple of page size */ + max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, + hugepage_sz); + + RTE_LOG(DEBUG, EAL, "Attempting to preallocate " + "%" PRIu64 "M on socket %i\n", + max_pagesz_mem >> 20, socket_id); + + type_msl_idx = 0; + while (cur_pagesz_mem < max_pagesz_mem && + total_segs < max_segs) { + uint64_t cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx]; + + cur_mem = get_mem_amount(hugepage_sz, + max_pagesz_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) { + /* failing to allocate a memseg list is + * a serious error. + */ + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + if (alloc_va_space(msl)) { + /* if we couldn't allocate VA space, we + * can try with smaller page sizes. + */ + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); + /* deallocate memseg list */ + if (free_memseg_list(msl)) + return -1; + break; + } + + total_segs += msl->memseg_arr.len; + cur_pagesz_mem = total_segs * hugepage_sz; + type_msl_idx++; + msl_idx++; + } + cur_socket_mem += cur_pagesz_mem; + } + if (cur_socket_mem == 0) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", + socket_id); + return -1; + } + } + + return 0; +} + +static int __rte_unused +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct memtype { + uint64_t page_sz; + int socket_id; + } *memtypes = NULL; + int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ + struct rte_memseg_list *msl; + uint64_t max_mem, max_mem_per_type; + unsigned int max_seglists_per_type; + unsigned int n_memtypes, cur_type; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* + * figuring out amount of memory we're going to have is a long and very + * involved process. the basic element we're operating with is a memory + * type, defined as a combination of NUMA node ID and page size (so that + * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). + * + * deciding amount of memory going towards each memory type is a + * balancing act between maximum segments per type, maximum memory per + * type, and number of detected NUMA nodes. the goal is to make sure + * each memory type gets at least one memseg list. + * + * the total amount of memory is limited by RTE_MAX_MEM_MB value. + * + * the total amount of memory per type is limited by either + * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number + * of detected NUMA nodes. additionally, maximum number of segments per + * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for + * smaller page sizes, it can take hundreds of thousands of segments to + * reach the above specified per-type memory limits. + * + * additionally, each type may have multiple memseg lists associated + * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger + * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. + * + * the number of memseg lists per type is decided based on the above + * limits, and also taking number of detected NUMA nodes, to make sure + * that we don't run out of memseg lists before we populate all NUMA + * nodes with memory. + * + * we do this in three stages. first, we collect the number of types. + * then, we figure out memory constraints and populate the list of + * would-be memseg lists. then, we go ahead and allocate the memseg + * lists. + */ + + /* create space for mem types */ + n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); + memtypes = calloc(n_memtypes, sizeof(*memtypes)); + if (memtypes == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); + return -1; + } + + /* populate mem types */ + cur_type = 0; + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { + int socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + /* we can still sort pages by socket in legacy mode */ + if (!internal_config.legacy_mem && socket_id > 0) + break; +#endif + memtypes[cur_type].page_sz = hugepage_sz; + memtypes[cur_type].socket_id = socket_id; + + RTE_LOG(DEBUG, EAL, "Detected memory type: " + "socket_id:%u hugepage_sz:%" PRIu64 "\n", + socket_id, hugepage_sz); + } + } + /* number of memtypes could have been lower due to no NUMA support */ + n_memtypes = cur_type; + + /* set up limits for types */ + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, + max_mem / n_memtypes); + /* + * limit maximum number of segment lists per type to ensure there's + * space for memseg lists for all NUMA nodes with all page sizes + */ + max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; + + if (max_seglists_per_type == 0) { + RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + + /* go through all mem types and create segment lists */ + msl_idx = 0; + for (cur_type = 0; cur_type < n_memtypes; cur_type++) { + unsigned int cur_seglist, n_seglists, n_segs; + unsigned int max_segs_per_type, max_segs_per_list; + struct memtype *type = &memtypes[cur_type]; + uint64_t max_mem_per_list, pagesz; + int socket_id; + + pagesz = type->page_sz; + socket_id = type->socket_id; + + /* + * we need to create segment lists for this type. we must take + * into account the following things: + * + * 1. total amount of memory we can use for this memory type + * 2. total amount of memory per memseg list allowed + * 3. number of segments needed to fit the amount of memory + * 4. number of segments allowed per type + * 5. number of segments allowed per memseg list + * 6. number of memseg lists we are allowed to take up + */ + + /* calculate how much segments we will need in total */ + max_segs_per_type = max_mem_per_type / pagesz; + /* limit number of segments to maximum allowed per type */ + max_segs_per_type = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); + /* limit number of segments to maximum allowed per list */ + max_segs_per_list = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_LIST); + + /* calculate how much memory we can have per segment list */ + max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, + (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); + + /* calculate how many segments each segment list will have */ + n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); + + /* calculate how many segment lists we can have */ + n_seglists = RTE_MIN(max_segs_per_type / n_segs, + max_mem_per_type / max_mem_per_list); + + /* limit number of segment lists according to our maximum */ + n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); + + RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " + "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", + n_seglists, n_segs, socket_id, pagesz); + + /* create all segment lists */ + for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + msl = &mcfg->memsegs[msl_idx++]; + + if (alloc_memseg_list(msl, pagesz, n_segs, + socket_id, cur_seglist)) + goto out; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + goto out; + } + } + } + /* we're successful */ + ret = 0; +out: + free(memtypes); + return ret; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + /* increase rlimit to maximum */ + struct rlimit lim; + + if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { + /* set limit to maximum */ + lim.rlim_cur = lim.rlim_max; + + if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", + strerror(errno)); + } else { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" + PRIu64 "\n", + (uint64_t)lim.rlim_cur); + } + } else { + RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); + } +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (!internal_config.legacy_mem && rte_socket_count() > 1) { + RTE_LOG(WARNING, EAL, "DPDK is running on a NUMA system, but is compiled without NUMA support.\n"); + RTE_LOG(WARNING, EAL, "This will have adverse consequences for performance and usability.\n"); + RTE_LOG(WARNING, EAL, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.\n"); + } +#endif + + return rte_eal_process_type() == RTE_PROC_PRIMARY ? +#ifndef RTE_ARCH_64 + memseg_primary_init_32() : +#else + memseg_primary_init() : +#endif + memseg_secondary_init(); +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_thread.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_thread.c new file mode 100644 index 000000000..cd9d6e0eb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_thread.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <pthread.h> +#include <sched.h> +#include <sys/queue.h> +#include <sys/syscall.h> + +#include <rte_debug.h> +#include <rte_atomic.h> +#include <rte_launch.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_eal_trace.h> + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + int rc = -EBUSY; + + if (lcore_config[slave_id].state != WAIT) + goto finish; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + rc = 0; +finish: + rte_eal_trace_thread_remote_launch(f, arg, slave_id, rc); + return rc; +} + +/* set affinity for current EAL thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__rte_noreturn void * +eal_thread_loop(__rte_unused void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); + + __rte_trace_mem_per_thread_alloc(); + rte_eal_trace_thread_lcore_ready(lcore_id, cpuset); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + + /* when a service core returns, it should go directly to WAIT + * state, because the application will not lcore_wait() for it. + */ + if (lcore_config[lcore_id].core_role == ROLE_SERVICE) + lcore_config[lcore_id].state = WAIT; + else + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + return (int)syscall(SYS_gettid); +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + int ret = ENOSYS; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_setname_np(id, name); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + return -ret; +} + +int rte_thread_getname(pthread_t id, char *name, size_t len) +{ + int ret = ENOSYS; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_getname_np(id, name, len); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + RTE_SET_USED(len); + return -ret; + +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_timer.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_timer.c new file mode 100644 index 000000000..6dc6b565d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_timer.c @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2012-2013 6WIND S.A. + */ + +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <fcntl.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <pthread.h> +#include <errno.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +enum timer_source eal_timer_source = EAL_TIMER_HPET; + +#ifdef RTE_LIBEAL_USE_HPET + +#define DEV_HPET "/dev/hpet" + +/* Maximum number of counters. */ +#define HPET_TIMER_NUM 3 + +/* General capabilities register */ +#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ +#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ + +/** + * HPET timer registers. From the Intel IA-PC HPET (High Precision Event + * Timers) Specification. + */ +struct eal_hpet_regs { + /* Memory-mapped, software visible registers */ + uint64_t capabilities; /**< RO General Capabilities Register. */ + uint64_t reserved0; /**< Reserved for future use. */ + uint64_t config; /**< RW General Configuration Register. */ + uint64_t reserved1; /**< Reserved for future use. */ + uint64_t isr; /**< RW Clear General Interrupt Status. */ + uint64_t reserved2[25]; /**< Reserved for future use. */ + union { + uint64_t counter; /**< RW Main Counter Value Register. */ + struct { + uint32_t counter_l; /**< RW Main Counter Low. */ + uint32_t counter_h; /**< RW Main Counter High. */ + }; + }; + uint64_t reserved3; /**< Reserved for future use. */ + struct { + uint64_t config; /**< RW Timer Config and Capability Reg. */ + uint64_t comp; /**< RW Timer Comparator Value Register. */ + uint64_t fsb; /**< RW FSB Interrupt Route Register. */ + uint64_t reserved4; /**< Reserved for future use. */ + } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ +}; + +/* Mmap'd hpet registers */ +static volatile struct eal_hpet_regs *eal_hpet = NULL; + +/* Period at which the HPET counter increments in + * femtoseconds (10^-15 seconds). */ +static uint32_t eal_hpet_resolution_fs = 0; + +/* Frequency of the HPET counter in Hz */ +static uint64_t eal_hpet_resolution_hz = 0; + +/* Incremented 4 times during one 32bits hpet full count */ +static uint32_t eal_hpet_msb; + +static pthread_t msb_inc_thread_id; + +/* + * This function runs on a specific thread to update a global variable + * containing used to process MSB of the HPET (unfortunately, we need + * this because hpet is 32 bits by default under linux). + */ +static void * +hpet_msb_inc(__rte_unused void *arg) +{ + uint32_t t; + + while (1) { + t = (eal_hpet->counter_l >> 30); + if (t != (eal_hpet_msb & 3)) + eal_hpet_msb ++; + sleep(10); + } + return NULL; +} + +uint64_t +rte_get_hpet_hz(void) +{ + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + return eal_hpet_resolution_hz; +} + +uint64_t +rte_get_hpet_cycles(void) +{ + uint32_t t, msb; + uint64_t ret; + + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + t = eal_hpet->counter_l; + msb = eal_hpet_msb; + ret = (msb + 2 - (t >> 30)) / 4; + ret <<= 32; + ret += t; + return ret; +} + +#endif + +#ifdef RTE_LIBEAL_USE_HPET +/* + * Open and mmap /dev/hpet (high precision event timer) that will + * provide our time reference. + */ +int +rte_eal_hpet_init(int make_default) +{ + int fd, ret; + + if (internal_config.no_hpet) { + RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); + return -1; + } + + fd = open(DEV_HPET, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", + strerror(errno)); + internal_config.no_hpet = 1; + return -1; + } + eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); + if (eal_hpet == MAP_FAILED) { + RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" + "Please enable CONFIG_HPET_MMAP in your kernel configuration " + "to allow HPET support.\n" + "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " + "in your build configuration or use '--no-hpet' EAL flag.\n"); + close(fd); + internal_config.no_hpet = 1; + return -1; + } + close(fd); + + eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & + CLK_PERIOD_MASK) >> + CLK_PERIOD_SHIFT); + + eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / + (uint64_t)eal_hpet_resolution_fs; + + RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", + eal_hpet_resolution_hz/1000); + + eal_hpet_msb = (eal_hpet->counter_l >> 30); + + /* create a thread that will increment a global variable for + * msb (hpet is 32 bits by default under linux) */ + ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, + hpet_msb_inc, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); + internal_config.no_hpet = 1; + return -1; + } + + if (make_default) + eal_timer_source = EAL_TIMER_HPET; + return 0; +} +#endif + +uint64_t +get_tsc_freq(void) +{ +#ifdef CLOCK_MONOTONIC_RAW +#define NS_PER_SEC 1E9 +#define CYC_PER_10MHZ 1E7 + + struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ + + struct timespec t_start, t_end; + uint64_t tsc_hz; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { + uint64_t ns, end, start = rte_rdtsc(); + nanosleep(&sleeptime,NULL); + clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); + end = rte_rdtsc(); + ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); + ns += (t_end.tv_nsec - t_start.tv_nsec); + + double secs = (double)ns/NS_PER_SEC; + tsc_hz = (uint64_t)((end - start)/secs); + /* Round up to 10Mhz. 1E7 ~ 10Mhz */ + return RTE_ALIGN_MUL_NEAR(tsc_hz, CYC_PER_10MHZ); + } +#endif + return 0; +} + +int +rte_eal_timer_init(void) +{ + + eal_timer_source = EAL_TIMER_TSC; + + set_tsc_freq(); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio.c new file mode 100644 index 000000000..d26e1649a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio.c @@ -0,0 +1,2186 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include <inttypes.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/ioctl.h> + +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_eal_memconfig.h> +#include <rte_vfio.h> + +#include "eal_filesystem.h" +#include "eal_memcfg.h" +#include "eal_vfio.h" +#include "eal_private.h" + +#ifdef VFIO_PRESENT + +#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" + +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can + * recreate the mappings for DPDK segments, but we cannot do so for memory that + * was registered by the user themselves, so we need to store the user mappings + * somewhere, to recreate them later. + */ +#define VFIO_MAX_USER_MEM_MAPS 256 +struct user_mem_map { + uint64_t addr; + uint64_t iova; + uint64_t len; +}; + +struct user_mem_maps { + rte_spinlock_recursive_t lock; + int n_maps; + struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_active_groups; + const struct vfio_iommu_type *vfio_iommu_type; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; + struct user_mem_maps mem_maps; +}; + +/* per-process VFIO config */ +static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; +static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; + +static int vfio_type1_dma_map(int); +static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_spapr_dma_map(int); +static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_noiommu_dma_map(int); +static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, + uint64_t iova, uint64_t len, int do_map); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { + .type_id = RTE_VFIO_TYPE1, + .name = "Type 1", + .dma_map_func = &vfio_type1_dma_map, + .dma_user_map_func = &vfio_type1_dma_mem_map + }, + /* ppc64 IOMMU, otherwise known as spapr */ + { + .type_id = RTE_VFIO_SPAPR, + .name = "sPAPR", + .dma_map_func = &vfio_spapr_dma_map, + .dma_user_map_func = &vfio_spapr_dma_mem_map + }, + /* IOMMU-less mode */ + { + .type_id = RTE_VFIO_NOIOMMU, + .name = "No-IOMMU", + .dma_map_func = &vfio_noiommu_dma_map, + .dma_user_map_func = &vfio_noiommu_dma_mem_map + }, +}; + +static int +is_null_map(const struct user_mem_map *map) +{ + return map->addr == 0 && map->iova == 0 && map->len == 0; +} + +/* we may need to merge user mem maps together in case of user mapping/unmapping + * chunks of memory, so we'll need a comparator function to sort segments. + */ +static int +user_mem_map_cmp(const void *a, const void *b) +{ + const struct user_mem_map *umm_a = a; + const struct user_mem_map *umm_b = b; + + /* move null entries to end */ + if (is_null_map(umm_a)) + return 1; + if (is_null_map(umm_b)) + return -1; + + /* sort by iova first */ + if (umm_a->iova < umm_b->iova) + return -1; + if (umm_a->iova > umm_b->iova) + return 1; + + if (umm_a->addr < umm_b->addr) + return -1; + if (umm_a->addr > umm_b->addr) + return 1; + + if (umm_a->len < umm_b->len) + return -1; + if (umm_a->len > umm_b->len) + return 1; + + return 0; +} + +/* adjust user map entry. this may result in shortening of existing map, or in + * splitting existing map in two pieces. + */ +static void +adjust_map(struct user_mem_map *src, struct user_mem_map *end, + uint64_t remove_va_start, uint64_t remove_len) +{ + /* if va start is same as start address, we're simply moving start */ + if (remove_va_start == src->addr) { + src->addr += remove_len; + src->iova += remove_len; + src->len -= remove_len; + } else if (remove_va_start + remove_len == src->addr + src->len) { + /* we're shrinking mapping from the end */ + src->len -= remove_len; + } else { + /* we're blowing a hole in the middle */ + struct user_mem_map tmp; + uint64_t total_len = src->len; + + /* adjust source segment length */ + src->len = remove_va_start - src->addr; + + /* create temporary segment in the middle */ + tmp.addr = src->addr + src->len; + tmp.iova = src->iova + src->len; + tmp.len = remove_len; + + /* populate end segment - this one we will be keeping */ + end->addr = tmp.addr + tmp.len; + end->iova = tmp.iova + tmp.len; + end->len = total_len - src->len - tmp.len; + } +} + +/* try merging two maps into one, return 1 if succeeded */ +static int +merge_map(struct user_mem_map *left, struct user_mem_map *right) +{ + if (left->addr + left->len != right->addr) + return 0; + if (left->iova + left->len != right->iova) + return 0; + + left->len += right->len; + + memset(right, 0, sizeof(*right)); + + return 1; +} + +static struct user_mem_map * +find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, + uint64_t iova, uint64_t len) +{ + uint64_t va_end = addr + len; + uint64_t iova_end = iova + len; + int i; + + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = &user_mem_maps->maps[i]; + uint64_t map_va_end = map->addr + map->len; + uint64_t map_iova_end = map->iova + map->len; + + /* check start VA */ + if (addr < map->addr || addr >= map_va_end) + continue; + /* check if VA end is within boundaries */ + if (va_end <= map->addr || va_end > map_va_end) + continue; + + /* check start IOVA */ + if (iova < map->iova || iova >= map_iova_end) + continue; + /* check if IOVA end is within boundaries */ + if (iova_end <= map->iova || iova_end > map_iova_end) + continue; + + /* we've found our map */ + return map; + } + return NULL; +} + +/* this will sort all user maps, and merge/compact any adjacent maps */ +static void +compact_user_maps(struct user_mem_maps *user_mem_maps) +{ + int i, n_merged, cur_idx; + + qsort(user_mem_maps->maps, user_mem_maps->n_maps, + sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); + + /* we'll go over the list backwards when merging */ + n_merged = 0; + for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { + struct user_mem_map *l, *r; + + l = &user_mem_maps->maps[i]; + r = &user_mem_maps->maps[i + 1]; + + if (is_null_map(l) || is_null_map(r)) + continue; + + if (merge_map(l, r)) + n_merged++; + } + + /* the entries are still sorted, but now they have holes in them, so + * walk through the list and remove the holes + */ + if (n_merged > 0) { + cur_idx = 0; + for (i = 0; i < user_mem_maps->n_maps; i++) { + if (!is_null_map(&user_mem_maps->maps[i])) { + struct user_mem_map *src, *dst; + + src = &user_mem_maps->maps[i]; + dst = &user_mem_maps->maps[cur_idx++]; + + if (src != dst) { + memcpy(dst, src, sizeof(*src)); + memset(src, 0, sizeof(*src)); + } + } + } + user_mem_maps->n_maps = cur_idx; + } +} + +static int +vfio_open_group_fd(int iommu_group_num) +{ + int vfio_group_fd; + char filename[PATH_MAX]; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, + iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via mp channel. + */ + p->req = SOCKET_REQ_GROUP; + p->group_num = iommu_group_num; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_group_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_group_fd = mp_rep->fds[0]; + } else if (p->result == SOCKET_NO_FD) { + RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); + vfio_group_fd = 0; + } + } + + free(mp_reply.msgs); + if (vfio_group_fd < 0) + RTE_LOG(ERR, EAL, " cannot request group fd\n"); + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_num(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + if (vfio_cfg->vfio_groups[j].group_num == + iommu_group_num) + return vfio_cfg; + } + } + + return NULL; +} + +static int +vfio_get_group_fd(struct vfio_config *vfio_cfg, + int iommu_group_num) +{ + int i; + int vfio_group_fd; + struct vfio_group *cur_grp; + + /* check if we already have the group descriptor open */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) + return vfio_cfg->vfio_groups[i].fd; + + /* Lets see first if there is room for a new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Now lets get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd <= 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD) + return default_vfio_cfg; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + +static int +get_vfio_group_idx(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return j; + } + + return -1; +} + +static void +vfio_group_device_get(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices++; +} + +static void +vfio_group_device_put(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices--; +} + +static int +vfio_group_device_count(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + return -1; + } + + return vfio_cfg->vfio_groups[i].devices; +} + +static void +vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + void *arg __rte_unused) +{ + rte_iova_t iova_start, iova_expected; + struct rte_memseg_list *msl; + struct rte_memseg *ms; + size_t cur_len = 0; + uint64_t va_start; + + msl = rte_mem_virt2memseg_list(addr); + + /* for IOVA as VA mode, no need to care for IOVA addresses */ + if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { + uint64_t vfio_va = (uint64_t)(uintptr_t)addr; + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 0); + return; + } + +#ifdef RTE_ARCH_PPC_64 + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + + rte_fbarray_set_free(&msl->memseg_arr, idx); + cur_len += ms->len; + ++ms; + } + cur_len = 0; +#endif + /* memsegs are contiguous in memory */ + ms = rte_mem_virt2memseg(addr, msl); + + /* + * This memory is not guaranteed to be contiguous, but it still could + * be, or it could have some small contiguous chunks. Since the number + * of VFIO mappings is limited, and VFIO appears to not concatenate + * adjacent mappings, we have to do this ourselves. + * + * So, find contiguous chunks, then map them. + */ + va_start = ms->addr_64; + iova_start = iova_expected = ms->iova; + while (cur_len < len) { + bool new_contig_area = ms->iova != iova_expected; + bool last_seg = (len - cur_len) == ms->len; + bool skip_last = false; + + /* only do mappings when current contiguous area ends */ + if (new_contig_area) { + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 1); + else + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 0); + va_start = ms->addr_64; + iova_start = ms->iova; + } + /* some memory segments may have invalid IOVA */ + if (ms->iova == RTE_BAD_IOVA) { + RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", + ms->addr); + skip_last = true; + } + iova_expected = ms->iova + ms->len; + cur_len += ms->len; + ++ms; + + /* + * don't count previous segment, and don't attempt to + * dereference a potentially invalid pointer. + */ + if (skip_last && !last_seg) { + iova_expected = iova_start = ms->iova; + va_start = ms->addr_64; + } else if (!skip_last && last_seg) { + /* this is the last segment and we're not skipping */ + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 1); + else + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 0); + } + } +#ifdef RTE_ARCH_PPC_64 + cur_len = 0; + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + + rte_fbarray_set_used(&msl->memseg_arr, idx); + cur_len += ms->len; + ++ms; + } +#endif +} + +static int +vfio_sync_default_container(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + int iommu_type_id; + unsigned int i; + + /* cannot be called from primary */ + if (rte_eal_process_type() != RTE_PROC_SECONDARY) + return -1; + + /* default container fd should have been opened in rte_vfio_enable() */ + if (!default_vfio_cfg->vfio_enabled || + default_vfio_cfg->vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, "VFIO support is not initialized\n"); + return -1; + } + + /* find default container's IOMMU type */ + p->req = SOCKET_REQ_IOMMU_TYPE; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + iommu_type_id = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK) + iommu_type_id = p->iommu_type_id; + } + free(mp_reply.msgs); + if (iommu_type_id < 0) { + RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n"); + return -1; + } + + /* we now have an fd for default container, as well as its IOMMU type. + * now, set up default VFIO container config to match. + */ + for (i = 0; i < RTE_DIM(iommu_types); i++) { + const struct vfio_iommu_type *t = &iommu_types[i]; + if (t->type_id != iommu_type_id) + continue; + + /* we found our IOMMU type */ + default_vfio_cfg->vfio_iommu_type = t; + + return 0; + } + RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n", + iommu_type_id); + return -1; +} + +int +rte_vfio_clear_group(int vfio_group_fd) +{ + int i; + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0) + return -1; + vfio_cfg->vfio_groups[i].group_num = -1; + vfio_cfg->vfio_groups[i].fd = -1; + vfio_cfg->vfio_groups[i].devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int vfio_container_fd; + int vfio_group_fd; + int iommu_group_num; + int i, ret; + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd < 0) + return -1; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable! " + "Not all devices in IOMMU group bound to VFIO or unbound\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + vfio_container_fd = vfio_cfg->vfio_container_fd; + user_mem_maps = &vfio_cfg->mem_maps; + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when first group is + * assigned to a container and only in primary process. + * Note this can happen several times with the hotplug + * functionality. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg->vfio_active_groups == 1 && + vfio_group_device_count(vfio_group_fd) == 0) { + const struct vfio_iommu_type *t; + + /* select an IOMMU type which we will be using */ + t = vfio_set_iommu_type(vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, + " %s failed to select IOMMU type\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* lock memory hotplug before mapping and release it + * after registering callback, to prevent races + */ + rte_mcfg_mem_read_lock(); + if (vfio_cfg == default_vfio_cfg) + ret = t->dma_map_func(vfio_container_fd); + else + ret = 0; + if (ret) { + RTE_LOG(ERR, EAL, + " %s DMA remapping failed, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + rte_mcfg_mem_read_unlock(); + return -1; + } + + vfio_cfg->vfio_iommu_type = t; + + /* re-map all user-mapped segments */ + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* this IOMMU type may not support DMA mapping, but + * if we have mappings in the list - that means we have + * previously mapped something successfully, so we can + * be sure that DMA mapping is supported. + */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map; + map = &user_mem_maps->maps[i]; + + ret = t->dma_user_map_func( + vfio_container_fd, + map->addr, map->iova, map->len, + 1); + if (ret) { + RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " + "va: 0x%" PRIx64 " " + "iova: 0x%" PRIx64 " " + "len: 0x%" PRIu64 "\n", + map->addr, map->iova, + map->len); + rte_spinlock_recursive_unlock( + &user_mem_maps->lock); + rte_mcfg_mem_read_unlock(); + return -1; + } + } + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + + /* register callback for mem events */ + if (vfio_cfg == default_vfio_cfg) + ret = rte_mem_event_callback_register( + VFIO_MEM_EVENT_CLB_NAME, + vfio_mem_event_callback, NULL); + else + ret = 0; + /* unlock memory hotplug */ + rte_mcfg_mem_read_unlock(); + + if (ret && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); + return -1; + } + if (ret) + RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); + else + RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); + } + } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && + vfio_cfg == default_vfio_cfg && + vfio_cfg->vfio_iommu_type == NULL) { + /* if we're not a primary process, we do not set up the VFIO + * container because it's already been set up by the primary + * process. instead, we simply ask the primary about VFIO type + * we are using, and set the VFIO config up appropriately. + */ + ret = vfio_sync_default_container(); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n"); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* we have successfully initialized VFIO, notify user */ + const struct vfio_iommu_type *t = + default_vfio_cfg->vfio_iommu_type; + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + } + + /* get a file descriptor for the device */ + *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (*vfio_dev_fd < 0) { + /* if we cannot get a device fd, this implies a problem with + * the VFIO group or the container not having IOMMU configured. + */ + + RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* test and setup the device */ + ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", dev_addr, errno, + strerror(errno)); + close(*vfio_dev_fd); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + vfio_group_device_get(vfio_group_fd); + + return 0; +} + +int +rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, + int vfio_dev_fd) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + int vfio_group_fd; + int iommu_group_num; + int ret; + + /* we don't want any DMA mapping messages to come while we're detaching + * VFIO device, because this might be the last device and we might need + * to unregister the callback. + */ + rte_mcfg_mem_read_lock(); + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret <= 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", + dev_addr); + /* This is an error at this point. */ + ret = -1; + goto out; + } + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd <= 0) { + RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + /* At this point we got an active group. Closing it will make the + * container detachment. If this is the last active group, VFIO kernel + * code will unset the container and the IOMMU mappings. + */ + + /* Closing a device */ + if (close(vfio_dev_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* An VFIO group can have several devices attached. Just when there is + * no devices remaining should the group be closed. + */ + vfio_group_device_put(vfio_group_fd); + if (!vfio_group_device_count(vfio_group_fd)) { + + if (close(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + if (rte_vfio_clear_group(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", + dev_addr); + ret = -1; + goto out; + } + } + + /* if there are no active device groups, unregister the callback to + * avoid spurious attempts to map/unmap memory from VFIO. + */ + if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && + rte_eal_process_type() != RTE_PROC_SECONDARY) + rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, + NULL); + + /* success */ + ret = 0; + +out: + rte_mcfg_mem_read_unlock(); + return ret; +} + +int +rte_vfio_enable(const char *modname) +{ + /* initialize group list */ + int i, j; + int vfio_available; + + rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfgs[i].vfio_container_fd = -1; + vfio_cfgs[i].vfio_active_groups = 0; + vfio_cfgs[i].vfio_iommu_type = NULL; + vfio_cfgs[i].mem_maps.lock = lock; + + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + vfio_cfgs[i].vfio_groups[j].fd = -1; + vfio_cfgs[i].vfio_groups[j].group_num = -1; + vfio_cfgs[i].vfio_groups[j].devices = 0; + } + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio module is loaded */ + vfio_available = rte_eal_check_module(modname); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* open a new container */ + default_vfio_cfg->vfio_container_fd = + rte_vfio_get_container_fd(); + } else { + /* get the default container from the primary process */ + default_vfio_cfg->vfio_container_fd = + vfio_get_default_container_fd(); + } + + /* check if we have VFIO driver enabled */ + if (default_vfio_cfg->vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + default_vfio_cfg->vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +rte_vfio_is_enabled(const char *modname) +{ + const int mod_available = rte_eal_check_module(modname) > 0; + return default_vfio_cfg->vfio_enabled && mod_available; +} + +int +vfio_get_default_container_fd(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + int container_fd; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* if we were secondary process we would try requesting + * container fd from the primary, but we're the primary + * process so just exit here + */ + return -1; + } + + p->req = SOCKET_REQ_DEFAULT_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + container_fd = mp_rep->fds[0]; + free(mp_reply.msgs); + return container_fd; + } + } + + free(mp_reply.msgs); + RTE_LOG(ERR, EAL, " cannot request default container fd\n"); + return -1; +} + +int +vfio_get_iommu_type(void) +{ + if (default_vfio_cfg->vfio_iommu_type == NULL) + return -1; + + return default_vfio_cfg->vfio_iommu_type->type_id; +} + +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd) +{ + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +int +vfio_has_supported_extensions(int vfio_container_fd) +{ + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +int +rte_vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } + /* + * if we're in a secondary process, request container fd from the + * primary process via mp channel + */ + p->req = SOCKET_REQ_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_container_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_container_fd = mp_rep->fds[0]; + free(mp_reply.msgs); + return vfio_container_fd; + } + } + + free(mp_reply.msgs); + RTE_LOG(ERR, EAL, " cannot request container fd\n"); + return -1; +} + +int +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + "%s/%s/iommu_group", sysfs_base, dev_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_num = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); + return -1; + } + + return 1; +} + +static int +type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + size_t len, void *arg) +{ + int *vfio_container_fd = arg; + + if (msl->external) + return 0; + + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + len, 1); +} + +static int +type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + /* if IOVA mode is VA, we've already mapped the internal segments */ + if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) + return 0; + + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +static int +vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + /** + * In case the mapping was already done EEXIST will be + * returned from kernel. + */ + if (errno == EEXIST) { + RTE_LOG(DEBUG, EAL, + " Memory segment is already mapped," + " skipping"); + } else { + RTE_LOG(ERR, EAL, + " cannot set up DMA remapping," + " error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_type1_dma_map(int vfio_container_fd) +{ + if (rte_eal_iova_mode() == RTE_IOVA_VA) { + /* with IOVA as VA mode, we can get away with mapping contiguous + * chunks rather than going page-by-page. + */ + int ret = rte_memseg_contig_walk(type1_map_contig, + &vfio_container_fd); + if (ret) + return ret; + /* we have to continue the walk because we've skipped the + * external segments during the config walk. + */ + } + return rte_memseg_walk(type1_map, &vfio_container_fd); +} + +static int +vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; + + if (do_map != 0) { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + /** + * In case the mapping was already done EBUSY will be + * returned from kernel. + */ + if (errno == EBUSY) { + RTE_LOG(DEBUG, EAL, + " Memory segment is already mapped," + " skipping"); + } else { + RTE_LOG(ERR, EAL, + " cannot set up DMA remapping," + " error %i (%s)\n", errno, + strerror(errno)); + return -1; + } + } + + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_map_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +static int +vfio_spapr_unmap_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 0); +} + +struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +}; + +static int +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + struct spapr_walk_param *param = arg; + uint64_t max = ms->iova + ms->len; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; + } + + return 0; +} + +static int +vfio_spapr_create_new_dma_window(int vfio_container_fd, + struct vfio_iommu_spapr_tce_create *create) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + }; + struct vfio_iommu_spapr_tce_info info = { + .argsz = sizeof(info), + }; + int ret; + + /* query spapr iommu info */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + RTE_LOG(ERR, EAL, " cannot get iommu info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* remove default DMA of 32 bit window */ + remove.start_addr = info.dma32_window_start; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + RTE_LOG(ERR, EAL, " cannot remove default DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* create new DMA window */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); + if (ret) { +#ifdef VFIO_IOMMU_SPAPR_INFO_DDW + /* try possible page_shift and levels for workaround */ + uint32_t levels; + + for (levels = create->levels + 1; + ret && levels <= info.ddw.levels; levels++) { + create->levels = levels; + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_TCE_CREATE, create); + } +#endif + if (ret) { + RTE_LOG(ERR, EAL, " cannot create new DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + if (create->start_addr != 0) { + RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); + return -1; + } + + return 0; +} + +static int +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct spapr_walk_param param; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int i, ret = 0; + + vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid container fd!\n"); + return -1; + } + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); + + /* we're inside a callback so use thread-unsafe version */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, + ¶m) < 0) { + RTE_LOG(ERR, EAL, "Could not get window size\n"); + ret = -1; + goto out; + } + + /* also check user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + uint64_t max = user_mem_maps->maps[i].iova + + user_mem_maps->maps[i].len; + param.window_size = RTE_MAX(param.window_size, max); + } + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (do_map) { + /* re-create window and remap the entire memory */ + if (iova + len > create.window_size) { + /* release all maps before recreating the window */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not release DMA maps\n"); + ret = -1; + goto out; + } + /* release all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 0)) { + RTE_LOG(ERR, EAL, "Could not release user DMA maps\n"); + ret = -1; + goto out; + } + } + create.window_size = rte_align64pow2(iova + len); + if (vfio_spapr_create_new_dma_window(vfio_container_fd, + &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + ret = -1; + goto out; + } + /* we're inside a callback, so use thread-unsafe version + */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; + } + /* remap all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 1)) { + RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); + ret = -1; + goto out; + } + } + } + if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) { + RTE_LOG(ERR, EAL, "Failed to map DMA\n"); + ret = -1; + goto out; + } + } else { + /* for unmap, check if iova within DMA window */ + if (iova > create.window_size) { + RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); + ret = -1; + goto out; + } + + vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); + } +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) + return -1; + + return 0; +} + +static int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, + uint64_t __rte_unused vaddr, + uint64_t __rte_unused iova, uint64_t __rte_unused len, + int __rte_unused do_map) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; + + if (!t) { + RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); + rte_errno = ENODEV; + return -1; + } + + if (!t->dma_user_map_func) { + RTE_LOG(ERR, EAL, + " VFIO custom DMA region maping not supported by IOMMU %s\n", + t->name); + rte_errno = ENOTSUP; + return -1; + } + + return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, + len, do_map); +} + +static int +container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *new_map; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + /* map the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { + /* technically, this will fail if there are currently no devices + * plugged in, even if a device were added later, this mapping + * might have succeeded. however, since we cannot verify if this + * is a valid mapping without having a device attached, consider + * this to be unsupported, because we can't just store any old + * mapping and pollute list of active mappings willy-nilly. + */ + RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); + ret = -1; + goto out; + } + /* create new user mem map entry */ + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + new_map->addr = vaddr; + new_map->iova = iova; + new_map->len = len; + + compact_user_maps(user_mem_maps); +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *map, *new_map = NULL; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* find our mapping */ + map = find_user_mem_map(user_mem_maps, vaddr, iova, len); + if (!map) { + RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); + rte_errno = EINVAL; + ret = -1; + goto out; + } + if (map->addr != vaddr || map->iova != iova || map->len != len) { + /* we're partially unmapping a previously mapped region, so we + * need to split entry into two. + */ + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + } + + /* unmap the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { + /* there may not be any devices plugged in, so unmapping will + * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't + * stop us from removing the mapping, as the assumption is we + * won't be needing this memory any more and thus will want to + * prevent it from being remapped again on hotplug. so, only + * fail if we indeed failed to unmap (e.g. if the mapping was + * within our mapped range but had invalid alignment). + */ + if (rte_errno != ENODEV && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); + ret = -1; + goto out; + } else { + RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); + } + } + /* remove map from the list of active mappings */ + if (new_map != NULL) { + adjust_map(map, new_map, vaddr, len); + + /* if we've created a new map by splitting, sort everything */ + if (!is_null_map(new_map)) { + compact_user_maps(user_mem_maps); + } else { + /* we've created a new mapping, but it was unused */ + user_mem_maps->n_maps--; + } + } else { + memset(map, 0, sizeof(*map)); + compact_user_maps(user_mem_maps); + user_mem_maps->n_maps--; + } + +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + int fd; + ssize_t cnt; + char c; + + fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); + if (fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", + errno, strerror(errno)); + return -1; + } + /* + * else the file does not exists + * i.e. noiommu is not enabled + */ + return 0; + } + + cnt = read(fd, &c, 1); + close(fd); + if (cnt != 1) { + RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " + "file %i (%s)\n", errno, strerror(errno)); + return -1; + } + + return c == 'Y'; +} + +int +rte_vfio_container_create(void) +{ + int i; + + /* Find an empty slot to store new vfio config */ + for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == -1) + break; + } + + if (i == VFIO_MAX_CONTAINERS) { + RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); + return -1; + } + + vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); + if (vfio_cfgs[i].vfio_container_fd < 0) { + RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); + return -1; + } + + return vfio_cfgs[i].vfio_container_fd; +} + +int +rte_vfio_container_destroy(int container_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num != -1) + rte_vfio_container_group_unbind(container_fd, + vfio_cfg->vfio_groups[i].group_num); + + close(container_fd); + vfio_cfg->vfio_container_fd = -1; + vfio_cfg->vfio_active_groups = 0; + vfio_cfg->vfio_iommu_type = NULL; + + return 0; +} + +int +rte_vfio_container_group_bind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + +int +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp = NULL; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { + RTE_LOG(ERR, EAL, "Specified group number not found\n"); + return -1; + } + + if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { + RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" + " iommu_group_num %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = -1; + cur_grp->fd = -1; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_map(vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_unmap(vfio_cfg, vaddr, iova, len); +} + +#else + +int +rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int +rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, __rte_unused int fd) +{ + return -1; +} + +int +rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + return -1; +} + +int +rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return -1; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +#endif /* VFIO_PRESENT */ diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio.h b/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio.h new file mode 100644 index 000000000..cb2d35fb1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_VFIO_H_ +#define EAL_VFIO_H_ + +#include <rte_common.h> + +/* + * determine if VFIO is present on the system + */ +#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) +#include <linux/version.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_PRESENT +#else +#pragma message("VFIO configured but not supported by this kernel, disabling.") +#endif /* kernel version >= 3.6.0 */ +#endif /* RTE_EAL_VFIO */ + +#ifdef VFIO_PRESENT + +#include <stdint.h> +#include <linux/vfio.h> + +#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU + +#ifndef VFIO_SPAPR_TCE_v2_IOMMU +#define RTE_VFIO_SPAPR 7 +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +struct vfio_iommu_spapr_register_memory { + uint32_t argsz; + uint32_t flags; + uint64_t vaddr; + uint64_t size; +}; + +struct vfio_iommu_spapr_tce_create { + uint32_t argsz; + uint32_t flags; + /* in */ + uint32_t page_shift; + uint32_t __resv1; + uint64_t window_size; + uint32_t levels; + uint32_t __resv2; + /* out */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_remove { + uint32_t argsz; + uint32_t flags; + /* in */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_ddw_info { + uint64_t pgsizes; + uint32_t max_dynamic_windows_supported; + uint32_t levels; +}; + +/* SPAPR_v2 is not present, but SPAPR might be */ +#ifndef VFIO_SPAPR_TCE_IOMMU +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct vfio_iommu_spapr_tce_info { + uint32_t argsz; + uint32_t flags; + uint32_t dma32_window_start; + uint32_t dma32_window_size; + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; +#endif /* VFIO_SPAPR_TCE_IOMMU */ + +#else /* VFIO_SPAPR_TCE_v2_IOMMU */ +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU +#endif + +#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS +#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_num; + int fd; + int devices; +}; + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +/* Custom memory region DMA mapping function prototype. + * Takes VFIO container fd, virtual address, phisical address, length and + * operation type (0 to unmap 1 for map) as a parameters. + * Returns 0 on success, -1 on error. + **/ +typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_user_func_t dma_user_map_func; + vfio_dma_func_t dma_map_func; +}; + +/* get the vfio container that devices are bound to by default */ +int vfio_get_default_container_fd(void); + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd); + +int +vfio_get_iommu_type(void); + +/* check if we have any supported extensions */ +int +vfio_has_supported_extensions(int vfio_container_fd); + +int vfio_mp_sync_setup(void); + +#define EAL_VFIO_MP "eal_vfio_mp_sync" + +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 +#define SOCKET_REQ_IOMMU_TYPE 0x800 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + +struct vfio_mp_param { + int req; + int result; + RTE_STD_C11 + union { + int group_num; + int iommu_type_id; + }; +}; + +#endif /* VFIO_PRESENT */ + +#endif /* EAL_VFIO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio_mp_sync.c b/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio_mp_sync.c new file mode 100644 index 000000000..5f2a5fc1d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/eal_vfio_mp_sync.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include <unistd.h> +#include <string.h> + +#include <rte_compat.h> +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_vfio.h> +#include <rte_eal.h> + +#include "eal_vfio.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +static int +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) +{ + int fd = -1; + int ret; + struct rte_mp_msg reply; + struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; + const struct vfio_mp_param *m = + (const struct vfio_mp_param *)msg->param; + + if (msg->len_param != sizeof(*m)) { + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + switch (m->req) { + case SOCKET_REQ_GROUP: + r->req = SOCKET_REQ_GROUP; + r->group_num = m->group_num; + fd = rte_vfio_get_group_fd(m->group_num); + if (fd < 0) + r->result = SOCKET_ERR; + else if (fd == 0) + /* if VFIO group exists but isn't bound to VFIO driver */ + r->result = SOCKET_NO_FD; + else { + /* if group exists and is bound to VFIO driver */ + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_CONTAINER: + r->req = SOCKET_REQ_CONTAINER; + fd = rte_vfio_get_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_DEFAULT_CONTAINER: + r->req = SOCKET_REQ_DEFAULT_CONTAINER; + fd = vfio_get_default_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_IOMMU_TYPE: + { + int iommu_type_id; + + r->req = SOCKET_REQ_IOMMU_TYPE; + + iommu_type_id = vfio_get_iommu_type(); + + if (iommu_type_id < 0) + r->result = SOCKET_ERR; + else { + r->iommu_type_id = iommu_type_id; + r->result = SOCKET_OK; + } + break; + } + default: + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + strcpy(reply.name, EAL_VFIO_MP); + reply.len_param = sizeof(*r); + + ret = rte_mp_reply(&reply, peer); + if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) + close(fd); + return ret; +} + +int +vfio_mp_sync_setup(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + int ret = rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); + if (ret && rte_errno != ENOTSUP) + return -1; + } + + return 0; +} + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/linux/include/meson.build b/src/spdk/dpdk/lib/librte_eal/linux/include/meson.build new file mode 100644 index 000000000..1241894b3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/include/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020 Mellanox Technologies, Ltd + +includes += include_directories('.') + +headers += files( + 'rte_kni_common.h', + 'rte_os.h', +) diff --git a/src/spdk/dpdk/lib/librte_eal/linux/include/rte_kni_common.h b/src/spdk/dpdk/lib/librte_eal/linux/include/rte_kni_common.h new file mode 100644 index 000000000..7313ef504 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/include/rte_kni_common.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */ +/* + * Copyright(c) 2007-2014 Intel Corporation. + */ + +#ifndef _RTE_KNI_COMMON_H_ +#define _RTE_KNI_COMMON_H_ + +#ifdef __KERNEL__ +#include <linux/if.h> +#include <asm/barrier.h> +#define RTE_STD_C11 +#else +#include <rte_common.h> +#include <rte_config.h> +#endif + +/* + * KNI name is part of memzone name. Must not exceed IFNAMSIZ. + */ +#define RTE_KNI_NAMESIZE 16 + +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/* + * Request id. + */ +enum rte_kni_req_id { + RTE_KNI_REQ_UNKNOWN = 0, + RTE_KNI_REQ_CHANGE_MTU, + RTE_KNI_REQ_CFG_NETWORK_IF, + RTE_KNI_REQ_CHANGE_MAC_ADDR, + RTE_KNI_REQ_CHANGE_PROMISC, + RTE_KNI_REQ_CHANGE_ALLMULTI, + RTE_KNI_REQ_MAX, +}; + +/* + * Structure for KNI request. + */ +struct rte_kni_request { + uint32_t req_id; /**< Request id */ + RTE_STD_C11 + union { + uint32_t new_mtu; /**< New MTU */ + uint8_t if_up; /**< 1: interface up, 0: interface down */ + uint8_t mac_addr[6]; /**< MAC address for interface */ + uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */ + uint8_t allmulti; /**< 1: all-multicast mode enable, 0: disable */ + }; + int32_t result; /**< Result for processing request */ +} __attribute__((__packed__)); + +/* + * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO + * Write and read should wrap around. Fifo is empty when write == read + * Writing should never overwrite the read position + */ +struct rte_kni_fifo { +#ifdef RTE_USE_C11_MEM_MODEL + unsigned write; /**< Next position to be written*/ + unsigned read; /**< Next position to be read */ +#else + volatile unsigned write; /**< Next position to be written*/ + volatile unsigned read; /**< Next position to be read */ +#endif + unsigned len; /**< Circular buffer length */ + unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ + void *volatile buffer[]; /**< The buffer contains mbuf pointers */ +}; + +/* + * The kernel image of the rte_mbuf struct, with only the relevant fields. + * Padding is necessary to assure the offsets of these fields + */ +struct rte_kni_mbuf { + void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + uint64_t buf_physaddr; + uint16_t data_off; /**< Start address of data in segment buffer. */ + char pad1[2]; + uint16_t nb_segs; /**< Number of segments. */ + char pad4[2]; + uint64_t ol_flags; /**< Offload features. */ + char pad2[4]; + uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + + /* fields on second cache line */ + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); + void *pool; + void *next; /**< Physical address of next mbuf in kernel. */ +}; + +/* + * Struct used to create a KNI device. Passed to the kernel in IOCTL call + */ + +struct rte_kni_device_info { + char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ + + phys_addr_t tx_phys; + phys_addr_t rx_phys; + phys_addr_t alloc_phys; + phys_addr_t free_phys; + + /* Used by Ethtool */ + phys_addr_t req_phys; + phys_addr_t resp_phys; + phys_addr_t sync_phys; + void * sync_va; + + /* mbuf mempool */ + void * mbuf_va; + phys_addr_t mbuf_phys; + + uint16_t group_id; /**< Group ID */ + uint32_t core_id; /**< core ID to bind for kernel thread */ + + __extension__ + uint8_t force_bind : 1; /**< Flag for kernel thread binding */ + + /* mbuf size */ + unsigned mbuf_size; + unsigned int mtu; + unsigned int min_mtu; + unsigned int max_mtu; + uint8_t mac_addr[6]; + uint8_t iova_mode; +}; + +#define KNI_DEVICE "kni" + +#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) +#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) +#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) + +#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/linux/include/rte_os.h b/src/spdk/dpdk/lib/librte_eal/linux/include/rte_os.h new file mode 100644 index 000000000..218d4fa86 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/include/rte_os.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_OS_H_ +#define _RTE_OS_H_ + +/** + * This is header should contain any function/macro definition + * which are not supported natively or named differently in the + * linux OS. Functions will be added in future releases. + */ + +#include <sched.h> + +typedef cpu_set_t rte_cpuset_t; +#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2) +#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2) +#define RTE_CPU_FILL(set) do \ +{ \ + unsigned int i; \ + CPU_ZERO(set); \ + for (i = 0; i < CPU_SETSIZE; i++) \ + CPU_SET(i, set); \ +} while (0) +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpu_set_t tmp; \ + RTE_CPU_FILL(&tmp); \ + CPU_XOR(dst, &tmp, src); \ +} while (0) + +#endif /* _RTE_OS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/linux/meson.build b/src/spdk/dpdk/lib/librte_eal/linux/meson.build new file mode 100644 index 000000000..7742aa475 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linux/meson.build @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +subdir('include') + +sources += files( + 'eal.c', + 'eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_dev.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_lcore.c', + 'eal_log.c', + 'eal_memalloc.c', + 'eal_memory.c', + 'eal_thread.c', + 'eal_timer.c', + 'eal_vfio.c', + 'eal_vfio_mp_sync.c', +) + +deps += ['kvargs', 'telemetry'] +if has_libnuma == 1 + dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) +endif diff --git a/src/spdk/dpdk/lib/librte_eal/meson.build b/src/spdk/dpdk/lib/librte_eal/meson.build new file mode 100644 index 000000000..e301f4558 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/meson.build @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017-2019 Intel Corporation + +includes += global_inc +subdir('include') + +subdir('common') + +dpdk_conf.set('RTE_EXEC_ENV_' + exec_env.to_upper(), 1) +subdir(exec_env) + +subdir(arch_subdir) + +deps += ['kvargs'] +if not is_windows + deps += ['telemetry'] +endif +if dpdk_conf.has('RTE_USE_LIBBSD') + ext_deps += libbsd +endif +if cc.has_function('getentropy', prefix : '#include <unistd.h>') + cflags += '-DRTE_LIBEAL_USE_GETENTROPY' +endif +if cc.has_header('getopt.h') + cflags += ['-DHAVE_GETOPT_H', '-DHAVE_GETOPT', '-DHAVE_GETOPT_LONG'] +endif diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/meson.build b/src/spdk/dpdk/lib/librte_eal/ppc/include/meson.build new file mode 100644 index 000000000..ab4bd2809 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/meson.build @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +arch_headers = files( + 'rte_altivec.h', + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', + 'rte_cycles.h', + 'rte_io.h', + 'rte_memcpy.h', + 'rte_pause.h', + 'rte_prefetch.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', +) +install_headers(arch_headers, subdir: get_option('include_subdir_arch')) diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_altivec.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_altivec.h new file mode 100644 index 000000000..1551a9454 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_altivec.h @@ -0,0 +1,22 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) Mellanox 2020. + */ + +#ifndef _RTE_ALTIVEC_H_ +#define _RTE_ALTIVEC_H_ + +/* To include altivec.h, GCC version must be >= 4.8 */ +#include <altivec.h> + +/* + * Compilation workaround for PPC64 when AltiVec is fully enabled, e.g. std=c11. + * Otherwise there would be a type conflict between stdbool and altivec. + */ +#if defined(__PPC64__) && !defined(__APPLE_ALTIVEC__) +#undef bool +/* redefine as in stdbool.h */ +#define bool _Bool +#endif + +#endif /* _RTE_ALTIVEC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_atomic.h new file mode 100644 index 000000000..7e3e13118 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_atomic.h @@ -0,0 +1,413 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Inspired from FreeBSD src/sys/powerpc/include/atomic.h + * Copyright (c) 2008 Marcel Moolenaar + * Copyright (c) 2001 Benno Rice + * Copyright (c) 2001 David E. O'Brien + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + */ + +#ifndef _RTE_ATOMIC_PPC_64_H_ +#define _RTE_ATOMIC_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include "generic/rte_atomic.h" + +#define rte_mb() asm volatile("sync" : : : "memory") + +#define rte_wmb() asm volatile("sync" : : : "memory") + +#define rte_rmb() asm volatile("sync" : : : "memory") + +#define rte_smp_mb() rte_mb() + +#define rte_smp_wmb() rte_wmb() + +#define rte_smp_rmb() rte_rmb() + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_wmb() + +#define rte_io_rmb() rte_rmb() + +#define rte_cio_wmb() rte_wmb() + +#define rte_cio_rmb() rte_rmb() + +/*------------------------- 16 bit atomic operations -------------------------*/ +/* To be compatible with Power7, use GCC built-in functions for 16 bit + * operations */ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) +{ + return __atomic_compare_exchange(dst, &exp, &src, 0, __ATOMIC_ACQUIRE, + __ATOMIC_ACQUIRE) ? 1 : 0; +} + +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) +{ + return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic16_inc(rte_atomic16_t *v) +{ + __atomic_add_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE); +} + +static inline void +rte_atomic16_dec(rte_atomic16_t *v) +{ + __atomic_sub_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE); +} + +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) +{ + return __atomic_add_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE) == 0; +} + +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) +{ + return __atomic_sub_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE) == 0; +} + +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ + return __atomic_exchange_2(dst, val, __ATOMIC_SEQ_CST); +} + +/*------------------------- 32 bit atomic operations -------------------------*/ + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + unsigned int ret = 0; + + asm volatile( + "\tlwsync\n" + "1:\tlwarx %[ret], 0, %[dst]\n" + "cmplw %[exp], %[ret]\n" + "bne 2f\n" + "stwcx. %[src], 0, %[dst]\n" + "bne- 1b\n" + "li %[ret], 1\n" + "b 3f\n" + "2:\n" + "stwcx. %[ret], 0, %[dst]\n" + "li %[ret], 0\n" + "3:\n" + "isync\n" + : [ret] "=&r" (ret), "=m" (*dst) + : [dst] "r" (dst), + [exp] "r" (exp), + [src] "r" (src), + "m" (*dst) + : "cc", "memory"); + + return ret; +} + +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) +{ + return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic32_inc(rte_atomic32_t *v) +{ + int t; + + asm volatile( + "1: lwarx %[t],0,%[cnt]\n" + "addic %[t],%[t],1\n" + "stwcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "=m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline void +rte_atomic32_dec(rte_atomic32_t *v) +{ + int t; + + asm volatile( + "1: lwarx %[t],0,%[cnt]\n" + "addic %[t],%[t],-1\n" + "stwcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "=m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) +{ + int ret; + + asm volatile( + "\n\tlwsync\n" + "1: lwarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],1\n" + "stwcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) +{ + int ret; + + asm volatile( + "\n\tlwsync\n" + "1: lwarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],-1\n" + "stwcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +} + +/*------------------------- 64 bit atomic operations -------------------------*/ + +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + unsigned int ret = 0; + + asm volatile ( + "\tlwsync\n" + "1: ldarx %[ret], 0, %[dst]\n" + "cmpld %[exp], %[ret]\n" + "bne 2f\n" + "stdcx. %[src], 0, %[dst]\n" + "bne- 1b\n" + "li %[ret], 1\n" + "b 3f\n" + "2:\n" + "stdcx. %[ret], 0, %[dst]\n" + "li %[ret], 0\n" + "3:\n" + "isync\n" + : [ret] "=&r" (ret), "=m" (*dst) + : [dst] "r" (dst), + [exp] "r" (exp), + [src] "r" (src), + "m" (*dst) + : "cc", "memory"); + return ret; +} + +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ + v->cnt = 0; +} + +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ + long ret; + + asm volatile("ld%U1%X1 %[ret],%[cnt]" + : [ret] "=r"(ret) + : [cnt] "m"(v->cnt)); + + return ret; +} + +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ + asm volatile("std%U0%X0 %[new_value],%[cnt]" + : [cnt] "=m"(v->cnt) + : [new_value] "r"(new_value)); +} + +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "add %[t],%[inc],%[t]\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "=m" (v->cnt) + : [cnt] "r" (&v->cnt), [inc] "r" (inc), "m" (v->cnt) + : "cc", "memory"); +} + +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "subf %[t],%[dec],%[t]\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "+m" (v->cnt) + : [cnt] "r" (&v->cnt), [dec] "r" (dec), "m" (v->cnt) + : "cc", "memory"); +} + +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "addic %[t],%[t],1\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "+m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "addic %[t],%[t],-1\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "+m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "add %[ret],%[inc],%[ret]\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [inc] "r" (inc), [cnt] "r" (&v->cnt) + : "cc", "memory"); + + return ret; +} + +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "subf %[ret],%[dec],%[ret]\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [dec] "r" (dec), [cnt] "r" (&v->cnt) + : "cc", "memory"); + + return ret; +} + +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],1\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],-1\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} +/** + * Atomically set a 64-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + v->cnt = 0; +} + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ + return __atomic_exchange_8(dst, val, __ATOMIC_SEQ_CST); +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_byteorder.h new file mode 100644 index 000000000..bfdded40f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_byteorder.h @@ -0,0 +1,120 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Inspired from FreeBSD src/sys/powerpc/include/endian.h + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. + */ + +#ifndef _RTE_BYTEORDER_PPC_64_H_ +#define _RTE_BYTEORDER_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include "generic/rte_byteorder.h" + +/* + * An architecture-optimized byte swap for a 16-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap16(). + */ +static inline uint16_t rte_arch_bswap16(uint16_t _x) +{ + return (_x >> 8) | ((_x << 8) & 0xff00); +} + +/* + * An architecture-optimized byte swap for a 32-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap32(). + */ +static inline uint32_t rte_arch_bswap32(uint32_t _x) +{ + return (_x >> 24) | ((_x >> 8) & 0xff00) | ((_x << 8) & 0xff0000) | + ((_x << 24) & 0xff000000); +} + +/* + * An architecture-optimized byte swap for a 64-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap64(). + */ +/* 64-bit mode */ +static inline uint64_t rte_arch_bswap64(uint64_t _x) +{ + return (_x >> 56) | ((_x >> 40) & 0xff00) | ((_x >> 24) & 0xff0000) | + ((_x >> 8) & 0xff000000) | ((_x << 8) & (0xffULL << 32)) | + ((_x << 24) & (0xffULL << 40)) | + ((_x << 40) & (0xffULL << 48)) | ((_x << 56)); +} + +#ifndef RTE_FORCE_INTRINSICS +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) + +#define rte_bswap32(x) ((uint32_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap32(x) : \ + rte_arch_bswap32(x))) + +#define rte_bswap64(x) ((uint64_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap64(x) : \ + rte_arch_bswap64(x))) +#else +/* + * __builtin_bswap16 is only available gcc 4.8 and upwards + */ +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) +#endif +#endif + +/* Power 8 have both little endian and big endian mode + * Power 7 only support big endian + */ +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + +#define rte_cpu_to_le_16(x) (x) +#define rte_cpu_to_le_32(x) (x) +#define rte_cpu_to_le_64(x) (x) + +#define rte_cpu_to_be_16(x) rte_bswap16(x) +#define rte_cpu_to_be_32(x) rte_bswap32(x) +#define rte_cpu_to_be_64(x) rte_bswap64(x) + +#define rte_le_to_cpu_16(x) (x) +#define rte_le_to_cpu_32(x) (x) +#define rte_le_to_cpu_64(x) (x) + +#define rte_be_to_cpu_16(x) rte_bswap16(x) +#define rte_be_to_cpu_32(x) rte_bswap32(x) +#define rte_be_to_cpu_64(x) rte_bswap64(x) + +#else /* RTE_BIG_ENDIAN */ + +#define rte_cpu_to_le_16(x) rte_bswap16(x) +#define rte_cpu_to_le_32(x) rte_bswap32(x) +#define rte_cpu_to_le_64(x) rte_bswap64(x) + +#define rte_cpu_to_be_16(x) (x) +#define rte_cpu_to_be_32(x) (x) +#define rte_cpu_to_be_64(x) (x) + +#define rte_le_to_cpu_16(x) rte_bswap16(x) +#define rte_le_to_cpu_32(x) rte_bswap32(x) +#define rte_le_to_cpu_64(x) rte_bswap64(x) + +#define rte_be_to_cpu_16(x) (x) +#define rte_be_to_cpu_32(x) (x) +#define rte_be_to_cpu_64(x) (x) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BYTEORDER_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_cpuflags.h new file mode 100644 index 000000000..a88355d17 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_cpuflags.h @@ -0,0 +1,61 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2014. + */ + +#ifndef _RTE_CPUFLAGS_PPC_64_H_ +#define _RTE_CPUFLAGS_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Enumeration of all CPU features supported + */ +enum rte_cpu_flag_t { + RTE_CPUFLAG_PPC_LE = 0, + RTE_CPUFLAG_TRUE_LE, + RTE_CPUFLAG_PSERIES_PERFMON_COMPAT, + RTE_CPUFLAG_VSX, + RTE_CPUFLAG_ARCH_2_06, + RTE_CPUFLAG_POWER6_EXT, + RTE_CPUFLAG_DFP, + RTE_CPUFLAG_PA6T, + RTE_CPUFLAG_ARCH_2_05, + RTE_CPUFLAG_ICACHE_SNOOP, + RTE_CPUFLAG_SMT, + RTE_CPUFLAG_BOOKE, + RTE_CPUFLAG_CELLBE, + RTE_CPUFLAG_POWER5_PLUS, + RTE_CPUFLAG_POWER5, + RTE_CPUFLAG_POWER4, + RTE_CPUFLAG_NOTB, + RTE_CPUFLAG_EFP_DOUBLE, + RTE_CPUFLAG_EFP_SINGLE, + RTE_CPUFLAG_SPE, + RTE_CPUFLAG_UNIFIED_CACHE, + RTE_CPUFLAG_4xxMAC, + RTE_CPUFLAG_MMU, + RTE_CPUFLAG_FPU, + RTE_CPUFLAG_ALTIVEC, + RTE_CPUFLAG_PPC601, + RTE_CPUFLAG_PPC64, + RTE_CPUFLAG_PPC32, + RTE_CPUFLAG_TAR, + RTE_CPUFLAG_LSEL, + RTE_CPUFLAG_EBB, + RTE_CPUFLAG_DSCR, + RTE_CPUFLAG_HTM, + RTE_CPUFLAG_ARCH_2_07, + /* The last item */ + RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_cycles.h new file mode 100644 index 000000000..5585f9273 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_cycles.h @@ -0,0 +1,46 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2014. + */ + +#ifndef _RTE_CYCLES_PPC_64_H_ +#define _RTE_CYCLES_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/platform/ppc.h> + +#include "generic/rte_cycles.h" + +#include <rte_byteorder.h> +#include <rte_common.h> + +/** + * Read the time base register. + * + * @return + * The time base for this lcore. + */ +static inline uint64_t +rte_rdtsc(void) +{ + return __ppc_get_timebase(); +} + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_io.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_io.h new file mode 100644 index 000000000..01455065e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_io.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_PPC_64_H_ +#define _RTE_IO_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_io.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_mcslock.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_mcslock.h new file mode 100644 index 000000000..c58a6edc1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_mcslock.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_MCSLOCK_PPC_64_H_ +#define _RTE_MCSLOCK_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_mcslock.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MCSLOCK_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_memcpy.h new file mode 100644 index 000000000..c2a1f356d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_memcpy.h @@ -0,0 +1,209 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2014. + */ + +#ifndef _RTE_MEMCPY_PPC_64_H_ +#define _RTE_MEMCPY_PPC_64_H_ + +#include <stdint.h> +#include <string.h> + +#include "rte_altivec.h" +#include "rte_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_memcpy.h" + +#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); + vec_vsx_st(vec_vsx_ld(64, src), 64, dst); + vec_vsx_st(vec_vsx_ld(80, src), 80, dst); + vec_vsx_st(vec_vsx_ld(96, src), 96, dst); + vec_vsx_st(vec_vsx_ld(112, src), 112, dst); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov128(dst, src); + rte_mov128(dst + 128, src + 128); +} + +#define rte_memcpy(dst, src, n) \ + __extension__ ({ \ + (__builtin_constant_p(n)) ? \ + memcpy((dst), (src), (n)) : \ + rte_memcpy_func((dst), (src), (n)); }) + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* We can't copy < 16 bytes using XMM registers so do it manually. */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + dst = (uint8_t *)dst + 1; + src = (const uint8_t *)src + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst = (uint16_t *)dst + 1; + src = (const uint16_t *)src + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + return ret; + } + + /* Special fast cases for <= 128 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + + if (n <= 128) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /* + * For large copies > 128 bytes. This combination of 256, 64 and 16 byte + * copies was found to be faster than doing 128 and 32 byte copies as + * well. + */ + for ( ; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /* + * We split the remaining bytes (which will be less than 256) into + * 64byte (2^6) chunks. + * Using incrementing integers in the case labels of a switch statement + * encourages the compiler to use a jump table. To get incrementing + * integers, we shift the 2 relevant bits to the LSB position to first + * get decrementing integers, and then subtract. + */ + switch (3 - (n >> 6)) { + case 0x00: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x01: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x02: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + default: + ; + } + + /* + * We split the remaining bytes (which will be less than 64) into + * 16byte (2^4) chunks, using the same switch structure as above. + */ + switch (3 - (n >> 4)) { + case 0x00: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x01: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x02: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + default: + ; + } + + /* Copy any remaining bytes, without going beyond end of buffers */ + if (n != 0) + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; +} + +#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_pause.h new file mode 100644 index 000000000..16e47ce22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_pause.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_PPC64_H_ +#define _RTE_PAUSE_PPC64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "rte_atomic.h" + +#include "generic/rte_pause.h" + +static inline void rte_pause(void) +{ + /* Set hardware multi-threading low priority */ + asm volatile("or 1,1,1"); + /* Set hardware multi-threading medium priority */ + asm volatile("or 2,2,2"); + rte_compiler_barrier(); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_PPC64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_prefetch.h new file mode 100644 index 000000000..9ba07c815 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_prefetch.h @@ -0,0 +1,41 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2014. + */ + +#ifndef _RTE_PREFETCH_PPC_64_H_ +#define _RTE_PREFETCH_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("dcbt 0,%[p],0" : : [p] "r" (p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("dcbt 0,%[p],0" : : [p] "r" (p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("dcbt 0,%[p],0" : : [p] "r" (p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + /* non-temporal version not available, fallback to rte_prefetch0 */ + rte_prefetch0(p); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_rwlock.h new file mode 100644 index 000000000..9fadc0407 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_rwlock.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ +#ifndef _RTE_RWLOCK_PPC_64_H_ +#define _RTE_RWLOCK_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_rwlock.h" + +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_lock(rwl); +} + +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_unlock(rwl); +} + +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_lock(rwl); +} + +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_unlock(rwl); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_spinlock.h new file mode 100644 index 000000000..149ec245c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_spinlock.h @@ -0,0 +1,88 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2014. + */ + +#ifndef _RTE_SPINLOCK_PPC_64_H_ +#define _RTE_SPINLOCK_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include <rte_pause.h> +#include "generic/rte_spinlock.h" + +/* Fixme: Use intrinsics to implement the spinlock on Power architecture */ + +#ifndef RTE_FORCE_INTRINSICS + +static inline void +rte_spinlock_lock(rte_spinlock_t *sl) +{ + while (__sync_lock_test_and_set(&sl->locked, 1)) + while (sl->locked) + rte_pause(); +} + +static inline void +rte_spinlock_unlock(rte_spinlock_t *sl) +{ + __sync_lock_release(&sl->locked); +} + +static inline int +rte_spinlock_trylock(rte_spinlock_t *sl) +{ + return __sync_lock_test_and_set(&sl->locked, 1) == 0; +} + +#endif + +static inline int rte_tm_supported(void) +{ + return 0; +} + +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_lock(sl); /* fall-back */ +} + +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl) +{ + return rte_spinlock_trylock(sl); +} + +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_unlock(sl); +} + +static inline void +rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_lock(slr); /* fall-back */ +} + +static inline void +rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_unlock(slr); +} + +static inline int +rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr) +{ + return rte_spinlock_recursive_trylock(slr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SPINLOCK_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_ticketlock.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_ticketlock.h new file mode 100644 index 000000000..c175e9eab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_ticketlock.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_TICKETLOCK_PPC_64_H_ +#define _RTE_TICKETLOCK_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_ticketlock.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TICKETLOCK_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_vect.h new file mode 100644 index 000000000..b0545c878 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/include/rte_vect.h @@ -0,0 +1,36 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2016. + */ + +#ifndef _RTE_VECT_PPC_64_H_ +#define _RTE_VECT_PPC_64_H_ + +#include "rte_altivec.h" + +#include "generic/rte_vect.h" +#include "rte_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef vector signed int xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} __rte_aligned(16) rte_xmm_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VECT_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/meson.build b/src/spdk/dpdk/lib/librte_eal/ppc/meson.build new file mode 100644 index 000000000..f4b6d95c4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +subdir('include') + +sources += files( + 'rte_cpuflags.c', + 'rte_cycles.c', + 'rte_hypervisor.c', +) diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/rte_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/ppc/rte_cpuflags.c new file mode 100644 index 000000000..3bb7563ce --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/rte_cpuflags.c @@ -0,0 +1,110 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2014. + */ + +#include "rte_cpuflags.h" + +#include <elf.h> +#include <fcntl.h> +#include <assert.h> +#include <unistd.h> + +/* Symbolic values for the entries in the auxiliary table */ +#define AT_HWCAP 16 +#define AT_HWCAP2 26 + +/* software based registers */ +enum cpu_register_t { + REG_NONE = 0, + REG_HWCAP, + REG_HWCAP2, + REG_MAX +}; + +typedef uint32_t hwcap_registers_t[REG_MAX]; + +struct feature_entry { + uint32_t reg; + uint32_t bit; +#define CPU_FLAG_NAME_MAX_LEN 64 + char name[CPU_FLAG_NAME_MAX_LEN]; +}; + +#define FEAT_DEF(name, reg, bit) \ + [RTE_CPUFLAG_##name] = {reg, bit, #name}, + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(PPC_LE, REG_HWCAP, 0) + FEAT_DEF(TRUE_LE, REG_HWCAP, 1) + FEAT_DEF(PSERIES_PERFMON_COMPAT, REG_HWCAP, 6) + FEAT_DEF(VSX, REG_HWCAP, 7) + FEAT_DEF(ARCH_2_06, REG_HWCAP, 8) + FEAT_DEF(POWER6_EXT, REG_HWCAP, 9) + FEAT_DEF(DFP, REG_HWCAP, 10) + FEAT_DEF(PA6T, REG_HWCAP, 11) + FEAT_DEF(ARCH_2_05, REG_HWCAP, 12) + FEAT_DEF(ICACHE_SNOOP, REG_HWCAP, 13) + FEAT_DEF(SMT, REG_HWCAP, 14) + FEAT_DEF(BOOKE, REG_HWCAP, 15) + FEAT_DEF(CELLBE, REG_HWCAP, 16) + FEAT_DEF(POWER5_PLUS, REG_HWCAP, 17) + FEAT_DEF(POWER5, REG_HWCAP, 18) + FEAT_DEF(POWER4, REG_HWCAP, 19) + FEAT_DEF(NOTB, REG_HWCAP, 20) + FEAT_DEF(EFP_DOUBLE, REG_HWCAP, 21) + FEAT_DEF(EFP_SINGLE, REG_HWCAP, 22) + FEAT_DEF(SPE, REG_HWCAP, 23) + FEAT_DEF(UNIFIED_CACHE, REG_HWCAP, 24) + FEAT_DEF(4xxMAC, REG_HWCAP, 25) + FEAT_DEF(MMU, REG_HWCAP, 26) + FEAT_DEF(FPU, REG_HWCAP, 27) + FEAT_DEF(ALTIVEC, REG_HWCAP, 28) + FEAT_DEF(PPC601, REG_HWCAP, 29) + FEAT_DEF(PPC64, REG_HWCAP, 30) + FEAT_DEF(PPC32, REG_HWCAP, 31) + FEAT_DEF(TAR, REG_HWCAP2, 26) + FEAT_DEF(LSEL, REG_HWCAP2, 27) + FEAT_DEF(EBB, REG_HWCAP2, 28) + FEAT_DEF(DSCR, REG_HWCAP2, 29) + FEAT_DEF(HTM, REG_HWCAP2, 30) + FEAT_DEF(ARCH_2_07, REG_HWCAP2, 31) +}; + +/* + * Read AUXV software register and get cpu features for Power + */ +static void +rte_cpu_get_features(hwcap_registers_t out) +{ + out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP); + out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2); +} + +/* + * Checks if a particular flag is available on current machine. + */ +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature) +{ + const struct feature_entry *feat; + hwcap_registers_t regs = {0}; + + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return -ENOENT; + + feat = &rte_cpu_feature_table[feature]; + if (feat->reg == REG_NONE) + return -EFAULT; + + rte_cpu_get_features(regs); + return (regs[feat->reg] >> feat->bit) & 1; +} + +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature) +{ + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return NULL; + return rte_cpu_feature_table[feature].name; +} diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/rte_cycles.c b/src/spdk/dpdk/lib/librte_eal/ppc/rte_cycles.c new file mode 100644 index 000000000..c96a2143b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/rte_cycles.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) IBM Corporation 2019. + */ + +#include "eal_private.h" + +uint64_t +get_tsc_freq_arch(void) +{ + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/ppc/rte_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/ppc/rte_hypervisor.c new file mode 100644 index 000000000..08a1c97d1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/ppc/rte_hypervisor.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +enum rte_hypervisor +rte_hypervisor_get(void) +{ + return RTE_HYPERVISOR_UNKNOWN; +} diff --git a/src/spdk/dpdk/lib/librte_eal/rte_eal_exports.def b/src/spdk/dpdk/lib/librte_eal/rte_eal_exports.def new file mode 100644 index 000000000..12a6c79d6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/rte_eal_exports.def @@ -0,0 +1,9 @@ +EXPORTS + __rte_panic + rte_eal_get_configuration + rte_eal_init + rte_eal_mp_remote_launch + rte_eal_mp_wait_lcore + rte_eal_remote_launch + rte_log + rte_vlog diff --git a/src/spdk/dpdk/lib/librte_eal/rte_eal_version.map b/src/spdk/dpdk/lib/librte_eal/rte_eal_version.map new file mode 100644 index 000000000..d8038749a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/rte_eal_version.map @@ -0,0 +1,389 @@ +DPDK_20.0 { + global: + + __rte_panic; + eal_parse_sysfs_value; + eal_timer_source; + per_lcore__lcore_id; + per_lcore__rte_errno; + rte_bus_dump; + rte_bus_find; + rte_bus_find_by_device; + rte_bus_find_by_name; + rte_bus_get_iommu_class; + rte_bus_probe; + rte_bus_register; + rte_bus_scan; + rte_bus_unregister; + rte_calloc; + rte_calloc_socket; + rte_cpu_get_flag_enabled; + rte_cpu_get_flag_name; + rte_cpu_is_supported; + rte_ctrl_thread_create; + rte_cycles_vmware_tsc_map; + rte_delay_us; + rte_delay_us_block; + rte_delay_us_callback_register; + rte_dev_is_probed; + rte_dev_probe; + rte_dev_remove; + rte_devargs_add; + rte_devargs_dump; + rte_devargs_insert; + rte_devargs_next; + rte_devargs_parse; + rte_devargs_parsef; + rte_devargs_remove; + rte_devargs_type_count; + rte_dump_physmem_layout; + rte_dump_registers; + rte_dump_stack; + rte_dump_tailq; + rte_eal_alarm_cancel; + rte_eal_alarm_set; + rte_eal_cleanup; + rte_eal_create_uio_dev; + rte_eal_get_lcore_state; + rte_eal_get_physmem_size; + rte_eal_get_runtime_dir; + rte_eal_has_hugepages; + rte_eal_has_pci; + rte_eal_hotplug_add; + rte_eal_hotplug_remove; + rte_eal_hpet_init; + rte_eal_init; + rte_eal_iopl_init; + rte_eal_iova_mode; + rte_eal_lcore_role; + rte_eal_mbuf_user_pool_ops; + rte_eal_mp_remote_launch; + rte_eal_mp_wait_lcore; + rte_eal_primary_proc_alive; + rte_eal_process_type; + rte_eal_remote_launch; + rte_eal_tailq_lookup; + rte_eal_tailq_register; + rte_eal_using_phys_addrs; + rte_eal_vfio_intr_mode; + rte_eal_wait_lcore; + rte_epoll_ctl; + rte_epoll_wait; + rte_exit; + rte_free; + rte_get_hpet_cycles; + rte_get_hpet_hz; + rte_get_master_lcore; + rte_get_next_lcore; + rte_get_tsc_hz; + rte_hexdump; + rte_hypervisor_get; + rte_hypervisor_get_name; + rte_intr_allow_others; + rte_intr_callback_register; + rte_intr_callback_unregister; + rte_intr_cap_multiple; + rte_intr_disable; + rte_intr_dp_is_en; + rte_intr_efd_disable; + rte_intr_efd_enable; + rte_intr_enable; + rte_intr_free_epoll_fd; + rte_intr_rx_ctl; + rte_intr_tls_epfd; + rte_keepalive_create; + rte_keepalive_dispatch_pings; + rte_keepalive_mark_alive; + rte_keepalive_mark_sleep; + rte_keepalive_register_core; + rte_keepalive_register_relay_callback; + rte_lcore_count; + rte_lcore_has_role; + rte_lcore_index; + rte_lcore_is_enabled; + rte_lcore_to_socket_id; + rte_log; + rte_log_cur_msg_loglevel; + rte_log_cur_msg_logtype; + rte_log_dump; + rte_log_get_global_level; + rte_log_get_level; + rte_log_register; + rte_log_set_global_level; + rte_log_set_level; + rte_log_set_level_pattern; + rte_log_set_level_regexp; + rte_logs; + rte_malloc; + rte_malloc_dump_stats; + rte_malloc_get_socket_stats; + rte_malloc_set_limit; + rte_malloc_socket; + rte_malloc_validate; + rte_malloc_virt2iova; + rte_mcfg_mem_read_lock; + rte_mcfg_mem_read_unlock; + rte_mcfg_mem_write_lock; + rte_mcfg_mem_write_unlock; + rte_mcfg_mempool_read_lock; + rte_mcfg_mempool_read_unlock; + rte_mcfg_mempool_write_lock; + rte_mcfg_mempool_write_unlock; + rte_mcfg_tailq_read_lock; + rte_mcfg_tailq_read_unlock; + rte_mcfg_tailq_write_lock; + rte_mcfg_tailq_write_unlock; + rte_mem_lock_page; + rte_mem_virt2iova; + rte_mem_virt2phy; + rte_memdump; + rte_memory_get_nchannel; + rte_memory_get_nrank; + rte_memzone_dump; + rte_memzone_free; + rte_memzone_lookup; + rte_memzone_reserve; + rte_memzone_reserve_aligned; + rte_memzone_reserve_bounded; + rte_memzone_walk; + rte_openlog_stream; + rte_rand; + rte_realloc; + rte_reciprocal_value; + rte_reciprocal_value_u64; + rte_rtm_supported; + rte_service_attr_get; + rte_service_attr_reset_all; + rte_service_component_register; + rte_service_component_runstate_set; + rte_service_component_unregister; + rte_service_dump; + rte_service_finalize; + rte_service_get_by_id; + rte_service_get_by_name; + rte_service_get_count; + rte_service_get_name; + rte_service_lcore_add; + rte_service_lcore_attr_get; + rte_service_lcore_attr_reset_all; + rte_service_lcore_count; + rte_service_lcore_count_services; + rte_service_lcore_del; + rte_service_lcore_list; + rte_service_lcore_reset_all; + rte_service_lcore_start; + rte_service_lcore_stop; + rte_service_map_lcore_get; + rte_service_map_lcore_set; + rte_service_may_be_active; + rte_service_probe_capability; + rte_service_run_iter_on_app_lcore; + rte_service_runstate_get; + rte_service_runstate_set; + rte_service_set_runstate_mapped_check; + rte_service_set_stats_enable; + rte_service_start_with_defaults; + rte_set_application_usage_hook; + rte_socket_count; + rte_socket_id; + rte_socket_id_by_idx; + rte_srand; + rte_strerror; + rte_strscpy; + rte_strsplit; + rte_sys_gettid; + rte_thread_get_affinity; + rte_thread_set_affinity; + rte_thread_setname; + rte_uuid_compare; + rte_uuid_is_null; + rte_uuid_parse; + rte_uuid_unparse; + rte_vfio_clear_group; + rte_vfio_container_create; + rte_vfio_container_destroy; + rte_vfio_container_dma_map; + rte_vfio_container_dma_unmap; + rte_vfio_container_group_bind; + rte_vfio_container_group_unbind; + rte_vfio_enable; + rte_vfio_get_container_fd; + rte_vfio_get_group_fd; + rte_vfio_get_group_num; + rte_vfio_is_enabled; + rte_vfio_noiommu_is_enabled; + rte_vfio_release_device; + rte_vfio_setup_device; + rte_vlog; + rte_zmalloc; + rte_zmalloc_socket; + + local: *; +}; + +EXPERIMENTAL { + global: + + # added in 18.02 + rte_mp_action_register; + rte_mp_action_unregister; + rte_mp_reply; + rte_mp_sendmsg; + + # added in 18.05 + rte_dev_event_callback_register; + rte_dev_event_callback_unregister; + rte_dev_event_monitor_start; + rte_dev_event_monitor_stop; + rte_fbarray_attach; + rte_fbarray_destroy; + rte_fbarray_detach; + rte_fbarray_dump_metadata; + rte_fbarray_find_contig_free; + rte_fbarray_find_contig_used; + rte_fbarray_find_idx; + rte_fbarray_find_next_free; + rte_fbarray_find_next_n_free; + rte_fbarray_find_next_n_used; + rte_fbarray_find_next_used; + rte_fbarray_get; + rte_fbarray_init; + rte_fbarray_is_used; + rte_fbarray_set_free; + rte_fbarray_set_used; + rte_log_register_type_and_pick_level; + rte_malloc_dump_heaps; + rte_mem_alloc_validator_register; + rte_mem_alloc_validator_unregister; + rte_mem_check_dma_mask; + rte_mem_event_callback_register; + rte_mem_event_callback_unregister; + rte_mem_iova2virt; + rte_mem_virt2memseg; + rte_mem_virt2memseg_list; + rte_memseg_contig_walk; + rte_memseg_list_walk; + rte_memseg_walk; + rte_mp_request_async; + rte_mp_request_sync; + + # added in 18.08 + rte_class_find; + rte_class_find_by_name; + rte_class_register; + rte_class_unregister; + rte_dev_iterator_init; + rte_dev_iterator_next; + rte_fbarray_find_prev_free; + rte_fbarray_find_prev_n_free; + rte_fbarray_find_prev_n_used; + rte_fbarray_find_prev_used; + rte_fbarray_find_rev_contig_free; + rte_fbarray_find_rev_contig_used; + rte_memseg_contig_walk_thread_unsafe; + rte_memseg_list_walk_thread_unsafe; + rte_memseg_walk_thread_unsafe; + + # added in 18.11 + rte_delay_us_sleep; + rte_dev_event_callback_process; + rte_dev_hotplug_handle_disable; + rte_dev_hotplug_handle_enable; + rte_malloc_heap_create; + rte_malloc_heap_destroy; + rte_malloc_heap_get_socket; + rte_malloc_heap_memory_add; + rte_malloc_heap_memory_attach; + rte_malloc_heap_memory_detach; + rte_malloc_heap_memory_remove; + rte_malloc_heap_socket_is_external; + rte_mem_check_dma_mask_thread_unsafe; + rte_mem_set_dma_mask; + rte_memseg_get_fd; + rte_memseg_get_fd_offset; + rte_memseg_get_fd_offset_thread_unsafe; + rte_memseg_get_fd_thread_unsafe; + + # added in 19.02 + rte_extmem_attach; + rte_extmem_detach; + rte_extmem_register; + rte_extmem_unregister; + + # added in 19.05 + rte_dev_dma_map; + rte_dev_dma_unmap; + rte_fbarray_find_biggest_free; + rte_fbarray_find_biggest_used; + rte_fbarray_find_rev_biggest_free; + rte_fbarray_find_rev_biggest_used; + rte_intr_callback_unregister_pending; + rte_realloc_socket; + + # added in 19.08 + rte_intr_ack; + rte_lcore_cpuset; + rte_lcore_to_cpu_id; + rte_mcfg_timer_lock; + rte_mcfg_timer_unlock; + rte_rand_max; + + # added in 19.11 + rte_log_get_stream; + rte_mcfg_get_single_file_segments; + + # added in 20.02 + rte_thread_is_intr; + + # added in 20.05 + __rte_eal_trace_alarm_cancel; + __rte_eal_trace_alarm_set; + __rte_eal_trace_generic_double; + __rte_eal_trace_generic_float; + __rte_eal_trace_generic_func; + __rte_eal_trace_generic_i16; + __rte_eal_trace_generic_i32; + __rte_eal_trace_generic_i64; + __rte_eal_trace_generic_i8; + __rte_eal_trace_generic_int; + __rte_eal_trace_generic_long; + __rte_eal_trace_generic_ptr; + __rte_eal_trace_generic_str; + __rte_eal_trace_generic_u16; + __rte_eal_trace_generic_u32; + __rte_eal_trace_generic_u64; + __rte_eal_trace_generic_u8; + __rte_eal_trace_generic_void; + __rte_eal_trace_intr_callback_register; + __rte_eal_trace_intr_callback_unregister; + __rte_eal_trace_intr_enable; + __rte_eal_trace_intr_disable; + __rte_eal_trace_mem_free; + __rte_eal_trace_mem_malloc; + __rte_eal_trace_mem_realloc; + __rte_eal_trace_mem_zmalloc; + __rte_eal_trace_memzone_free; + __rte_eal_trace_memzone_lookup; + __rte_eal_trace_memzone_reserve; + __rte_eal_trace_thread_lcore_ready; + __rte_eal_trace_thread_remote_launch; + __rte_trace_mem_per_thread_alloc; + __rte_trace_point_emit_field; + __rte_trace_point_register; + per_lcore_trace_mem; + per_lcore_trace_point_sz; + rte_log_can_log; + rte_thread_getname; + rte_trace_dump; + rte_trace_is_enabled; + rte_trace_metadata_dump; + rte_trace_mode_get; + rte_trace_mode_set; + rte_trace_pattern; + rte_trace_point_disable; + rte_trace_point_enable; + rte_trace_point_is_enabled; + rte_trace_point_lookup; + rte_trace_regexp; + rte_trace_save; +}; diff --git a/src/spdk/dpdk/lib/librte_eal/windows/eal.c b/src/spdk/dpdk/lib/librte_eal/windows/eal.c new file mode 100644 index 000000000..d084606a6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/eal.c @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include <fcntl.h> +#include <io.h> +#include <share.h> +#include <sys/stat.h> + +#include <rte_debug.h> +#include <rte_eal.h> +#include <eal_memcfg.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <eal_thread.h> +#include <eal_internal_cfg.h> +#include <eal_filesystem.h> +#include <eal_options.h> +#include <eal_private.h> + +#include "eal_windows.h" + + /* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc + */ +static int mem_cfg_fd = -1; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if we can open the file but not get a write-lock we are a secondary + * process. NOTE: if we get a file handle back, we keep that open + * and don't close it to prevent a race condition between multiple opens + */ + errno_t err = _sopen_s(&mem_cfg_fd, pathname, + _O_RDWR, _SH_DENYNO, _S_IREAD | _S_IWRITE); + if (err == 0) { + OVERLAPPED soverlapped = { 0 }; + soverlapped.Offset = sizeof(*rte_config.mem_config); + soverlapped.OffsetHigh = 0; + + HANDLE hwinfilehandle = (HANDLE)_get_osfhandle(mem_cfg_fd); + + if (!LockFileEx(hwinfilehandle, + LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, + sizeof(*rte_config.mem_config), 0, &soverlapped)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + /* Allow the application to print its usage message too + * if hook is set + */ + if (rte_application_usage_hook) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + + argvopt = argv; + + eal_reset_internal_config(&internal_config); + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, + &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + optind = 0; /* reset getopt lib */ +} + +/* Parse the argument given in the command line of the application */ +__attribute__((optnone)) static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + + argvopt = argv; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') { + eal_usage(prgname); + return -1; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + return -1; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Windows\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Windows\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Windows\n", opt); + } + eal_usage(prgname); + return -1; + } + } + + if (eal_adjust_config(&internal_config) != 0) + return -1; + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + return -1; + } + + if (optind >= 0) + argv[optind - 1] = prgname; + ret = optind - 1; + optind = 0; /* reset getopt lib */ + return ret; +} + +static int +sync_func(void *arg __rte_unused) +{ + return 0; +} + +static void +rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + + /* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret; + + rte_eal_log_init(NULL, 0); + + eal_log_level_parse(argc, argv); + + /* create a map of all processors in the system */ + eal_create_cpu_map(); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) + exit(1); + + eal_thread_init_master(rte_config.master_lcore); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (_pipe(lcore_config[i].pipe_master2slave, + sizeof(char), _O_BINARY) < 0) + rte_panic("Cannot create pipe\n"); + if (_pipe(lcore_config[i].pipe_slave2master, + sizeof(char), _O_BINARY) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + if (eal_thread_create(&lcore_config[i].thread_id) != 0) + rte_panic("Cannot create thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + return fctret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/windows/eal_debug.c b/src/spdk/dpdk/lib/librte_eal/windows/eal_debug.c new file mode 100644 index 000000000..669be6ff9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/eal_debug.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include <stdarg.h> +#include <rte_log.h> +#include <rte_debug.h> + + /* call abort(), it will generate a coredump if enabled */ +void +__rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + abort(); +} diff --git a/src/spdk/dpdk/lib/librte_eal/windows/eal_lcore.c b/src/spdk/dpdk/lib/librte_eal/windows/eal_lcore.c new file mode 100644 index 000000000..82ee45413 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/eal_lcore.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include <pthread.h> +#include <stdint.h> + +#include <rte_common.h> + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_windows.h" + +/* global data structure that contains the CPU map */ +static struct _wcpu_map { + unsigned int total_procs; + unsigned int proc_sockets; + unsigned int proc_cores; + unsigned int reserved; + struct _win_lcore_map { + uint8_t socket_id; + uint8_t core_id; + } wlcore_map[RTE_MAX_LCORE]; +} wcpu_map = { 0 }; + +/* + * Create a map of all processors and associated cores on the system + */ +void +eal_create_cpu_map() +{ + wcpu_map.total_procs = + GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + + LOGICAL_PROCESSOR_RELATIONSHIP lprocRel; + DWORD lprocInfoSize = 0; + BOOL ht_enabled = FALSE; + + /* First get the processor package information */ + lprocRel = RelationProcessorPackage; + /* Determine the size of buffer we need (pass NULL) */ + GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize); + wcpu_map.proc_sockets = lprocInfoSize / 48; + + lprocInfoSize = 0; + /* Next get the processor core information */ + lprocRel = RelationProcessorCore; + GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize); + wcpu_map.proc_cores = lprocInfoSize / 48; + + if (wcpu_map.total_procs > wcpu_map.proc_cores) + ht_enabled = TRUE; + + /* Distribute the socket and core ids appropriately + * across the logical cores. For now, split the cores + * equally across the sockets. + */ + unsigned int lcore = 0; + for (unsigned int socket = 0; socket < + wcpu_map.proc_sockets; ++socket) { + for (unsigned int core = 0; + core < (wcpu_map.proc_cores / wcpu_map.proc_sockets); + ++core) { + wcpu_map.wlcore_map[lcore] + .socket_id = socket; + wcpu_map.wlcore_map[lcore] + .core_id = core; + lcore++; + if (ht_enabled) { + wcpu_map.wlcore_map[lcore] + .socket_id = socket; + wcpu_map.wlcore_map[lcore] + .core_id = core; + lcore++; + } + } + } +} + +/* + * Check if a cpu is present by the presence of the cpu information for it + */ +int +eal_cpu_detected(unsigned int lcore_id) +{ + return (lcore_id < wcpu_map.total_procs); +} + +/* + * Get CPU socket id for a logical core + */ +unsigned +eal_cpu_socket_id(unsigned int lcore_id) +{ + return wcpu_map.wlcore_map[lcore_id].socket_id; +} + +/* + * Get CPU socket id (NUMA node) for a logical core + */ +unsigned +eal_cpu_core_id(unsigned int lcore_id) +{ + return wcpu_map.wlcore_map[lcore_id].core_id; +} diff --git a/src/spdk/dpdk/lib/librte_eal/windows/eal_log.c b/src/spdk/dpdk/lib/librte_eal/windows/eal_log.c new file mode 100644 index 000000000..875981f13 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/eal_log.c @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include "eal_private.h" + +/* set the log to default function, called during eal init process. */ +int +rte_eal_log_init(__rte_unused const char *id, __rte_unused int facility) +{ + rte_openlog_stream(stderr); + + eal_log_set_default(stderr); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/windows/eal_thread.c b/src/spdk/dpdk/lib/librte_eal/windows/eal_thread.c new file mode 100644 index 000000000..e149199a6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/eal_thread.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include <io.h> + +#include <rte_atomic.h> +#include <rte_debug.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_common.h> +#include <rte_memory.h> +#include <eal_thread.h> + +#include "eal_private.h" +#include "eal_windows.h" + +RTE_DEFINE_PER_LCORE(unsigned int, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned int, _socket_id) = (unsigned int)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned int slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = _write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = _read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +void +eal_thread_init_master(unsigned int lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; +} + +static inline pthread_t +eal_thread_self(void) +{ + return GetCurrentThreadId(); +} + +/* main loop of threads */ +void * +eal_thread_loop(void *arg __rte_unused) +{ + char c; + int n, ret; + unsigned int lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = eal_thread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = _read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = _write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + + /* when a service core returns, it should go directly to WAIT + * state, because the application will not lcore_wait() for it. + */ + if (lcore_config[lcore_id].core_role == ROLE_SERVICE) + lcore_config[lcore_id].state = WAIT; + else + lcore_config[lcore_id].state = FINISHED; + } +} + +/* function to create threads */ +int +eal_thread_create(pthread_t *thread) +{ + HANDLE th; + + th = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)eal_thread_loop, + NULL, 0, (LPDWORD)thread); + if (!th) + return -1; + + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + SetThreadPriority(th, THREAD_PRIORITY_TIME_CRITICAL); + + return 0; +} + +int +rte_thread_setname(__rte_unused pthread_t id, __rte_unused const char *name) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/windows/eal_windows.h b/src/spdk/dpdk/lib/librte_eal/windows/eal_windows.h new file mode 100644 index 000000000..fadd676b2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/eal_windows.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2020 Dmitry Kozlyuk + */ + +#ifndef _EAL_WINDOWS_H_ +#define _EAL_WINDOWS_H_ + +/** + * @file Facilities private to Windows EAL + */ + +#include <rte_windows.h> + +/** + * Create a map of processors and cores on the system. + */ +void eal_create_cpu_map(void); + +/** + * Create a thread. + * + * @param thread + * The location to store the thread id if successful. + * @return + * 0 for success, -1 if the thread is not created. + */ +int eal_thread_create(pthread_t *thread); + +#endif /* _EAL_WINDOWS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/fnmatch.c b/src/spdk/dpdk/lib/librte_eal/windows/fnmatch.c new file mode 100644 index 000000000..f622bf54c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/fnmatch.c @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Guido van Rossum. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static const char sccsid[] = "@(#)fnmatch.c 8.2 (Berkeley) 4/16/94"; +#endif /* LIBC_SCCS and not lint */ + +/* + * Function fnmatch() as specified in POSIX 1003.2-1992, section B.6. + * Compares a filename or pathname to a pattern. + */ + +#include <ctype.h> +#include <string.h> +#include <stdio.h> + +#include "fnmatch.h" + +#define EOS '\0' + +static const char *rangematch(const char *, char, int); + +int +fnmatch(const char *pattern, const char *string, int flags) +{ + const char *stringstart; + char c, test; + + for (stringstart = string;;) + switch (c = *pattern++) { + case EOS: + if ((flags & FNM_LEADING_DIR) && *string == '/') + return (0); + return (*string == EOS ? 0 : FNM_NOMATCH); + case '?': + if (*string == EOS) + return (FNM_NOMATCH); + if (*string == '/' && (flags & FNM_PATHNAME)) + return (FNM_NOMATCH); + if (*string == '.' && (flags & FNM_PERIOD) && + (string == stringstart || + ((flags & FNM_PATHNAME) && *(string - 1) == '/'))) + return (FNM_NOMATCH); + ++string; + break; + case '*': + c = *pattern; + /* Collapse multiple stars. */ + while (c == '*') + c = *++pattern; + + if (*string == '.' && (flags & FNM_PERIOD) && + (string == stringstart || + ((flags & FNM_PATHNAME) && *(string - 1) == '/'))) + return (FNM_NOMATCH); + + /* Optimize for pattern with * at end or before /. */ + if (c == EOS) + if (flags & FNM_PATHNAME) + return ((flags & FNM_LEADING_DIR) || + strchr(string, '/') == NULL ? + 0 : FNM_NOMATCH); + else + return (0); + else if (c == '/' && flags & FNM_PATHNAME) { + string = strchr(string, '/'); + if (string == NULL) + return (FNM_NOMATCH); + break; + } + + /* General case, use recursion. */ + while ((test = *string) != EOS) { + if (!fnmatch(pattern, string, + flags & ~FNM_PERIOD)) + return (0); + if (test == '/' && flags & FNM_PATHNAME) + break; + ++string; + } + return (FNM_NOMATCH); + case '[': + if (*string == EOS) + return (FNM_NOMATCH); + if (*string == '/' && flags & FNM_PATHNAME) + return (FNM_NOMATCH); + pattern = rangematch(pattern, *string, flags); + if (pattern == NULL) + return (FNM_NOMATCH); + ++string; + break; + case '\\': + if (!(flags & FNM_NOESCAPE)) { + c = *pattern++; + if (c == EOS) { + c = '\\'; + --pattern; + } + } + /* FALLTHROUGH */ + default: + if (c == *string) + ; + else if ((flags & FNM_CASEFOLD) && + (tolower((unsigned char)c) == + tolower((unsigned char)*string))) + ; + else if ((flags & FNM_PREFIX_DIRS) && *string == EOS && + ((c == '/' && string != stringstart) || + (string == stringstart+1 && *stringstart == '/'))) + return (0); + else + return (FNM_NOMATCH); + string++; + break; + } + /* NOTREACHED */ +} + +static const char * +rangematch(const char *pattern, char test, int flags) +{ + int negate, ok; + char c, c2; + + /* + * A bracket expression starting with an unquoted circumflex + * character produces unspecified results (IEEE 1003.2-1992, + * 3.13.2). This implementation treats it like '!', for + * consistency with the regular expression syntax. + * J.T. Conklin (conklin@ngai.kaleida.com) + */ + negate = (*pattern == '!' || *pattern == '^'); + if (negate) + ++pattern; + + if (flags & FNM_CASEFOLD) + test = tolower((unsigned char)test); + + for (ok = 0; (c = *pattern++) != ']';) { + if (c == '\\' && !(flags & FNM_NOESCAPE)) + c = *pattern++; + if (c == EOS) + return (NULL); + + if (flags & FNM_CASEFOLD) + c = tolower((unsigned char)c); + + c2 = *(pattern + 1); + if (*pattern == '-' && c2 != EOS && c2 != ']') { + pattern += 2; + if (c2 == '\\' && !(flags & FNM_NOESCAPE)) + c2 = *pattern++; + if (c2 == EOS) + return (NULL); + + if (flags & FNM_CASEFOLD) + c2 = tolower((unsigned char)c2); + + if ((unsigned char)c <= (unsigned char)test && + (unsigned char)test <= (unsigned char)c2) + ok = 1; + } else if (c == test) + ok = 1; + } + return (ok == negate ? NULL : pattern); +} diff --git a/src/spdk/dpdk/lib/librte_eal/windows/getopt.c b/src/spdk/dpdk/lib/librte_eal/windows/getopt.c new file mode 100644 index 000000000..170c9b5e0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/getopt.c @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: ISC AND BSD-2-Clause + * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com> + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/* + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + */ + +#include <getopt.h> + +#ifdef NEED_USUAL_GETOPT + +#include <string.h> +#include <stdlib.h> + +const char *optarg; /* argument associated with option */ +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ + +static void pass(void) {} +#define warnx(a, ...) pass() + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH ((int)'?') +#define BADARG ((*options == ':') ? (int)':' : (int)'?') +#define INORDER 1 + +#define EMSG "" + +static const char *place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static const char recargchar[] = "option requires an argument -- %c"; +static const char recargstring[] = "option requires an argument -- %s"; +static const char ambig[] = "ambiguous option -- %.*s"; +static const char noarg[] = "option doesn't take an argument -- %.*s"; +static const char illoptchar[] = "unknown option -- %c"; +static const char illoptstring[] = "unknown option -- %s"; + +/* + * Compute the greatest common divisor of a and b. + */ +static int +gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void +permute_args(int panonopt_start, int panonopt_end, int opt_end, + char **nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char *swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) { + cstart = panonopt_end+i; + pos = cstart; + for (j = 0; j < cyclelen; j++) { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char **) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char **)nargv)[cstart] = swap; + } + } +} + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int +parse_long_options(char **nargv, const char *options, + const struct option *long_options, int *idx, int short_too) +{ + const char *current_argv; + char *has_equal; + size_t current_argv_len; + int i, match; + + current_argv = place; + match = -1; + + optind++; + + has_equal = strchr(current_argv, '='); + if (has_equal != NULL) { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, + current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) { + /* exact match */ + match = i; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int)current_argv_len, + current_argv); + optopt = 0; + return BADCH; + } + } + if (match != -1) { /* option found */ + if (long_options[match].has_arg == no_argument + && has_equal) { + if (PRINT_ERROR) + warnx(noarg, (int)current_argv_len, + current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return BADARG; + } + if (long_options[match].has_arg == required_argument || + long_options[match].has_arg == optional_argument) { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == + required_argument) { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) + && (optarg == NULL)) { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, + current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return BADARG; + } + } else { /* unknown option */ + if (short_too) { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return BADCH; + } + if (idx) + *idx = match; + if (long_options[match].flag) { + *long_options[match].flag = long_options[match].val; + return 0; + } else + return (long_options[match].val); +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int +getopt_internal(int nargc, char **nargv, const char *options, + const struct option *long_options, int *idx, int flags) +{ + char *oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + char *buf; + size_t len; + int optreset = 0; + + if (options == NULL) + return (-1); + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + */ + if (posixly_correct == -1) + posixly_correct = _dupenv_s(&buf, &len, "POSIXLY_CORRECT"); + if (!posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + else if (*options == '-') + flags |= FLAG_ALLARGS; + if (*options == '+' || *options == '-') + options++; + if (!posixly_correct) + free(buf); + /* + * reset if requested + */ + if (optind == 0) + optind = optreset = 1; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } else if (nonopt_start != -1) { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + place = nargv[optind]; + if (*place != '-' || + (place[1] == '\0' && strchr(options, '-') == NULL)) { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return INORDER; + } + if (!(flags & FLAG_PERMUTE)) { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + nonopt_start = optind - + (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && + (*place == '-' || (flags & FLAG_LONGONLY))) { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, + idx, short_too); + if (optchar != -1) { + place = EMSG; + return optchar; + } + } + + optchar = (int)*place++; + oli = strchr(options, optchar); + if (optchar == (int)':' || + (optchar == (int)'-' && *place != '\0') || + oli == NULL) { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int)'-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return BADCH; + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') { + /* -W long-option */ + if (*place) + ; + else if (++optind >= nargc) { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return BADARG; + } /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, + idx, 0); + place = EMSG; + return optchar; + } + if (*++oli != ':') { /* doesn't take argument */ + if (!*place) + ++optind; + } else { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') { /* arg not optional */ + if (++optind >= nargc) { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return BADARG; + } + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return optchar; +} + +/* + * getopt -- + * Parse argc/argv argument vector. + */ +int +getopt(int nargc, char *nargv[], const char *options) +{ + return getopt_internal(nargc, nargv, options, NULL, NULL, + FLAG_PERMUTE); +} + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int +getopt_long(int nargc, char *nargv[], const char *options, + const struct option *long_options, int *idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int +getopt_long_only(int nargc, char *nargv[], const char *options, + const struct option *long_options, int *idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE|FLAG_LONGONLY)); +} + +#endif /* NEED_USUAL_GETOPT */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/dirent.h b/src/spdk/dpdk/lib/librte_eal/windows/include/dirent.h new file mode 100644 index 000000000..869a59837 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/dirent.h @@ -0,0 +1,664 @@ +/* SPDX-License-Identifier: MIT + * Dirent interface for Microsoft Visual Studio + * Version 1.21 + * Copyright (C) 2006-2012 Toni Ronkko + * https://github.com/tronkko/dirent + */ + +#ifndef DIRENT_H +#define DIRENT_H + +/* + * Include windows.h without Windows Sockets 1.1 to prevent conflicts with + * Windows Sockets 2.0. + */ +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif + +#include <windows.h> + +#include <stdio.h> +#include <stdarg.h> +#include <wchar.h> +#include <string.h> +#include <stdlib.h> +#include <malloc.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <errno.h> + +/* Maximum length of file name */ +#if !defined(PATH_MAX) +# define PATH_MAX MAX_PATH +#endif + +/* File type flags for d_type */ +#define DT_UNKNOWN 0 +#define DT_REG S_IFREG +#define DT_DIR S_IFDIR +#define DT_CHR S_IFCHR + +/* + * File type macros. Note that block devices, sockets and links cannot be + * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are + * only defined for compatibility. These macros should always return false + * on Windows. + */ +#if !defined(S_ISDIR) +# define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) +#endif +#if !defined(S_ISREG) +# define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) +#endif + +/* Wide-character version */ +struct _wdirent { + /* Always zero */ + long d_ino; + + /* Structure size */ + unsigned short d_reclen; + + /* Length of name without \0 */ + size_t d_namlen; + + /* File type */ + int d_type; + + /* File name */ + wchar_t d_name[PATH_MAX]; +}; +typedef struct _wdirent _wdirent; + +struct _WDIR { + /* Current directory entry */ + struct _wdirent ent; + + /* Private file data */ + WIN32_FIND_DATAW data; + + /* True if data is valid */ + int cached; + + /* Win32 search handle */ + HANDLE handle; + + /* Initial directory name */ + wchar_t *patt; +}; +typedef struct _WDIR _WDIR; + +static _WDIR *_wopendir(const wchar_t *dirname); +static int _wclosedir(_WDIR *dirp); + +/* For compatibility with Symbian */ +#define wdirent _wdirent +#define WDIR _WDIR +#define wopendir _wopendir +#define wclosedir _wclosedir + +/* Multi-byte character versions */ +struct dirent { + /* Always zero */ + long d_ino; + + /* Structure size */ + unsigned short d_reclen; + + /* Length of name without \0 */ + size_t d_namlen; + + /* File type */ + int d_type; + + /* File name */ + char d_name[PATH_MAX]; +}; +typedef struct dirent dirent; + +struct DIR { + struct dirent ent; + struct _WDIR *wdirp; +}; +typedef struct DIR DIR; + +static DIR *opendir(const char *dirname); +static struct dirent *readdir(DIR *dirp); +static int closedir(DIR *dirp); + +/* Internal utility functions */ +static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp); +static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp); + +static int dirent_mbstowcs_s( + size_t *pReturnValue, + wchar_t *wcstr, + size_t sizeInWords, + const char *mbstr, + size_t count); + +static int dirent_wcstombs_s( + size_t *pReturnValue, + char *mbstr, + size_t sizeInBytes, + const wchar_t *wcstr, + size_t count); + +static void dirent_set_errno(int error); + +/* + * Open directory stream DIRNAME for read and return a pointer to the + * internal working area that is used to retrieve individual directory + * entries. + */ +static _WDIR* +_wopendir(const wchar_t *dirname) +{ + _WDIR *dirp = NULL; + int error; + + /* Must have directory name */ + if (dirname == NULL || dirname[0] == '\0') { + dirent_set_errno(ENOENT); + return NULL; + } + + /* Allocate new _WDIR structure */ + dirp = (_WDIR *)malloc(sizeof(struct _WDIR)); + if (dirp != NULL) { + DWORD n; + + /* Reset _WDIR structure */ + dirp->handle = INVALID_HANDLE_VALUE; + dirp->patt = NULL; + dirp->cached = 0; + + /* Compute the length of full path plus zero terminator + * + * Note that on WinRT there's no way to convert relative paths + * into absolute paths, so just assume its an absolute path. + */ + #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) + n = wcslen(dirname); + #else + n = GetFullPathNameW(dirname, 0, NULL, NULL); + #endif + + /* Allocate room for absolute directory name and search + * pattern + */ + dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16); + if (dirp->patt) { + /* Convert relative directory name to an + * absolute one. This allows rewinddir() to + * function correctly even when current working + * directory is changed between opendir() + * and rewinddir(). + * + * Note that on WinRT there's no way to convert + * relative paths into absolute paths, so just + * assume its an absolute path. + */ + #if defined(WINAPI_FAMILY) && \ + (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) + wcsncpy_s(dirp->patt, n + 1, dirname, n); + #else + n = GetFullPathNameW(dirname, n, dirp->patt, NULL); + #endif + if (n > 0) { + wchar_t *p; + + /* Append search pattern \* to the directory + * name + */ + p = dirp->patt + n; + if (dirp->patt < p) { + switch (p[-1]) { + case '\\': + case '/': + case ':': + /* Directory ends in path separator, + * e.g.c:\temp\ + */ + /*NOP*/; + break; + + default: + /* Directory name doesn't end in path + * separator + */ + *p++ = '\\'; + } + } + *p++ = '*'; + *p = '\0'; + + /* Open directory stream and retrieve the first + * entry + */ + if (dirent_first(dirp)) { + /* Directory stream opened successfully */ + error = 0; + } else { + /* Cannot retrieve first entry */ + error = 1; + dirent_set_errno(ENOENT); + } + + } else { + /* Cannot retrieve full path name */ + dirent_set_errno(ENOENT); + error = 1; + } + + } else { + /* Cannot allocate memory for search pattern */ + error = 1; + } + + } else { + /* Cannot allocate _WDIR structure */ + error = 1; + } + + /* Clean up in case of error */ + if (error && dirp) { + _wclosedir(dirp); + dirp = NULL; + } + + return dirp; +} + +/* + * Close directory stream opened by opendir() function. + * This invalidates the DIR structure as well as any directory + * entry read previously by _wreaddir(). + */ +static int +_wclosedir(_WDIR *dirp) +{ + int ok; + if (dirp) { + + /* Release search handle */ + if (dirp->handle != INVALID_HANDLE_VALUE) { + FindClose(dirp->handle); + dirp->handle = INVALID_HANDLE_VALUE; + } + + /* Release search pattern */ + if (dirp->patt) { + free(dirp->patt); + dirp->patt = NULL; + } + + /* Release directory structure */ + free(dirp); + ok = /*success*/0; + + } else { + /* Invalid directory stream */ + dirent_set_errno(EBADF); + ok = /*failure*/-1; + } + return ok; +} + +/* Get first directory entry (internal) */ +static WIN32_FIND_DATAW* +dirent_first(_WDIR *dirp) +{ + WIN32_FIND_DATAW *datap; + + /* Open directory and retrieve the first entry */ + dirp->handle = FindFirstFileExW( + dirp->patt, FindExInfoStandard, &dirp->data, + FindExSearchNameMatch, NULL, 0); + if (dirp->handle != INVALID_HANDLE_VALUE) { + + /* a directory entry is now waiting in memory */ + datap = &dirp->data; + dirp->cached = 1; + + } else { + + /* Failed to re-open directory: no directory entry in memory */ + dirp->cached = 0; + datap = NULL; + + } + return datap; +} + +/* Get next directory entry (internal) */ +static WIN32_FIND_DATAW* +dirent_next(_WDIR *dirp) +{ + WIN32_FIND_DATAW *p; + + /* Get next directory entry */ + if (dirp->cached != 0) { + + /* A valid directory entry already in memory */ + p = &dirp->data; + dirp->cached = 0; + + } else if (dirp->handle != INVALID_HANDLE_VALUE) { + + /* Get the next directory entry from stream */ + if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) { + /* Got a file */ + p = &dirp->data; + } else { + /* The very last entry has been processed + *or an error occurred + */ + FindClose(dirp->handle); + dirp->handle = INVALID_HANDLE_VALUE; + p = NULL; + } + + } else { + + /* End of directory stream reached */ + p = NULL; + + } + + return p; +} + +/* + * Open directory stream using plain old C-string. + */ +static DIR* +opendir(const char *dirname) +{ + struct DIR *dirp; + int error; + + /* Must have directory name */ + if (dirname == NULL || dirname[0] == '\0') { + dirent_set_errno(ENOENT); + return NULL; + } + + /* Allocate memory for DIR structure */ + dirp = (DIR *)malloc(sizeof(struct DIR)); + if (dirp) { + wchar_t wname[PATH_MAX]; + size_t n; + + /* Convert directory name to wide-character string */ + error = dirent_mbstowcs_s(&n, wname, PATH_MAX, + dirname, PATH_MAX); + if (!error) { + + /* Open directory stream using wide-character name */ + dirp->wdirp = _wopendir(wname); + if (dirp->wdirp) { + /* Directory stream opened */ + error = 0; + } else { + /* Failed to open directory stream */ + error = 1; + } + + } else { + /* + * Cannot convert file name to wide-character string. + * This occurs if the string contains invalid multi-byte + * sequences or the output buffer is too small to + * contain the resulting string. + */ + error = 1; + } + + } else { + /* Cannot allocate DIR structure */ + error = 1; + } + + /* Clean up in case of error */ + if (error && dirp) { + free(dirp); + dirp = NULL; + } + + return dirp; +} + +/* + * Read next directory entry. + * + * When working with text consoles, please note that file names + * returned by readdir() are represented in the default ANSI code + * page while any output toconsole is typically formatted on another + * code page. Thus, non-ASCII characters in file names will not usually + * display correctly on console. The problem can be fixed in two ways: + * (1) change the character set of console to 1252 using chcp utility + * and use Lucida Console font, or (2) use _cprintf function when + * writing to console. The _cprinf() will re-encode ANSI strings to the + * console code page so many non-ASCII characters will display correctly. + */ +static struct dirent* +readdir(DIR *dirp) +{ + WIN32_FIND_DATAW *datap; + struct dirent *entp; + + /* Read next directory entry */ + datap = dirent_next(dirp->wdirp); + if (datap) { + size_t n; + int error; + + /* Attempt to convert file name to multi-byte string */ + error = dirent_wcstombs_s(&n, dirp->ent.d_name, + PATH_MAX, datap->cFileName, PATH_MAX); + + /* + * If the file name cannot be represented by a multi-byte + * string, then attempt to use old 8+3 file name. + * This allows traditional Unix-code to access some file + * names despite of unicode characters, although file names + * may seem unfamiliar to the user. + * + * Be ware that the code below cannot come up with a short + * file name unless the file system provides one. At least + * VirtualBox shared folders fail to do this. + */ + if (error && datap->cAlternateFileName[0] != '\0') { + error = dirent_wcstombs_s( + &n, dirp->ent.d_name, PATH_MAX, + datap->cAlternateFileName, PATH_MAX); + } + + if (!error) { + DWORD attr; + + /* Initialize directory entry for return */ + entp = &dirp->ent; + + /* Length of file name excluding zero terminator */ + entp->d_namlen = n - 1; + + /* File attributes */ + attr = datap->dwFileAttributes; + if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) + entp->d_type = DT_CHR; + else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) + entp->d_type = DT_DIR; + else + entp->d_type = DT_REG; + + /* Reset dummy fields */ + entp->d_ino = 0; + entp->d_reclen = sizeof(struct dirent); + + } else { + /* + * Cannot convert file name to multi-byte string so + * construct an erroneous directory entry and return + * that. Note that we cannot return NULL as that would + * stop the processing of directory entries completely. + */ + entp = &dirp->ent; + entp->d_name[0] = '?'; + entp->d_name[1] = '\0'; + entp->d_namlen = 1; + entp->d_type = DT_UNKNOWN; + entp->d_ino = 0; + entp->d_reclen = 0; + } + + } else { + /* No more directory entries */ + entp = NULL; + } + + return entp; +} + +/* + * Close directory stream. + */ +static int +closedir(DIR *dirp) +{ + int ok; + if (dirp) { + + /* Close wide-character directory stream */ + ok = _wclosedir(dirp->wdirp); + dirp->wdirp = NULL; + + /* Release multi-byte character version */ + free(dirp); + + } else { + + /* Invalid directory stream */ + dirent_set_errno(EBADF); + ok = /*failure*/-1; + + } + return ok; +} + +/* Convert multi-byte string to wide character string */ +static int +dirent_mbstowcs_s( + size_t *pReturnValue, + wchar_t *wcstr, + size_t sizeInWords, + const char *mbstr, + size_t count) +{ + int error; + + #if defined(_MSC_VER) && _MSC_VER >= 1400 + /* Microsoft Visual Studio 2005 or later */ + error = mbstowcs_s(pReturnValue, wcstr, + sizeInWords, mbstr, count); + #else + + /* Older Visual Studio or non-Microsoft compiler */ + size_t n; + + /* Convert to wide-character string (or count characters) */ + n = mbstowcs(wcstr, mbstr, sizeInWords); + if (!wcstr || n < count) { + + /* Zero-terminate output buffer */ + if (wcstr && sizeInWords) { + if (n >= sizeInWords) + n = sizeInWords - 1; + wcstr[n] = 0; + } + + /* Length of resuting multi-byte string WITH zero + *terminator + */ + if (pReturnValue) + *pReturnValue = n + 1; + + /* Success */ + error = 0; + + } else { + + /* Could not convert string */ + error = 1; + + } + #endif + + return error; +} + +/* Convert wide-character string to multi-byte string */ +static int +dirent_wcstombs_s( + size_t *pReturnValue, + char *mbstr, + size_t sizeInBytes, /* max size of mbstr */ + const wchar_t *wcstr, + size_t count) +{ + int error; + + #if defined(_MSC_VER) && _MSC_VER >= 1400 + /* Microsoft Visual Studio 2005 or later */ + error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count); + #else + /* Older Visual Studio or non-Microsoft compiler */ + size_t n; + + /* Convert to multi-byte string + * (or count the number of bytes needed) + */ + n = wcstombs(mbstr, wcstr, sizeInBytes); + if (!mbstr || n < count) { + /* Zero-terminate output buffer */ + if (mbstr && sizeInBytes) { + if (n >= sizeInBytes) + n = sizeInBytes - 1; + mbstr[n] = '\0'; + } + /* Length of resulting multi-bytes string WITH + *zero-terminator + */ + if (pReturnValue) + *pReturnValue = n + 1; + /* Success */ + error = 0; + } else { + /* Cannot convert string */ + error = 1; + } + #endif + + return error; +} + +/* Set errno variable */ +static void +dirent_set_errno(int error) +{ +#if defined(_MSC_VER) && _MSC_VER >= 1400 + /* Microsoft Visual Studio 2005 and later */ + _set_errno(error); +#else + + /* Non-Microsoft compiler or older Microsoft compiler */ + errno = error; +#endif +} + +#ifdef __cplusplus +} +#endif +#endif /*DIRENT_H*/ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/fnmatch.h b/src/spdk/dpdk/lib/librte_eal/windows/include/fnmatch.h new file mode 100644 index 000000000..142753c35 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/fnmatch.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _FNMATCH_H_ +#define _FNMATCH_H_ + +/** + * This file is required to support the common code in eal_common_log.c + * as Microsoft libc does not contain fnmatch.h. This may be removed in + * future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> + +#define FNM_NOMATCH 1 + +#define FNM_NOESCAPE 0x01 +#define FNM_PATHNAME 0x02 +#define FNM_PERIOD 0x04 +#define FNM_LEADING_DIR 0x08 +#define FNM_CASEFOLD 0x10 +#define FNM_PREFIX_DIRS 0x20 + +/** + * This function is used for searhing a given string source + * with the given regular expression pattern. + * + * @param pattern + * regular expression notation decribing the pattern to match + * + * @param string + * source string to searcg for the pattern + * + * @param flag + * containing information about the pattern + * + * @return + * if the pattern is found then return 0 or else FNM_NOMATCH + */ +int fnmatch(const char *pattern, const char *string, int flags); + +#ifdef __cplusplus +} +#endif + +#endif /* _FNMATCH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/getopt.h b/src/spdk/dpdk/lib/librte_eal/windows/include/getopt.h new file mode 100644 index 000000000..6f57af454 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/getopt.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: BSD-2-Clause + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + */ + +/** + * @file + * getopt compat. + * + * This module provides getopt() and getopt_long(). + */ + +#ifndef _USUAL_GETOPT_H_ +#define _USUAL_GETOPT_H_ + +#ifndef NEED_USUAL_GETOPT +#if !defined(HAVE_GETOPT_H) || !defined(HAVE_GETOPT) || \ + !defined(HAVE_GETOPT_LONG) +#define NEED_USUAL_GETOPT +#endif +#endif + +#ifndef NEED_USUAL_GETOPT + +/* Use system getopt */ +#ifdef RTE_TOOLCHAIN_GCC +#include_next <getopt.h> +#else +#include <getopt.h> +#endif + +#else /* NEED_USUAL_GETOPT */ + +/* avoid name collision */ +#define optarg usual_optarg +#define opterr usual_opterr +#define optind usual_optind +#define optopt usual_optopt +#define getopt(a, b, c) usual_getopt(a, b, c) +#define getopt_long(a, b, c, d, e) usual_getopt_long(a, b, c, d, e) + + +/** argument to current option, or NULL if it has none */ +extern const char *optarg; +/** Current position in arg string. Starts from 1. + * Setting to 0 resets state. + */ +extern int optind; +/** whether getopt() should print error messages on problems. Default: 1. */ +extern int opterr; +/** Option char which caused error */ +extern int optopt; + +/** long option takes no argument */ +#define no_argument 0 +/** long option requires argument */ +#define required_argument 1 +/** long option has optional argument */ +#define optional_argument 2 + +/** Long option description */ +struct option { + /** name of long option */ + const char *name; + + /** + * whether option takes an argument. + * One of no_argument, required_argument, and optional_argument. + */ + int has_arg; + + /** if not NULL, set *flag to val when option found */ + int *flag; + + /** if flag not NULL, value to set *flag to; else return value */ + int val; +}; + +/** Compat: getopt */ +int getopt(int argc, char *argv[], const char *options); + +/** Compat: getopt_long */ +int getopt_long(int argc, char *argv[], const char *options, + const struct option *longopts, int *longindex); + +/** Compat: getopt_long_only */ +int getopt_long_only(int nargc, char *argv[], const char *options, + const struct option *long_options, int *idx); + + +#endif /* NEED_USUAL_GETOPT */ + +#endif /* !_USUAL_GETOPT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/meson.build b/src/spdk/dpdk/lib/librte_eal/windows/include/meson.build new file mode 100644 index 000000000..5fb1962ac --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020 Mellanox Technologies, Ltd + +includes += include_directories('.') + +headers += files( + 'rte_os.h', + 'rte_windows.h', +) diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/pthread.h b/src/spdk/dpdk/lib/librte_eal/windows/include/pthread.h new file mode 100644 index 000000000..0bbed5c3b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/pthread.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _PTHREAD_H_ +#define _PTHREAD_H_ + +#include <stdint.h> + +/** + * This file is required to support the common code in eal_common_proc.c, + * eal_common_thread.c and common\include\rte_per_lcore.h as Microsoft libc + * does not contain pthread.h. This may be removed in future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include <windows.h> +#include <rte_common.h> + +#define PTHREAD_BARRIER_SERIAL_THREAD TRUE + +/* defining pthread_t type on Windows since there is no in Microsoft libc*/ +typedef uintptr_t pthread_t; + +/* defining pthread_attr_t type on Windows since there is no in Microsoft libc*/ +typedef void *pthread_attr_t; + +typedef SYNCHRONIZATION_BARRIER pthread_barrier_t; + +#define pthread_barrier_init(barrier, attr, count) \ + InitializeSynchronizationBarrier(barrier, count, -1) +#define pthread_barrier_wait(barrier) EnterSynchronizationBarrier(barrier, \ + SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY) +#define pthread_barrier_destroy(barrier) \ + DeleteSynchronizationBarrier(barrier) +#define pthread_cancel(thread) TerminateThread((HANDLE) thread, 0) + +/* pthread function overrides */ +#define pthread_self() \ + ((pthread_t)GetCurrentThreadId()) +#define pthread_setaffinity_np(thread, size, cpuset) \ + eal_set_thread_affinity_mask(thread, (unsigned long *) cpuset) +#define pthread_getaffinity_np(thread, size, cpuset) \ + eal_get_thread_affinity_mask(thread, (unsigned long *) cpuset) +#define pthread_create(threadid, threadattr, threadfunc, args) \ + eal_create_thread(threadid, threadfunc, args) + +static inline int +eal_set_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset) +{ + SetThreadAffinityMask((HANDLE) threadid, *cpuset); + return 0; +} + +static inline int +eal_get_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset) +{ + /* Workaround for the lack of a GetThreadAffinityMask() + *API in Windows + */ + /* obtain previous mask by setting dummy mask */ + DWORD dwprevaffinitymask = + SetThreadAffinityMask((HANDLE) threadid, 0x1); + /* set it back! */ + SetThreadAffinityMask((HANDLE) threadid, dwprevaffinitymask); + *cpuset = dwprevaffinitymask; + return 0; +} + +static inline int +eal_create_thread(void *threadid, void *threadfunc, void *args) +{ + HANDLE hThread; + hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)threadfunc, + args, 0, (LPDWORD)threadid); + if (hThread) { + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + SetThreadPriority(hThread, THREAD_PRIORITY_TIME_CRITICAL); + } + return ((hThread != NULL) ? 0 : E_FAIL); +} + +static inline int +pthread_join(__rte_unused pthread_t thread, + __rte_unused void **value_ptr) +{ + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PTHREAD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/regex.h b/src/spdk/dpdk/lib/librte_eal/windows/include/regex.h new file mode 100644 index 000000000..827f93841 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/regex.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _REGEX_H_ +#define _REGEX_H_ + +/** + * This file is required to support the common code in eal_common_log.c + * as Microsoft libc does not contain regex.h. This may be removed in + * future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#define REG_NOMATCH 1 +#define REG_ESPACE 12 + +#include <rte_common.h> + +/* defining regex_t for Windows */ +typedef void *regex_t; +/* defining regmatch_t for Windows */ +typedef void *regmatch_t; + +/** + * The regcomp() function will compile the regular expression + * contained in the string pointed to by the pattern argument + * and place the results in the structure pointed to by preg. + * The cflags argument is the bitwise inclusive OR of zero or + * more of the flags + */ +static inline int regcomp(__rte_unused regex_t *preg, + __rte_unused const char *regex, __rte_unused int cflags) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return REG_ESPACE; +} + +/** + * The regexec() function compares the null-terminated string + * specified by string with the compiled regular expression + * preg initialised by a previous call to regcomp(). If it finds + * a match, regexec() returns 0; otherwise it returns non-zero + * indicating either no match or an error. The eflags argument + * is the bitwise inclusive OR of zero or more of the flags. + */ +static inline int regexec(__rte_unused const regex_t *preg, + __rte_unused const char *string, __rte_unused size_t nmatch, + __rte_unused regmatch_t pmatch[], __rte_unused int eflags) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return REG_NOMATCH; +} + +/** + * The regerror() function provides a mapping from error codes + * returned by regcomp() and regexec() to unspecified printable strings. + */ +static inline size_t regerror(__rte_unused int errcode, + __rte_unused const regex_t *preg, char *errbuf, + __rte_unused size_t errbuf_size) +{ + /* TODO */ + /* This is a stub, not the expected result */ + if (errbuf) { + *errbuf = '\0'; + return 1; + } + return 0; +} + +/** + * The regfree() function frees any memory allocated by regcomp() + * associated with preg. + */ +static inline void regfree(__rte_unused regex_t *preg) +{ + /* TODO */ + /* This is a stub, not the expected result */ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _REGEX_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/rte_os.h b/src/spdk/dpdk/lib/librte_eal/windows/include/rte_os.h new file mode 100644 index 000000000..510e39e03 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/rte_os.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_OS_H_ +#define _RTE_OS_H_ + +/** + * This is header should contain any function/macro definition + * which are not supported natively or named differently in the + * Windows OS. It must not include Windows-specific headers. + */ + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* limits.h replacement, value as in <windows.h> */ +#ifndef PATH_MAX +#define PATH_MAX _MAX_PATH +#endif + +#define strerror_r(a, b, c) strerror_s(b, c, a) + +/* strdup is deprecated in Microsoft libc and _strdup is preferred */ +#define strdup(str) _strdup(str) + +#define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) + +#define index(a, b) strchr(a, b) +#define rindex(a, b) strrchr(a, b) + +#define strncasecmp(s1, s2, count) _strnicmp(s1, s2, count) + +/* cpu_set macros implementation */ +#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2) +#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2) +#define RTE_CPU_FILL(set) CPU_FILL(set) +#define RTE_CPU_NOT(dst, src) CPU_NOT(dst, src) + +/* as in <windows.h> */ +typedef long long ssize_t; + +#ifndef RTE_TOOLCHAIN_GCC +static inline int +asprintf(char **buffer, const char *format, ...) +{ + int size, ret; + va_list arg; + + va_start(arg, format); + size = vsnprintf(NULL, 0, format, arg); + va_end(arg); + if (size < 0) + return -1; + size++; + + *buffer = malloc(size); + if (*buffer == NULL) + return -1; + + va_start(arg, format); + ret = vsnprintf(*buffer, size, format, arg); + va_end(arg); + if (ret != size - 1) { + free(*buffer); + return -1; + } + return ret; +} +#endif /* RTE_TOOLCHAIN_GCC */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_OS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/rte_windows.h b/src/spdk/dpdk/lib/librte_eal/windows/include/rte_windows.h new file mode 100644 index 000000000..ed6e4c148 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/rte_windows.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2020 Dmitry Kozlyuk + */ + +#ifndef _RTE_WINDOWS_H_ +#define _RTE_WINDOWS_H_ + +/** + * @file Windows-specific facilities + * + * This file should be included by DPDK libraries and applications + * that need access to Windows API. It includes platform SDK headers + * in compatible order with proper options and defines error-handling macros. + */ + +/* Disable excessive libraries. */ +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif + +/* Must come first. */ +#include <windows.h> + +#include <basetsd.h> +#include <psapi.h> + +/* Have GUIDs defined. */ +#ifndef INITGUID +#define INITGUID +#endif +#include <initguid.h> + +/** + * Log GetLastError() with context, usually a Win32 API function and arguments. + */ +#define RTE_LOG_WIN32_ERR(...) \ + RTE_LOG(DEBUG, EAL, RTE_FMT("GetLastError()=%lu: " \ + RTE_FMT_HEAD(__VA_ARGS__,) "\n", GetLastError(), \ + RTE_FMT_TAIL(__VA_ARGS__,))) + +#endif /* _RTE_WINDOWS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/sched.h b/src/spdk/dpdk/lib/librte_eal/windows/include/sched.h new file mode 100644 index 000000000..fbe07f742 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/sched.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _SCHED_H_ +#define _SCHED_H_ + +/** + * This file is added to support the common code in eal_common_thread.c + * as Microsoft libc does not contain sched.h. This may be removed + * in future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef CPU_SETSIZE +#define CPU_SETSIZE RTE_MAX_LCORE +#endif + +#define _BITS_PER_SET (sizeof(long long) * 8) +#define _BIT_SET_MASK (_BITS_PER_SET - 1) + +#define _NUM_SETS(b) (((b) + _BIT_SET_MASK) / _BITS_PER_SET) +#define _WHICH_SET(b) ((b) / _BITS_PER_SET) +#define _WHICH_BIT(b) ((b) & (_BITS_PER_SET - 1)) + +typedef struct _rte_cpuset_s { + long long _bits[_NUM_SETS(CPU_SETSIZE)]; +} rte_cpuset_t; + +#define CPU_SET(b, s) ((s)->_bits[_WHICH_SET(b)] |= (1LL << _WHICH_BIT(b))) + +#define CPU_ZERO(s) \ + do { \ + unsigned int _i; \ + \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (s)->_bits[_i] = 0LL; \ + } while (0) + +#define CPU_ISSET(b, s) (((s)->_bits[_WHICH_SET(b)] & \ + (1LL << _WHICH_BIT(b))) != 0LL) + +static inline int +count_cpu(rte_cpuset_t *s) +{ + unsigned int _i; + int count = 0; + + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) + if (CPU_ISSET(_i, s) != 0LL) + count++; + return count; +} +#define CPU_COUNT(s) count_cpu(s) + +#define CPU_AND(dst, src1, src2) \ +do { \ + unsigned int _i; \ + \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (dst)->_bits[_i] = (src1)->_bits[_i] & (src2)->_bits[_i]; \ +} while (0) + +#define CPU_OR(dst, src1, src2) \ +do { \ + unsigned int _i; \ + \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (dst)->_bits[_i] = (src1)->_bits[_i] | (src2)->_bits[_i]; \ +} while (0) + +#define CPU_FILL(s) \ +do { \ + unsigned int _i; \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (s)->_bits[_i] = -1LL; \ +} while (0) + +#define CPU_NOT(dst, src) \ +do { \ + unsigned int _i; \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (dst)->_bits[_i] = (src)->_bits[_i] ^ -1LL; \ +} while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SCHED_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/sys/queue.h b/src/spdk/dpdk/lib/librte_eal/windows/include/sys/queue.h new file mode 100644 index 000000000..a65949a78 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/sys/queue.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +/* + * This file defines tail queues. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * Below is a summary of implemented functions where: + * + means the macro is available + * - means the macro is not available + * s means the macro is available but is slow (runs in O(n) time) + * + * TAILQ + * _HEAD + + * _CLASS_HEAD + + * _HEAD_INITIALIZER + + * _ENTRY + + * _CLASS_ENTRY + + * _INIT + + * _EMPTY + + * _FIRST + + * _NEXT + + * _PREV + + * _LAST + + * _LAST_FAST + + * _FOREACH + + * _FOREACH_FROM + + * _FOREACH_SAFE + + * _FOREACH_FROM_SAFE + + * _FOREACH_REVERSE + + * _FOREACH_REVERSE_FROM + + * _FOREACH_REVERSE_SAFE + + * _FOREACH_REVERSE_FROM_SAFE + + * _INSERT_HEAD + + * _INSERT_BEFORE + + * _INSERT_AFTER + + * _INSERT_TAIL + + * _CONCAT + + * _REMOVE_AFTER - + * _REMOVE_HEAD - + * _REMOVE + + * _SWAP + + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * List definitions. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define TRACEBUF +#define TRACEBUF_INITIALIZER + +#define TRASHIT(x) +#define QMD_IS_TRASHED(x) 0 + +#define QMD_SAVELINK(name, link) + +#ifdef __cplusplus +/* + * In C++ there can be structure lists and class lists: + */ +#define QUEUE_TYPEOF(type) type +#else +#define QUEUE_TYPEOF(type) struct type +#endif + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_HEAD(name, type) \ +struct name { \ + class type *tqh_first; /* first element */ \ + class type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_ENTRY(type) \ +struct { \ + class type *tqe_next; /* next element */ \ + class type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#define QMD_TAILQ_CHECK_HEAD(head, field) +#define QMD_TAILQ_CHECK_TAIL(head, headname) +#define QMD_TAILQ_CHECK_NEXT(elm, field) +#define QMD_TAILQ_CHECK_PREV(elm, field) + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field); \ + if (TAILQ_NEXT((listelm), field) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + TAILQ_NEXT((elm), field) = TAILQ_FIRST((head)); \ + if (TAILQ_FIRST((head)) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +/* + * The FAST function is fast in that it causes no data access other + * then the access to the head. The standard LAST function above + * will cause a data access of both the element you want and + * the previous element. FAST is very useful for instances when + * you may want to prefetch the last data element. + */ +#define TAILQ_LAST_FAST(head, type, field) \ + (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, \ + QUEUE_TYPEOF(type), field.tqe_next)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ + QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ + QMD_TAILQ_CHECK_NEXT(elm, field); \ + QMD_TAILQ_CHECK_PREV(elm, field); \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_SWAP(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) * swap_first = (head1)->tqh_first; \ + QUEUE_TYPEOF(type) * *swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + swap_first = (head1)->tqh_first; \ + if (swap_first != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + swap_first = (head2)->tqh_first; \ + if (swap_first != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ +} while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_QUEUE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/include/unistd.h b/src/spdk/dpdk/lib/librte_eal/windows/include/unistd.h new file mode 100644 index 000000000..757b7f3c5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/include/unistd.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _UNISTD_H_ +#define _UNISTD_H_ +/** + * This file is added to support common code in eal_common_lcore.c + * as Microsoft libc does not contain unistd.h. This may be removed + * in future releases. + */ +#endif /* _UNISTD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/windows/meson.build b/src/spdk/dpdk/lib/librte_eal/windows/meson.build new file mode 100644 index 000000000..adfc8b9b7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/windows/meson.build @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2019 Intel Corporation + +subdir('include') + +sources += files( + 'eal.c', + 'eal_debug.c', + 'eal_lcore.c', + 'eal_log.c', + 'eal_thread.c', + 'fnmatch.c', + 'getopt.c', +) diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/meson.build b/src/spdk/dpdk/lib/librte_eal/x86/include/meson.build new file mode 100644 index 000000000..f0e998c2f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/meson.build @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +arch_headers = files( + 'rte_atomic_32.h', + 'rte_atomic_64.h', + 'rte_atomic.h', + 'rte_byteorder_32.h', + 'rte_byteorder_64.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', + 'rte_cycles.h', + 'rte_io.h', + 'rte_memcpy.h', + 'rte_prefetch.h', + 'rte_pause.h', + 'rte_rtm.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', +) +install_headers(arch_headers, subdir: get_option('include_subdir_arch')) diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic.h new file mode 100644 index 000000000..b9dcd30ab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ATOMIC_X86_H_ +#define _RTE_ATOMIC_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <rte_common.h> +#include <rte_config.h> +#include <emmintrin.h> +#include "generic/rte_atomic.h" + +#if RTE_MAX_LCORE == 1 +#define MPLOCKED /**< No need to insert MP lock prefix. */ +#else +#define MPLOCKED "lock ; " /**< Insert MP lock prefix. */ +#endif + +#define rte_mb() _mm_mfence() + +#define rte_wmb() _mm_sfence() + +#define rte_rmb() _mm_lfence() + +#define rte_smp_wmb() rte_compiler_barrier() + +#define rte_smp_rmb() rte_compiler_barrier() + +/* + * From Intel Software Development Manual; Vol 3; + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families: + * ... + * . Reads are not reordered with other reads. + * . Writes are not reordered with older reads. + * . Writes to memory are not reordered with other writes, + * with the following exceptions: + * . streaming stores (writes) executed with the non-temporal move + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and + * . string operations (see Section 8.2.4.1). + * ... + * . Reads may be reordered with older writes to different locations but not + * with older writes to the same location. + * . Reads or writes cannot be reordered with I/O instructions, + * locked instructions, or serializing instructions. + * . Reads cannot pass earlier LFENCE and MFENCE instructions. + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions. + * . LFENCE instructions cannot pass earlier reads. + * . SFENCE instructions cannot pass earlier writes ... + * . MFENCE instructions cannot pass earlier reads, writes ... + * + * As pointed by Java guys, that makes possible to use lock-prefixed + * instructions to get the same effect as mfence and on most modern HW + * that gives a better performance then using mfence: + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + * Basic idea is to use lock prefixed add with some dummy memory location + * as the destination. From their experiments 128B(2 cache lines) below + * current stack pointer looks like a good candidate. + * So below we use that techinque for rte_smp_mb() implementation. + */ + +static __rte_always_inline void +rte_smp_mb(void) +{ +#ifdef RTE_ARCH_I686 + asm volatile("lock addl $0, -128(%%esp); " ::: "memory"); +#else + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); +#endif +} + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_compiler_barrier() + +#define rte_io_rmb() rte_compiler_barrier() + +#define rte_cio_wmb() rte_compiler_barrier() + +#define rte_cio_rmb() rte_compiler_barrier() + +/*------------------------- 16 bit atomic operations -------------------------*/ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) +{ + uint8_t res; + + asm volatile( + MPLOCKED + "cmpxchgw %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ + asm volatile( + MPLOCKED + "xchgw %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) +{ + return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic16_inc(rte_atomic16_t *v) +{ + asm volatile( + MPLOCKED + "incw %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline void +rte_atomic16_dec(rte_atomic16_t *v) +{ + asm volatile( + MPLOCKED + "decw %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "incw %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) +{ + uint8_t ret; + + asm volatile(MPLOCKED + "decw %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +/*------------------------- 32 bit atomic operations -------------------------*/ + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + uint8_t res; + + asm volatile( + MPLOCKED + "cmpxchgl %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ + asm volatile( + MPLOCKED + "xchgl %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) +{ + return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic32_inc(rte_atomic32_t *v) +{ + asm volatile( + MPLOCKED + "incl %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline void +rte_atomic32_dec(rte_atomic32_t *v) +{ + asm volatile( + MPLOCKED + "decl %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "incl %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) +{ + uint8_t ret; + + asm volatile(MPLOCKED + "decl %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} +#endif + +#ifdef RTE_ARCH_I686 +#include "rte_atomic_32.h" +#else +#include "rte_atomic_64.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic_32.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic_32.h new file mode 100644 index 000000000..f63b7fa27 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic_32.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + */ + +/* + * Inspired from FreeBSD src/sys/i386/include/atomic.h + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + */ + +#ifndef _RTE_ATOMIC_X86_H_ +#error do not include this file directly, use <rte_atomic.h> instead +#endif + +#ifndef _RTE_ATOMIC_I686_H_ +#define _RTE_ATOMIC_I686_H_ + +#include <stdint.h> +#include <rte_common.h> +#include <rte_atomic.h> + +/*------------------------- 64 bit atomic operations -------------------------*/ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + uint8_t res; + RTE_STD_C11 + union { + struct { + uint32_t l32; + uint32_t h32; + }; + uint64_t u64; + } _exp, _src; + + _exp.u64 = exp; + _src.u64 = src; + +#ifndef __PIC__ + asm volatile ( + MPLOCKED + "cmpxchg8b (%[dst]);" + "setz %[res];" + : [res] "=a" (res) /* result in eax */ + : [dst] "S" (dst), /* esi */ + "b" (_src.l32), /* ebx */ + "c" (_src.h32), /* ecx */ + "a" (_exp.l32), /* eax */ + "d" (_exp.h32) /* edx */ + : "memory" ); /* no-clobber list */ +#else + asm volatile ( + "xchgl %%ebx, %%edi;\n" + MPLOCKED + "cmpxchg8b (%[dst]);" + "setz %[res];" + "xchgl %%ebx, %%edi;\n" + : [res] "=a" (res) /* result in eax */ + : [dst] "S" (dst), /* esi */ + "D" (_src.l32), /* ebx */ + "c" (_src.h32), /* ecx */ + "a" (_exp.l32), /* eax */ + "d" (_exp.h32) /* edx */ + : "memory" ); /* no-clobber list */ +#endif + + return res; +} + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dest, uint64_t val) +{ + uint64_t old; + + do { + old = *dest; + } while (rte_atomic64_cmpset(dest, old, val) == 0); + + return old; +} + +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, 0); + } +} + +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + /* replace the value by itself */ + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp); + } + return tmp; +} + +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, new_value); + } +} + +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp + inc); + } +} + +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp - dec); + } +} + +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + rte_atomic64_add(v, 1); +} + +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + rte_atomic64_sub(v, 1); +} + +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp + inc); + } + + return tmp + inc; +} + +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp - dec); + } + + return tmp - dec; +} + +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_add_return(v, 1) == 0; +} + +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_sub_return(v, 1) == 0; +} + +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} + +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + rte_atomic64_set(v, 0); +} +#endif + +#endif /* _RTE_ATOMIC_I686_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic_64.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic_64.h new file mode 100644 index 000000000..cfe7067dd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_atomic_64.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + */ + +/* + * Inspired from FreeBSD src/sys/amd64/include/atomic.h + * Copyright (c) 1998 Doug Rabson + * Copyright (c) 2019 Intel Corporation + * All rights reserved. + */ + +#ifndef _RTE_ATOMIC_X86_H_ +#error do not include this file directly, use <rte_atomic.h> instead +#endif + +#ifndef _RTE_ATOMIC_X86_64_H_ +#define _RTE_ATOMIC_X86_64_H_ + +#include <stdint.h> +#include <rte_common.h> +#include <rte_compat.h> +#include <rte_atomic.h> + +/*------------------------- 64 bit atomic operations -------------------------*/ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + uint8_t res; + + + asm volatile( + MPLOCKED + "cmpxchgq %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + + return res; +} + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ + asm volatile( + MPLOCKED + "xchgq %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ + v->cnt = 0; +} + +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ + return v->cnt; +} + +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ + v->cnt = new_value; +} + +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + asm volatile( + MPLOCKED + "addq %[inc], %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : [inc] "ir" (inc), /* input */ + "m" (v->cnt) + ); +} + +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + asm volatile( + MPLOCKED + "subq %[dec], %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : [dec] "ir" (dec), /* input */ + "m" (v->cnt) + ); +} + +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + asm volatile( + MPLOCKED + "incq %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + asm volatile( + MPLOCKED + "decq %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + int64_t prev = inc; + + asm volatile( + MPLOCKED + "xaddq %[prev], %[cnt]" + : [prev] "+r" (prev), /* output */ + [cnt] "=m" (v->cnt) + : "m" (v->cnt) /* input */ + ); + return prev + inc; +} + +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + return rte_atomic64_add_return(v, -dec); +} + +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "incq %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + + return ret != 0; +} + +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "decq %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} + +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + v->cnt = 0; +} +#endif + +/*------------------------ 128 bit atomic operations -------------------------*/ + +__rte_experimental +static inline int +rte_atomic128_cmp_exchange(rte_int128_t *dst, + rte_int128_t *exp, + const rte_int128_t *src, + unsigned int weak, + int success, + int failure) +{ + RTE_SET_USED(weak); + RTE_SET_USED(success); + RTE_SET_USED(failure); + uint8_t res; + + asm volatile ( + MPLOCKED + "cmpxchg16b %[dst];" + " sete %[res]" + : [dst] "=m" (dst->val[0]), + "=a" (exp->val[0]), + "=d" (exp->val[1]), + [res] "=r" (res) + : "b" (src->val[0]), + "c" (src->val[1]), + "a" (exp->val[0]), + "d" (exp->val[1]), + "m" (dst->val[0]) + : "memory"); + + return res; +} + +#endif /* _RTE_ATOMIC_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder.h new file mode 100644 index 000000000..a2dfecc1f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_X86_H_ +#define _RTE_BYTEORDER_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <rte_common.h> +#include <rte_config.h> +#include "generic/rte_byteorder.h" + +#ifndef RTE_BYTE_ORDER +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif + +/* + * An architecture-optimized byte swap for a 16-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap16(). + */ +static inline uint16_t rte_arch_bswap16(uint16_t _x) +{ + uint16_t x = _x; + asm volatile ("xchgb %b[x1],%h[x2]" + : [x1] "=Q" (x) + : [x2] "0" (x) + ); + return x; +} + +/* + * An architecture-optimized byte swap for a 32-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap32(). + */ +static inline uint32_t rte_arch_bswap32(uint32_t _x) +{ + uint32_t x = _x; + asm volatile ("bswap %[x]" + : [x] "+r" (x) + ); + return x; +} + +#ifndef RTE_FORCE_INTRINSICS +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) + +#define rte_bswap32(x) ((uint32_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap32(x) : \ + rte_arch_bswap32(x))) + +#define rte_bswap64(x) ((uint64_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap64(x) : \ + rte_arch_bswap64(x))) +#else +/* + * __builtin_bswap16 is only available gcc 4.8 and upwards + */ +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) +#endif +#endif + +#define rte_cpu_to_le_16(x) (x) +#define rte_cpu_to_le_32(x) (x) +#define rte_cpu_to_le_64(x) (x) + +#define rte_cpu_to_be_16(x) rte_bswap16(x) +#define rte_cpu_to_be_32(x) rte_bswap32(x) +#define rte_cpu_to_be_64(x) rte_bswap64(x) + +#define rte_le_to_cpu_16(x) (x) +#define rte_le_to_cpu_32(x) (x) +#define rte_le_to_cpu_64(x) (x) + +#define rte_be_to_cpu_16(x) rte_bswap16(x) +#define rte_be_to_cpu_32(x) rte_bswap32(x) +#define rte_be_to_cpu_64(x) rte_bswap64(x) + +#ifdef RTE_ARCH_I686 +#include "rte_byteorder_32.h" +#else +#include "rte_byteorder_64.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BYTEORDER_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder_32.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder_32.h new file mode 100644 index 000000000..d5a768e52 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder_32.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_X86_H_ +#error do not include this file directly, use <rte_byteorder.h> instead +#endif + +#ifndef _RTE_BYTEORDER_I686_H_ +#define _RTE_BYTEORDER_I686_H_ + +#include <stdint.h> +#include <rte_byteorder.h> + +/* + * An architecture-optimized byte swap for a 64-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap64(). + */ +/* Compat./Leg. mode */ +static inline uint64_t rte_arch_bswap64(uint64_t x) +{ + uint64_t ret = 0; + ret |= ((uint64_t)rte_arch_bswap32(x & 0xffffffffUL) << 32); + ret |= ((uint64_t)rte_arch_bswap32((x >> 32) & 0xffffffffUL)); + return ret; +} + +#endif /* _RTE_BYTEORDER_I686_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder_64.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder_64.h new file mode 100644 index 000000000..8c6cf285b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_byteorder_64.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_X86_H_ +#error do not include this file directly, use <rte_byteorder.h> instead +#endif + +#ifndef _RTE_BYTEORDER_X86_64_H_ +#define _RTE_BYTEORDER_X86_64_H_ + +#include <stdint.h> +#include <rte_common.h> + +/* + * An architecture-optimized byte swap for a 64-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap64(). + */ +/* 64-bit mode */ +static inline uint64_t rte_arch_bswap64(uint64_t _x) +{ + uint64_t x = _x; + asm volatile ("bswap %[x]" + : [x] "+r" (x) + ); + return x; +} + +#endif /* _RTE_BYTEORDER_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_cpuflags.h new file mode 100644 index 000000000..c1d20364d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_cpuflags.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_CPUFLAGS_X86_64_H_ +#define _RTE_CPUFLAGS_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +enum rte_cpu_flag_t { + /* (EAX 01h) ECX features*/ + RTE_CPUFLAG_SSE3 = 0, /**< SSE3 */ + RTE_CPUFLAG_PCLMULQDQ, /**< PCLMULQDQ */ + RTE_CPUFLAG_DTES64, /**< DTES64 */ + RTE_CPUFLAG_MONITOR, /**< MONITOR */ + RTE_CPUFLAG_DS_CPL, /**< DS_CPL */ + RTE_CPUFLAG_VMX, /**< VMX */ + RTE_CPUFLAG_SMX, /**< SMX */ + RTE_CPUFLAG_EIST, /**< EIST */ + RTE_CPUFLAG_TM2, /**< TM2 */ + RTE_CPUFLAG_SSSE3, /**< SSSE3 */ + RTE_CPUFLAG_CNXT_ID, /**< CNXT_ID */ + RTE_CPUFLAG_FMA, /**< FMA */ + RTE_CPUFLAG_CMPXCHG16B, /**< CMPXCHG16B */ + RTE_CPUFLAG_XTPR, /**< XTPR */ + RTE_CPUFLAG_PDCM, /**< PDCM */ + RTE_CPUFLAG_PCID, /**< PCID */ + RTE_CPUFLAG_DCA, /**< DCA */ + RTE_CPUFLAG_SSE4_1, /**< SSE4_1 */ + RTE_CPUFLAG_SSE4_2, /**< SSE4_2 */ + RTE_CPUFLAG_X2APIC, /**< X2APIC */ + RTE_CPUFLAG_MOVBE, /**< MOVBE */ + RTE_CPUFLAG_POPCNT, /**< POPCNT */ + RTE_CPUFLAG_TSC_DEADLINE, /**< TSC_DEADLINE */ + RTE_CPUFLAG_AES, /**< AES */ + RTE_CPUFLAG_XSAVE, /**< XSAVE */ + RTE_CPUFLAG_OSXSAVE, /**< OSXSAVE */ + RTE_CPUFLAG_AVX, /**< AVX */ + RTE_CPUFLAG_F16C, /**< F16C */ + RTE_CPUFLAG_RDRAND, /**< RDRAND */ + RTE_CPUFLAG_HYPERVISOR, /**< Running in a VM */ + + /* (EAX 01h) EDX features */ + RTE_CPUFLAG_FPU, /**< FPU */ + RTE_CPUFLAG_VME, /**< VME */ + RTE_CPUFLAG_DE, /**< DE */ + RTE_CPUFLAG_PSE, /**< PSE */ + RTE_CPUFLAG_TSC, /**< TSC */ + RTE_CPUFLAG_MSR, /**< MSR */ + RTE_CPUFLAG_PAE, /**< PAE */ + RTE_CPUFLAG_MCE, /**< MCE */ + RTE_CPUFLAG_CX8, /**< CX8 */ + RTE_CPUFLAG_APIC, /**< APIC */ + RTE_CPUFLAG_SEP, /**< SEP */ + RTE_CPUFLAG_MTRR, /**< MTRR */ + RTE_CPUFLAG_PGE, /**< PGE */ + RTE_CPUFLAG_MCA, /**< MCA */ + RTE_CPUFLAG_CMOV, /**< CMOV */ + RTE_CPUFLAG_PAT, /**< PAT */ + RTE_CPUFLAG_PSE36, /**< PSE36 */ + RTE_CPUFLAG_PSN, /**< PSN */ + RTE_CPUFLAG_CLFSH, /**< CLFSH */ + RTE_CPUFLAG_DS, /**< DS */ + RTE_CPUFLAG_ACPI, /**< ACPI */ + RTE_CPUFLAG_MMX, /**< MMX */ + RTE_CPUFLAG_FXSR, /**< FXSR */ + RTE_CPUFLAG_SSE, /**< SSE */ + RTE_CPUFLAG_SSE2, /**< SSE2 */ + RTE_CPUFLAG_SS, /**< SS */ + RTE_CPUFLAG_HTT, /**< HTT */ + RTE_CPUFLAG_TM, /**< TM */ + RTE_CPUFLAG_PBE, /**< PBE */ + + /* (EAX 06h) EAX features */ + RTE_CPUFLAG_DIGTEMP, /**< DIGTEMP */ + RTE_CPUFLAG_TRBOBST, /**< TRBOBST */ + RTE_CPUFLAG_ARAT, /**< ARAT */ + RTE_CPUFLAG_PLN, /**< PLN */ + RTE_CPUFLAG_ECMD, /**< ECMD */ + RTE_CPUFLAG_PTM, /**< PTM */ + + /* (EAX 06h) ECX features */ + RTE_CPUFLAG_MPERF_APERF_MSR, /**< MPERF_APERF_MSR */ + RTE_CPUFLAG_ACNT2, /**< ACNT2 */ + RTE_CPUFLAG_ENERGY_EFF, /**< ENERGY_EFF */ + + /* (EAX 07h, ECX 0h) EBX features */ + RTE_CPUFLAG_FSGSBASE, /**< FSGSBASE */ + RTE_CPUFLAG_BMI1, /**< BMI1 */ + RTE_CPUFLAG_HLE, /**< Hardware Lock elision */ + RTE_CPUFLAG_AVX2, /**< AVX2 */ + RTE_CPUFLAG_SMEP, /**< SMEP */ + RTE_CPUFLAG_BMI2, /**< BMI2 */ + RTE_CPUFLAG_ERMS, /**< ERMS */ + RTE_CPUFLAG_INVPCID, /**< INVPCID */ + RTE_CPUFLAG_RTM, /**< Transactional memory */ + RTE_CPUFLAG_AVX512F, /**< AVX512F */ + RTE_CPUFLAG_RDSEED, /**< RDSEED instruction */ + + /* (EAX 80000001h) ECX features */ + RTE_CPUFLAG_LAHF_SAHF, /**< LAHF_SAHF */ + RTE_CPUFLAG_LZCNT, /**< LZCNT */ + + /* (EAX 80000001h) EDX features */ + RTE_CPUFLAG_SYSCALL, /**< SYSCALL */ + RTE_CPUFLAG_XD, /**< XD */ + RTE_CPUFLAG_1GB_PG, /**< 1GB_PG */ + RTE_CPUFLAG_RDTSCP, /**< RDTSCP */ + RTE_CPUFLAG_EM64T, /**< EM64T */ + + /* (EAX 80000007h) EDX features */ + RTE_CPUFLAG_INVTSC, /**< INVTSC */ + + RTE_CPUFLAG_AVX512DQ, /**< AVX512 Doubleword and Quadword */ + RTE_CPUFLAG_AVX512IFMA, /**< AVX512 Integer Fused Multiply-Add */ + RTE_CPUFLAG_AVX512CD, /**< AVX512 Conflict Detection*/ + RTE_CPUFLAG_AVX512BW, /**< AVX512 Byte and Word */ + RTE_CPUFLAG_AVX512VL, /**< AVX512 Vector Length */ + RTE_CPUFLAG_AVX512VBMI, /**< AVX512 Vector Bit Manipulation */ + RTE_CPUFLAG_AVX512VBMI2, /**< AVX512 Vector Bit Manipulation 2 */ + RTE_CPUFLAG_GFNI, /**< Galois Field New Instructions */ + RTE_CPUFLAG_VAES, /**< Vector AES */ + RTE_CPUFLAG_VPCLMULQDQ, /**< Vector Carry-less Multiply */ + RTE_CPUFLAG_AVX512VNNI, + /**< AVX512 Vector Neural Network Instructions */ + RTE_CPUFLAG_AVX512BITALG, /**< AVX512 Bit Algorithms */ + RTE_CPUFLAG_AVX512VPOPCNTDQ, /**< AVX512 Vector Popcount */ + RTE_CPUFLAG_CLDEMOTE, /**< Cache Line Demote */ + RTE_CPUFLAG_MOVDIRI, /**< Direct Store Instructions */ + RTE_CPUFLAG_MOVDIR64B, /**< Direct Store Instructions 64B */ + RTE_CPUFLAG_AVX512VP2INTERSECT, /**< AVX512 Two Register Intersection */ + + /* The last item */ + RTE_CPUFLAG_NUMFLAGS, /**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_cycles.h new file mode 100644 index 000000000..a461a4d73 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_cycles.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#ifndef _RTE_CYCLES_X86_64_H_ +#define _RTE_CYCLES_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT +/* Global switch to use VMWARE mapping of TSC instead of RDTSC */ +extern int rte_cycles_vmware_tsc_map; +#include <rte_branch_prediction.h> +#endif +#include <rte_common.h> +#include <rte_config.h> + +static inline uint64_t +rte_rdtsc(void) +{ + union { + uint64_t tsc_64; + RTE_STD_C11 + struct { + uint32_t lo_32; + uint32_t hi_32; + }; + } tsc; + +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + if (unlikely(rte_cycles_vmware_tsc_map)) { + /* ecx = 0x10000 corresponds to the physical TSC for VMware */ + asm volatile("rdpmc" : + "=a" (tsc.lo_32), + "=d" (tsc.hi_32) : + "c"(0x10000)); + return tsc.tsc_64; + } +#endif + + asm volatile("rdtsc" : + "=a" (tsc.lo_32), + "=d" (tsc.hi_32)); + return tsc.tsc_64; +} + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_io.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_io.h new file mode 100644 index 000000000..2db71b1b0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_io.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_X86_H_ +#define _RTE_IO_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_io.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_mcslock.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_mcslock.h new file mode 100644 index 000000000..a8f041a72 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_mcslock.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_MCSLOCK_X86_64_H_ +#define _RTE_MCSLOCK_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_mcslock.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MCSLOCK_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_memcpy.h new file mode 100644 index 000000000..9c67232df --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_memcpy.h @@ -0,0 +1,885 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMCPY_X86_64_H_ +#define _RTE_MEMCPY_X86_64_H_ + +/** + * @file + * + * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). + */ + +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <rte_vect.h> +#include <rte_common.h> +#include <rte_config.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif + +/** + * Copy bytes from one location to another. The locations must not overlap. + * + * @note This is implemented as a macro, so it's address should not be taken + * and care is needed as parameter expressions may be evaluated multiple times. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + * @param n + * Number of bytes to copy. + * @return + * Pointer to the destination data. + */ +static __rte_always_inline void * +rte_memcpy(void *dst, const void *src, size_t n); + +#ifdef RTE_MACHINE_CPUFLAG_AVX512F + +#define ALIGNMENT_MASK 0x3F + +/** + * AVX512 implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __m256i ymm0; + + ymm0 = _mm256_loadu_si256((const __m256i *)src); + _mm256_storeu_si256((__m256i *)dst, ymm0); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + __m512i zmm0; + + zmm0 = _mm512_loadu_si512((const void *)src); + _mm512_storeu_si512((void *)dst, zmm0); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov64(dst + 0 * 64, src + 0 * 64); + rte_mov64(dst + 1 * 64, src + 1 * 64); +} + +/** + * Copy 256 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov64(dst + 0 * 64, src + 0 * 64); + rte_mov64(dst + 1 * 64, src + 1 * 64); + rte_mov64(dst + 2 * 64, src + 2 * 64); + rte_mov64(dst + 3 * 64, src + 3 * 64); +} + +/** + * Copy 128-byte blocks from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m512i zmm0, zmm1; + + while (n >= 128) { + zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); + n -= 128; + zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); + src = src + 128; + _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); + _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); + dst = dst + 128; + } +} + +/** + * Copy 512-byte blocks from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; + + while (n >= 512) { + zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); + n -= 512; + zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); + zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); + zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); + zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); + zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); + zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); + zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); + src = src + 512; + _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); + _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); + _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); + _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); + _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); + _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); + _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); + _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); + dst = dst + 512; + } +} + +static __rte_always_inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) +{ + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t bits; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) + *(uint64_t *)dstu = *(const uint64_t *)srcu; + return ret; + } + + /** + * Fast way when copy size doesn't exceed 512 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n <= 512) { + if (n >= 256) { + n -= 256; + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 256; + dst = (uint8_t *)dst + 256; + } + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK63: + if (n > 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + if (n > 0) + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /** + * Make store aligned when copy size exceeds 512 bytes + */ + dstofss = ((uintptr_t)dst & 0x3F); + if (dstofss > 0) { + dstofss = 64 - dstofss; + n -= dstofss; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + + /** + * Copy 512-byte blocks. + * Use copy block function for better instruction order control, + * which is important when load is unaligned. + */ + rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 511; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + + /** + * Copy 128-byte blocks. + * Use copy block function for better instruction order control, + * which is important when load is unaligned. + */ + if (n >= 128) { + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 127; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + } + + /** + * Copy whatever left + */ + goto COPY_BLOCK_128_BACK63; +} + +#elif defined RTE_MACHINE_CPUFLAG_AVX2 + +#define ALIGNMENT_MASK 0x1F + +/** + * AVX2 implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __m256i ymm0; + + ymm0 = _mm256_loadu_si256((const __m256i *)src); + _mm256_storeu_si256((__m256i *)dst, ymm0); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); + rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); + rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); +} + +/** + * Copy 128-byte blocks from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m256i ymm0, ymm1, ymm2, ymm3; + + while (n >= 128) { + ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); + n -= 128; + ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); + ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); + ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); + src = (const uint8_t *)src + 128; + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); + dst = (uint8_t *)dst + 128; + } +} + +static __rte_always_inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) +{ + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t bits; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) { + *(uint64_t *)dstu = *(const uint64_t *)srcu; + } + return ret; + } + + /** + * Fast way when copy size doesn't exceed 256 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n <= 256) { + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK31: + if (n >= 64) { + n -= 64; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 64; + dst = (uint8_t *)dst + 64; + } + if (n > 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n > 0) { + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + } + return ret; + } + + /** + * Make store aligned when copy size exceeds 256 bytes + */ + dstofss = (uintptr_t)dst & 0x1F; + if (dstofss > 0) { + dstofss = 32 - dstofss; + n -= dstofss; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + + /** + * Copy 128-byte blocks + */ + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 127; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + + /** + * Copy whatever left + */ + goto COPY_BLOCK_128_BACK31; +} + +#else /* RTE_MACHINE_CPUFLAG */ + +#define ALIGNMENT_MASK 0x0F + +/** + * SSE & AVX implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); + rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); + rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); + rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); + rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); +} + +/** + * Copy 256 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); + rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); + rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); + rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); + rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); + rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); + rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); + rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); + rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); + rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); + rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); + rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); + rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); +} + +/** + * Macro for copying unaligned block from one location to another with constant load offset, + * 47 bytes leftover maximum, + * locations should not overlap. + * Requirements: + * - Store is aligned + * - Load offset is <offset>, which must be immediate value within [1, 15] + * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading + * - <dst>, <src>, <len> must be variables + * - __m128i <xmm0> ~ <xmm8> must be pre-defined + */ +#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ +__extension__ ({ \ + size_t tmp; \ + while (len >= 128 + 16 - offset) { \ + xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ + len -= 128; \ + xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ + xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ + xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \ + xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \ + xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \ + xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \ + xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \ + xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \ + src = (const uint8_t *)src + 128; \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ + dst = (uint8_t *)dst + 128; \ + } \ + tmp = len; \ + len = ((len - 16 + offset) & 127) + 16 - offset; \ + tmp -= len; \ + src = (const uint8_t *)src + tmp; \ + dst = (uint8_t *)dst + tmp; \ + if (len >= 32 + 16 - offset) { \ + while (len >= 32 + 16 - offset) { \ + xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ + len -= 32; \ + xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ + xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ + src = (const uint8_t *)src + 32; \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ + dst = (uint8_t *)dst + 32; \ + } \ + tmp = len; \ + len = ((len - 16 + offset) & 31) + 16 - offset; \ + tmp -= len; \ + src = (const uint8_t *)src + tmp; \ + dst = (uint8_t *)dst + tmp; \ + } \ +}) + +/** + * Macro for copying unaligned block from one location to another, + * 47 bytes leftover maximum, + * locations should not overlap. + * Use switch here because the aligning instruction requires immediate value for shift count. + * Requirements: + * - Store is aligned + * - Load offset is <offset>, which must be within [1, 15] + * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading + * - <dst>, <src>, <len> must be variables + * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined + */ +#define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ +__extension__ ({ \ + switch (offset) { \ + case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ + case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ + case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ + case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ + case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ + case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ + case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ + case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ + case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ + case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ + case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ + case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ + case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ + case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ + case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ + default:; \ + } \ +}) + +static __rte_always_inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) +{ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t srcofs; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) { + *(uint64_t *)dstu = *(const uint64_t *)srcu; + } + return ret; + } + + /** + * Fast way when copy size doesn't exceed 512 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 128) { + goto COPY_BLOCK_128_BACK15; + } + if (n <= 512) { + if (n >= 256) { + n -= 256; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); + src = (const uint8_t *)src + 256; + dst = (uint8_t *)dst + 256; + } +COPY_BLOCK_255_BACK15: + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK15: + if (n >= 64) { + n -= 64; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 64; + dst = (uint8_t *)dst + 64; + } +COPY_BLOCK_64_BACK15: + if (n >= 32) { + n -= 32; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 32; + dst = (uint8_t *)dst + 32; + } + if (n > 16) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n > 0) { + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + } + return ret; + } + + /** + * Make store aligned when copy size exceeds 512 bytes, + * and make sure the first 15 bytes are copied, because + * unaligned copy functions require up to 15 bytes + * backwards access. + */ + dstofss = (uintptr_t)dst & 0x0F; + if (dstofss > 0) { + dstofss = 16 - dstofss + 16; + n -= dstofss; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + srcofs = ((uintptr_t)src & 0x0F); + + /** + * For aligned copy + */ + if (srcofs == 0) { + /** + * Copy 256-byte blocks + */ + for (; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /** + * Copy whatever left + */ + goto COPY_BLOCK_255_BACK15; + } + + /** + * For copy with unaligned load + */ + MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); + + /** + * Copy whatever left + */ + goto COPY_BLOCK_64_BACK15; +} + +#endif /* RTE_MACHINE_CPUFLAG */ + +static __rte_always_inline void * +rte_memcpy_aligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* Copy size <= 16 bytes */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + src = (const uint8_t *)src + 1; + dst = (uint8_t *)dst + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + + return ret; + } + + /* Copy 16 <= size <= 32 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + + return ret; + } + + /* Copy 32 < size <= 64 bytes */ + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + + return ret; + } + + /* Copy 64 bytes blocks */ + for (; n >= 64; n -= 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; + } + + /* Copy whatever left */ + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + + return ret; +} + +static __rte_always_inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) + return rte_memcpy_aligned(dst, src, n); + else + return rte_memcpy_generic(dst, src, n); +} + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_pause.h new file mode 100644 index 000000000..b4cf1df1d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_pause.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_X86_H_ +#define _RTE_PAUSE_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_pause.h" + +#include <emmintrin.h> +static inline void rte_pause(void) +{ + _mm_pause(); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_prefetch.h new file mode 100644 index 000000000..384c6b3ef --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_prefetch.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_PREFETCH_X86_64_H_ +#define _RTE_PREFETCH_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_common.h> +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("prefetcht0 %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("prefetcht1 %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("prefetcht2 %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_rtm.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_rtm.h new file mode 100644 index 000000000..36bf49846 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_rtm.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2012,2013 Intel Corporation + */ + +#ifndef _RTE_RTM_H_ +#define _RTE_RTM_H_ 1 + + +/* Official RTM intrinsics interface matching gcc/icc, but works + on older gcc compatible compilers and binutils. */ + +#include <rte_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RTE_XBEGIN_STARTED (~0u) +#define RTE_XABORT_EXPLICIT (1 << 0) +#define RTE_XABORT_RETRY (1 << 1) +#define RTE_XABORT_CONFLICT (1 << 2) +#define RTE_XABORT_CAPACITY (1 << 3) +#define RTE_XABORT_DEBUG (1 << 4) +#define RTE_XABORT_NESTED (1 << 5) +#define RTE_XABORT_CODE(x) (((x) >> 24) & 0xff) + +static __rte_always_inline +unsigned int rte_xbegin(void) +{ + unsigned int ret = RTE_XBEGIN_STARTED; + + asm volatile(".byte 0xc7,0xf8 ; .long 0" : "+a" (ret) :: "memory"); + return ret; +} + +static __rte_always_inline +void rte_xend(void) +{ + asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory"); +} + +/* not an inline function to workaround a clang bug with -O0 */ +#define rte_xabort(status) do { \ + asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory"); \ +} while (0) + +static __rte_always_inline +int rte_xtest(void) +{ + unsigned char out; + + asm volatile(".byte 0x0f,0x01,0xd6 ; setnz %0" : + "=r" (out) :: "memory"); + return out; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RTM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_rwlock.h new file mode 100644 index 000000000..eec4c7123 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_rwlock.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#ifndef _RTE_RWLOCK_X86_64_H_ +#define _RTE_RWLOCK_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_rwlock.h" +#include "rte_spinlock.h" + +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl) +{ + if (likely(rte_try_tm(&rwl->cnt))) + return; + rte_rwlock_read_lock(rwl); +} + +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl) +{ + if (unlikely(rwl->cnt)) + rte_rwlock_read_unlock(rwl); + else + rte_xend(); +} + +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl) +{ + if (likely(rte_try_tm(&rwl->cnt))) + return; + rte_rwlock_write_lock(rwl); +} + +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl) +{ + if (unlikely(rwl->cnt)) + rte_rwlock_write_unlock(rwl); + else + rte_xend(); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_spinlock.h new file mode 100644 index 000000000..e2e2b2643 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_spinlock.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_SPINLOCK_X86_64_H_ +#define _RTE_SPINLOCK_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_spinlock.h" +#include "rte_rtm.h" +#include "rte_cpuflags.h" +#include "rte_branch_prediction.h" +#include "rte_common.h" +#include "rte_pause.h" +#include "rte_cycles.h" + +#define RTE_RTM_MAX_RETRIES (20) +#define RTE_XABORT_LOCK_BUSY (0xff) + +#ifndef RTE_FORCE_INTRINSICS +static inline void +rte_spinlock_lock(rte_spinlock_t *sl) +{ + int lock_val = 1; + asm volatile ( + "1:\n" + "xchg %[locked], %[lv]\n" + "test %[lv], %[lv]\n" + "jz 3f\n" + "2:\n" + "pause\n" + "cmpl $0, %[locked]\n" + "jnz 2b\n" + "jmp 1b\n" + "3:\n" + : [locked] "=m" (sl->locked), [lv] "=q" (lock_val) + : "[lv]" (lock_val) + : "memory"); +} + +static inline void +rte_spinlock_unlock (rte_spinlock_t *sl) +{ + int unlock_val = 0; + asm volatile ( + "xchg %[locked], %[ulv]\n" + : [locked] "=m" (sl->locked), [ulv] "=q" (unlock_val) + : "[ulv]" (unlock_val) + : "memory"); +} + +static inline int +rte_spinlock_trylock (rte_spinlock_t *sl) +{ + int lockval = 1; + + asm volatile ( + "xchg %[locked], %[lockval]" + : [locked] "=m" (sl->locked), [lockval] "=q" (lockval) + : "[lockval]" (lockval) + : "memory"); + + return lockval == 0; +} +#endif + +extern uint8_t rte_rtm_supported; + +static inline int rte_tm_supported(void) +{ + return rte_rtm_supported; +} + +static inline int +rte_try_tm(volatile int *lock) +{ + int i, retries; + + if (!rte_rtm_supported) + return 0; + + retries = RTE_RTM_MAX_RETRIES; + + while (likely(retries--)) { + + unsigned int status = rte_xbegin(); + + if (likely(RTE_XBEGIN_STARTED == status)) { + if (unlikely(*lock)) + rte_xabort(RTE_XABORT_LOCK_BUSY); + else + return 1; + } + while (*lock) + rte_pause(); + + if ((status & RTE_XABORT_CONFLICT) || + ((status & RTE_XABORT_EXPLICIT) && + (RTE_XABORT_CODE(status) == RTE_XABORT_LOCK_BUSY))) { + /* add a small delay before retrying, basing the + * delay on the number of times we've already tried, + * to give a back-off type of behaviour. We + * randomize trycount by taking bits from the tsc count + */ + int try_count = RTE_RTM_MAX_RETRIES - retries; + int pause_count = (rte_rdtsc() & 0x7) | 1; + pause_count <<= try_count; + for (i = 0; i < pause_count; i++) + rte_pause(); + continue; + } + + if ((status & RTE_XABORT_RETRY) == 0) /* do not retry */ + break; + } + return 0; +} + +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl) +{ + if (likely(rte_try_tm(&sl->locked))) + return; + + rte_spinlock_lock(sl); /* fall-back */ +} + +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl) +{ + if (likely(rte_try_tm(&sl->locked))) + return 1; + + return rte_spinlock_trylock(sl); +} + +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl) +{ + if (unlikely(sl->locked)) + rte_spinlock_unlock(sl); + else + rte_xend(); +} + +static inline void +rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr) +{ + if (likely(rte_try_tm(&slr->sl.locked))) + return; + + rte_spinlock_recursive_lock(slr); /* fall-back */ +} + +static inline void +rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr) +{ + if (unlikely(slr->sl.locked)) + rte_spinlock_recursive_unlock(slr); + else + rte_xend(); +} + +static inline int +rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr) +{ + if (likely(rte_try_tm(&slr->sl.locked))) + return 1; + + return rte_spinlock_recursive_trylock(slr); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SPINLOCK_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_ticketlock.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_ticketlock.h new file mode 100644 index 000000000..0cc01f6b9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_ticketlock.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Arm Limited + */ + +#ifndef _RTE_TICKETLOCK_X86_64_H_ +#define _RTE_TICKETLOCK_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_ticketlock.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TICKETLOCK_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/include/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_vect.h new file mode 100644 index 000000000..df5a60762 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/include/rte_vect.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_VECT_X86_H_ +#define _RTE_VECT_X86_H_ + +/** + * @file + * + * RTE SSE/AVX related header. + */ + +#include <stdint.h> +#include <rte_config.h> +#include "generic/rte_vect.h" + +#if (defined(__ICC) || \ + (defined(_WIN64)) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ < 4)) + +#include <smmintrin.h> /* SSE4 */ + +#if defined(__AVX__) +#include <immintrin.h> +#endif + +#else + +#include <x86intrin.h> + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef __m128i xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} rte_xmm_t; + +#ifdef __AVX__ + +typedef __m256i ymm_t; + +#define YMM_SIZE (sizeof(ymm_t)) +#define YMM_MASK (YMM_SIZE - 1) + +typedef union rte_ymm { + ymm_t y; + xmm_t x[YMM_SIZE / sizeof(xmm_t)]; + uint8_t u8[YMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[YMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[YMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[YMM_SIZE / sizeof(uint64_t)]; + double pd[YMM_SIZE / sizeof(double)]; +} rte_ymm_t; + +#endif /* __AVX__ */ + +#ifdef RTE_ARCH_I686 +#define _mm_cvtsi128_si64(a) \ +__extension__ ({ \ + rte_xmm_t m; \ + m.x = (a); \ + (m.u64[0]); \ +}) +#endif + +/* + * Prior to version 12.1 icc doesn't support _mm_set_epi64x. + */ +#if (defined(__ICC) && __ICC < 1210) +#define _mm_set_epi64x(a, b) \ +__extension__ ({ \ + rte_xmm_t m; \ + m.u64[0] = b; \ + m.u64[1] = a; \ + (m.x); \ +}) +#endif /* (defined(__ICC) && __ICC < 1210) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VECT_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/meson.build b/src/spdk/dpdk/lib/librte_eal/x86/meson.build new file mode 100644 index 000000000..e78f29002 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +subdir('include') + +sources += files( + 'rte_cpuflags.c', + 'rte_cycles.c', + 'rte_hypervisor.c', + 'rte_spinlock.c', +) diff --git a/src/spdk/dpdk/lib/librte_eal/x86/rte_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/x86/rte_cpuflags.c new file mode 100644 index 000000000..30439e795 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/rte_cpuflags.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#include "rte_cpuflags.h" + +#include <stdio.h> +#include <errno.h> +#include <stdint.h> + +#include "rte_cpuid.h" + +/** + * Struct to hold a processor feature entry + */ +struct feature_entry { + uint32_t leaf; /**< cpuid leaf */ + uint32_t subleaf; /**< cpuid subleaf */ + uint32_t reg; /**< cpuid register */ + uint32_t bit; /**< cpuid register bit */ +#define CPU_FLAG_NAME_MAX_LEN 64 + char name[CPU_FLAG_NAME_MAX_LEN]; /**< String for printing */ +}; + +#define FEAT_DEF(name, leaf, subleaf, reg, bit) \ + [RTE_CPUFLAG_##name] = {leaf, subleaf, reg, bit, #name }, + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(SSE3, 0x00000001, 0, RTE_REG_ECX, 0) + FEAT_DEF(PCLMULQDQ, 0x00000001, 0, RTE_REG_ECX, 1) + FEAT_DEF(DTES64, 0x00000001, 0, RTE_REG_ECX, 2) + FEAT_DEF(MONITOR, 0x00000001, 0, RTE_REG_ECX, 3) + FEAT_DEF(DS_CPL, 0x00000001, 0, RTE_REG_ECX, 4) + FEAT_DEF(VMX, 0x00000001, 0, RTE_REG_ECX, 5) + FEAT_DEF(SMX, 0x00000001, 0, RTE_REG_ECX, 6) + FEAT_DEF(EIST, 0x00000001, 0, RTE_REG_ECX, 7) + FEAT_DEF(TM2, 0x00000001, 0, RTE_REG_ECX, 8) + FEAT_DEF(SSSE3, 0x00000001, 0, RTE_REG_ECX, 9) + FEAT_DEF(CNXT_ID, 0x00000001, 0, RTE_REG_ECX, 10) + FEAT_DEF(FMA, 0x00000001, 0, RTE_REG_ECX, 12) + FEAT_DEF(CMPXCHG16B, 0x00000001, 0, RTE_REG_ECX, 13) + FEAT_DEF(XTPR, 0x00000001, 0, RTE_REG_ECX, 14) + FEAT_DEF(PDCM, 0x00000001, 0, RTE_REG_ECX, 15) + FEAT_DEF(PCID, 0x00000001, 0, RTE_REG_ECX, 17) + FEAT_DEF(DCA, 0x00000001, 0, RTE_REG_ECX, 18) + FEAT_DEF(SSE4_1, 0x00000001, 0, RTE_REG_ECX, 19) + FEAT_DEF(SSE4_2, 0x00000001, 0, RTE_REG_ECX, 20) + FEAT_DEF(X2APIC, 0x00000001, 0, RTE_REG_ECX, 21) + FEAT_DEF(MOVBE, 0x00000001, 0, RTE_REG_ECX, 22) + FEAT_DEF(POPCNT, 0x00000001, 0, RTE_REG_ECX, 23) + FEAT_DEF(TSC_DEADLINE, 0x00000001, 0, RTE_REG_ECX, 24) + FEAT_DEF(AES, 0x00000001, 0, RTE_REG_ECX, 25) + FEAT_DEF(XSAVE, 0x00000001, 0, RTE_REG_ECX, 26) + FEAT_DEF(OSXSAVE, 0x00000001, 0, RTE_REG_ECX, 27) + FEAT_DEF(AVX, 0x00000001, 0, RTE_REG_ECX, 28) + FEAT_DEF(F16C, 0x00000001, 0, RTE_REG_ECX, 29) + FEAT_DEF(RDRAND, 0x00000001, 0, RTE_REG_ECX, 30) + FEAT_DEF(HYPERVISOR, 0x00000001, 0, RTE_REG_ECX, 31) + + FEAT_DEF(FPU, 0x00000001, 0, RTE_REG_EDX, 0) + FEAT_DEF(VME, 0x00000001, 0, RTE_REG_EDX, 1) + FEAT_DEF(DE, 0x00000001, 0, RTE_REG_EDX, 2) + FEAT_DEF(PSE, 0x00000001, 0, RTE_REG_EDX, 3) + FEAT_DEF(TSC, 0x00000001, 0, RTE_REG_EDX, 4) + FEAT_DEF(MSR, 0x00000001, 0, RTE_REG_EDX, 5) + FEAT_DEF(PAE, 0x00000001, 0, RTE_REG_EDX, 6) + FEAT_DEF(MCE, 0x00000001, 0, RTE_REG_EDX, 7) + FEAT_DEF(CX8, 0x00000001, 0, RTE_REG_EDX, 8) + FEAT_DEF(APIC, 0x00000001, 0, RTE_REG_EDX, 9) + FEAT_DEF(SEP, 0x00000001, 0, RTE_REG_EDX, 11) + FEAT_DEF(MTRR, 0x00000001, 0, RTE_REG_EDX, 12) + FEAT_DEF(PGE, 0x00000001, 0, RTE_REG_EDX, 13) + FEAT_DEF(MCA, 0x00000001, 0, RTE_REG_EDX, 14) + FEAT_DEF(CMOV, 0x00000001, 0, RTE_REG_EDX, 15) + FEAT_DEF(PAT, 0x00000001, 0, RTE_REG_EDX, 16) + FEAT_DEF(PSE36, 0x00000001, 0, RTE_REG_EDX, 17) + FEAT_DEF(PSN, 0x00000001, 0, RTE_REG_EDX, 18) + FEAT_DEF(CLFSH, 0x00000001, 0, RTE_REG_EDX, 19) + FEAT_DEF(DS, 0x00000001, 0, RTE_REG_EDX, 21) + FEAT_DEF(ACPI, 0x00000001, 0, RTE_REG_EDX, 22) + FEAT_DEF(MMX, 0x00000001, 0, RTE_REG_EDX, 23) + FEAT_DEF(FXSR, 0x00000001, 0, RTE_REG_EDX, 24) + FEAT_DEF(SSE, 0x00000001, 0, RTE_REG_EDX, 25) + FEAT_DEF(SSE2, 0x00000001, 0, RTE_REG_EDX, 26) + FEAT_DEF(SS, 0x00000001, 0, RTE_REG_EDX, 27) + FEAT_DEF(HTT, 0x00000001, 0, RTE_REG_EDX, 28) + FEAT_DEF(TM, 0x00000001, 0, RTE_REG_EDX, 29) + FEAT_DEF(PBE, 0x00000001, 0, RTE_REG_EDX, 31) + + FEAT_DEF(DIGTEMP, 0x00000006, 0, RTE_REG_EAX, 0) + FEAT_DEF(TRBOBST, 0x00000006, 0, RTE_REG_EAX, 1) + FEAT_DEF(ARAT, 0x00000006, 0, RTE_REG_EAX, 2) + FEAT_DEF(PLN, 0x00000006, 0, RTE_REG_EAX, 4) + FEAT_DEF(ECMD, 0x00000006, 0, RTE_REG_EAX, 5) + FEAT_DEF(PTM, 0x00000006, 0, RTE_REG_EAX, 6) + + FEAT_DEF(MPERF_APERF_MSR, 0x00000006, 0, RTE_REG_ECX, 0) + FEAT_DEF(ACNT2, 0x00000006, 0, RTE_REG_ECX, 1) + FEAT_DEF(ENERGY_EFF, 0x00000006, 0, RTE_REG_ECX, 3) + + FEAT_DEF(FSGSBASE, 0x00000007, 0, RTE_REG_EBX, 0) + FEAT_DEF(BMI1, 0x00000007, 0, RTE_REG_EBX, 2) + FEAT_DEF(HLE, 0x00000007, 0, RTE_REG_EBX, 4) + FEAT_DEF(AVX2, 0x00000007, 0, RTE_REG_EBX, 5) + FEAT_DEF(SMEP, 0x00000007, 0, RTE_REG_EBX, 6) + FEAT_DEF(BMI2, 0x00000007, 0, RTE_REG_EBX, 7) + FEAT_DEF(ERMS, 0x00000007, 0, RTE_REG_EBX, 8) + FEAT_DEF(INVPCID, 0x00000007, 0, RTE_REG_EBX, 10) + FEAT_DEF(RTM, 0x00000007, 0, RTE_REG_EBX, 11) + FEAT_DEF(AVX512F, 0x00000007, 0, RTE_REG_EBX, 16) + FEAT_DEF(RDSEED, 0x00000007, 0, RTE_REG_EBX, 18) + + FEAT_DEF(LAHF_SAHF, 0x80000001, 0, RTE_REG_ECX, 0) + FEAT_DEF(LZCNT, 0x80000001, 0, RTE_REG_ECX, 4) + + FEAT_DEF(SYSCALL, 0x80000001, 0, RTE_REG_EDX, 11) + FEAT_DEF(XD, 0x80000001, 0, RTE_REG_EDX, 20) + FEAT_DEF(1GB_PG, 0x80000001, 0, RTE_REG_EDX, 26) + FEAT_DEF(RDTSCP, 0x80000001, 0, RTE_REG_EDX, 27) + FEAT_DEF(EM64T, 0x80000001, 0, RTE_REG_EDX, 29) + + FEAT_DEF(INVTSC, 0x80000007, 0, RTE_REG_EDX, 8) + + FEAT_DEF(AVX512DQ, 0x00000007, 0, RTE_REG_EBX, 17) + FEAT_DEF(AVX512IFMA, 0x00000007, 0, RTE_REG_EBX, 21) + FEAT_DEF(AVX512CD, 0x00000007, 0, RTE_REG_EBX, 28) + FEAT_DEF(AVX512BW, 0x00000007, 0, RTE_REG_EBX, 30) + FEAT_DEF(AVX512VL, 0x00000007, 0, RTE_REG_EBX, 31) + FEAT_DEF(AVX512VBMI, 0x00000007, 0, RTE_REG_ECX, 1) + FEAT_DEF(AVX512VBMI2, 0x00000007, 0, RTE_REG_ECX, 6) + FEAT_DEF(GFNI, 0x00000007, 0, RTE_REG_ECX, 8) + FEAT_DEF(VAES, 0x00000007, 0, RTE_REG_ECX, 9) + FEAT_DEF(VPCLMULQDQ, 0x00000007, 0, RTE_REG_ECX, 10) + FEAT_DEF(AVX512VNNI, 0x00000007, 0, RTE_REG_ECX, 11) + FEAT_DEF(AVX512BITALG, 0x00000007, 0, RTE_REG_ECX, 12) + FEAT_DEF(AVX512VPOPCNTDQ, 0x00000007, 0, RTE_REG_ECX, 14) + FEAT_DEF(CLDEMOTE, 0x00000007, 0, RTE_REG_ECX, 25) + FEAT_DEF(MOVDIRI, 0x00000007, 0, RTE_REG_ECX, 27) + FEAT_DEF(MOVDIR64B, 0x00000007, 0, RTE_REG_ECX, 28) + FEAT_DEF(AVX512VP2INTERSECT, 0x00000007, 0, RTE_REG_EDX, 8) +}; + +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature) +{ + const struct feature_entry *feat; + cpuid_registers_t regs; + unsigned int maxleaf; + + if (feature >= RTE_CPUFLAG_NUMFLAGS) + /* Flag does not match anything in the feature tables */ + return -ENOENT; + + feat = &rte_cpu_feature_table[feature]; + + if (!feat->leaf) + /* This entry in the table wasn't filled out! */ + return -EFAULT; + + maxleaf = __get_cpuid_max(feat->leaf & 0x80000000, NULL); + + if (maxleaf < feat->leaf) + return 0; + + __cpuid_count(feat->leaf, feat->subleaf, + regs[RTE_REG_EAX], regs[RTE_REG_EBX], + regs[RTE_REG_ECX], regs[RTE_REG_EDX]); + + /* check if the feature is enabled */ + return (regs[feat->reg] >> feat->bit) & 1; +} + +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature) +{ + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return NULL; + return rte_cpu_feature_table[feature].name; +} diff --git a/src/spdk/dpdk/lib/librte_eal/x86/rte_cpuid.h b/src/spdk/dpdk/lib/librte_eal/x86/rte_cpuid.h new file mode 100644 index 000000000..b773ad931 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/rte_cpuid.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef RTE_CPUID_H +#define RTE_CPUID_H + +#include <cpuid.h> + +enum cpu_register_t { + RTE_REG_EAX = 0, + RTE_REG_EBX, + RTE_REG_ECX, + RTE_REG_EDX, +}; + +typedef uint32_t cpuid_registers_t[4]; + +#endif /* RTE_CPUID_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/x86/rte_cycles.c b/src/spdk/dpdk/lib/librte_eal/x86/rte_cycles.c new file mode 100644 index 000000000..edd9621ab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/rte_cycles.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include <fcntl.h> +#include <unistd.h> +#include <cpuid.h> + +#include <rte_common.h> + +#include "eal_private.h" + +static unsigned int +rte_cpu_get_model(uint32_t fam_mod_step) +{ + uint32_t family, model, ext_model; + + family = (fam_mod_step >> 8) & 0xf; + model = (fam_mod_step >> 4) & 0xf; + + if (family == 6 || family == 15) { + ext_model = (fam_mod_step >> 16) & 0xf; + model += (ext_model << 4); + } + + return model; +} + +static int32_t +rdmsr(int msr, uint64_t *val) +{ +#ifdef RTE_EXEC_ENV_LINUX + int fd; + int ret; + + fd = open("/dev/cpu/0/msr", O_RDONLY); + if (fd < 0) + return fd; + + ret = pread(fd, val, sizeof(uint64_t), msr); + + close(fd); + + return ret; +#else + RTE_SET_USED(msr); + RTE_SET_USED(val); + + return -1; +#endif +} + +static uint32_t +check_model_wsm_nhm(uint8_t model) +{ + switch (model) { + /* Westmere */ + case 0x25: + case 0x2C: + case 0x2F: + /* Nehalem */ + case 0x1E: + case 0x1F: + case 0x1A: + case 0x2E: + return 1; + } + + return 0; +} + +static uint32_t +check_model_gdm_dnv(uint8_t model) +{ + switch (model) { + /* Goldmont */ + case 0x5C: + /* Denverton */ + case 0x5F: + return 1; + } + + return 0; +} + +uint64_t +get_tsc_freq_arch(void) +{ + uint64_t tsc_hz = 0; + uint32_t a, b, c, d, maxleaf; + uint8_t mult, model; + int32_t ret; + + /* + * Time Stamp Counter and Nominal Core Crystal Clock + * Information Leaf + */ + maxleaf = __get_cpuid_max(0, NULL); + + if (maxleaf >= 0x15) { + __cpuid(0x15, a, b, c, d); + + /* EBX : TSC/Crystal ratio, ECX : Crystal Hz */ + if (b && c) + return c * (b / a); + } + + __cpuid(0x1, a, b, c, d); + model = rte_cpu_get_model(a); + + if (check_model_wsm_nhm(model)) + mult = 133; + else if ((c & bit_AVX) || check_model_gdm_dnv(model)) + mult = 100; + else + return 0; + + ret = rdmsr(0xCE, &tsc_hz); + if (ret < 0) + return 0; + + return ((tsc_hz >> 8) & 0xff) * mult * 1E6; +} diff --git a/src/spdk/dpdk/lib/librte_eal/x86/rte_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/x86/rte_hypervisor.c new file mode 100644 index 000000000..c38cfc09d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/rte_hypervisor.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +#include <stdint.h> +#include <string.h> + +#include "rte_cpuflags.h" +#include "rte_cpuid.h" + +/* See http://lwn.net/Articles/301888/ */ +#define HYPERVISOR_INFO_LEAF 0x40000000 + +enum rte_hypervisor +rte_hypervisor_get(void) +{ + cpuid_registers_t regs; + int reg; + char name[13]; + + if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_HYPERVISOR)) + return RTE_HYPERVISOR_NONE; + + __cpuid(HYPERVISOR_INFO_LEAF, + regs[RTE_REG_EAX], regs[RTE_REG_EBX], + regs[RTE_REG_ECX], regs[RTE_REG_EDX]); + for (reg = 1; reg < 4; reg++) + memcpy(name + (reg - 1) * 4, ®s[reg], 4); + name[12] = '\0'; + + if (strcmp("KVMKVMKVM", name) == 0) + return RTE_HYPERVISOR_KVM; + if (strcmp("Microsoft Hv", name) == 0) + return RTE_HYPERVISOR_HYPERV; + if (strcmp("VMwareVMware", name) == 0) + return RTE_HYPERVISOR_VMWARE; + return RTE_HYPERVISOR_UNKNOWN; +} diff --git a/src/spdk/dpdk/lib/librte_eal/x86/rte_spinlock.c b/src/spdk/dpdk/lib/librte_eal/x86/rte_spinlock.c new file mode 100644 index 000000000..34890ea87 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/x86/rte_spinlock.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdint.h> + +#include "rte_cpuflags.h" + +uint8_t rte_rtm_supported; /* cache the flag to avoid the overhead + of the rte_cpu_get_flag_enabled function */ + +RTE_INIT(rte_rtm_init) +{ + rte_rtm_supported = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RTM); +} |