summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib/env_dpdk
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/lib/env_dpdk
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/env_dpdk')
-rw-r--r--src/spdk/lib/env_dpdk/Makefile47
-rw-r--r--src/spdk/lib/env_dpdk/env.c451
-rw-r--r--src/spdk/lib/env_dpdk/env.mk176
-rw-r--r--src/spdk/lib/env_dpdk/env_internal.h98
-rw-r--r--src/spdk/lib/env_dpdk/init.c604
-rw-r--r--src/spdk/lib/env_dpdk/memory.c1442
-rw-r--r--src/spdk/lib/env_dpdk/pci.c1063
-rw-r--r--src/spdk/lib/env_dpdk/pci_idxd.c50
-rw-r--r--src/spdk/lib/env_dpdk/pci_ioat.c98
-rw-r--r--src/spdk/lib/env_dpdk/pci_virtio.c53
-rw-r--r--src/spdk/lib/env_dpdk/pci_vmd.c50
-rw-r--r--src/spdk/lib/env_dpdk/spdk_env_dpdk.map114
-rw-r--r--src/spdk/lib/env_dpdk/threads.c108
13 files changed, 4354 insertions, 0 deletions
diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile
new file mode 100644
index 000000000..11433fe86
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = env.c memory.c pci.c init.c threads.c
+C_SRCS += pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c
+LIBNAME = env_dpdk
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_env_dpdk.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c
new file mode 100644
index 000000000..94b709de9
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.c
@@ -0,0 +1,451 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+#include "spdk/env_dpdk.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_mempool.h>
+#include <rte_memzone.h>
+#include <rte_version.h>
+
+static uint64_t
+virt_to_phys(void *vaddr)
+{
+ uint64_t ret;
+
+ ret = rte_malloc_virt2iova(vaddr);
+ if (ret != RTE_BAD_IOVA) {
+ return ret;
+ }
+
+ return spdk_vtophys(vaddr, NULL);
+}
+
+void *
+spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+ void *buf;
+
+ if (flags == 0) {
+ return NULL;
+ }
+
+ align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+ buf = rte_malloc_socket(NULL, size, align, socket_id);
+ if (buf && phys_addr) {
+#ifdef DEBUG
+ fprintf(stderr, "phys_addr param in spdk_*malloc() is deprecated\n");
+#endif
+ *phys_addr = virt_to_phys(buf);
+ }
+ return buf;
+}
+
+void *
+spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+ void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags);
+ if (buf) {
+ memset(buf, 0, size);
+ }
+ return buf;
+}
+
+void *
+spdk_realloc(void *buf, size_t size, size_t align)
+{
+ align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+ return rte_realloc(buf, size, align);
+}
+
+void
+spdk_free(void *buf)
+{
+ rte_free(buf);
+}
+
+void *
+spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+ return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+ return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+ return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+ return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr)
+{
+ void *new_buf;
+
+ align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+ new_buf = rte_realloc(buf, size, align);
+ if (new_buf && phys_addr) {
+ *phys_addr = virt_to_phys(new_buf);
+ }
+ return new_buf;
+}
+
+void
+spdk_dma_free(void *buf)
+{
+ spdk_free(buf);
+}
+
+void *
+spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
+ unsigned flags, unsigned align)
+{
+ const struct rte_memzone *mz;
+ unsigned dpdk_flags = 0;
+
+ if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) {
+ dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG;
+ }
+
+ if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+ socket_id = SOCKET_ID_ANY;
+ }
+
+ mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align);
+
+ if (mz != NULL) {
+ memset(mz->addr, 0, len);
+ return mz->addr;
+ } else {
+ return NULL;
+ }
+}
+
+void *
+spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags)
+{
+ return spdk_memzone_reserve_aligned(name, len, socket_id, flags,
+ RTE_CACHE_LINE_SIZE);
+}
+
+void *
+spdk_memzone_lookup(const char *name)
+{
+ const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+ if (mz != NULL) {
+ return mz->addr;
+ } else {
+ return NULL;
+ }
+}
+
+int
+spdk_memzone_free(const char *name)
+{
+ const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+ if (mz != NULL) {
+ return rte_memzone_free(mz);
+ }
+
+ return -1;
+}
+
+void
+spdk_memzone_dump(FILE *f)
+{
+ rte_memzone_dump(f);
+}
+
+struct spdk_mempool *
+spdk_mempool_create_ctor(const char *name, size_t count,
+ size_t ele_size, size_t cache_size, int socket_id,
+ spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg)
+{
+ struct rte_mempool *mp;
+ size_t tmp;
+
+ if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+ socket_id = SOCKET_ID_ANY;
+ }
+
+ /* No more than half of all elements can be in cache */
+ tmp = (count / 2) / rte_lcore_count();
+ if (cache_size > tmp) {
+ cache_size = tmp;
+ }
+
+ if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+ cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+ }
+
+ mp = rte_mempool_create(name, count, ele_size, cache_size,
+ 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg,
+ socket_id, MEMPOOL_F_NO_PHYS_CONTIG);
+
+ return (struct spdk_mempool *)mp;
+}
+
+
+struct spdk_mempool *
+spdk_mempool_create(const char *name, size_t count,
+ size_t ele_size, size_t cache_size, int socket_id)
+{
+ return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id,
+ NULL, NULL);
+}
+
+char *
+spdk_mempool_get_name(struct spdk_mempool *mp)
+{
+ return ((struct rte_mempool *)mp)->name;
+}
+
+void
+spdk_mempool_free(struct spdk_mempool *mp)
+{
+ rte_mempool_free((struct rte_mempool *)mp);
+}
+
+void *
+spdk_mempool_get(struct spdk_mempool *mp)
+{
+ void *ele = NULL;
+ int rc;
+
+ rc = rte_mempool_get((struct rte_mempool *)mp, &ele);
+ if (rc != 0) {
+ return NULL;
+ }
+ return ele;
+}
+
+int
+spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+ return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+void
+spdk_mempool_put(struct spdk_mempool *mp, void *ele)
+{
+ rte_mempool_put((struct rte_mempool *)mp, ele);
+}
+
+void
+spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+ rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+size_t
+spdk_mempool_count(const struct spdk_mempool *pool)
+{
+ return rte_mempool_avail_count((struct rte_mempool *)pool);
+}
+
+uint32_t
+spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb,
+ void *obj_cb_arg)
+{
+ return rte_mempool_obj_iter((struct rte_mempool *)mp, (rte_mempool_obj_cb_t *)obj_cb,
+ obj_cb_arg);
+}
+
+struct spdk_mempool *
+spdk_mempool_lookup(const char *name)
+{
+ return (struct spdk_mempool *)rte_mempool_lookup(name);
+}
+
+bool
+spdk_process_is_primary(void)
+{
+ return (rte_eal_process_type() == RTE_PROC_PRIMARY);
+}
+
+uint64_t spdk_get_ticks(void)
+{
+ return rte_get_timer_cycles();
+}
+
+uint64_t spdk_get_ticks_hz(void)
+{
+ return rte_get_timer_hz();
+}
+
+void spdk_delay_us(unsigned int us)
+{
+ rte_delay_us(us);
+}
+
+void spdk_pause(void)
+{
+ rte_pause();
+}
+
+void
+spdk_unaffinitize_thread(void)
+{
+ rte_cpuset_t new_cpuset, orig_cpuset;
+ long num_cores, i, orig_num_cores;
+
+ CPU_ZERO(&new_cpuset);
+
+ num_cores = sysconf(_SC_NPROCESSORS_CONF);
+
+ /* Create a mask containing all CPUs */
+ for (i = 0; i < num_cores; i++) {
+ CPU_SET(i, &new_cpuset);
+ }
+
+ rte_thread_get_affinity(&orig_cpuset);
+ orig_num_cores = CPU_COUNT(&orig_cpuset);
+ if (orig_num_cores < num_cores) {
+ for (i = 0; i < orig_num_cores; i++) {
+ if (CPU_ISSET(i, &orig_cpuset)) {
+ CPU_CLR(i, &new_cpuset);
+ }
+ }
+ }
+
+ rte_thread_set_affinity(&new_cpuset);
+}
+
+void *
+spdk_call_unaffinitized(void *cb(void *arg), void *arg)
+{
+ rte_cpuset_t orig_cpuset;
+ void *ret;
+
+ if (cb == NULL) {
+ return NULL;
+ }
+
+ rte_thread_get_affinity(&orig_cpuset);
+
+ spdk_unaffinitize_thread();
+
+ ret = cb(arg);
+
+ rte_thread_set_affinity(&orig_cpuset);
+
+ return ret;
+}
+
+struct spdk_ring *
+spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id)
+{
+ char ring_name[64];
+ static uint32_t ring_num = 0;
+ unsigned flags = RING_F_EXACT_SZ;
+
+ switch (type) {
+ case SPDK_RING_TYPE_SP_SC:
+ flags |= RING_F_SP_ENQ | RING_F_SC_DEQ;
+ break;
+ case SPDK_RING_TYPE_MP_SC:
+ flags |= RING_F_SC_DEQ;
+ break;
+ case SPDK_RING_TYPE_MP_MC:
+ flags |= 0;
+ break;
+ default:
+ return NULL;
+ }
+
+ snprintf(ring_name, sizeof(ring_name), "ring_%u_%d",
+ __atomic_fetch_add(&ring_num, 1, __ATOMIC_RELAXED), getpid());
+
+ return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags);
+}
+
+void
+spdk_ring_free(struct spdk_ring *ring)
+{
+ rte_ring_free((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_count(struct spdk_ring *ring)
+{
+ return rte_ring_count((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count,
+ size_t *free_space)
+{
+ return rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count,
+ (unsigned int *)free_space);
+}
+
+size_t
+spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count)
+{
+ return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL);
+}
+
+void
+spdk_env_dpdk_dump_mem_stats(FILE *file)
+{
+ fprintf(file, "DPDK memory size %lu\n", rte_eal_get_physmem_size());
+ fprintf(file, "DPDK memory layout\n");
+ rte_dump_physmem_layout(file);
+ fprintf(file, "DPDK memzones.\n");
+ rte_memzone_dump(file);
+ fprintf(file, "DPDK mempools.\n");
+ rte_mempool_list_dump(file);
+ fprintf(file, "DPDK malloc stats.\n");
+ rte_malloc_dump_stats(file, NULL);
+ fprintf(file, "DPDK malloc heaps.\n");
+ rte_malloc_dump_heaps(file);
+}
diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk
new file mode 100644
index 000000000..c2bfb0d19
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.mk
@@ -0,0 +1,176 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# This makefile snippet must define the following flags:
+# ENV_CFLAGS
+# ENV_CXXFLAGS
+# ENV_LIBS
+# ENV_LINKER_ARGS
+
+DPDK_DIR = $(CONFIG_DPDK_DIR)
+
+export DPDK_ABS_DIR = $(abspath $(DPDK_DIR))
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h))
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include
+else
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk
+endif
+DPDK_INC := -I$(DPDK_INC_DIR)
+
+ifeq ($(CONFIG_SHARED),y)
+DPDK_LIB_EXT = .so
+else
+DPDK_LIB_EXT = .a
+endif
+
+DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf
+
+# librte_mempool_ring was new added from DPDK 17.05. Link this library used for
+# ring based mempool management API.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*))
+DPDK_LIB_LIST += rte_mempool_ring
+endif
+
+# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its
+# existence to maintain backward compatibility.
+ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),)
+DPDK_LIB_LIST += rte_malloc
+endif
+
+# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally
+# based on their existence to maintain backward compatibility.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*))
+DPDK_LIB_LIST += rte_pci
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*))
+DPDK_LIB_LIST += rte_bus_pci
+endif
+
+# DPDK 20.05 eal dependency
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_telemetry.*))
+DPDK_LIB_LIST += rte_telemetry
+endif
+
+# There are some complex dependencies when using crypto, reduce or both so
+# here we add the feature specific ones and set a flag to add the common
+# ones after that.
+DPDK_FRAMEWORK=n
+ifeq ($(CONFIG_CRYPTO),y)
+DPDK_FRAMEWORK=y
+DPDK_LIB_LIST += rte_pmd_aesni_mb rte_reorder
+endif
+
+ifeq ($(CONFIG_REDUCE),y)
+DPDK_FRAMEWORK=y
+DPDK_LIB_LIST += rte_pmd_isal
+endif
+
+ifeq ($(DPDK_FRAMEWORK),y)
+DPDK_LIB_LIST += rte_cryptodev rte_compressdev rte_bus_vdev rte_pmd_qat
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*))
+DPDK_LIB_LIST += rte_kvargs
+endif
+
+LINK_HASH=n
+
+ifeq ($(CONFIG_VHOST),y)
+ifneq ($(CONFIG_VHOST_INTERNAL_LIB),y)
+DPDK_LIB_LIST += rte_vhost rte_net
+LINK_HASH=y
+ifneq ($(DPDK_FRAMEWORK),y)
+DPDK_LIB_LIST += rte_cryptodev
+endif
+endif
+endif
+
+ifeq ($(CONFIG_RAID5),y)
+LINK_HASH=y
+endif
+
+ifeq ($(LINK_HASH),y)
+DPDK_LIB_LIST += rte_hash
+endif
+
+define dpdk_lib_list_to_libs
+$(1:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT))
+endef
+
+define dpdk_env_linker_args
+$(ENV_DPDK_FILE) -Wl,--whole-archive,--no-as-needed $(call dpdk_lib_list_to_libs,$1) -Wl,--no-whole-archive
+endef
+
+DPDK_LIB = $(call dpdk_lib_list_to_libs,$(DPDK_LIB_LIST))
+
+# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05
+ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations
+ENV_CXXFLAGS = $(ENV_CFLAGS)
+ifeq ($(CONFIG_SHARED),y)
+ENV_DPDK_FILE = $(call spdk_lib_list_to_shared_libs,env_dpdk)
+else
+ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk)
+endif
+ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB)
+ENV_LINKER_ARGS = -Wl,-rpath-link $(DPDK_ABS_DIR)/lib
+ENV_LINKER_ARGS += $(call dpdk_env_linker_args,$(DPDK_LIB_LIST))
+
+ifeq ($(CONFIG_IPSEC_MB),y)
+ENV_LINKER_ARGS += -lIPSec_MB -L$(IPSEC_MB_DIR)
+endif
+
+ifeq ($(CONFIG_REDUCE),y)
+ENV_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs
+endif
+
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+# DPDK built with meson puts those defines elsewhere
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_build_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_build_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+ifeq ($(OS),Linux)
+ENV_LINKER_ARGS += -ldl
+endif
+ifeq ($(OS),FreeBSD)
+ENV_LINKER_ARGS += -lexecinfo
+endif
diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h
new file mode 100644
index 000000000..c7900d9d3
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env_internal.h
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ENV_INTERNAL_H
+#define SPDK_ENV_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+
+#include <rte_config.h>
+#include <rte_version.h>
+#include <rte_eal.h>
+#include <rte_bus.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_dev.h>
+
+#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0)
+#error RTE_VERSION is too old! Minimum 18.11 is required.
+#endif
+
+/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
+ * which is enough to cover 256 TB.
+ */
+#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */
+#define MASK_256TB ((1ULL << SHIFT_256TB) - 1)
+
+#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */
+#define MASK_1GB ((1ULL << SHIFT_1GB) - 1)
+
+#define SPDK_PCI_DRIVER_MAX_NAME_LEN 32
+struct spdk_pci_driver {
+ struct rte_pci_driver driver;
+
+ const char *name;
+ const struct spdk_pci_id *id_table;
+ uint32_t drv_flags;
+
+ spdk_pci_enum_cb cb_fn;
+ void *cb_arg;
+ TAILQ_ENTRY(spdk_pci_driver) tailq;
+};
+
+int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
+int pci_device_fini(struct rte_pci_device *device);
+
+void pci_env_init(void);
+void pci_env_reinit(void);
+void pci_env_fini(void);
+int mem_map_init(bool legacy_mem);
+int vtophys_init(void);
+
+/**
+ * Report a DMA-capable PCI device to the vtophys translation code.
+ * Increases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called after a `rte_pci_device` is created.
+ */
+void vtophys_pci_device_added(struct rte_pci_device *pci_device);
+
+/**
+ * Report the removal of a DMA-capable PCI device to the vtophys translation code.
+ * Decreases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called before a `rte_pci_device` is destroyed.
+ */
+void vtophys_pci_device_removed(struct rte_pci_device *pci_device);
+
+#endif
diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c
new file mode 100644
index 000000000..0376dbe7b
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/init.c
@@ -0,0 +1,604 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include "spdk/version.h"
+#include "spdk/env_dpdk.h"
+
+#include <rte_config.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk"
+#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1
+#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1
+#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1"
+#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000
+
+static char **g_eal_cmdline;
+static int g_eal_cmdline_argcount;
+static bool g_external_init = true;
+
+static char *
+_sprintf_alloc(const char *format, ...)
+{
+ va_list args;
+ va_list args_copy;
+ char *buf;
+ size_t bufsize;
+ int rc;
+
+ va_start(args, format);
+
+ /* Try with a small buffer first. */
+ bufsize = 32;
+
+ /* Limit maximum buffer size to something reasonable so we don't loop forever. */
+ while (bufsize <= 1024 * 1024) {
+ buf = malloc(bufsize);
+ if (buf == NULL) {
+ va_end(args);
+ return NULL;
+ }
+
+ va_copy(args_copy, args);
+ rc = vsnprintf(buf, bufsize, format, args_copy);
+ va_end(args_copy);
+
+ /*
+ * If vsnprintf() returned a count within our current buffer size, we are done.
+ * The count does not include the \0 terminator, so rc == bufsize is not OK.
+ */
+ if (rc >= 0 && (size_t)rc < bufsize) {
+ va_end(args);
+ return buf;
+ }
+
+ /*
+ * vsnprintf() should return the required space, but some libc versions do not
+ * implement this correctly, so just double the buffer size and try again.
+ *
+ * We don't need the data in buf, so rather than realloc(), use free() and malloc()
+ * again to avoid a copy.
+ */
+ free(buf);
+ bufsize *= 2;
+ }
+
+ va_end(args);
+ return NULL;
+}
+
+void
+spdk_env_opts_init(struct spdk_env_opts *opts)
+{
+ if (!opts) {
+ return;
+ }
+
+ memset(opts, 0, sizeof(*opts));
+
+ opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
+ opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
+ opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
+ opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
+ opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
+ opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
+ opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
+}
+
+static void
+free_args(char **args, int argcount)
+{
+ int i;
+
+ if (args == NULL) {
+ return;
+ }
+
+ for (i = 0; i < argcount; i++) {
+ free(args[i]);
+ }
+
+ if (argcount) {
+ free(args);
+ }
+}
+
+static char **
+push_arg(char *args[], int *argcount, char *arg)
+{
+ char **tmp;
+
+ if (arg == NULL) {
+ fprintf(stderr, "%s: NULL arg supplied\n", __func__);
+ free_args(args, *argcount);
+ return NULL;
+ }
+
+ tmp = realloc(args, sizeof(char *) * (*argcount + 1));
+ if (tmp == NULL) {
+ free(arg);
+ free_args(args, *argcount);
+ return NULL;
+ }
+
+ tmp[*argcount] = arg;
+ (*argcount)++;
+
+ return tmp;
+}
+
+#if defined(__linux__) && defined(__x86_64__)
+
+/* TODO: Can likely get this value from rlimits in the future */
+#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
+#define VTD_CAP_MGAW_SHIFT 16
+#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
+
+static int
+get_iommu_width(void)
+{
+ DIR *dir;
+ FILE *file;
+ struct dirent *entry;
+ char mgaw_path[64];
+ char buf[64];
+ char *end;
+ long long int val;
+ int width, tmp;
+
+ dir = opendir("/sys/devices/virtual/iommu/");
+ if (dir == NULL) {
+ return -EINVAL;
+ }
+
+ width = 0;
+
+ while ((entry = readdir(dir)) != NULL) {
+ /* Find directories named "dmar0", "dmar1", etc */
+ if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
+ continue;
+ }
+
+ tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
+ entry->d_name);
+ if ((unsigned)tmp >= sizeof(mgaw_path)) {
+ continue;
+ }
+
+ file = fopen(mgaw_path, "r");
+ if (file == NULL) {
+ continue;
+ }
+
+ if (fgets(buf, sizeof(buf), file) == NULL) {
+ fclose(file);
+ continue;
+ }
+
+ val = strtoll(buf, &end, 16);
+ if (val == LLONG_MIN || val == LLONG_MAX) {
+ fclose(file);
+ continue;
+ }
+
+ tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
+ if (width == 0 || tmp < width) {
+ width = tmp;
+ }
+
+ fclose(file);
+ }
+
+ closedir(dir);
+
+ return width;
+}
+
+#endif
+
+static int
+build_eal_cmdline(const struct spdk_env_opts *opts)
+{
+ int argcount = 0;
+ char **args;
+
+ args = NULL;
+
+ /* set the program name */
+ args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
+ if (opts->shm_id < 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the coremask */
+ /* NOTE: If coremask starts with '[' and ends with ']' it is a core list
+ */
+ if (opts->core_mask[0] == '[') {
+ char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
+
+ if (l_arg != NULL) {
+ int len = strlen(l_arg);
+
+ if (l_arg[len - 1] == ']') {
+ l_arg[len - 1] = '\0';
+ }
+ }
+ args = push_arg(args, &argcount, l_arg);
+ } else {
+ args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
+ }
+
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* set the memory channel number */
+ if (opts->mem_channel > 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the memory size */
+ if (opts->mem_size >= 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the master core */
+ if (opts->master_core > 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
+ opts->master_core));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set no pci if enabled */
+ if (opts->no_pci) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* create just one hugetlbfs file */
+ if (opts->hugepage_single_segments) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* unlink hugepages after initialization */
+ if (opts->unlink_hugepage) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* use a specific hugetlbfs mount */
+ if (opts->hugedir) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ if (opts->num_pci_addr) {
+ size_t i;
+ char bdf[32];
+ struct spdk_pci_addr *pci_addr =
+ opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
+
+ for (i = 0; i < opts->num_pci_addr; i++) {
+ spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
+ args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
+ (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
+ bdf));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+ }
+
+ /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
+ * This can be overridden by specifying the same option in opts->env_context
+ */
+ args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
+ * This can be overridden by specifying the same option in opts->env_context
+ */
+ args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* `user1` log type is used by rte_vhost, which prints an INFO log for each received
+ * vhost user message. We don't want that. The same log type is also used by a couple
+ * of other DPDK libs, but none of which we make use right now. If necessary, this can
+ * be overridden via opts->env_context.
+ */
+ args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ if (opts->env_context) {
+ args = push_arg(args, &argcount, strdup(opts->env_context));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+#ifdef __linux__
+
+ if (opts->iova_mode) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
+ if (args == NULL) {
+ return -1;
+ }
+ } else {
+ /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
+ * but DPDK guesses it should be iova-mode=va. Add a check and force
+ * iova-mode=pa here. */
+ if (rte_vfio_noiommu_is_enabled()) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+#if defined(__x86_64__)
+ /* DPDK by default guesses that it should be using iova-mode=va so that it can
+ * support running as an unprivileged user. However, some systems (especially
+ * virtual machines) don't have an IOMMU capable of handling the full virtual
+ * address space and DPDK doesn't currently catch that. Add a check in SPDK
+ * and force iova-mode=pa here. */
+ if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#elif defined(__PPC64__)
+ /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
+ * auto-detect at the moment, so we'll just force it here. */
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+ if (args == NULL) {
+ return -1;
+ }
+#endif
+ }
+
+
+ /* Set the base virtual address - it must be an address that is not in the
+ * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
+ * mmap hint.
+ *
+ * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+ */
+ args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
+ * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
+ * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
+ * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
+ */
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
+ args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#endif
+
+ if (opts->shm_id < 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
+ getpid()));
+ if (args == NULL) {
+ return -1;
+ }
+ } else {
+ args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
+ opts->shm_id));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* set the process type */
+ args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#endif
+
+ g_eal_cmdline = args;
+ g_eal_cmdline_argcount = argcount;
+ return argcount;
+}
+
+int
+spdk_env_dpdk_post_init(bool legacy_mem)
+{
+ int rc;
+
+ pci_env_init();
+
+ rc = mem_map_init(legacy_mem);
+ if (rc < 0) {
+ fprintf(stderr, "Failed to allocate mem_map\n");
+ return rc;
+ }
+
+ rc = vtophys_init();
+ if (rc < 0) {
+ fprintf(stderr, "Failed to initialize vtophys\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+void
+spdk_env_dpdk_post_fini(void)
+{
+ pci_env_fini();
+
+ free_args(g_eal_cmdline, g_eal_cmdline_argcount);
+ g_eal_cmdline = NULL;
+ g_eal_cmdline_argcount = 0;
+}
+
+int
+spdk_env_init(const struct spdk_env_opts *opts)
+{
+ char **dpdk_args = NULL;
+ int i, rc;
+ int orig_optind;
+ bool legacy_mem;
+
+ /* If SPDK env has been initialized before, then only pci env requires
+ * reinitialization.
+ */
+ if (g_external_init == false) {
+ if (opts != NULL) {
+ fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
+ return -EINVAL;
+ }
+
+ printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
+ pci_env_reinit();
+
+ return 0;
+ }
+
+ if (opts == NULL) {
+ fprintf(stderr, "NULL arguments to initialize DPDK\n");
+ return -EINVAL;
+ }
+
+ rc = build_eal_cmdline(opts);
+ if (rc < 0) {
+ fprintf(stderr, "Invalid arguments to initialize DPDK\n");
+ return -EINVAL;
+ }
+
+ printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
+ printf("[ DPDK EAL parameters: ");
+ for (i = 0; i < g_eal_cmdline_argcount; i++) {
+ printf("%s ", g_eal_cmdline[i]);
+ }
+ printf("]\n");
+
+ /* DPDK rearranges the array we pass to it, so make a copy
+ * before passing so we can still free the individual strings
+ * correctly.
+ */
+ dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
+ if (dpdk_args == NULL) {
+ fprintf(stderr, "Failed to allocate dpdk_args\n");
+ return -ENOMEM;
+ }
+ memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
+
+ fflush(stdout);
+ orig_optind = optind;
+ optind = 1;
+ rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
+ optind = orig_optind;
+
+ free(dpdk_args);
+
+ if (rc < 0) {
+ if (rte_errno == EALREADY) {
+ fprintf(stderr, "DPDK already initialized\n");
+ } else {
+ fprintf(stderr, "Failed to initialize DPDK\n");
+ }
+ return -rte_errno;
+ }
+
+ legacy_mem = false;
+ if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
+ legacy_mem = true;
+ }
+
+ rc = spdk_env_dpdk_post_init(legacy_mem);
+ if (rc == 0) {
+ g_external_init = false;
+ }
+
+ return rc;
+}
+
+void
+spdk_env_fini(void)
+{
+ spdk_env_dpdk_post_fini();
+}
+
+bool
+spdk_env_dpdk_external_init(void)
+{
+ return g_external_init;
+}
diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c
new file mode 100644
index 000000000..4c2205a46
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/memory.c
@@ -0,0 +1,1442 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "spdk_internal/assert.h"
+
+#include "spdk/assert.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/env_dpdk.h"
+
+#ifdef __FreeBSD__
+#define VFIO_ENABLED 0
+#else
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
+#define VFIO_ENABLED 1
+#include <linux/vfio.h>
+#include <rte_vfio.h>
+
+struct spdk_vfio_dma_map {
+ struct vfio_iommu_type1_dma_map map;
+ struct vfio_iommu_type1_dma_unmap unmap;
+ TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
+};
+
+struct vfio_cfg {
+ int fd;
+ bool enabled;
+ bool noiommu_enabled;
+ unsigned device_ref;
+ TAILQ_HEAD(, spdk_vfio_dma_map) maps;
+ pthread_mutex_t mutex;
+};
+
+static struct vfio_cfg g_vfio = {
+ .fd = -1,
+ .enabled = false,
+ .noiommu_enabled = false,
+ .device_ref = 0,
+ .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
+ .mutex = PTHREAD_MUTEX_INITIALIZER
+};
+
+#else
+#define VFIO_ENABLED 0
+#endif
+#endif
+
+#if DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
+#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
+
+#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
+#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
+
+/* Page is registered */
+#define REG_MAP_REGISTERED (1ULL << 62)
+
+/* A notification region barrier. The 2MB translation entry that's marked
+ * with this flag must be unregistered separately. This allows contiguous
+ * regions to be unregistered in the same chunks they were registered.
+ */
+#define REG_MAP_NOTIFY_START (1ULL << 63)
+
+/* Translation of a single 2MB page. */
+struct map_2mb {
+ uint64_t translation_2mb;
+};
+
+/* Second-level map table indexed by bits [21..29] of the virtual address.
+ * Each entry contains the address translation or error for entries that haven't
+ * been retrieved yet.
+ */
+struct map_1gb {
+ struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
+};
+
+/* Top-level map table indexed by bits [30..47] of the virtual address.
+ * Each entry points to a second-level map table or NULL.
+ */
+struct map_256tb {
+ struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
+};
+
+/* Page-granularity memory address translation */
+struct spdk_mem_map {
+ struct map_256tb map_256tb;
+ pthread_mutex_t mutex;
+ uint64_t default_translation;
+ struct spdk_mem_map_ops ops;
+ void *cb_ctx;
+ TAILQ_ENTRY(spdk_mem_map) tailq;
+};
+
+/* Registrations map. The 64 bit translations are bit fields with the
+ * following layout (starting with the low bits):
+ * 0 - 61 : reserved
+ * 62 - 63 : flags
+ */
+static struct spdk_mem_map *g_mem_reg_map;
+static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
+ TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
+static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static bool g_legacy_mem;
+
+/*
+ * Walk the currently registered memory via the main memory registration map
+ * and call the new map's notify callback for each virtually contiguous region.
+ */
+static int
+mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
+{
+ size_t idx_256tb;
+ uint64_t idx_1gb;
+ uint64_t contig_start = UINT64_MAX;
+ uint64_t contig_end = UINT64_MAX;
+ struct map_1gb *map_1gb;
+ int rc;
+
+ if (!g_mem_reg_map) {
+ return -EINVAL;
+ }
+
+ /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
+ pthread_mutex_lock(&g_mem_reg_map->mutex);
+
+ for (idx_256tb = 0;
+ idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
+ idx_256tb++) {
+ map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ if (contig_start != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ rc = map->ops.notify_cb(map->cb_ctx, map, action,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ /* Don't bother handling unregister failures. It can't be any worse */
+ if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+ goto err_unregister;
+ }
+ }
+ contig_start = UINT64_MAX;
+ continue;
+ }
+
+ for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
+ if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+ (contig_start == UINT64_MAX ||
+ (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+ /* Rebuild the virtual address from the indexes */
+ uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+ if (contig_start == UINT64_MAX) {
+ contig_start = vaddr;
+ }
+
+ contig_end = vaddr;
+ } else {
+ if (contig_start != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ rc = map->ops.notify_cb(map->cb_ctx, map, action,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ /* Don't bother handling unregister failures. It can't be any worse */
+ if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+ goto err_unregister;
+ }
+
+ /* This page might be a part of a neighbour region, so process
+ * it again. The idx_1gb will be incremented immediately.
+ */
+ idx_1gb--;
+ }
+ contig_start = UINT64_MAX;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_mem_reg_map->mutex);
+ return 0;
+
+err_unregister:
+ /* Unwind to the first empty translation so we don't unregister
+ * a region that just failed to register.
+ */
+ idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
+ idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
+ contig_start = UINT64_MAX;
+ contig_end = UINT64_MAX;
+
+ /* Unregister any memory we managed to register before the failure */
+ for (; idx_256tb < SIZE_MAX; idx_256tb--) {
+ map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ if (contig_end != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ map->ops.notify_cb(map->cb_ctx, map,
+ SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ }
+ contig_end = UINT64_MAX;
+ continue;
+ }
+
+ for (; idx_1gb < UINT64_MAX; idx_1gb--) {
+ if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+ (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+ /* Rebuild the virtual address from the indexes */
+ uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+ if (contig_end == UINT64_MAX) {
+ contig_end = vaddr;
+ }
+ contig_start = vaddr;
+ } else {
+ if (contig_end != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ map->ops.notify_cb(map->cb_ctx, map,
+ SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ idx_1gb++;
+ }
+ contig_end = UINT64_MAX;
+ }
+ }
+ idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
+ }
+
+ pthread_mutex_unlock(&g_mem_reg_map->mutex);
+ return rc;
+}
+
+struct spdk_mem_map *
+spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
+{
+ struct spdk_mem_map *map;
+ int rc;
+
+ map = calloc(1, sizeof(*map));
+ if (map == NULL) {
+ return NULL;
+ }
+
+ if (pthread_mutex_init(&map->mutex, NULL)) {
+ free(map);
+ return NULL;
+ }
+
+ map->default_translation = default_translation;
+ map->cb_ctx = cb_ctx;
+ if (ops) {
+ map->ops = *ops;
+ }
+
+ if (ops && ops->notify_cb) {
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+ rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ DEBUG_PRINT("Initial mem_map notify failed\n");
+ pthread_mutex_destroy(&map->mutex);
+ free(map);
+ return NULL;
+ }
+ TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ }
+
+ return map;
+}
+
+void
+spdk_mem_map_free(struct spdk_mem_map **pmap)
+{
+ struct spdk_mem_map *map;
+ size_t i;
+
+ if (!pmap) {
+ return;
+ }
+
+ map = *pmap;
+
+ if (!map) {
+ return;
+ }
+
+ if (map->ops.notify_cb) {
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+ mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
+ TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ }
+
+ for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
+ free(map->map_256tb.map[i]);
+ }
+
+ pthread_mutex_destroy(&map->mutex);
+
+ free(map);
+ *pmap = NULL;
+}
+
+int
+spdk_mem_register(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ int rc;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ if (len == 0) {
+ return 0;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if (reg & REG_MAP_REGISTERED) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EBUSY;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = 0;
+ while (len > 0) {
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
+ seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
+ seg_len += VALUE_2MB;
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+int
+spdk_mem_unregister(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ int rc;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg, newreg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ /* The first page must be a start of a region. Also check if it's
+ * registered to make sure we don't return -ERANGE for non-registered
+ * regions.
+ */
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+ if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -ERANGE;
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if ((reg & REG_MAP_REGISTERED) == 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EINVAL;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ /* If the next page is registered, it must be a start of a region as well,
+ * otherwise we'd be unregistering only a part of a region.
+ */
+ if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -ERANGE;
+ }
+ seg_vaddr = vaddr;
+ seg_len = 0;
+
+ while (len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
+
+ if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
+ TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = VALUE_2MB;
+ } else {
+ seg_len += VALUE_2MB;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ if (seg_len > 0) {
+ TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+int
+spdk_mem_reserve(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ if (len == 0) {
+ return 0;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ /* Check if any part of this range is already registered */
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if (reg & REG_MAP_REGISTERED) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EBUSY;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ /* Simply set the translation to the memory map's default. This allocates the space in the
+ * map but does not provide a valid translation. */
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
+ g_mem_reg_map->default_translation);
+
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+static struct map_1gb *
+mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
+{
+ struct map_1gb *map_1gb;
+ uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ size_t i;
+
+ if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
+ return NULL;
+ }
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ pthread_mutex_lock(&map->mutex);
+
+ /* Recheck to make sure nobody else got the mutex first. */
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (!map_1gb) {
+ map_1gb = malloc(sizeof(struct map_1gb));
+ if (map_1gb) {
+ /* initialize all entries to default translation */
+ for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
+ map_1gb->map[i].translation_2mb = map->default_translation;
+ }
+ map->map_256tb.map[idx_256tb] = map_1gb;
+ }
+ }
+
+ pthread_mutex_unlock(&map->mutex);
+
+ if (!map_1gb) {
+ DEBUG_PRINT("allocation failed\n");
+ return NULL;
+ }
+ }
+
+ return map_1gb;
+}
+
+int
+spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
+ uint64_t translation)
+{
+ uint64_t vfn_2mb;
+ struct map_1gb *map_1gb;
+ uint64_t idx_1gb;
+ struct map_2mb *map_2mb;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
+ return -EINVAL;
+ }
+
+ /* For now, only 2 MB-aligned registrations are supported */
+ if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
+ __func__, vaddr, size);
+ return -EINVAL;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+
+ while (size) {
+ map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
+ if (!map_1gb) {
+ DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
+ return -ENOMEM;
+ }
+
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+ map_2mb = &map_1gb->map[idx_1gb];
+ map_2mb->translation_2mb = translation;
+
+ size -= VALUE_2MB;
+ vfn_2mb++;
+ }
+
+ return 0;
+}
+
+int
+spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
+{
+ return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
+}
+
+inline uint64_t
+spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
+{
+ const struct map_1gb *map_1gb;
+ const struct map_2mb *map_2mb;
+ uint64_t idx_256tb;
+ uint64_t idx_1gb;
+ uint64_t vfn_2mb;
+ uint64_t cur_size;
+ uint64_t prev_translation;
+ uint64_t orig_translation;
+
+ if (spdk_unlikely(vaddr & ~MASK_256TB)) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
+ return map->default_translation;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+ idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (spdk_unlikely(!map_1gb)) {
+ return map->default_translation;
+ }
+
+ cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
+ map_2mb = &map_1gb->map[idx_1gb];
+ if (size == NULL || map->ops.are_contiguous == NULL ||
+ map_2mb->translation_2mb == map->default_translation) {
+ if (size != NULL) {
+ *size = spdk_min(*size, cur_size);
+ }
+ return map_2mb->translation_2mb;
+ }
+
+ orig_translation = map_2mb->translation_2mb;
+ prev_translation = orig_translation;
+ while (cur_size < *size) {
+ vfn_2mb++;
+ idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (spdk_unlikely(!map_1gb)) {
+ break;
+ }
+
+ map_2mb = &map_1gb->map[idx_1gb];
+ if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
+ break;
+ }
+
+ cur_size += VALUE_2MB;
+ prev_translation = map_2mb->translation_2mb;
+ }
+
+ *size = spdk_min(*size, cur_size);
+ return orig_translation;
+}
+
+static void
+memory_hotplug_cb(enum rte_mem_event event_type,
+ const void *addr, size_t len, void *arg)
+{
+ if (event_type == RTE_MEM_EVENT_ALLOC) {
+ spdk_mem_register((void *)addr, len);
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ if (!spdk_env_dpdk_external_init()) {
+ return;
+ }
+#endif
+
+ /* Prior to DPDK 19.02, we have to worry about DPDK
+ * freeing memory in different units than it was allocated.
+ * That doesn't work with things like RDMA MRs. So for
+ * those versions of DPDK, mark each segment so that DPDK
+ * won't later free it. That ensures we don't have to deal
+ * with that scenario.
+ *
+ * DPDK 19.02 added the --match-allocations RTE flag to
+ * avoid this condition.
+ *
+ * Note: if the user initialized DPDK separately, we can't
+ * be sure that --match-allocations was specified, so need
+ * to still mark the segments so they aren't freed.
+ */
+ while (len > 0) {
+ struct rte_memseg *seg;
+
+ seg = rte_mem_virt2memseg(addr, NULL);
+ assert(seg != NULL);
+ seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+ addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
+ len -= seg->hugepage_sz;
+ }
+ } else if (event_type == RTE_MEM_EVENT_FREE) {
+ spdk_mem_unregister((void *)addr, len);
+ }
+}
+
+static int
+memory_iter_cb(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ return spdk_mem_register(ms->addr, len);
+}
+
+int
+mem_map_init(bool legacy_mem)
+{
+ g_legacy_mem = legacy_mem;
+
+ g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
+ if (g_mem_reg_map == NULL) {
+ DEBUG_PRINT("memory registration map allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Walk all DPDK memory segments and register them
+ * with the master memory map
+ */
+ rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
+ rte_memseg_contig_walk(memory_iter_cb, NULL);
+ return 0;
+}
+
+bool
+spdk_iommu_is_enabled(void)
+{
+#if VFIO_ENABLED
+ return g_vfio.enabled && !g_vfio.noiommu_enabled;
+#else
+ return false;
+#endif
+}
+
+struct spdk_vtophys_pci_device {
+ struct rte_pci_device *pci_device;
+ TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
+};
+
+static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
+ TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
+
+static struct spdk_mem_map *g_vtophys_map;
+static struct spdk_mem_map *g_phys_ref_map;
+
+#if VFIO_ENABLED
+static int
+vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
+{
+ struct spdk_vfio_dma_map *dma_map;
+ uint64_t refcount;
+ int ret;
+
+ refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
+ assert(refcount < UINT64_MAX);
+ if (refcount > 0) {
+ spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
+ return 0;
+ }
+
+ dma_map = calloc(1, sizeof(*dma_map));
+ if (dma_map == NULL) {
+ return -ENOMEM;
+ }
+
+ dma_map->map.argsz = sizeof(dma_map->map);
+ dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ dma_map->map.vaddr = vaddr;
+ dma_map->map.iova = iova;
+ dma_map->map.size = size;
+
+ dma_map->unmap.argsz = sizeof(dma_map->unmap);
+ dma_map->unmap.flags = 0;
+ dma_map->unmap.iova = iova;
+ dma_map->unmap.size = size;
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ if (g_vfio.device_ref == 0) {
+ /* VFIO requires at least one device (IOMMU group) to be added to
+ * a VFIO container before it is possible to perform any IOMMU
+ * operations on that container. This memory will be mapped once
+ * the first device (IOMMU group) is hotplugged.
+ *
+ * Since the vfio container is managed internally by DPDK, it is
+ * also possible that some device is already in that container, but
+ * it's not managed by SPDK - e.g. an NIC attached internally
+ * inside DPDK. We could map the memory straight away in such
+ * scenario, but there's no need to do it. DPDK devices clearly
+ * don't need our mappings and hence we defer the mapping
+ * unconditionally until the first SPDK-managed device is
+ * hotplugged.
+ */
+ goto out_insert;
+ }
+
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+ if (ret) {
+ DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ free(dma_map);
+ return ret;
+ }
+
+out_insert:
+ TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
+ return 0;
+}
+
+static int
+vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
+{
+ struct spdk_vfio_dma_map *dma_map;
+ uint64_t refcount;
+ int ret;
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ if (dma_map->map.iova == iova) {
+ break;
+ }
+ }
+
+ if (dma_map == NULL) {
+ DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return -ENXIO;
+ }
+
+ refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
+ assert(refcount < UINT64_MAX);
+ if (refcount > 0) {
+ spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
+ }
+
+ /* We still have outstanding references, don't clear it. */
+ if (refcount > 1) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return 0;
+ }
+
+ /** don't support partial or multiple-page unmap for now */
+ assert(dma_map->map.size == size);
+
+ if (g_vfio.device_ref == 0) {
+ /* Memory is not mapped anymore, just remove it's references */
+ goto out_remove;
+ }
+
+
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+ if (ret) {
+ DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return ret;
+ }
+
+out_remove:
+ TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ free(dma_map);
+ return 0;
+}
+#endif
+
+static uint64_t
+vtophys_get_paddr_memseg(uint64_t vaddr)
+{
+ uintptr_t paddr;
+ struct rte_memseg *seg;
+
+ seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
+ if (seg != NULL) {
+ paddr = seg->phys_addr;
+ if (paddr == RTE_BAD_IOVA) {
+ return SPDK_VTOPHYS_ERROR;
+ }
+ paddr += (vaddr - (uintptr_t)seg->addr);
+ return paddr;
+ }
+
+ return SPDK_VTOPHYS_ERROR;
+}
+
+/* Try to get the paddr from /proc/self/pagemap */
+static uint64_t
+vtophys_get_paddr_pagemap(uint64_t vaddr)
+{
+ uintptr_t paddr;
+
+ /* Silence static analyzers */
+ assert(vaddr != 0);
+ paddr = rte_mem_virt2iova((void *)vaddr);
+ if (paddr == RTE_BAD_IOVA) {
+ /*
+ * The vaddr may be valid but doesn't have a backing page
+ * assigned yet. Touch the page to ensure a backing page
+ * gets assigned, then try to translate again.
+ */
+ rte_atomic64_read((rte_atomic64_t *)vaddr);
+ paddr = rte_mem_virt2iova((void *)vaddr);
+ }
+ if (paddr == RTE_BAD_IOVA) {
+ /* Unable to get to the physical address. */
+ return SPDK_VTOPHYS_ERROR;
+ }
+
+ return paddr;
+}
+
+/* Try to get the paddr from pci devices */
+static uint64_t
+vtophys_get_paddr_pci(uint64_t vaddr)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+ uintptr_t paddr;
+ struct rte_pci_device *dev;
+ struct rte_mem_resource *res;
+ unsigned r;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ dev = vtophys_dev->pci_device;
+
+ for (r = 0; r < PCI_MAX_RESOURCE; r++) {
+ res = &dev->mem_resource[r];
+ if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
+ vaddr < (uint64_t)res->addr + res->len) {
+ paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
+ DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
+ (void *)paddr);
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+ return paddr;
+ }
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+ return SPDK_VTOPHYS_ERROR;
+}
+
+static int
+vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t len)
+{
+ int rc = 0, pci_phys = 0;
+ uint64_t paddr;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
+ vaddr, len);
+ return -EINVAL;
+ }
+
+ /* Get the physical address from the DPDK memsegs */
+ paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+
+ switch (action) {
+ case SPDK_MEM_MAP_NOTIFY_REGISTER:
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /* This is not an address that DPDK is managing. */
+#if VFIO_ENABLED
+ enum rte_iova_mode iova_mode;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0)
+ iova_mode = rte_eal_iova_mode();
+#else
+ iova_mode = rte_eal_get_configuration()->iova_mode;
+#endif
+
+ if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
+ /* We'll use the virtual address as the iova to match DPDK. */
+ paddr = (uint64_t)vaddr;
+ rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
+ if (rc) {
+ return -EFAULT;
+ }
+ while (len > 0) {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ if (rc != 0) {
+ return rc;
+ }
+ vaddr += VALUE_2MB;
+ paddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+ } else
+#endif
+ {
+ /* Get the physical address from /proc/self/pagemap. */
+ paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /* Get the physical address from PCI devices */
+ paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+ /* The beginning of this address range points to a PCI resource,
+ * so the rest must point to a PCI resource as well.
+ */
+ pci_phys = 1;
+ }
+
+ /* Get paddr for each 2MB chunk in this address range */
+ while (len > 0) {
+ /* Get the physical address from /proc/self/pagemap. */
+ if (pci_phys) {
+ paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+ } else {
+ paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+ }
+
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+
+ /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
+ if (!pci_phys && (paddr & MASK_2MB)) {
+ DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
+ return -EINVAL;
+ }
+#if VFIO_ENABLED
+ /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
+ * with the IOMMU using the physical address to match. */
+ if (spdk_iommu_is_enabled()) {
+ rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
+ if (rc) {
+ DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
+ return -EFAULT;
+ }
+ }
+#endif
+
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+ }
+ } else {
+ /* This is an address managed by DPDK. Just setup the translations. */
+ while (len > 0) {
+ paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+ }
+
+ break;
+ case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+#if VFIO_ENABLED
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /*
+ * This is not an address that DPDK is managing. If vfio is enabled,
+ * we need to unmap the range from the IOMMU
+ */
+ if (spdk_iommu_is_enabled()) {
+ uint64_t buffer_len = len;
+ uint8_t *va = vaddr;
+ enum rte_iova_mode iova_mode;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0)
+ iova_mode = rte_eal_iova_mode();
+#else
+ iova_mode = rte_eal_get_configuration()->iova_mode;
+#endif
+ /*
+ * In virtual address mode, the region is contiguous and can be done in
+ * one unmap.
+ */
+ if (iova_mode == RTE_IOVA_VA) {
+ paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
+ if (buffer_len != len || paddr != (uintptr_t)va) {
+ DEBUG_PRINT("Unmapping %p with length %lu failed because "
+ "translation had address 0x%" PRIx64 " and length %lu\n",
+ va, len, paddr, buffer_len);
+ return -EINVAL;
+ }
+ rc = vtophys_iommu_unmap_dma(paddr, len);
+ if (rc) {
+ DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
+ return -EFAULT;
+ }
+ } else if (iova_mode == RTE_IOVA_PA) {
+ /* Get paddr for each 2MB chunk in this address range */
+ while (buffer_len > 0) {
+ paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
+
+ if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
+ DEBUG_PRINT("could not get phys addr for %p\n", va);
+ return -EFAULT;
+ }
+
+ rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
+ if (rc) {
+ DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
+ return -EFAULT;
+ }
+
+ va += VALUE_2MB;
+ buffer_len -= VALUE_2MB;
+ }
+ }
+ }
+ }
+#endif
+ while (len > 0) {
+ rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ return rc;
+}
+
+static int
+vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
+{
+ /* This function is always called with paddrs for two subsequent
+ * 2MB chunks in virtual address space, so those chunks will be only
+ * physically contiguous if the physical addresses are 2MB apart
+ * from each other as well.
+ */
+ return (paddr2 - paddr1 == VALUE_2MB);
+}
+
+#if VFIO_ENABLED
+
+static bool
+vfio_enabled(void)
+{
+ return rte_vfio_is_enabled("vfio_pci");
+}
+
+/* Check if IOMMU is enabled on the system */
+static bool
+has_iommu_groups(void)
+{
+ struct dirent *d;
+ int count = 0;
+ DIR *dir = opendir("/sys/kernel/iommu_groups");
+
+ if (dir == NULL) {
+ return false;
+ }
+
+ while (count < 3 && (d = readdir(dir)) != NULL) {
+ count++;
+ }
+
+ closedir(dir);
+ /* there will always be ./ and ../ entries */
+ return count > 2;
+}
+
+static bool
+vfio_noiommu_enabled(void)
+{
+ return rte_vfio_noiommu_is_enabled();
+}
+
+static void
+vtophys_iommu_init(void)
+{
+ char proc_fd_path[PATH_MAX + 1];
+ char link_path[PATH_MAX + 1];
+ const char vfio_path[] = "/dev/vfio/vfio";
+ DIR *dir;
+ struct dirent *d;
+
+ if (!vfio_enabled()) {
+ return;
+ }
+
+ if (vfio_noiommu_enabled()) {
+ g_vfio.noiommu_enabled = true;
+ } else if (!has_iommu_groups()) {
+ return;
+ }
+
+ dir = opendir("/proc/self/fd");
+ if (!dir) {
+ DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
+ return;
+ }
+
+ while ((d = readdir(dir)) != NULL) {
+ if (d->d_type != DT_LNK) {
+ continue;
+ }
+
+ snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
+ if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
+ continue;
+ }
+
+ if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
+ sscanf(d->d_name, "%d", &g_vfio.fd);
+ break;
+ }
+ }
+
+ closedir(dir);
+
+ if (g_vfio.fd < 0) {
+ DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
+ return;
+ }
+
+ g_vfio.enabled = true;
+
+ return;
+}
+#endif
+
+void
+vtophys_pci_device_added(struct rte_pci_device *pci_device)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+
+ vtophys_dev = calloc(1, sizeof(*vtophys_dev));
+ if (vtophys_dev) {
+ vtophys_dev->pci_device = pci_device;
+ TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
+ } else {
+ DEBUG_PRINT("Memory allocation error\n");
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if VFIO_ENABLED
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ if (!g_vfio.enabled) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ g_vfio.device_ref++;
+ if (g_vfio.device_ref > 1) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return;
+ }
+
+ /* This is the first SPDK device using DPDK vfio. This means that the first
+ * IOMMU group might have been just been added to the DPDK vfio container.
+ * From this point it is certain that the memory can be mapped now.
+ */
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+ if (ret) {
+ DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+void
+vtophys_pci_device_removed(struct rte_pci_device *pci_device)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ if (vtophys_dev->pci_device == pci_device) {
+ TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
+ free(vtophys_dev);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if VFIO_ENABLED
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ if (!g_vfio.enabled) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ assert(g_vfio.device_ref > 0);
+ g_vfio.device_ref--;
+ if (g_vfio.device_ref > 0) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return;
+ }
+
+ /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
+ * any additional devices using it's vfio container, all the mappings
+ * will be automatically removed by the Linux vfio driver. We unmap
+ * the memory manually to be able to easily re-map it later regardless
+ * of other, external factors.
+ */
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+ if (ret) {
+ DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+int
+vtophys_init(void)
+{
+ const struct spdk_mem_map_ops vtophys_map_ops = {
+ .notify_cb = vtophys_notify,
+ .are_contiguous = vtophys_check_contiguous_entries,
+ };
+
+ const struct spdk_mem_map_ops phys_ref_map_ops = {
+ .notify_cb = NULL,
+ .are_contiguous = NULL,
+ };
+
+#if VFIO_ENABLED
+ vtophys_iommu_init();
+#endif
+
+ g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
+ if (g_phys_ref_map == NULL) {
+ DEBUG_PRINT("phys_ref map allocation failed.\n");
+ return -ENOMEM;
+ }
+
+ g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
+ if (g_vtophys_map == NULL) {
+ DEBUG_PRINT("vtophys map allocation failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+uint64_t
+spdk_vtophys(void *buf, uint64_t *size)
+{
+ uint64_t vaddr, paddr_2mb;
+
+ vaddr = (uint64_t)buf;
+ paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
+
+ /*
+ * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
+ * we will still bitwise-or it with the buf offset below, but the result will still be
+ * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
+ * unaligned) we must now check the return value before addition.
+ */
+ SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
+ if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
+ return SPDK_VTOPHYS_ERROR;
+ } else {
+ return paddr_2mb + (vaddr & MASK_2MB);
+ }
+}
diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c
new file mode 100644
index 000000000..5fd1b4abd
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci.c
@@ -0,0 +1,1063 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include <rte_alarm.h>
+#include <rte_devargs.h>
+#include "spdk/env.h"
+
+#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers"
+
+#define PCI_CFG_SIZE 256
+#define PCI_EXT_CAP_ID_SN 0x03
+
+/* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
+ * might cause the internal IPC to misbehave. Just retry in such case.
+ */
+#define DPDK_HOTPLUG_RETRY_COUNT 4
+
+/* DPDK alarm/interrupt thread */
+static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
+/* devices hotplugged on a dpdk thread */
+static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
+ TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
+static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
+
+static int
+map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
+ void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+ struct rte_pci_device *dev = device->dev_handle;
+
+ *mapped_addr = dev->mem_resource[bar].addr;
+ *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
+ *size = (uint64_t)dev->mem_resource[bar].len;
+
+ return 0;
+}
+
+static int
+unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
+{
+ return 0;
+}
+
+static int
+cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ int rc;
+
+ rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
+
+ return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+static int
+cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ int rc;
+
+ rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
+
+#ifdef __FreeBSD__
+ /* DPDK returns 0 on success and -1 on failure */
+ return rc;
+#endif
+ return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+static void
+remove_rte_dev(struct rte_pci_device *rte_dev)
+{
+ char bdf[32];
+ int i = 0, rc;
+
+ snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
+ do {
+ rc = rte_eal_hotplug_remove("pci", bdf);
+ } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
+}
+
+static void
+detach_rte_cb(void *_dev)
+{
+ remove_rte_dev(_dev);
+}
+
+static void
+detach_rte(struct spdk_pci_device *dev)
+{
+ struct rte_pci_device *rte_dev = dev->dev_handle;
+ int i;
+ bool removed;
+
+ if (!spdk_process_is_primary()) {
+ remove_rte_dev(rte_dev);
+ return;
+ }
+
+ pthread_mutex_lock(&g_pci_mutex);
+ dev->internal.attached = false;
+ /* prevent the hotremove notification from removing this device */
+ dev->internal.pending_removal = true;
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
+
+ /* wait up to 2s for the cb to execute */
+ for (i = 2000; i > 0; i--) {
+
+ spdk_delay_us(1000);
+ pthread_mutex_lock(&g_pci_mutex);
+ removed = dev->internal.removed;
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ if (removed) {
+ break;
+ }
+ }
+
+ /* besides checking the removed flag, we also need to wait
+ * for the dpdk detach function to unwind, as it's doing some
+ * operations even after calling our detach callback. Simply
+ * cancel the alarm - if it started executing already, this
+ * call will block and wait for it to finish.
+ */
+ rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
+
+ /* the device could have been finally removed, so just check
+ * it again.
+ */
+ pthread_mutex_lock(&g_pci_mutex);
+ removed = dev->internal.removed;
+ pthread_mutex_unlock(&g_pci_mutex);
+ if (!removed) {
+ fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n",
+ rte_dev->name);
+ /* If we reach this state, then the device couldn't be removed and most likely
+ a subsequent hot add of a device in the same BDF will fail */
+ }
+}
+
+void
+spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
+{
+ struct spdk_pci_driver *driver;
+
+ driver = calloc(1, sizeof(*driver));
+ if (!driver) {
+ /* we can't do any better than bailing atm */
+ return;
+ }
+
+ driver->name = name;
+ driver->id_table = id_table;
+ driver->drv_flags = flags;
+ TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
+}
+
+struct spdk_pci_driver *
+spdk_pci_nvme_get_driver(void)
+{
+ return spdk_pci_get_driver("nvme");
+}
+
+struct spdk_pci_driver *
+spdk_pci_get_driver(const char *name)
+{
+ struct spdk_pci_driver *driver;
+
+ TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
+ if (strcmp(driver->name, name) == 0) {
+ return driver;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+pci_device_rte_hotremove(const char *device_name,
+ enum rte_dev_event_type event,
+ void *cb_arg)
+{
+ struct spdk_pci_device *dev;
+ bool can_detach = false;
+
+ if (event != RTE_DEV_EVENT_REMOVE) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ struct rte_pci_device *rte_dev = dev->dev_handle;
+
+ if (strcmp(rte_dev->name, device_name) == 0 &&
+ !dev->internal.pending_removal) {
+ can_detach = !dev->internal.attached;
+ /* prevent any further attaches */
+ dev->internal.pending_removal = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ if (dev != NULL && can_detach) {
+ /* if device is not attached we can remove it right away.
+ * Otherwise it will be removed at detach.
+ */
+ remove_rte_dev(dev->dev_handle);
+ }
+}
+
+static void
+cleanup_pci_devices(void)
+{
+ struct spdk_pci_device *dev, *tmp;
+
+ pthread_mutex_lock(&g_pci_mutex);
+ /* cleanup removed devices */
+ TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
+ if (!dev->internal.removed) {
+ continue;
+ }
+
+ vtophys_pci_device_removed(dev->dev_handle);
+ TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
+ free(dev);
+ }
+
+ /* add newly-attached devices */
+ TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
+ TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
+ TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
+ vtophys_pci_device_added(dev->dev_handle);
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+}
+
+static int scan_pci_bus(bool delay_init);
+
+/* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
+static int
+register_rte_driver(struct spdk_pci_driver *driver)
+{
+ unsigned pci_id_count = 0;
+ struct rte_pci_id *rte_id_table;
+ char *rte_name;
+ size_t rte_name_len;
+ uint32_t rte_flags;
+
+ assert(driver->id_table);
+ while (driver->id_table[pci_id_count].vendor_id) {
+ pci_id_count++;
+ }
+ assert(pci_id_count > 0);
+
+ rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
+ if (!rte_id_table) {
+ return -ENOMEM;
+ }
+
+ while (pci_id_count > 0) {
+ struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
+ const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
+
+ rte_id->class_id = spdk_id->class_id;
+ rte_id->vendor_id = spdk_id->vendor_id;
+ rte_id->device_id = spdk_id->device_id;
+ rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
+ rte_id->subsystem_device_id = spdk_id->subdevice_id;
+ pci_id_count--;
+ }
+
+ assert(driver->name);
+ rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
+ rte_name = calloc(rte_name_len, 1);
+ if (!rte_name) {
+ free(rte_id_table);
+ return -ENOMEM;
+ }
+
+ snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
+ driver->driver.driver.name = rte_name;
+ driver->driver.id_table = rte_id_table;
+
+ rte_flags = 0;
+ if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
+ rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
+ }
+ if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
+ rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
+ }
+ driver->driver.drv_flags = rte_flags;
+
+ driver->driver.probe = pci_device_init;
+ driver->driver.remove = pci_device_fini;
+
+ rte_pci_register(&driver->driver);
+ return 0;
+}
+
+static inline void
+_pci_env_init(void)
+{
+ /* We assume devices were present on the bus for more than 2 seconds
+ * before initializing SPDK and there's no need to wait more. We scan
+ * the bus, but we don't blacklist any devices.
+ */
+ scan_pci_bus(false);
+
+ /* Register a single hotremove callback for all devices. */
+ if (spdk_process_is_primary()) {
+ rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL);
+ }
+}
+
+void
+pci_env_init(void)
+{
+ struct spdk_pci_driver *driver;
+
+ TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
+ register_rte_driver(driver);
+ }
+
+ _pci_env_init();
+}
+
+void
+pci_env_reinit(void)
+{
+ /* There is no need to register pci drivers again, since they were
+ * already pre-registered in pci_env_init.
+ */
+
+ _pci_env_init();
+}
+
+void
+pci_env_fini(void)
+{
+ struct spdk_pci_device *dev;
+ char bdf[32];
+
+ cleanup_pci_devices();
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (dev->internal.attached) {
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
+ fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf);
+ }
+ }
+
+ if (spdk_process_is_primary()) {
+ rte_dev_event_callback_unregister(NULL, pci_device_rte_hotremove, NULL);
+ }
+}
+
+int
+pci_device_init(struct rte_pci_driver *_drv,
+ struct rte_pci_device *_dev)
+{
+ struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
+ struct spdk_pci_device *dev;
+ int rc;
+
+ dev = calloc(1, sizeof(*dev));
+ if (dev == NULL) {
+ return -1;
+ }
+
+ dev->dev_handle = _dev;
+
+ dev->addr.domain = _dev->addr.domain;
+ dev->addr.bus = _dev->addr.bus;
+ dev->addr.dev = _dev->addr.devid;
+ dev->addr.func = _dev->addr.function;
+ dev->id.class_id = _dev->id.class_id;
+ dev->id.vendor_id = _dev->id.vendor_id;
+ dev->id.device_id = _dev->id.device_id;
+ dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
+ dev->id.subdevice_id = _dev->id.subsystem_device_id;
+ dev->socket_id = _dev->device.numa_node;
+ dev->type = "pci";
+
+ dev->map_bar = map_bar_rte;
+ dev->unmap_bar = unmap_bar_rte;
+ dev->cfg_read = cfg_read_rte;
+ dev->cfg_write = cfg_write_rte;
+
+ dev->internal.driver = driver;
+ dev->internal.claim_fd = -1;
+
+ if (driver->cb_fn != NULL) {
+ rc = driver->cb_fn(driver->cb_arg, dev);
+ if (rc != 0) {
+ free(dev);
+ return rc;
+ }
+ dev->internal.attached = true;
+ }
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
+ pthread_mutex_unlock(&g_pci_mutex);
+ return 0;
+}
+
+int
+pci_device_fini(struct rte_pci_device *_dev)
+{
+ struct spdk_pci_device *dev;
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (dev->dev_handle == _dev) {
+ break;
+ }
+ }
+
+ if (dev == NULL || dev->internal.attached) {
+ /* The device might be still referenced somewhere in SPDK. */
+ pthread_mutex_unlock(&g_pci_mutex);
+ return -1;
+ }
+
+ /* remove our whitelist_at option */
+ if (_dev->device.devargs) {
+ _dev->device.devargs->data = NULL;
+ }
+
+ assert(!dev->internal.removed);
+ dev->internal.removed = true;
+ pthread_mutex_unlock(&g_pci_mutex);
+ return 0;
+
+}
+
+void
+spdk_pci_device_detach(struct spdk_pci_device *dev)
+{
+ assert(dev->internal.attached);
+
+ if (dev->internal.claim_fd >= 0) {
+ spdk_pci_device_unclaim(dev);
+ }
+
+ if (strcmp(dev->type, "pci") == 0) {
+ /* if it's a physical device we need to deal with DPDK on
+ * a different process and we can't just unset one flag
+ * here. We also want to stop using any device resources
+ * so that the device isn't "in use" by the userspace driver
+ * once we detach it. This would allow attaching the device
+ * to a different process, or to a kernel driver like nvme.
+ */
+ detach_rte(dev);
+ } else {
+ dev->internal.attached = false;
+ }
+
+ cleanup_pci_devices();
+}
+
+static int
+scan_pci_bus(bool delay_init)
+{
+ struct spdk_pci_driver *driver;
+ struct rte_pci_device *rte_dev;
+ uint64_t now;
+
+ rte_bus_scan();
+ now = spdk_get_ticks();
+
+ driver = TAILQ_FIRST(&g_pci_drivers);
+ if (!driver) {
+ return 0;
+ }
+
+ TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
+ struct rte_devargs *da;
+
+ da = rte_dev->device.devargs;
+ if (!da) {
+ char devargs_str[128];
+
+ /* the device was never blacklisted or whitelisted */
+ da = calloc(1, sizeof(*da));
+ if (!da) {
+ return -1;
+ }
+
+ snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
+ if (rte_devargs_parse(da, devargs_str) != 0) {
+ free(da);
+ return -1;
+ }
+
+ rte_devargs_insert(&da);
+ rte_dev->device.devargs = da;
+ }
+
+ if (da->data) {
+ uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data;
+
+ /* this device was seen by spdk before... */
+ if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) {
+ da->policy = RTE_DEV_WHITELISTED;
+ }
+ } else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST &&
+ da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) {
+ /* override the policy only if not permanently blacklisted */
+
+ if (delay_init) {
+ da->policy = RTE_DEV_BLACKLISTED;
+ da->data = (void *)(now + 2 * spdk_get_ticks_hz());
+ } else {
+ da->policy = RTE_DEV_WHITELISTED;
+ da->data = (void *)(uintptr_t)now;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_pci_device_attach(struct spdk_pci_driver *driver,
+ spdk_pci_enum_cb enum_cb,
+ void *enum_ctx, struct spdk_pci_addr *pci_address)
+{
+ struct spdk_pci_device *dev;
+ struct rte_pci_device *rte_dev;
+ struct rte_devargs *da;
+ int rc;
+ char bdf[32];
+
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
+
+ cleanup_pci_devices();
+
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
+ break;
+ }
+ }
+
+ if (dev != NULL && dev->internal.driver == driver) {
+ pthread_mutex_lock(&g_pci_mutex);
+ if (dev->internal.attached || dev->internal.pending_removal) {
+ pthread_mutex_unlock(&g_pci_mutex);
+ return -1;
+ }
+
+ rc = enum_cb(enum_ctx, dev);
+ if (rc == 0) {
+ dev->internal.attached = true;
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+ return rc;
+ }
+
+ driver->cb_fn = enum_cb;
+ driver->cb_arg = enum_ctx;
+
+ int i = 0;
+
+ do {
+ rc = rte_eal_hotplug_add("pci", bdf, "");
+ } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
+
+ if (i > 1 && rc == -EEXIST) {
+ /* Even though the previous request timed out, the device
+ * was attached successfully.
+ */
+ rc = 0;
+ }
+
+ driver->cb_arg = NULL;
+ driver->cb_fn = NULL;
+
+ cleanup_pci_devices();
+
+ if (rc != 0) {
+ return -1;
+ }
+
+ /* explicit attach ignores the whitelist, so if we blacklisted this
+ * device before let's enable it now - just for clarity.
+ */
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
+ break;
+ }
+ }
+ assert(dev != NULL);
+
+ rte_dev = dev->dev_handle;
+ da = rte_dev->device.devargs;
+ if (da && da->data) {
+ da->data = (void *)(uintptr_t)spdk_get_ticks();
+ da->policy = RTE_DEV_WHITELISTED;
+ }
+
+ return 0;
+}
+
+/* Note: You can call spdk_pci_enumerate from more than one thread
+ * simultaneously safely, but you cannot call spdk_pci_enumerate
+ * and rte_eal_pci_probe simultaneously.
+ */
+int
+spdk_pci_enumerate(struct spdk_pci_driver *driver,
+ spdk_pci_enum_cb enum_cb,
+ void *enum_ctx)
+{
+ struct spdk_pci_device *dev;
+ int rc;
+
+ cleanup_pci_devices();
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (dev->internal.attached ||
+ dev->internal.driver != driver ||
+ dev->internal.pending_removal) {
+ continue;
+ }
+
+ rc = enum_cb(enum_ctx, dev);
+ if (rc == 0) {
+ dev->internal.attached = true;
+ } else if (rc < 0) {
+ pthread_mutex_unlock(&g_pci_mutex);
+ return -1;
+ }
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ if (scan_pci_bus(true) != 0) {
+ return -1;
+ }
+
+ driver->cb_fn = enum_cb;
+ driver->cb_arg = enum_ctx;
+
+ if (rte_bus_probe() != 0) {
+ driver->cb_arg = NULL;
+ driver->cb_fn = NULL;
+ return -1;
+ }
+
+ driver->cb_arg = NULL;
+ driver->cb_fn = NULL;
+
+ cleanup_pci_devices();
+ return 0;
+}
+
+struct spdk_pci_device *
+spdk_pci_get_first_device(void)
+{
+ return TAILQ_FIRST(&g_pci_devices);
+}
+
+struct spdk_pci_device *
+spdk_pci_get_next_device(struct spdk_pci_device *prev)
+{
+ return TAILQ_NEXT(prev, internal.tailq);
+}
+
+int
+spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
+ void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+ return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
+}
+
+int
+spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
+{
+ return dev->unmap_bar(dev, bar, addr);
+}
+
+uint32_t
+spdk_pci_device_get_domain(struct spdk_pci_device *dev)
+{
+ return dev->addr.domain;
+}
+
+uint8_t
+spdk_pci_device_get_bus(struct spdk_pci_device *dev)
+{
+ return dev->addr.bus;
+}
+
+uint8_t
+spdk_pci_device_get_dev(struct spdk_pci_device *dev)
+{
+ return dev->addr.dev;
+}
+
+uint8_t
+spdk_pci_device_get_func(struct spdk_pci_device *dev)
+{
+ return dev->addr.func;
+}
+
+uint16_t
+spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
+{
+ return dev->id.vendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
+{
+ return dev->id.device_id;
+}
+
+uint16_t
+spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
+{
+ return dev->id.subvendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
+{
+ return dev->id.subdevice_id;
+}
+
+struct spdk_pci_id
+spdk_pci_device_get_id(struct spdk_pci_device *dev)
+{
+ return dev->id;
+}
+
+int
+spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
+{
+ return dev->socket_id;
+}
+
+int
+spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ return dev->cfg_read(dev, value, len, offset);
+}
+
+int
+spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ return dev->cfg_write(dev, value, len, offset);
+}
+
+int
+spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 4, offset);
+}
+
+int
+spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 4, offset);
+}
+
+int
+spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
+{
+ int err;
+ uint32_t pos, header = 0;
+ uint32_t i, buf[2];
+
+ if (len < 17) {
+ return -1;
+ }
+
+ err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
+ if (err || !header) {
+ return -1;
+ }
+
+ pos = PCI_CFG_SIZE;
+ while (1) {
+ if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
+ if (pos) {
+ /* skip the header */
+ pos += 4;
+ for (i = 0; i < 2; i++) {
+ err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
+ if (err) {
+ return -1;
+ }
+ }
+ snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
+ return 0;
+ }
+ }
+ pos = (header >> 20) & 0xffc;
+ /* 0 if no other items exist */
+ if (pos < PCI_CFG_SIZE) {
+ return -1;
+ }
+ err = spdk_pci_device_cfg_read32(dev, &header, pos);
+ if (err) {
+ return -1;
+ }
+ }
+ return -1;
+}
+
+struct spdk_pci_addr
+spdk_pci_device_get_addr(struct spdk_pci_device *dev)
+{
+ return dev->addr;
+}
+
+bool
+spdk_pci_device_is_removed(struct spdk_pci_device *dev)
+{
+ return dev->internal.pending_removal;
+}
+
+int
+spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
+{
+ if (a1->domain > a2->domain) {
+ return 1;
+ } else if (a1->domain < a2->domain) {
+ return -1;
+ } else if (a1->bus > a2->bus) {
+ return 1;
+ } else if (a1->bus < a2->bus) {
+ return -1;
+ } else if (a1->dev > a2->dev) {
+ return 1;
+ } else if (a1->dev < a2->dev) {
+ return -1;
+ } else if (a1->func > a2->func) {
+ return 1;
+ } else if (a1->func < a2->func) {
+ return -1;
+ }
+
+ return 0;
+}
+
+#ifdef __linux__
+int
+spdk_pci_device_claim(struct spdk_pci_device *dev)
+{
+ int dev_fd;
+ char dev_name[64];
+ int pid;
+ void *dev_map;
+ struct flock pcidev_lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_start = 0,
+ .l_len = 0,
+ };
+
+ snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
+ dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
+
+ dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (dev_fd == -1) {
+ fprintf(stderr, "could not open %s\n", dev_name);
+ return -errno;
+ }
+
+ if (ftruncate(dev_fd, sizeof(int)) != 0) {
+ fprintf(stderr, "could not truncate %s\n", dev_name);
+ close(dev_fd);
+ return -errno;
+ }
+
+ dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+ MAP_SHARED, dev_fd, 0);
+ if (dev_map == MAP_FAILED) {
+ fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno);
+ close(dev_fd);
+ return -errno;
+ }
+
+ if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
+ pid = *(int *)dev_map;
+ fprintf(stderr, "Cannot create lock on device %s, probably"
+ " process %d has claimed it\n", dev_name, pid);
+ munmap(dev_map, sizeof(int));
+ close(dev_fd);
+ /* F_SETLK returns unspecified errnos, normalize them */
+ return -EACCES;
+ }
+
+ *(int *)dev_map = (int)getpid();
+ munmap(dev_map, sizeof(int));
+ dev->internal.claim_fd = dev_fd;
+ /* Keep dev_fd open to maintain the lock. */
+ return 0;
+}
+
+void
+spdk_pci_device_unclaim(struct spdk_pci_device *dev)
+{
+ char dev_name[64];
+
+ snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
+ dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
+
+ close(dev->internal.claim_fd);
+ dev->internal.claim_fd = -1;
+ unlink(dev_name);
+}
+#endif /* __linux__ */
+
+#ifdef __FreeBSD__
+int
+spdk_pci_device_claim(struct spdk_pci_device *dev)
+{
+ /* TODO */
+ return 0;
+}
+
+void
+spdk_pci_device_unclaim(struct spdk_pci_device *dev)
+{
+ /* TODO */
+}
+#endif /* __FreeBSD__ */
+
+int
+spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
+{
+ unsigned domain, bus, dev, func;
+
+ if (addr == NULL || bdf == NULL) {
+ return -EINVAL;
+ }
+
+ if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
+ (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
+ /* Matched a full address - all variables are initialized */
+ } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+ func = 0;
+ } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
+ (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
+ domain = 0;
+ } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
+ (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
+ domain = 0;
+ func = 0;
+ } else {
+ return -EINVAL;
+ }
+
+ if (bus > 0xFF || dev > 0x1F || func > 7) {
+ return -EINVAL;
+ }
+
+ addr->domain = domain;
+ addr->bus = bus;
+ addr->dev = dev;
+ addr->func = func;
+
+ return 0;
+}
+
+int
+spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
+{
+ int rc;
+
+ rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
+ addr->domain, addr->bus,
+ addr->dev, addr->func);
+
+ if (rc > 0 && (size_t)rc < sz) {
+ return 0;
+ }
+
+ return -1;
+}
+
+void
+spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
+{
+ assert(dev->map_bar != NULL);
+ assert(dev->unmap_bar != NULL);
+ assert(dev->cfg_read != NULL);
+ assert(dev->cfg_write != NULL);
+ dev->internal.driver = drv;
+ TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
+}
+
+void
+spdk_pci_unhook_device(struct spdk_pci_device *dev)
+{
+ assert(!dev->internal.attached);
+ TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
+}
+
+const char *
+spdk_pci_device_get_type(const struct spdk_pci_device *dev)
+{
+ return dev->type;
+}
diff --git a/src/spdk/lib/env_dpdk/pci_idxd.c b/src/spdk/lib/env_dpdk/pci_idxd.c
new file mode 100644
index 000000000..eddbfa4af
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_idxd.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct spdk_pci_id idxd_driver_id[] = {
+ {SPDK_IDXD_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IDXD)},
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_idxd_get_driver(void)
+{
+ return spdk_pci_get_driver("idxd");
+}
+
+SPDK_PCI_DRIVER_REGISTER("idxd", idxd_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING);
diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c
new file mode 100644
index 000000000..28b7bdb44
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_ioat.c
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct spdk_pci_id ioat_driver_id[] = {
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_ICX)},
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_ioat_get_driver(void)
+{
+ return spdk_pci_get_driver("ioat");
+}
+
+SPDK_PCI_DRIVER_REGISTER("ioat", ioat_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING);
diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c
new file mode 100644
index 000000000..e525a4a8e
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_virtio.c
@@ -0,0 +1,53 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct spdk_pci_id virtio_pci_driver_id[] = {
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) },
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) },
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) },
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_virtio_get_driver(void)
+{
+ return spdk_pci_get_driver("virtio");
+}
+
+SPDK_PCI_DRIVER_REGISTER("virtio", virtio_pci_driver_id,
+ SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
diff --git a/src/spdk/lib/env_dpdk/pci_vmd.c b/src/spdk/lib/env_dpdk/pci_vmd.c
new file mode 100644
index 000000000..fb6860873
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_vmd.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct spdk_pci_id vmd_pci_driver_id[] = {
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_vmd_get_driver(void)
+{
+ return spdk_pci_get_driver("vmd");
+}
+
+SPDK_PCI_DRIVER_REGISTER("vmd", vmd_pci_driver_id,
+ SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
diff --git a/src/spdk/lib/env_dpdk/spdk_env_dpdk.map b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map
new file mode 100644
index 000000000..a465f0938
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map
@@ -0,0 +1,114 @@
+{
+ global:
+
+ # Public functions in env.h
+ spdk_malloc;
+ spdk_zmalloc;
+ spdk_realloc;
+ spdk_free;
+ spdk_env_opts_init;
+ spdk_env_init;
+ spdk_env_fini;
+ spdk_dma_malloc;
+ spdk_dma_malloc_socket;
+ spdk_dma_zmalloc;
+ spdk_dma_zmalloc_socket;
+ spdk_dma_realloc;
+ spdk_dma_free;
+ spdk_memzone_reserve;
+ spdk_memzone_reserve_aligned;
+ spdk_memzone_lookup;
+ spdk_memzone_free;
+ spdk_memzone_dump;
+ spdk_mempool_create;
+ spdk_mempool_create_ctor;
+ spdk_mempool_get_name;
+ spdk_mempool_free;
+ spdk_mempool_get;
+ spdk_mempool_get_bulk;
+ spdk_mempool_put;
+ spdk_mempool_put_bulk;
+ spdk_mempool_count;
+ spdk_mempool_obj_iter;
+ spdk_mempool_lookup;
+ spdk_env_get_core_count;
+ spdk_env_get_current_core;
+ spdk_env_get_first_core;
+ spdk_env_get_last_core;
+ spdk_env_get_next_core;
+ spdk_env_get_socket_id;
+ spdk_env_thread_launch_pinned;
+ spdk_env_thread_wait_all;
+ spdk_process_is_primary;
+ spdk_get_ticks;
+ spdk_get_ticks_hz;
+ spdk_delay_us;
+ spdk_pause;
+ spdk_ring_create;
+ spdk_ring_free;
+ spdk_ring_count;
+ spdk_ring_enqueue;
+ spdk_ring_dequeue;
+ spdk_iommu_is_enabled;
+ spdk_vtophys;
+ spdk_pci_get_driver;
+ spdk_pci_driver_register;
+ spdk_pci_nvme_get_driver;
+ spdk_pci_vmd_get_driver;
+ spdk_pci_idxd_get_driver;
+ spdk_pci_ioat_get_driver;
+ spdk_pci_virtio_get_driver;
+ spdk_pci_enumerate;
+ spdk_pci_get_first_device;
+ spdk_pci_get_next_device;
+ spdk_pci_device_map_bar;
+ spdk_pci_device_unmap_bar;
+ spdk_pci_device_get_domain;
+ spdk_pci_device_get_bus;
+ spdk_pci_device_get_dev;
+ spdk_pci_device_get_func;
+ spdk_pci_device_get_addr;
+ spdk_pci_device_get_vendor_id;
+ spdk_pci_device_get_device_id;
+ spdk_pci_device_get_subvendor_id;
+ spdk_pci_device_get_subdevice_id;
+ spdk_pci_device_get_id;
+ spdk_pci_device_get_socket_id;
+ spdk_pci_device_get_serial_number;
+ spdk_pci_device_claim;
+ spdk_pci_device_unclaim;
+ spdk_pci_device_detach;
+ spdk_pci_device_attach;
+ spdk_pci_device_cfg_read;
+ spdk_pci_device_cfg_write;
+ spdk_pci_device_cfg_read8;
+ spdk_pci_device_cfg_write8;
+ spdk_pci_device_cfg_read16;
+ spdk_pci_device_cfg_write16;
+ spdk_pci_device_cfg_read32;
+ spdk_pci_device_cfg_write32;
+ spdk_pci_device_is_removed;
+ spdk_pci_addr_compare;
+ spdk_pci_addr_parse;
+ spdk_pci_addr_fmt;
+ spdk_pci_hook_device;
+ spdk_pci_unhook_device;
+ spdk_pci_device_get_type;
+ spdk_unaffinitize_thread;
+ spdk_call_unaffinitized;
+ spdk_mem_map_alloc;
+ spdk_mem_map_free;
+ spdk_mem_map_set_translation;
+ spdk_mem_map_clear_translation;
+ spdk_mem_map_translate;
+ spdk_mem_register;
+ spdk_mem_unregister;
+
+ # Public functions in env_dpdk.h
+ spdk_env_dpdk_post_init;
+ spdk_env_dpdk_post_fini;
+ spdk_env_dpdk_external_init;
+ spdk_env_dpdk_dump_mem_stats;
+
+ local: *;
+};
diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c
new file mode 100644
index 000000000..01c7b8d9f
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/threads.c
@@ -0,0 +1,108 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_lcore.h>
+
+uint32_t
+spdk_env_get_core_count(void)
+{
+ return rte_lcore_count();
+}
+
+uint32_t
+spdk_env_get_current_core(void)
+{
+ return rte_lcore_id();
+}
+
+uint32_t
+spdk_env_get_first_core(void)
+{
+ return rte_get_next_lcore(-1, 0, 0);
+}
+
+uint32_t
+spdk_env_get_last_core(void)
+{
+ uint32_t i;
+ uint32_t last_core = UINT32_MAX;
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ last_core = i;
+ }
+
+ assert(last_core != UINT32_MAX);
+
+ return last_core;
+}
+
+uint32_t
+spdk_env_get_next_core(uint32_t prev_core)
+{
+ unsigned lcore;
+
+ lcore = rte_get_next_lcore(prev_core, 0, 0);
+ if (lcore == RTE_MAX_LCORE) {
+ return UINT32_MAX;
+ }
+ return lcore;
+}
+
+uint32_t
+spdk_env_get_socket_id(uint32_t core)
+{
+ if (core >= RTE_MAX_LCORE) {
+ return SPDK_ENV_SOCKET_ID_ANY;
+ }
+
+ return rte_lcore_to_socket_id(core);
+}
+
+int
+spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg)
+{
+ int rc;
+
+ rc = rte_eal_remote_launch(fn, arg, core);
+
+ return rc;
+}
+
+void
+spdk_env_thread_wait_all(void)
+{
+ rte_eal_mp_wait_lcore();
+}