summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib/env_dpdk
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/lib/env_dpdk')
-rw-r--r--src/spdk/lib/env_dpdk/Makefile42
-rw-r--r--src/spdk/lib/env_dpdk/env.c419
-rw-r--r--src/spdk/lib/env_dpdk/env.mk112
-rw-r--r--src/spdk/lib/env_dpdk/env_internal.h104
-rw-r--r--src/spdk/lib/env_dpdk/init.c401
-rw-r--r--src/spdk/lib/env_dpdk/memory.c712
-rw-r--r--src/spdk/lib/env_dpdk/pci.c551
-rw-r--r--src/spdk/lib/env_dpdk/pci_ioat.c123
-rw-r--r--src/spdk/lib/env_dpdk/pci_nvme.c89
-rw-r--r--src/spdk/lib/env_dpdk/pci_virtio.c80
-rw-r--r--src/spdk/lib/env_dpdk/threads.c108
-rw-r--r--src/spdk/lib/env_dpdk/vtophys.c691
12 files changed, 3432 insertions, 0 deletions
diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile
new file mode 100644
index 00000000..b7a6961f
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/Makefile
@@ -0,0 +1,42 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = env.c memory.c pci.c vtophys.c init.c threads.c
+C_SRCS += pci_nvme.c pci_ioat.c pci_virtio.c
+LIBNAME = env_dpdk
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c
new file mode 100644
index 00000000..a5238e54
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.c
@@ -0,0 +1,419 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+
+#include <rte_config.h>
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_mempool.h>
+#include <rte_memzone.h>
+#include <rte_version.h>
+
+static uint64_t
+virt_to_phys(void *vaddr)
+{
+ uint64_t ret;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ ret = rte_malloc_virt2iova(vaddr);
+ if (ret != RTE_BAD_IOVA) {
+ return ret;
+ }
+#else
+ ret = rte_malloc_virt2phy(vaddr);
+ if (ret != RTE_BAD_PHYS_ADDR) {
+ return ret;
+ }
+#endif
+
+ return spdk_vtophys(vaddr);
+}
+
+void *
+spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+ if (flags == 0) {
+ return NULL;
+ }
+
+ void *buf = rte_malloc_socket(NULL, size, align, socket_id);
+ if (buf && phys_addr) {
+ *phys_addr = virt_to_phys(buf);
+ }
+ return buf;
+}
+
+void *
+spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+ void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags);
+ if (buf) {
+ memset(buf, 0, size);
+ }
+ return buf;
+}
+
+void
+spdk_free(void *buf)
+{
+ rte_free(buf);
+}
+
+void *
+spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+ return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+ return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+ return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+ return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr)
+{
+ void *new_buf = rte_realloc(buf, size, align);
+ if (new_buf && phys_addr) {
+ *phys_addr = virt_to_phys(new_buf);
+ }
+ return new_buf;
+}
+
+void
+spdk_dma_free(void *buf)
+{
+ spdk_free(buf);
+}
+
+void *
+spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
+ unsigned flags, unsigned align)
+{
+ const struct rte_memzone *mz;
+ unsigned dpdk_flags = 0;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
+ /* Older DPDKs do not offer such flag since their
+ * memzones are iova-contiguous by default.
+ */
+ if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) {
+ dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG;
+ }
+#endif
+
+ if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+ socket_id = SOCKET_ID_ANY;
+ }
+
+ mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align);
+
+ if (mz != NULL) {
+ memset(mz->addr, 0, len);
+ return mz->addr;
+ } else {
+ return NULL;
+ }
+}
+
+void *
+spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags)
+{
+ return spdk_memzone_reserve_aligned(name, len, socket_id, flags,
+ RTE_CACHE_LINE_SIZE);
+}
+
+void *
+spdk_memzone_lookup(const char *name)
+{
+ const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+ if (mz != NULL) {
+ return mz->addr;
+ } else {
+ return NULL;
+ }
+}
+
+int
+spdk_memzone_free(const char *name)
+{
+ const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+ if (mz != NULL) {
+ return rte_memzone_free(mz);
+ }
+
+ return -1;
+}
+
+void
+spdk_memzone_dump(FILE *f)
+{
+ rte_memzone_dump(f);
+}
+
+struct spdk_mempool *
+spdk_mempool_create_ctor(const char *name, size_t count,
+ size_t ele_size, size_t cache_size, int socket_id,
+ spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg)
+{
+ struct rte_mempool *mp;
+ size_t tmp;
+
+ if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+ socket_id = SOCKET_ID_ANY;
+ }
+
+ /* No more than half of all elements can be in cache */
+ tmp = (count / 2) / rte_lcore_count();
+ if (cache_size > tmp) {
+ cache_size = tmp;
+ }
+
+ if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+ cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+ }
+
+ mp = rte_mempool_create(name, count, ele_size, cache_size,
+ 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg,
+ socket_id, MEMPOOL_F_NO_PHYS_CONTIG);
+
+ return (struct spdk_mempool *)mp;
+}
+
+
+struct spdk_mempool *
+spdk_mempool_create(const char *name, size_t count,
+ size_t ele_size, size_t cache_size, int socket_id)
+{
+ return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id,
+ NULL, NULL);
+}
+
+char *
+spdk_mempool_get_name(struct spdk_mempool *mp)
+{
+ return ((struct rte_mempool *)mp)->name;
+}
+
+void
+spdk_mempool_free(struct spdk_mempool *mp)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 7, 0, 1)
+ rte_mempool_free((struct rte_mempool *)mp);
+#endif
+}
+
+void *
+spdk_mempool_get(struct spdk_mempool *mp)
+{
+ void *ele = NULL;
+ int rc;
+
+ rc = rte_mempool_get((struct rte_mempool *)mp, &ele);
+ if (rc != 0) {
+ return NULL;
+ }
+ return ele;
+}
+
+int
+spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+ return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+void
+spdk_mempool_put(struct spdk_mempool *mp, void *ele)
+{
+ rte_mempool_put((struct rte_mempool *)mp, ele);
+}
+
+void
+spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+ rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+size_t
+spdk_mempool_count(const struct spdk_mempool *pool)
+{
+#if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1)
+ return rte_mempool_count((struct rte_mempool *)pool);
+#else
+ return rte_mempool_avail_count((struct rte_mempool *)pool);
+#endif
+}
+
+bool
+spdk_process_is_primary(void)
+{
+ return (rte_eal_process_type() == RTE_PROC_PRIMARY);
+}
+
+uint64_t spdk_get_ticks(void)
+{
+ return rte_get_timer_cycles();
+}
+
+uint64_t spdk_get_ticks_hz(void)
+{
+ return rte_get_timer_hz();
+}
+
+void spdk_delay_us(unsigned int us)
+{
+ rte_delay_us(us);
+}
+
+void
+spdk_unaffinitize_thread(void)
+{
+ rte_cpuset_t new_cpuset;
+ long num_cores, i;
+
+ CPU_ZERO(&new_cpuset);
+
+ num_cores = sysconf(_SC_NPROCESSORS_CONF);
+
+ /* Create a mask containing all CPUs */
+ for (i = 0; i < num_cores; i++) {
+ CPU_SET(i, &new_cpuset);
+ }
+
+ rte_thread_set_affinity(&new_cpuset);
+}
+
+void *
+spdk_call_unaffinitized(void *cb(void *arg), void *arg)
+{
+ rte_cpuset_t orig_cpuset;
+ void *ret;
+
+ if (cb == NULL) {
+ return NULL;
+ }
+
+ rte_thread_get_affinity(&orig_cpuset);
+
+ spdk_unaffinitize_thread();
+
+ ret = cb(arg);
+
+ rte_thread_set_affinity(&orig_cpuset);
+
+ return ret;
+}
+
+struct spdk_ring *
+spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id)
+{
+ char ring_name[64];
+ static uint32_t ring_num = 0;
+ unsigned flags = 0;
+
+ switch (type) {
+ case SPDK_RING_TYPE_SP_SC:
+ flags = RING_F_SP_ENQ | RING_F_SC_DEQ;
+ break;
+ case SPDK_RING_TYPE_MP_SC:
+ flags = RING_F_SC_DEQ;
+ break;
+ case SPDK_RING_TYPE_MP_MC:
+ flags = 0;
+ break;
+ default:
+ return NULL;
+ }
+
+ snprintf(ring_name, sizeof(ring_name), "ring_%u_%d",
+ __sync_fetch_and_add(&ring_num, 1), getpid());
+
+ return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags);
+}
+
+void
+spdk_ring_free(struct spdk_ring *ring)
+{
+ rte_ring_free((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_count(struct spdk_ring *ring)
+{
+ return rte_ring_count((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count)
+{
+ int rc;
+#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0)
+ rc = rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count);
+ if (rc == 0) {
+ return count;
+ }
+
+ return 0;
+#else
+ rc = rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count, NULL);
+ return rc;
+#endif
+}
+
+size_t
+spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count)
+{
+#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0)
+ return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count);
+#else
+ return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL);
+#endif
+}
diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk
new file mode 100644
index 00000000..989bdd11
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.mk
@@ -0,0 +1,112 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# This makefile snippet must define the following flags:
+# ENV_CFLAGS
+# ENV_CXXFLAGS
+# ENV_LIBS
+# ENV_LINKER_ARGS
+
+DPDK_DIR = $(CONFIG_DPDK_DIR)
+
+export DPDK_ABS_DIR = $(abspath $(DPDK_DIR))
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h))
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include
+else
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk
+endif
+DPDK_INC := -I$(DPDK_INC_DIR)
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_eal.a))
+DPDK_LIB_EXT = .a
+else
+DPDK_LIB_EXT = .so
+endif
+
+DPDK_LIB_LIST = rte_eal rte_mempool rte_ring
+
+# librte_mempool_ring was new added from DPDK 17.05. Link this library used for
+# ring based mempool management API.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*))
+DPDK_LIB_LIST += rte_mempool_ring
+endif
+
+# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its
+# existence to maintain backward compatibility.
+ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),)
+DPDK_LIB_LIST += rte_malloc
+endif
+
+# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally
+# based on their existence to maintain backward compatibility.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*))
+DPDK_LIB_LIST += rte_pci
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*))
+DPDK_LIB_LIST += rte_bus_pci
+endif
+
+ifeq ($(CONFIG_CRYPTO),y)
+DPDK_LIB_LIST += rte_cryptodev rte_reorder rte_bus_vdev rte_pmd_aesni_mb rte_pmd_qat rte_mbuf
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*))
+DPDK_LIB_LIST += rte_kvargs
+endif
+
+DPDK_LIB = $(DPDK_LIB_LIST:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT))
+ifeq ($(CONFIG_CRYPTO),y)
+DPDK_LIB += $(SPDK_ROOT_DIR)/intel-ipsec-mb/libIPSec_MB.a
+endif
+
+# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05
+ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations
+ENV_CXXFLAGS = $(ENV_CFLAGS)
+ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk)
+ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB)
+ENV_LINKER_ARGS = $(ENV_DPDK_FILE) -Wl,--whole-archive $(DPDK_LIB) -Wl,--no-whole-archive
+
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+ifeq ($(OS),Linux)
+ENV_LINKER_ARGS += -ldl
+endif
+ifeq ($(OS),FreeBSD)
+ENV_LINKER_ARGS += -lexecinfo
+endif
diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h
new file mode 100644
index 00000000..d95084ea
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env_internal.h
@@ -0,0 +1,104 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ENV_INTERNAL_H
+#define SPDK_ENV_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#define spdk_pci_device rte_pci_device
+
+#include "spdk/env.h"
+
+#include <rte_config.h>
+#include <rte_version.h>
+#include <rte_eal.h>
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 0)
+#include <rte_bus.h>
+extern struct rte_pci_bus rte_pci_bus;
+#endif
+#include <rte_pci.h>
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 1)
+#include <rte_bus_pci.h>
+#endif
+#include <rte_dev.h>
+
+/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
+ * which is enough to cover 256 TB.
+ */
+#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */
+#define MASK_256TB ((1ULL << SHIFT_256TB) - 1)
+
+#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */
+#define MASK_1GB ((1ULL << SHIFT_1GB) - 1)
+
+#define SHIFT_2MB 21 /* (1 << 21) == 2MB */
+#define MASK_2MB ((1ULL << SHIFT_2MB) - 1)
+#define VALUE_2MB (1 << SHIFT_2MB)
+
+#define SHIFT_4KB 12 /* (1 << 12) == 4KB */
+#define MASK_4KB ((1ULL << SHIFT_4KB) - 1)
+
+struct spdk_pci_enum_ctx {
+ struct rte_pci_driver driver;
+ spdk_pci_enum_cb cb_fn;
+ void *cb_arg;
+ pthread_mutex_t mtx;
+ bool is_registered;
+};
+
+int spdk_pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
+int spdk_pci_device_fini(struct rte_pci_device *device);
+
+int spdk_pci_enumerate(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_cb, void *enum_ctx);
+int spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_cb, void *enum_ctx,
+ struct spdk_pci_addr *pci_address);
+
+int spdk_mem_map_init(void);
+int spdk_vtophys_init(void);
+
+/**
+ * Report a DMA-capable PCI device to the vtophys translation code.
+ * Increases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called after a `rte_pci_device` is created.
+ */
+void spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device);
+
+/**
+ * Report the removal of a DMA-capable PCI device to the vtophys translation code.
+ * Decreases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called before a `rte_pci_device` is destroyed.
+ */
+void spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device);
+
+#endif
diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c
new file mode 100644
index 00000000..1a2fafe1
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/init.c
@@ -0,0 +1,401 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include "spdk/version.h"
+
+#include <rte_config.h>
+#include <rte_eal.h>
+
+#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk"
+#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1
+#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1
+#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1"
+
+static char **eal_cmdline;
+static int eal_cmdline_argcount;
+
+static char *
+_sprintf_alloc(const char *format, ...)
+{
+ va_list args;
+ va_list args_copy;
+ char *buf;
+ size_t bufsize;
+ int rc;
+
+ va_start(args, format);
+
+ /* Try with a small buffer first. */
+ bufsize = 32;
+
+ /* Limit maximum buffer size to something reasonable so we don't loop forever. */
+ while (bufsize <= 1024 * 1024) {
+ buf = malloc(bufsize);
+ if (buf == NULL) {
+ va_end(args);
+ return NULL;
+ }
+
+ va_copy(args_copy, args);
+ rc = vsnprintf(buf, bufsize, format, args_copy);
+ va_end(args_copy);
+
+ /*
+ * If vsnprintf() returned a count within our current buffer size, we are done.
+ * The count does not include the \0 terminator, so rc == bufsize is not OK.
+ */
+ if (rc >= 0 && (size_t)rc < bufsize) {
+ va_end(args);
+ return buf;
+ }
+
+ /*
+ * vsnprintf() should return the required space, but some libc versions do not
+ * implement this correctly, so just double the buffer size and try again.
+ *
+ * We don't need the data in buf, so rather than realloc(), use free() and malloc()
+ * again to avoid a copy.
+ */
+ free(buf);
+ bufsize *= 2;
+ }
+
+ va_end(args);
+ return NULL;
+}
+
+static void
+spdk_env_unlink_shared_files(void)
+{
+ /* Starting with DPDK 18.05, there are more files with unpredictable paths
+ * and filenames. The --no-shconf option prevents from creating them, but
+ * only for DPDK 18.08+. For DPDK 18.05 we just leave them be.
+ */
+#if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0)
+ char buffer[PATH_MAX];
+
+ snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid());
+ if (unlink(buffer)) {
+ fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno);
+ }
+#endif
+}
+
+void
+spdk_env_opts_init(struct spdk_env_opts *opts)
+{
+ if (!opts) {
+ return;
+ }
+
+ memset(opts, 0, sizeof(*opts));
+
+ opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
+ opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
+ opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
+ opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
+ opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
+ opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
+}
+
+static void
+spdk_free_args(char **args, int argcount)
+{
+ int i;
+
+ for (i = 0; i < argcount; i++) {
+ free(args[i]);
+ }
+
+ if (argcount) {
+ free(args);
+ }
+}
+
+static char **
+spdk_push_arg(char *args[], int *argcount, char *arg)
+{
+ char **tmp;
+
+ if (arg == NULL) {
+ fprintf(stderr, "%s: NULL arg supplied\n", __func__);
+ spdk_free_args(args, *argcount);
+ return NULL;
+ }
+
+ tmp = realloc(args, sizeof(char *) * (*argcount + 1));
+ if (tmp == NULL) {
+ spdk_free_args(args, *argcount);
+ return NULL;
+ }
+
+ tmp[*argcount] = arg;
+ (*argcount)++;
+
+ return tmp;
+}
+
+static void
+spdk_destruct_eal_cmdline(void)
+{
+ spdk_free_args(eal_cmdline, eal_cmdline_argcount);
+}
+
+
+static int
+spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
+{
+ int argcount = 0;
+ char **args;
+
+ args = NULL;
+
+ /* set the program name */
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
+ if (opts->shm_id < 0) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the coremask */
+ /* NOTE: If coremask starts with '[' and ends with ']' it is a core list
+ */
+ if (opts->core_mask[0] == '[') {
+ char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
+ int len = strlen(l_arg);
+ if (l_arg[len - 1] == ']') {
+ l_arg[len - 1] = '\0';
+ }
+ args = spdk_push_arg(args, &argcount, l_arg);
+ } else {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
+ }
+
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* set the memory channel number */
+ if (opts->mem_channel > 0) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the memory size */
+ if (opts->mem_size >= 0) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the master core */
+ if (opts->master_core > 0) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
+ opts->master_core));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set no pci if enabled */
+ if (opts->no_pci) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* create just one hugetlbfs file */
+ if (opts->hugepage_single_segments) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* unlink hugepages after initialization */
+ if (opts->unlink_hugepage) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0)
+ /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem"));
+ if (args == NULL) {
+ return -1;
+ }
+#endif
+
+ if (opts->num_pci_addr) {
+ size_t i;
+ char bdf[32];
+ struct spdk_pci_addr *pci_addr =
+ opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
+
+ for (i = 0; i < opts->num_pci_addr; i++) {
+ spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s",
+ (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
+ bdf));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+ }
+
+#ifdef __linux__
+ if (opts->shm_id < 0) {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
+ getpid()));
+ if (args == NULL) {
+ return -1;
+ }
+ } else {
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
+ opts->shm_id));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* Set the base virtual address - it must be an address that is not in the
+ * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
+ * mmap hint.
+ *
+ * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+ */
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* set the process type */
+ args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#endif
+
+ eal_cmdline = args;
+ eal_cmdline_argcount = argcount;
+ if (atexit(spdk_destruct_eal_cmdline) != 0) {
+ fprintf(stderr, "Failed to register cleanup handler\n");
+ }
+
+ return argcount;
+}
+
+int spdk_env_init(const struct spdk_env_opts *opts)
+{
+ char **dpdk_args = NULL;
+ int i, rc;
+ int orig_optind;
+
+ rc = spdk_build_eal_cmdline(opts);
+ if (rc < 0) {
+ fprintf(stderr, "Invalid arguments to initialize DPDK\n");
+ return -1;
+ }
+
+ printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
+ printf("[ DPDK EAL parameters: ");
+ for (i = 0; i < eal_cmdline_argcount; i++) {
+ printf("%s ", eal_cmdline[i]);
+ }
+ printf("]\n");
+
+ /* DPDK rearranges the array we pass to it, so make a copy
+ * before passing so we can still free the individual strings
+ * correctly.
+ */
+ dpdk_args = calloc(eal_cmdline_argcount, sizeof(char *));
+ if (dpdk_args == NULL) {
+ fprintf(stderr, "Failed to allocate dpdk_args\n");
+ return -1;
+ }
+ memcpy(dpdk_args, eal_cmdline, sizeof(char *) * eal_cmdline_argcount);
+
+ fflush(stdout);
+ orig_optind = optind;
+ optind = 1;
+ rc = rte_eal_init(eal_cmdline_argcount, dpdk_args);
+ optind = orig_optind;
+
+ free(dpdk_args);
+
+ if (rc < 0) {
+ fprintf(stderr, "Failed to initialize DPDK\n");
+ return -1;
+ }
+
+ if (opts->shm_id < 0 && !opts->hugepage_single_segments) {
+ /*
+ * Unlink hugepage and config info files after init. This will ensure they get
+ * deleted on app exit, even if the app crashes and does not exit normally.
+ * Only do this when not in multi-process mode, since for multi-process other
+ * apps will need to open these files. These files are not created for
+ * "single file segments".
+ */
+ spdk_env_unlink_shared_files();
+ }
+
+ if (spdk_mem_map_init() < 0) {
+ fprintf(stderr, "Failed to allocate mem_map\n");
+ return -1;
+ }
+ if (spdk_vtophys_init() < 0) {
+ fprintf(stderr, "Failed to initialize vtophys\n");
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c
new file mode 100644
index 00000000..eaeccb90
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/memory.c
@@ -0,0 +1,712 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_eal_memconfig.h>
+
+#include "spdk_internal/assert.h"
+
+#include "spdk/assert.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+
+#if DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
+#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
+
+#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
+#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
+
+/* Page is registered */
+#define REG_MAP_REGISTERED (1ULL << 62)
+
+/* A notification region barrier. The 2MB translation entry that's marked
+ * with this flag must be unregistered separately. This allows contiguous
+ * regions to be unregistered in the same chunks they were registered.
+ */
+#define REG_MAP_NOTIFY_START (1ULL << 63)
+
+/* Translation of a single 2MB page. */
+struct map_2mb {
+ uint64_t translation_2mb;
+};
+
+/* Second-level map table indexed by bits [21..29] of the virtual address.
+ * Each entry contains the address translation or error for entries that haven't
+ * been retrieved yet.
+ */
+struct map_1gb {
+ struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
+};
+
+/* Top-level map table indexed by bits [30..47] of the virtual address.
+ * Each entry points to a second-level map table or NULL.
+ */
+struct map_256tb {
+ struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
+};
+
+/* Page-granularity memory address translation */
+struct spdk_mem_map {
+ struct map_256tb map_256tb;
+ pthread_mutex_t mutex;
+ uint64_t default_translation;
+ struct spdk_mem_map_ops ops;
+ void *cb_ctx;
+ TAILQ_ENTRY(spdk_mem_map) tailq;
+};
+
+/* Registrations map. The 64 bit translations are bit fields with the
+ * following layout (starting with the low bits):
+ * 0 - 61 : reserved
+ * 62 - 63 : flags
+ */
+static struct spdk_mem_map *g_mem_reg_map;
+static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
+static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/*
+ * Walk the currently registered memory via the main memory registration map
+ * and call the new map's notify callback for each virtually contiguous region.
+ */
+static int
+spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
+{
+ size_t idx_256tb;
+ uint64_t idx_1gb;
+ uint64_t contig_start = UINT64_MAX;
+ uint64_t contig_end = UINT64_MAX;
+ struct map_1gb *map_1gb;
+ int rc;
+
+ if (!g_mem_reg_map) {
+ return -EINVAL;
+ }
+
+ /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
+ pthread_mutex_lock(&g_mem_reg_map->mutex);
+
+ for (idx_256tb = 0;
+ idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
+ idx_256tb++) {
+ map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ if (contig_start != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ rc = map->ops.notify_cb(map->cb_ctx, map, action,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ /* Don't bother handling unregister failures. It can't be any worse */
+ if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+ goto err_unregister;
+ }
+ }
+ contig_start = UINT64_MAX;
+ continue;
+ }
+
+ for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
+ if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+ (contig_start == UINT64_MAX ||
+ (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+ /* Rebuild the virtual address from the indexes */
+ uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+ if (contig_start == UINT64_MAX) {
+ contig_start = vaddr;
+ }
+
+ contig_end = vaddr;
+ } else {
+ if (contig_start != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ rc = map->ops.notify_cb(map->cb_ctx, map, action,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ /* Don't bother handling unregister failures. It can't be any worse */
+ if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+ goto err_unregister;
+ }
+
+ /* This page might be a part of a neighbour region, so process
+ * it again. The idx_1gb will be incremented immediately.
+ */
+ idx_1gb--;
+ }
+ contig_start = UINT64_MAX;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_mem_reg_map->mutex);
+ return 0;
+
+err_unregister:
+ /* Unwind to the first empty translation so we don't unregister
+ * a region that just failed to register.
+ */
+ idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
+ idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
+ contig_start = UINT64_MAX;
+ contig_end = UINT64_MAX;
+
+ /* Unregister any memory we managed to register before the failure */
+ for (; idx_256tb < SIZE_MAX; idx_256tb--) {
+ map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ if (contig_end != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ map->ops.notify_cb(map->cb_ctx, map,
+ SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ }
+ contig_end = UINT64_MAX;
+ continue;
+ }
+
+ for (; idx_1gb < UINT64_MAX; idx_1gb--) {
+ if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+ (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+ /* Rebuild the virtual address from the indexes */
+ uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+ if (contig_end == UINT64_MAX) {
+ contig_end = vaddr;
+ }
+ contig_start = vaddr;
+ } else {
+ if (contig_end != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ map->ops.notify_cb(map->cb_ctx, map,
+ SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ idx_1gb++;
+ }
+ contig_end = UINT64_MAX;
+ }
+ }
+ idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
+ }
+
+ pthread_mutex_unlock(&g_mem_reg_map->mutex);
+ return rc;
+}
+
+struct spdk_mem_map *
+spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
+{
+ struct spdk_mem_map *map;
+ int rc;
+
+ map = calloc(1, sizeof(*map));
+ if (map == NULL) {
+ return NULL;
+ }
+
+ if (pthread_mutex_init(&map->mutex, NULL)) {
+ free(map);
+ return NULL;
+ }
+
+ map->default_translation = default_translation;
+ map->cb_ctx = cb_ctx;
+ if (ops) {
+ map->ops = *ops;
+ }
+
+ if (ops && ops->notify_cb) {
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+ rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ DEBUG_PRINT("Initial mem_map notify failed\n");
+ pthread_mutex_destroy(&map->mutex);
+ free(map);
+ return NULL;
+ }
+ TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ }
+
+ return map;
+}
+
+void
+spdk_mem_map_free(struct spdk_mem_map **pmap)
+{
+ struct spdk_mem_map *map;
+ size_t i;
+
+ if (!pmap) {
+ return;
+ }
+
+ map = *pmap;
+
+ if (!map) {
+ return;
+ }
+
+ if (map->ops.notify_cb) {
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+ spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
+ TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ }
+
+ for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
+ free(map->map_256tb.map[i]);
+ }
+
+ pthread_mutex_destroy(&map->mutex);
+
+ free(map);
+ *pmap = NULL;
+}
+
+int
+spdk_mem_register(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ int rc;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ if (len == 0) {
+ return 0;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if (reg & REG_MAP_REGISTERED) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EBUSY;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = 0;
+ while (len > 0) {
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
+ seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
+ seg_len += VALUE_2MB;
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+int
+spdk_mem_unregister(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ int rc;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg, newreg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ /* The first page must be a start of a region. Also check if it's
+ * registered to make sure we don't return -ERANGE for non-registered
+ * regions.
+ */
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+ if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -ERANGE;
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if ((reg & REG_MAP_REGISTERED) == 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EINVAL;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ /* If the next page is registered, it must be a start of a region as well,
+ * otherwise we'd be unregistering only a part of a region.
+ */
+ if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -ERANGE;
+ }
+ seg_vaddr = vaddr;
+ seg_len = 0;
+
+ while (len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
+
+ if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = VALUE_2MB;
+ } else {
+ seg_len += VALUE_2MB;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ if (seg_len > 0) {
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+static struct map_1gb *
+spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
+{
+ struct map_1gb *map_1gb;
+ uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ size_t i;
+
+ if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
+ return NULL;
+ }
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ pthread_mutex_lock(&map->mutex);
+
+ /* Recheck to make sure nobody else got the mutex first. */
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (!map_1gb) {
+ map_1gb = malloc(sizeof(struct map_1gb));
+ if (map_1gb) {
+ /* initialize all entries to default translation */
+ for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
+ map_1gb->map[i].translation_2mb = map->default_translation;
+ }
+ map->map_256tb.map[idx_256tb] = map_1gb;
+ }
+ }
+
+ pthread_mutex_unlock(&map->mutex);
+
+ if (!map_1gb) {
+ DEBUG_PRINT("allocation failed\n");
+ return NULL;
+ }
+ }
+
+ return map_1gb;
+}
+
+int
+spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
+ uint64_t translation)
+{
+ uint64_t vfn_2mb;
+ struct map_1gb *map_1gb;
+ uint64_t idx_1gb;
+ struct map_2mb *map_2mb;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
+ return -EINVAL;
+ }
+
+ /* For now, only 2 MB-aligned registrations are supported */
+ if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
+ __func__, vaddr, size);
+ return -EINVAL;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+
+ while (size) {
+ map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
+ if (!map_1gb) {
+ DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
+ return -ENOMEM;
+ }
+
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+ map_2mb = &map_1gb->map[idx_1gb];
+ map_2mb->translation_2mb = translation;
+
+ size -= VALUE_2MB;
+ vfn_2mb++;
+ }
+
+ return 0;
+}
+
+int
+spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
+{
+ uint64_t vfn_2mb;
+ struct map_1gb *map_1gb;
+ uint64_t idx_1gb;
+ struct map_2mb *map_2mb;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
+ return -EINVAL;
+ }
+
+ /* For now, only 2 MB-aligned registrations are supported */
+ if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
+ __func__, vaddr, size);
+ return -EINVAL;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+
+ while (size) {
+ map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
+ if (!map_1gb) {
+ DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
+ return -ENOMEM;
+ }
+
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+ map_2mb = &map_1gb->map[idx_1gb];
+ map_2mb->translation_2mb = map->default_translation;
+
+ size -= VALUE_2MB;
+ vfn_2mb++;
+ }
+
+ return 0;
+}
+
+uint64_t
+spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
+{
+ const struct map_1gb *map_1gb;
+ const struct map_2mb *map_2mb;
+ uint64_t idx_256tb;
+ uint64_t idx_1gb;
+ uint64_t vfn_2mb;
+ uint64_t total_size = 0;
+ uint64_t cur_size;
+ uint64_t prev_translation;
+
+ if (size != NULL) {
+ total_size = *size;
+ *size = 0;
+ }
+
+ if (spdk_unlikely(vaddr & ~MASK_256TB)) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
+ return map->default_translation;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+ idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (spdk_unlikely(!map_1gb)) {
+ return map->default_translation;
+ }
+
+ cur_size = VALUE_2MB;
+ if (size != NULL) {
+ *size = VALUE_2MB;
+ }
+
+ map_2mb = &map_1gb->map[idx_1gb];
+ if (size == NULL || map->ops.are_contiguous == NULL ||
+ map_2mb->translation_2mb == map->default_translation) {
+ return map_2mb->translation_2mb;
+ }
+
+ prev_translation = map_2mb->translation_2mb;;
+ while (cur_size < total_size) {
+ vfn_2mb++;
+ idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (spdk_unlikely(!map_1gb)) {
+ break;
+ }
+
+ map_2mb = &map_1gb->map[idx_1gb];
+ if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
+ break;
+ }
+
+ cur_size += VALUE_2MB;
+ prev_translation = map_2mb->translation_2mb;
+ }
+
+ *size = cur_size;
+ return prev_translation;
+}
+
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
+static void
+memory_hotplug_cb(enum rte_mem_event event_type,
+ const void *addr, size_t len, void *arg)
+{
+ if (event_type == RTE_MEM_EVENT_ALLOC) {
+ while (len > 0) {
+ struct rte_memseg *seg;
+
+ seg = rte_mem_virt2memseg(addr, NULL);
+ assert(seg != NULL);
+ assert(len >= seg->hugepage_sz);
+
+ spdk_mem_register((void *)seg->addr, seg->hugepage_sz);
+ addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
+ len -= seg->hugepage_sz;
+ }
+ } else if (event_type == RTE_MEM_EVENT_FREE) {
+ spdk_mem_unregister((void *)addr, len);
+ }
+}
+
+static int
+memory_iter_cb(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ return spdk_mem_register(ms->addr, len);
+}
+#endif
+
+int
+spdk_mem_map_init(void)
+{
+ g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
+ if (g_mem_reg_map == NULL) {
+ DEBUG_PRINT("memory registration map allocation failed\n");
+ return -1;
+ }
+
+ /*
+ * Walk all DPDK memory segments and register them
+ * with the master memory map
+ */
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
+ rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
+ rte_memseg_contig_walk(memory_iter_cb, NULL);
+#else
+ struct rte_mem_config *mcfg;
+ size_t seg_idx;
+
+ mcfg = rte_eal_get_configuration()->mem_config;
+ for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
+ struct rte_memseg *seg = &mcfg->memseg[seg_idx];
+
+ if (seg->addr == NULL) {
+ break;
+ }
+
+ spdk_mem_register(seg->addr, seg->len);
+ }
+#endif
+ return 0;
+}
diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c
new file mode 100644
index 00000000..4153ac93
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci.c
@@ -0,0 +1,551 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/env.h"
+
+#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers"
+
+#define PCI_CFG_SIZE 256
+#define PCI_EXT_CAP_ID_SN 0x03
+
+int
+spdk_pci_device_init(struct rte_pci_driver *driver,
+ struct rte_pci_device *device)
+{
+ struct spdk_pci_enum_ctx *ctx = (struct spdk_pci_enum_ctx *)driver;
+ int rc;
+
+ if (!ctx->cb_fn) {
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ rte_pci_unmap_device(device);
+#elif RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ rte_eal_pci_unmap_device(device);
+#endif
+
+ /* Return a positive value to indicate that this device does not belong to this driver, but
+ * this isn't an error. */
+ return 1;
+ }
+
+ rc = ctx->cb_fn(ctx->cb_arg, (struct spdk_pci_device *)device);
+ if (rc != 0) {
+ return rc;
+ }
+
+ spdk_vtophys_pci_device_added(device);
+ return 0;
+}
+
+int
+spdk_pci_device_fini(struct rte_pci_device *device)
+{
+ spdk_vtophys_pci_device_removed(device);
+ return 0;
+}
+
+void
+spdk_pci_device_detach(struct spdk_pci_device *device)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+#if RTE_VERSION < RTE_VERSION_NUM(17, 05, 0, 0)
+ rte_eal_device_remove(&device->device);
+#endif
+#endif
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ struct spdk_pci_addr addr;
+ char bdf[32];
+
+ addr.domain = device->addr.domain;
+ addr.bus = device->addr.bus;
+ addr.dev = device->addr.devid;
+ addr.func = device->addr.function;
+
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr);
+ if (rte_eal_dev_detach(&device->device) < 0) {
+ fprintf(stderr, "Failed to detach PCI device %s (device already removed?).\n", bdf);
+ }
+#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ rte_pci_detach(&device->addr);
+#else
+ rte_eal_pci_detach(&device->addr);
+#endif
+}
+
+int
+spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx,
+ spdk_pci_enum_cb enum_cb,
+ void *enum_ctx, struct spdk_pci_addr *pci_address)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ char bdf[32];
+
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
+#else
+ struct rte_pci_addr addr;
+
+ addr.domain = pci_address->domain;
+ addr.bus = pci_address->bus;
+ addr.devid = pci_address->dev;
+ addr.function = pci_address->func;
+#endif
+
+ pthread_mutex_lock(&ctx->mtx);
+
+ if (!ctx->is_registered) {
+ ctx->is_registered = true;
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ rte_pci_register(&ctx->driver);
+#else
+ rte_eal_pci_register(&ctx->driver);
+#endif
+ }
+
+ ctx->cb_fn = enum_cb;
+ ctx->cb_arg = enum_ctx;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ if (rte_eal_dev_attach(bdf, "") != 0) {
+#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ if (rte_pci_probe_one(&addr) != 0) {
+#else
+ if (rte_eal_pci_probe_one(&addr) != 0) {
+#endif
+ ctx->cb_arg = NULL;
+ ctx->cb_fn = NULL;
+ pthread_mutex_unlock(&ctx->mtx);
+ return -1;
+ }
+
+ ctx->cb_arg = NULL;
+ ctx->cb_fn = NULL;
+ pthread_mutex_unlock(&ctx->mtx);
+
+ return 0;
+}
+
+/* Note: You can call spdk_pci_enumerate from more than one thread
+ * simultaneously safely, but you cannot call spdk_pci_enumerate
+ * and rte_eal_pci_probe simultaneously.
+ */
+int
+spdk_pci_enumerate(struct spdk_pci_enum_ctx *ctx,
+ spdk_pci_enum_cb enum_cb,
+ void *enum_ctx)
+{
+ pthread_mutex_lock(&ctx->mtx);
+
+ if (!ctx->is_registered) {
+ ctx->is_registered = true;
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ rte_pci_register(&ctx->driver);
+#else
+ rte_eal_pci_register(&ctx->driver);
+#endif
+ }
+
+ ctx->cb_fn = enum_cb;
+ ctx->cb_arg = enum_ctx;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ if (rte_bus_probe() != 0) {
+#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ if (rte_pci_probe() != 0) {
+#else
+ if (rte_eal_pci_probe() != 0) {
+#endif
+ ctx->cb_arg = NULL;
+ ctx->cb_fn = NULL;
+ pthread_mutex_unlock(&ctx->mtx);
+ return -1;
+ }
+
+ ctx->cb_arg = NULL;
+ ctx->cb_fn = NULL;
+ pthread_mutex_unlock(&ctx->mtx);
+
+ return 0;
+}
+
+int
+spdk_pci_device_map_bar(struct spdk_pci_device *device, uint32_t bar,
+ void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+ struct rte_pci_device *dev = device;
+
+ *mapped_addr = dev->mem_resource[bar].addr;
+ *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
+ *size = (uint64_t)dev->mem_resource[bar].len;
+
+ return 0;
+}
+
+int
+spdk_pci_device_unmap_bar(struct spdk_pci_device *device, uint32_t bar, void *addr)
+{
+ return 0;
+}
+
+uint32_t
+spdk_pci_device_get_domain(struct spdk_pci_device *dev)
+{
+ return dev->addr.domain;
+}
+
+uint8_t
+spdk_pci_device_get_bus(struct spdk_pci_device *dev)
+{
+ return dev->addr.bus;
+}
+
+uint8_t
+spdk_pci_device_get_dev(struct spdk_pci_device *dev)
+{
+ return dev->addr.devid;
+}
+
+uint8_t
+spdk_pci_device_get_func(struct spdk_pci_device *dev)
+{
+ return dev->addr.function;
+}
+
+uint16_t
+spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
+{
+ return dev->id.vendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
+{
+ return dev->id.device_id;
+}
+
+uint16_t
+spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
+{
+ return dev->id.subsystem_vendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
+{
+ return dev->id.subsystem_device_id;
+}
+
+struct spdk_pci_id
+spdk_pci_device_get_id(struct spdk_pci_device *pci_dev)
+{
+ struct spdk_pci_id pci_id;
+
+ pci_id.vendor_id = spdk_pci_device_get_vendor_id(pci_dev);
+ pci_id.device_id = spdk_pci_device_get_device_id(pci_dev);
+ pci_id.subvendor_id = spdk_pci_device_get_subvendor_id(pci_dev);
+ pci_id.subdevice_id = spdk_pci_device_get_subdevice_id(pci_dev);
+
+ return pci_id;
+}
+
+int
+spdk_pci_device_get_socket_id(struct spdk_pci_device *pci_dev)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ return pci_dev->device.numa_node;
+#else
+ return pci_dev->numa_node;
+#endif
+}
+
+int
+spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ int rc;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ rc = rte_pci_read_config(dev, value, len, offset);
+#else
+ rc = rte_eal_pci_read_config(dev, value, len, offset);
+#endif
+ return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+int
+spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ int rc;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
+ rc = rte_pci_write_config(dev, value, len, offset);
+#else
+ rc = rte_eal_pci_write_config(dev, value, len, offset);
+#endif
+ return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+int
+spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 4, offset);
+}
+
+int
+spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 4, offset);
+}
+
+int
+spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
+{
+ int err;
+ uint32_t pos, header = 0;
+ uint32_t i, buf[2];
+
+ if (len < 17) {
+ return -1;
+ }
+
+ err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
+ if (err || !header) {
+ return -1;
+ }
+
+ pos = PCI_CFG_SIZE;
+ while (1) {
+ if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
+ if (pos) {
+ /* skip the header */
+ pos += 4;
+ for (i = 0; i < 2; i++) {
+ err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
+ if (err) {
+ return -1;
+ }
+ }
+ snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
+ return 0;
+ }
+ }
+ pos = (header >> 20) & 0xffc;
+ /* 0 if no other items exist */
+ if (pos < PCI_CFG_SIZE) {
+ return -1;
+ }
+ err = spdk_pci_device_cfg_read32(dev, &header, pos);
+ if (err) {
+ return -1;
+ }
+ }
+ return -1;
+}
+
+struct spdk_pci_addr
+spdk_pci_device_get_addr(struct spdk_pci_device *pci_dev)
+{
+ struct spdk_pci_addr pci_addr;
+
+ pci_addr.domain = spdk_pci_device_get_domain(pci_dev);
+ pci_addr.bus = spdk_pci_device_get_bus(pci_dev);
+ pci_addr.dev = spdk_pci_device_get_dev(pci_dev);
+ pci_addr.func = spdk_pci_device_get_func(pci_dev);
+
+ return pci_addr;
+}
+
+int
+spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
+{
+ if (a1->domain > a2->domain) {
+ return 1;
+ } else if (a1->domain < a2->domain) {
+ return -1;
+ } else if (a1->bus > a2->bus) {
+ return 1;
+ } else if (a1->bus < a2->bus) {
+ return -1;
+ } else if (a1->dev > a2->dev) {
+ return 1;
+ } else if (a1->dev < a2->dev) {
+ return -1;
+ } else if (a1->func > a2->func) {
+ return 1;
+ } else if (a1->func < a2->func) {
+ return -1;
+ }
+
+ return 0;
+}
+
+#ifdef __linux__
+int
+spdk_pci_device_claim(const struct spdk_pci_addr *pci_addr)
+{
+ int dev_fd;
+ char dev_name[64];
+ int pid;
+ void *dev_map;
+ struct flock pcidev_lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_start = 0,
+ .l_len = 0,
+ };
+
+ snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", pci_addr->domain,
+ pci_addr->bus,
+ pci_addr->dev, pci_addr->func);
+
+ dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (dev_fd == -1) {
+ fprintf(stderr, "could not open %s\n", dev_name);
+ return -1;
+ }
+
+ if (ftruncate(dev_fd, sizeof(int)) != 0) {
+ fprintf(stderr, "could not truncate %s\n", dev_name);
+ close(dev_fd);
+ return -1;
+ }
+
+ dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+ MAP_SHARED, dev_fd, 0);
+ if (dev_map == MAP_FAILED) {
+ fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno);
+ close(dev_fd);
+ return -1;
+ }
+
+ if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
+ pid = *(int *)dev_map;
+ fprintf(stderr, "Cannot create lock on device %s, probably"
+ " process %d has claimed it\n", dev_name, pid);
+ munmap(dev_map, sizeof(int));
+ close(dev_fd);
+ return -1;
+ }
+
+ *(int *)dev_map = (int)getpid();
+ munmap(dev_map, sizeof(int));
+ /* Keep dev_fd open to maintain the lock. */
+ return dev_fd;
+}
+#endif /* __linux__ */
+
+#ifdef __FreeBSD__
+int
+spdk_pci_device_claim(const struct spdk_pci_addr *pci_addr)
+{
+ /* TODO */
+ return 0;
+}
+#endif /* __FreeBSD__ */
+
+int
+spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
+{
+ unsigned domain, bus, dev, func;
+
+ if (addr == NULL || bdf == NULL) {
+ return -EINVAL;
+ }
+
+ if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
+ (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
+ /* Matched a full address - all variables are initialized */
+ } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+ func = 0;
+ } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
+ (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
+ domain = 0;
+ } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
+ (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
+ domain = 0;
+ func = 0;
+ } else {
+ return -EINVAL;
+ }
+
+ if (bus > 0xFF || dev > 0x1F || func > 7) {
+ return -EINVAL;
+ }
+
+ addr->domain = domain;
+ addr->bus = bus;
+ addr->dev = dev;
+ addr->func = func;
+
+ return 0;
+}
+
+int
+spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
+{
+ int rc;
+
+ rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
+ addr->domain, addr->bus,
+ addr->dev, addr->func);
+
+ if (rc > 0 && (size_t)rc < sz) {
+ return 0;
+ }
+
+ return -1;
+}
diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c
new file mode 100644
index 00000000..b9640283
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_ioat.c
@@ -0,0 +1,123 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) RTE_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct rte_pci_id ioat_driver_id[] = {
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)},
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+static struct spdk_pci_enum_ctx g_ioat_pci_drv = {
+ .driver = {
+ .drv_flags = RTE_PCI_DRV_NEED_MAPPING,
+ .id_table = ioat_driver_id,
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ .probe = spdk_pci_device_init,
+ .remove = spdk_pci_device_fini,
+ .driver.name = "spdk_ioat",
+#else
+ .devinit = spdk_pci_device_init,
+ .devuninit = spdk_pci_device_fini,
+ .name = "spdk_ioat",
+#endif
+ },
+
+ .cb_fn = NULL,
+ .cb_arg = NULL,
+ .mtx = PTHREAD_MUTEX_INITIALIZER,
+ .is_registered = false,
+};
+
+int
+spdk_pci_ioat_device_attach(spdk_pci_enum_cb enum_cb, void *enum_ctx,
+ struct spdk_pci_addr *pci_address)
+{
+ return spdk_pci_device_attach(&g_ioat_pci_drv, enum_cb, enum_ctx, pci_address);
+}
+
+int
+spdk_pci_ioat_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx)
+{
+ return spdk_pci_enumerate(&g_ioat_pci_drv, enum_cb, enum_ctx);
+}
diff --git a/src/spdk/lib/env_dpdk/pci_nvme.c b/src/spdk/lib/env_dpdk/pci_nvme.c
new file mode 100644
index 00000000..4f3b84d1
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_nvme.c
@@ -0,0 +1,89 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct rte_pci_id nvme_pci_driver_id[] = {
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 7, 0, 1)
+ {
+ .class_id = SPDK_PCI_CLASS_NVME,
+ .vendor_id = PCI_ANY_ID,
+ .device_id = PCI_ANY_ID,
+ .subsystem_vendor_id = PCI_ANY_ID,
+ .subsystem_device_id = PCI_ANY_ID,
+ },
+#else
+ {RTE_PCI_DEVICE(0x8086, 0x0953)},
+#endif
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+static struct spdk_pci_enum_ctx g_nvme_pci_drv = {
+ .driver = {
+ .drv_flags = RTE_PCI_DRV_NEED_MAPPING
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0)
+ | RTE_PCI_DRV_WC_ACTIVATE
+#endif
+ ,
+ .id_table = nvme_pci_driver_id,
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ .probe = spdk_pci_device_init,
+ .remove = spdk_pci_device_fini,
+ .driver.name = "spdk_nvme",
+#else
+ .devinit = spdk_pci_device_init,
+ .devuninit = spdk_pci_device_fini,
+ .name = "spdk_nvme",
+#endif
+ },
+
+ .cb_fn = NULL,
+ .cb_arg = NULL,
+ .mtx = PTHREAD_MUTEX_INITIALIZER,
+ .is_registered = false,
+};
+
+int
+spdk_pci_nvme_device_attach(spdk_pci_enum_cb enum_cb,
+ void *enum_ctx, struct spdk_pci_addr *pci_address)
+{
+ return spdk_pci_device_attach(&g_nvme_pci_drv, enum_cb, enum_ctx, pci_address);
+}
+
+int
+spdk_pci_nvme_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx)
+{
+ return spdk_pci_enumerate(&g_nvme_pci_drv, enum_cb, enum_ctx);
+}
diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c
new file mode 100644
index 00000000..1fcb80d7
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_virtio.c
@@ -0,0 +1,80 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct rte_pci_id virtio_pci_driver_id[] = {
+ { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) },
+ { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+static struct spdk_pci_enum_ctx g_virtio_pci_drv = {
+ .driver = {
+ .drv_flags = RTE_PCI_DRV_NEED_MAPPING
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0)
+ | RTE_PCI_DRV_WC_ACTIVATE
+#endif
+ ,
+ .id_table = virtio_pci_driver_id,
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ .probe = spdk_pci_device_init,
+ .remove = spdk_pci_device_fini,
+ .driver.name = "spdk_virtio",
+#else
+ .devinit = spdk_pci_device_init,
+ .devuninit = spdk_pci_device_fini,
+ .name = "spdk_virtio",
+#endif
+ },
+
+ .cb_fn = NULL,
+ .cb_arg = NULL,
+ .mtx = PTHREAD_MUTEX_INITIALIZER,
+ .is_registered = false,
+};
+
+int
+spdk_pci_virtio_device_attach(spdk_pci_enum_cb enum_cb,
+ void *enum_ctx, struct spdk_pci_addr *pci_address)
+{
+ return spdk_pci_device_attach(&g_virtio_pci_drv, enum_cb, enum_ctx, pci_address);
+}
+
+int
+spdk_pci_virtio_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx)
+{
+ return spdk_pci_enumerate(&g_virtio_pci_drv, enum_cb, enum_ctx);
+}
diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c
new file mode 100644
index 00000000..55b0bbb6
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/threads.c
@@ -0,0 +1,108 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+
+#include <rte_config.h>
+#include <rte_lcore.h>
+
+uint32_t
+spdk_env_get_core_count(void)
+{
+ return rte_lcore_count();
+}
+
+uint32_t
+spdk_env_get_current_core(void)
+{
+ return rte_lcore_id();
+}
+
+uint32_t
+spdk_env_get_first_core(void)
+{
+ return rte_get_next_lcore(-1, 0, 0);
+}
+
+uint32_t
+spdk_env_get_last_core(void)
+{
+ uint32_t i;
+ uint32_t last_core = UINT32_MAX;
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ last_core = i;
+ }
+
+ assert(last_core != UINT32_MAX);
+
+ return last_core;
+}
+
+uint32_t
+spdk_env_get_next_core(uint32_t prev_core)
+{
+ unsigned lcore;
+
+ lcore = rte_get_next_lcore(prev_core, 0, 0);
+ if (lcore == RTE_MAX_LCORE) {
+ return UINT32_MAX;
+ }
+ return lcore;
+}
+
+uint32_t
+spdk_env_get_socket_id(uint32_t core)
+{
+ if (core >= RTE_MAX_LCORE) {
+ return SPDK_ENV_SOCKET_ID_ANY;
+ }
+
+ return rte_lcore_to_socket_id(core);
+}
+
+int
+spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg)
+{
+ int rc;
+
+ rc = rte_eal_remote_launch(fn, arg, core);
+
+ return rc;
+}
+
+void
+spdk_env_thread_wait_all(void)
+{
+ rte_eal_mp_wait_lcore();
+}
diff --git a/src/spdk/lib/env_dpdk/vtophys.c b/src/spdk/lib/env_dpdk/vtophys.c
new file mode 100644
index 00000000..00e8bb6d
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/vtophys.c
@@ -0,0 +1,691 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_eal_memconfig.h>
+
+#include "spdk_internal/assert.h"
+
+#include "spdk/assert.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+
+#ifdef __FreeBSD__
+#define SPDK_VFIO_ENABLED 0
+#else
+#include <linux/version.h>
+/*
+ * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API,
+ * and we can't link to internal symbols when built against shared library DPDK,
+ * so disable VFIO entirely in that case.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \
+ (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB))
+
+#define SPDK_VFIO_ENABLED 1
+#include <linux/vfio.h>
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+#include <rte_vfio.h>
+#else
+/* Internal DPDK function forward declaration */
+int pci_vfio_is_enabled(void);
+#endif
+
+struct spdk_vfio_dma_map {
+ struct vfio_iommu_type1_dma_map map;
+ struct vfio_iommu_type1_dma_unmap unmap;
+ TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
+};
+
+struct vfio_cfg {
+ int fd;
+ bool enabled;
+ unsigned device_ref;
+ TAILQ_HEAD(, spdk_vfio_dma_map) maps;
+ pthread_mutex_t mutex;
+};
+
+static struct vfio_cfg g_vfio = {
+ .fd = -1,
+ .enabled = false,
+ .device_ref = 0,
+ .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
+ .mutex = PTHREAD_MUTEX_INITIALIZER
+};
+
+#else
+#define SPDK_VFIO_ENABLED 0
+#endif
+#endif
+
+#if DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+struct spdk_vtophys_pci_device {
+ struct rte_pci_device *pci_device;
+ TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
+ uint64_t ref;
+};
+
+static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
+ TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
+
+static struct spdk_mem_map *g_vtophys_map;
+
+#if SPDK_VFIO_ENABLED
+static int
+vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
+{
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ dma_map = calloc(1, sizeof(*dma_map));
+ if (dma_map == NULL) {
+ return -ENOMEM;
+ }
+
+ dma_map->map.argsz = sizeof(dma_map->map);
+ dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ dma_map->map.vaddr = vaddr;
+ dma_map->map.iova = iova;
+ dma_map->map.size = size;
+
+ dma_map->unmap.argsz = sizeof(dma_map->unmap);
+ dma_map->unmap.flags = 0;
+ dma_map->unmap.iova = iova;
+ dma_map->unmap.size = size;
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ if (g_vfio.device_ref == 0) {
+ /* VFIO requires at least one device (IOMMU group) to be added to
+ * a VFIO container before it is possible to perform any IOMMU
+ * operations on that container. This memory will be mapped once
+ * the first device (IOMMU group) is hotplugged.
+ *
+ * Since the vfio container is managed internally by DPDK, it is
+ * also possible that some device is already in that container, but
+ * it's not managed by SPDK - e.g. an NIC attached internally
+ * inside DPDK. We could map the memory straight away in such
+ * scenario, but there's no need to do it. DPDK devices clearly
+ * don't need our mappings and hence we defer the mapping
+ * unconditionally until the first SPDK-managed device is
+ * hotplugged.
+ */
+ goto out_insert;
+ }
+
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+ if (ret) {
+ DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ free(dma_map);
+ return ret;
+ }
+
+out_insert:
+ TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return 0;
+}
+
+static int
+vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
+{
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ if (dma_map->map.iova == iova) {
+ break;
+ }
+ }
+
+ if (dma_map == NULL) {
+ DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return -ENXIO;
+ }
+
+ /** don't support partial or multiple-page unmap for now */
+ assert(dma_map->map.size == size);
+
+ if (g_vfio.device_ref == 0) {
+ /* Memory is not mapped anymore, just remove it's references */
+ goto out_remove;
+ }
+
+
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+ if (ret) {
+ DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return ret;
+ }
+
+out_remove:
+ TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ free(dma_map);
+ return 0;
+}
+#endif
+
+static uint64_t
+vtophys_get_paddr_memseg(uint64_t vaddr)
+{
+ uintptr_t paddr;
+ struct rte_memseg *seg;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
+ seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
+ if (seg != NULL) {
+ paddr = seg->phys_addr;
+ if (paddr == RTE_BAD_IOVA) {
+ return SPDK_VTOPHYS_ERROR;
+ }
+ paddr += (vaddr - (uintptr_t)seg->addr);
+ return paddr;
+ }
+#else
+ struct rte_mem_config *mcfg;
+ uint32_t seg_idx;
+
+ mcfg = rte_eal_get_configuration()->mem_config;
+ for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
+ seg = &mcfg->memseg[seg_idx];
+ if (seg->addr == NULL) {
+ break;
+ }
+
+ if (vaddr >= (uintptr_t)seg->addr &&
+ vaddr < ((uintptr_t)seg->addr + seg->len)) {
+ paddr = seg->phys_addr;
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ if (paddr == RTE_BAD_IOVA) {
+#else
+ if (paddr == RTE_BAD_PHYS_ADDR) {
+#endif
+ return SPDK_VTOPHYS_ERROR;
+ }
+ paddr += (vaddr - (uintptr_t)seg->addr);
+ return paddr;
+ }
+ }
+#endif
+
+ return SPDK_VTOPHYS_ERROR;
+}
+
+/* Try to get the paddr from /proc/self/pagemap */
+static uint64_t
+vtophys_get_paddr_pagemap(uint64_t vaddr)
+{
+ uintptr_t paddr;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+#define BAD_ADDR RTE_BAD_IOVA
+#define VTOPHYS rte_mem_virt2iova
+#else
+#define BAD_ADDR RTE_BAD_PHYS_ADDR
+#define VTOPHYS rte_mem_virt2phy
+#endif
+
+ /*
+ * Note: the virt2phy/virt2iova functions have changed over time, such
+ * that older versions may return 0 while recent versions will never
+ * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead. To support older and
+ * newer versions, check for both return values.
+ */
+ paddr = VTOPHYS((void *)vaddr);
+ if (paddr == 0 || paddr == BAD_ADDR) {
+ /*
+ * The vaddr may be valid but doesn't have a backing page
+ * assigned yet. Touch the page to ensure a backing page
+ * gets assigned, then try to translate again.
+ */
+ rte_atomic64_read((rte_atomic64_t *)vaddr);
+ paddr = VTOPHYS((void *)vaddr);
+ }
+ if (paddr == 0 || paddr == BAD_ADDR) {
+ /* Unable to get to the physical address. */
+ return SPDK_VTOPHYS_ERROR;
+ }
+
+#undef BAD_ADDR
+#undef VTOPHYS
+
+ return paddr;
+}
+
+/* Try to get the paddr from pci devices */
+static uint64_t
+vtophys_get_paddr_pci(uint64_t vaddr)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+ uintptr_t paddr;
+ struct rte_pci_device *dev;
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 1)
+ struct rte_mem_resource *res;
+#else
+ struct rte_pci_resource *res;
+#endif
+ unsigned r;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ dev = vtophys_dev->pci_device;
+
+ for (r = 0; r < PCI_MAX_RESOURCE; r++) {
+ res = &dev->mem_resource[r];
+ if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
+ vaddr < (uint64_t)res->addr + res->len) {
+ paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
+ DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
+ (void *)paddr);
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+ return paddr;
+ }
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+ return SPDK_VTOPHYS_ERROR;
+}
+
+static int
+spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t len)
+{
+ int rc = 0, pci_phys = 0;
+ uint64_t paddr;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ while (len > 0) {
+ /* Get the physical address from the DPDK memsegs */
+ paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+
+ switch (action) {
+ case SPDK_MEM_MAP_NOTIFY_REGISTER:
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /* This is not an address that DPDK is managing. */
+#if SPDK_VFIO_ENABLED
+ if (g_vfio.enabled) {
+ /* We'll use the virtual address as the iova. DPDK
+ * currently uses physical addresses as the iovas (or counts
+ * up from 0 if it can't get physical addresses), so
+ * the range of user space virtual addresses and physical
+ * addresses will never overlap.
+ */
+ paddr = (uint64_t)vaddr;
+ rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
+ if (rc) {
+ return -EFAULT;
+ }
+ } else
+#endif
+ {
+ /* Get the physical address from /proc/self/pagemap. */
+ paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /* Get the physical address from PCI devices */
+ paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+ pci_phys = 1;
+ }
+ }
+ }
+ /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
+ if (!pci_phys && (paddr & MASK_2MB)) {
+ DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
+ return -EINVAL;
+ }
+
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ break;
+ case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+#if SPDK_VFIO_ENABLED
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /*
+ * This is not an address that DPDK is managing. If vfio is enabled,
+ * we need to unmap the range from the IOMMU
+ */
+ if (g_vfio.enabled) {
+ uint64_t buffer_len;
+ paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
+ if (buffer_len != VALUE_2MB) {
+ return -EINVAL;
+ }
+ rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
+ if (rc) {
+ return -EFAULT;
+ }
+ }
+ }
+#endif
+ rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ if (rc != 0) {
+ return rc;
+ }
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ return rc;
+}
+
+#if SPDK_VFIO_ENABLED
+
+static bool
+spdk_vfio_enabled(void)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ return rte_vfio_is_enabled("vfio_pci");
+#else
+ return pci_vfio_is_enabled();
+#endif
+}
+
+static void
+spdk_vtophys_iommu_init(void)
+{
+ char proc_fd_path[PATH_MAX + 1];
+ char link_path[PATH_MAX + 1];
+ const char vfio_path[] = "/dev/vfio/vfio";
+ DIR *dir;
+ struct dirent *d;
+
+ if (!spdk_vfio_enabled()) {
+ return;
+ }
+
+ dir = opendir("/proc/self/fd");
+ if (!dir) {
+ DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
+ return;
+ }
+
+ while ((d = readdir(dir)) != NULL) {
+ if (d->d_type != DT_LNK) {
+ continue;
+ }
+
+ snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
+ if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
+ continue;
+ }
+
+ if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
+ sscanf(d->d_name, "%d", &g_vfio.fd);
+ break;
+ }
+ }
+
+ closedir(dir);
+
+ if (g_vfio.fd < 0) {
+ DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
+ return;
+ }
+
+ g_vfio.enabled = true;
+
+ return;
+}
+#endif
+
+void
+spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+ bool found = false;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ if (vtophys_dev->pci_device == pci_device) {
+ vtophys_dev->ref++;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ vtophys_dev = calloc(1, sizeof(*vtophys_dev));
+ if (vtophys_dev) {
+ vtophys_dev->pci_device = pci_device;
+ vtophys_dev->ref = 1;
+ TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
+ } else {
+ DEBUG_PRINT("Memory allocation error\n");
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if SPDK_VFIO_ENABLED
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ if (!g_vfio.enabled) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ g_vfio.device_ref++;
+ if (g_vfio.device_ref > 1) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return;
+ }
+
+ /* This is the first SPDK device using DPDK vfio. This means that the first
+ * IOMMU group might have been just been added to the DPDK vfio container.
+ * From this point it is certain that the memory can be mapped now.
+ */
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+ if (ret) {
+ DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+void
+spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ if (vtophys_dev->pci_device == pci_device) {
+ assert(vtophys_dev->ref > 0);
+ if (--vtophys_dev->ref == 0) {
+ TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
+ free(vtophys_dev);
+ }
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if SPDK_VFIO_ENABLED
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ if (!g_vfio.enabled) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ assert(g_vfio.device_ref > 0);
+ g_vfio.device_ref--;
+ if (g_vfio.device_ref > 0) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return;
+ }
+
+ /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
+ * any additional devices using it's vfio container, all the mappings
+ * will be automatically removed by the Linux vfio driver. We unmap
+ * the memory manually to be able to easily re-map it later regardless
+ * of other, external factors.
+ */
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+ if (ret) {
+ DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+int
+spdk_vtophys_init(void)
+{
+ const struct spdk_mem_map_ops vtophys_map_ops = {
+ .notify_cb = spdk_vtophys_notify,
+ .are_contiguous = NULL
+ };
+
+#if SPDK_VFIO_ENABLED
+ spdk_vtophys_iommu_init();
+#endif
+
+ g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
+ if (g_vtophys_map == NULL) {
+ DEBUG_PRINT("vtophys map allocation failed\n");
+ return -1;
+ }
+ return 0;
+}
+
+uint64_t
+spdk_vtophys(void *buf)
+{
+ uint64_t vaddr, paddr_2mb;
+
+ vaddr = (uint64_t)buf;
+
+ paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, NULL);
+
+ /*
+ * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
+ * we will still bitwise-or it with the buf offset below, but the result will still be
+ * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
+ * unaligned) we must now check the return value before addition.
+ */
+ SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
+ if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
+ return SPDK_VTOPHYS_ERROR;
+ } else {
+ return paddr_2mb + ((uint64_t)buf & MASK_2MB);
+ }
+}
+
+static int
+spdk_bus_scan(void)
+{
+ return 0;
+}
+
+static int
+spdk_bus_probe(void)
+{
+ return 0;
+}
+
+static struct rte_device *
+spdk_bus_find_device(const struct rte_device *start,
+ rte_dev_cmp_t cmp, const void *data)
+{
+ return NULL;
+}
+
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+static enum rte_iova_mode
+spdk_bus_get_iommu_class(void) {
+ /* Since we register our PCI drivers after EAL init, we have no chance
+ * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu
+ * class. DPDK uses RTE_IOVA_PA by default because for some platforms
+ * it's the only supported mode, but then SPDK does not support those
+ * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus
+ * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used
+ * (i.e. at least one device on the system is bound to uio_pci_generic),
+ * so we simply return RTE_IOVA_VA here.
+ */
+ return RTE_IOVA_VA;
+}
+#endif
+
+struct rte_bus spdk_bus = {
+ .scan = spdk_bus_scan,
+ .probe = spdk_bus_probe,
+ .find_device = spdk_bus_find_device,
+#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
+ .get_iommu_class = spdk_bus_get_iommu_class,
+#endif
+};
+
+RTE_REGISTER_BUS(spdk, spdk_bus);