diff options
Diffstat (limited to 'src/spdk/lib/env_dpdk')
-rw-r--r-- | src/spdk/lib/env_dpdk/Makefile | 47 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/env.c | 451 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/env.mk | 176 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/env_internal.h | 98 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/init.c | 604 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/memory.c | 1442 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci.c | 1063 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_idxd.c | 50 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_ioat.c | 98 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_virtio.c | 53 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_vmd.c | 50 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/spdk_env_dpdk.map | 114 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/threads.c | 108 |
13 files changed, 4354 insertions, 0 deletions
diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile new file mode 100644 index 000000000..11433fe86 --- /dev/null +++ b/src/spdk/lib/env_dpdk/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 5 +SO_MINOR := 0 + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = env.c memory.c pci.c init.c threads.c +C_SRCS += pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c +LIBNAME = env_dpdk + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_env_dpdk.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c new file mode 100644 index 000000000..94b709de9 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.c @@ -0,0 +1,451 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/util.h" +#include "spdk/env_dpdk.h" + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_cycles.h> +#include <rte_malloc.h> +#include <rte_mempool.h> +#include <rte_memzone.h> +#include <rte_version.h> + +static uint64_t +virt_to_phys(void *vaddr) +{ + uint64_t ret; + + ret = rte_malloc_virt2iova(vaddr); + if (ret != RTE_BAD_IOVA) { + return ret; + } + + return spdk_vtophys(vaddr, NULL); +} + +void * +spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + void *buf; + + if (flags == 0) { + return NULL; + } + + align = spdk_max(align, RTE_CACHE_LINE_SIZE); + buf = rte_malloc_socket(NULL, size, align, socket_id); + if (buf && phys_addr) { +#ifdef DEBUG + fprintf(stderr, "phys_addr param in spdk_*malloc() is deprecated\n"); +#endif + *phys_addr = virt_to_phys(buf); + } + return buf; +} + +void * +spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags); + if (buf) { + memset(buf, 0, size); + } + return buf; +} + +void * +spdk_realloc(void *buf, size_t size, size_t align) +{ + align = spdk_max(align, RTE_CACHE_LINE_SIZE); + return rte_realloc(buf, size, align); +} + +void +spdk_free(void *buf) +{ + rte_free(buf); +} + +void * +spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr) +{ + void *new_buf; + + align = spdk_max(align, RTE_CACHE_LINE_SIZE); + new_buf = rte_realloc(buf, size, align); + if (new_buf && phys_addr) { + *phys_addr = virt_to_phys(new_buf); + } + return new_buf; +} + +void +spdk_dma_free(void *buf) +{ + spdk_free(buf); +} + +void * +spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + const struct rte_memzone *mz; + unsigned dpdk_flags = 0; + + if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) { + dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG; + } + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align); + + if (mz != NULL) { + memset(mz->addr, 0, len); + return mz->addr; + } else { + return NULL; + } +} + +void * +spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags) +{ + return spdk_memzone_reserve_aligned(name, len, socket_id, flags, + RTE_CACHE_LINE_SIZE); +} + +void * +spdk_memzone_lookup(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return mz->addr; + } else { + return NULL; + } +} + +int +spdk_memzone_free(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return rte_memzone_free(mz); + } + + return -1; +} + +void +spdk_memzone_dump(FILE *f) +{ + rte_memzone_dump(f); +} + +struct spdk_mempool * +spdk_mempool_create_ctor(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id, + spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg) +{ + struct rte_mempool *mp; + size_t tmp; + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + /* No more than half of all elements can be in cache */ + tmp = (count / 2) / rte_lcore_count(); + if (cache_size > tmp) { + cache_size = tmp; + } + + if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) { + cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE; + } + + mp = rte_mempool_create(name, count, ele_size, cache_size, + 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg, + socket_id, MEMPOOL_F_NO_PHYS_CONTIG); + + return (struct spdk_mempool *)mp; +} + + +struct spdk_mempool * +spdk_mempool_create(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id) +{ + return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id, + NULL, NULL); +} + +char * +spdk_mempool_get_name(struct spdk_mempool *mp) +{ + return ((struct rte_mempool *)mp)->name; +} + +void +spdk_mempool_free(struct spdk_mempool *mp) +{ + rte_mempool_free((struct rte_mempool *)mp); +} + +void * +spdk_mempool_get(struct spdk_mempool *mp) +{ + void *ele = NULL; + int rc; + + rc = rte_mempool_get((struct rte_mempool *)mp, &ele); + if (rc != 0) { + return NULL; + } + return ele; +} + +int +spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +void +spdk_mempool_put(struct spdk_mempool *mp, void *ele) +{ + rte_mempool_put((struct rte_mempool *)mp, ele); +} + +void +spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +size_t +spdk_mempool_count(const struct spdk_mempool *pool) +{ + return rte_mempool_avail_count((struct rte_mempool *)pool); +} + +uint32_t +spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb, + void *obj_cb_arg) +{ + return rte_mempool_obj_iter((struct rte_mempool *)mp, (rte_mempool_obj_cb_t *)obj_cb, + obj_cb_arg); +} + +struct spdk_mempool * +spdk_mempool_lookup(const char *name) +{ + return (struct spdk_mempool *)rte_mempool_lookup(name); +} + +bool +spdk_process_is_primary(void) +{ + return (rte_eal_process_type() == RTE_PROC_PRIMARY); +} + +uint64_t spdk_get_ticks(void) +{ + return rte_get_timer_cycles(); +} + +uint64_t spdk_get_ticks_hz(void) +{ + return rte_get_timer_hz(); +} + +void spdk_delay_us(unsigned int us) +{ + rte_delay_us(us); +} + +void spdk_pause(void) +{ + rte_pause(); +} + +void +spdk_unaffinitize_thread(void) +{ + rte_cpuset_t new_cpuset, orig_cpuset; + long num_cores, i, orig_num_cores; + + CPU_ZERO(&new_cpuset); + + num_cores = sysconf(_SC_NPROCESSORS_CONF); + + /* Create a mask containing all CPUs */ + for (i = 0; i < num_cores; i++) { + CPU_SET(i, &new_cpuset); + } + + rte_thread_get_affinity(&orig_cpuset); + orig_num_cores = CPU_COUNT(&orig_cpuset); + if (orig_num_cores < num_cores) { + for (i = 0; i < orig_num_cores; i++) { + if (CPU_ISSET(i, &orig_cpuset)) { + CPU_CLR(i, &new_cpuset); + } + } + } + + rte_thread_set_affinity(&new_cpuset); +} + +void * +spdk_call_unaffinitized(void *cb(void *arg), void *arg) +{ + rte_cpuset_t orig_cpuset; + void *ret; + + if (cb == NULL) { + return NULL; + } + + rte_thread_get_affinity(&orig_cpuset); + + spdk_unaffinitize_thread(); + + ret = cb(arg); + + rte_thread_set_affinity(&orig_cpuset); + + return ret; +} + +struct spdk_ring * +spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id) +{ + char ring_name[64]; + static uint32_t ring_num = 0; + unsigned flags = RING_F_EXACT_SZ; + + switch (type) { + case SPDK_RING_TYPE_SP_SC: + flags |= RING_F_SP_ENQ | RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_SC: + flags |= RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_MC: + flags |= 0; + break; + default: + return NULL; + } + + snprintf(ring_name, sizeof(ring_name), "ring_%u_%d", + __atomic_fetch_add(&ring_num, 1, __ATOMIC_RELAXED), getpid()); + + return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags); +} + +void +spdk_ring_free(struct spdk_ring *ring) +{ + rte_ring_free((struct rte_ring *)ring); +} + +size_t +spdk_ring_count(struct spdk_ring *ring) +{ + return rte_ring_count((struct rte_ring *)ring); +} + +size_t +spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count, + size_t *free_space) +{ + return rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count, + (unsigned int *)free_space); +} + +size_t +spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count) +{ + return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL); +} + +void +spdk_env_dpdk_dump_mem_stats(FILE *file) +{ + fprintf(file, "DPDK memory size %lu\n", rte_eal_get_physmem_size()); + fprintf(file, "DPDK memory layout\n"); + rte_dump_physmem_layout(file); + fprintf(file, "DPDK memzones.\n"); + rte_memzone_dump(file); + fprintf(file, "DPDK mempools.\n"); + rte_mempool_list_dump(file); + fprintf(file, "DPDK malloc stats.\n"); + rte_malloc_dump_stats(file, NULL); + fprintf(file, "DPDK malloc heaps.\n"); + rte_malloc_dump_heaps(file); +} diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk new file mode 100644 index 000000000..c2bfb0d19 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.mk @@ -0,0 +1,176 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# This makefile snippet must define the following flags: +# ENV_CFLAGS +# ENV_CXXFLAGS +# ENV_LIBS +# ENV_LINKER_ARGS + +DPDK_DIR = $(CONFIG_DPDK_DIR) + +export DPDK_ABS_DIR = $(abspath $(DPDK_DIR)) + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h)) +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include +else +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk +endif +DPDK_INC := -I$(DPDK_INC_DIR) + +ifeq ($(CONFIG_SHARED),y) +DPDK_LIB_EXT = .so +else +DPDK_LIB_EXT = .a +endif + +DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf + +# librte_mempool_ring was new added from DPDK 17.05. Link this library used for +# ring based mempool management API. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*)) +DPDK_LIB_LIST += rte_mempool_ring +endif + +# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its +# existence to maintain backward compatibility. +ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),) +DPDK_LIB_LIST += rte_malloc +endif + +# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally +# based on their existence to maintain backward compatibility. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*)) +DPDK_LIB_LIST += rte_pci +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*)) +DPDK_LIB_LIST += rte_bus_pci +endif + +# DPDK 20.05 eal dependency +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_telemetry.*)) +DPDK_LIB_LIST += rte_telemetry +endif + +# There are some complex dependencies when using crypto, reduce or both so +# here we add the feature specific ones and set a flag to add the common +# ones after that. +DPDK_FRAMEWORK=n +ifeq ($(CONFIG_CRYPTO),y) +DPDK_FRAMEWORK=y +DPDK_LIB_LIST += rte_pmd_aesni_mb rte_reorder +endif + +ifeq ($(CONFIG_REDUCE),y) +DPDK_FRAMEWORK=y +DPDK_LIB_LIST += rte_pmd_isal +endif + +ifeq ($(DPDK_FRAMEWORK),y) +DPDK_LIB_LIST += rte_cryptodev rte_compressdev rte_bus_vdev rte_pmd_qat +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*)) +DPDK_LIB_LIST += rte_kvargs +endif + +LINK_HASH=n + +ifeq ($(CONFIG_VHOST),y) +ifneq ($(CONFIG_VHOST_INTERNAL_LIB),y) +DPDK_LIB_LIST += rte_vhost rte_net +LINK_HASH=y +ifneq ($(DPDK_FRAMEWORK),y) +DPDK_LIB_LIST += rte_cryptodev +endif +endif +endif + +ifeq ($(CONFIG_RAID5),y) +LINK_HASH=y +endif + +ifeq ($(LINK_HASH),y) +DPDK_LIB_LIST += rte_hash +endif + +define dpdk_lib_list_to_libs +$(1:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT)) +endef + +define dpdk_env_linker_args +$(ENV_DPDK_FILE) -Wl,--whole-archive,--no-as-needed $(call dpdk_lib_list_to_libs,$1) -Wl,--no-whole-archive +endef + +DPDK_LIB = $(call dpdk_lib_list_to_libs,$(DPDK_LIB_LIST)) + +# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05 +ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations +ENV_CXXFLAGS = $(ENV_CFLAGS) +ifeq ($(CONFIG_SHARED),y) +ENV_DPDK_FILE = $(call spdk_lib_list_to_shared_libs,env_dpdk) +else +ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk) +endif +ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB) +ENV_LINKER_ARGS = -Wl,-rpath-link $(DPDK_ABS_DIR)/lib +ENV_LINKER_ARGS += $(call dpdk_env_linker_args,$(DPDK_LIB_LIST)) + +ifeq ($(CONFIG_IPSEC_MB),y) +ENV_LINKER_ARGS += -lIPSec_MB -L$(IPSEC_MB_DIR) +endif + +ifeq ($(CONFIG_REDUCE),y) +ENV_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs +endif + +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + +# DPDK built with meson puts those defines elsewhere +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_build_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_build_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + +ifeq ($(OS),Linux) +ENV_LINKER_ARGS += -ldl +endif +ifeq ($(OS),FreeBSD) +ENV_LINKER_ARGS += -lexecinfo +endif diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h new file mode 100644 index 000000000..c7900d9d3 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env_internal.h @@ -0,0 +1,98 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ENV_INTERNAL_H +#define SPDK_ENV_INTERNAL_H + +#include "spdk/stdinc.h" + +#include "spdk/env.h" + +#include <rte_config.h> +#include <rte_version.h> +#include <rte_eal.h> +#include <rte_bus.h> +#include <rte_pci.h> +#include <rte_bus_pci.h> +#include <rte_dev.h> + +#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0) +#error RTE_VERSION is too old! Minimum 18.11 is required. +#endif + +/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], + * which is enough to cover 256 TB. + */ +#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */ +#define MASK_256TB ((1ULL << SHIFT_256TB) - 1) + +#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */ +#define MASK_1GB ((1ULL << SHIFT_1GB) - 1) + +#define SPDK_PCI_DRIVER_MAX_NAME_LEN 32 +struct spdk_pci_driver { + struct rte_pci_driver driver; + + const char *name; + const struct spdk_pci_id *id_table; + uint32_t drv_flags; + + spdk_pci_enum_cb cb_fn; + void *cb_arg; + TAILQ_ENTRY(spdk_pci_driver) tailq; +}; + +int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); +int pci_device_fini(struct rte_pci_device *device); + +void pci_env_init(void); +void pci_env_reinit(void); +void pci_env_fini(void); +int mem_map_init(bool legacy_mem); +int vtophys_init(void); + +/** + * Report a DMA-capable PCI device to the vtophys translation code. + * Increases the refcount of active DMA-capable devices managed by SPDK. + * This must be called after a `rte_pci_device` is created. + */ +void vtophys_pci_device_added(struct rte_pci_device *pci_device); + +/** + * Report the removal of a DMA-capable PCI device to the vtophys translation code. + * Decreases the refcount of active DMA-capable devices managed by SPDK. + * This must be called before a `rte_pci_device` is destroyed. + */ +void vtophys_pci_device_removed(struct rte_pci_device *pci_device); + +#endif diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c new file mode 100644 index 000000000..0376dbe7b --- /dev/null +++ b/src/spdk/lib/env_dpdk/init.c @@ -0,0 +1,604 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include "spdk/version.h" +#include "spdk/env_dpdk.h" + +#include <rte_config.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_vfio.h> + +#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" +#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" +#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 + +static char **g_eal_cmdline; +static int g_eal_cmdline_argcount; +static bool g_external_init = true; + +static char * +_sprintf_alloc(const char *format, ...) +{ + va_list args; + va_list args_copy; + char *buf; + size_t bufsize; + int rc; + + va_start(args, format); + + /* Try with a small buffer first. */ + bufsize = 32; + + /* Limit maximum buffer size to something reasonable so we don't loop forever. */ + while (bufsize <= 1024 * 1024) { + buf = malloc(bufsize); + if (buf == NULL) { + va_end(args); + return NULL; + } + + va_copy(args_copy, args); + rc = vsnprintf(buf, bufsize, format, args_copy); + va_end(args_copy); + + /* + * If vsnprintf() returned a count within our current buffer size, we are done. + * The count does not include the \0 terminator, so rc == bufsize is not OK. + */ + if (rc >= 0 && (size_t)rc < bufsize) { + va_end(args); + return buf; + } + + /* + * vsnprintf() should return the required space, but some libc versions do not + * implement this correctly, so just double the buffer size and try again. + * + * We don't need the data in buf, so rather than realloc(), use free() and malloc() + * again to avoid a copy. + */ + free(buf); + bufsize *= 2; + } + + va_end(args); + return NULL; +} + +void +spdk_env_opts_init(struct spdk_env_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; + opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; + opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; + opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; + opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; +} + +static void +free_args(char **args, int argcount) +{ + int i; + + if (args == NULL) { + return; + } + + for (i = 0; i < argcount; i++) { + free(args[i]); + } + + if (argcount) { + free(args); + } +} + +static char ** +push_arg(char *args[], int *argcount, char *arg) +{ + char **tmp; + + if (arg == NULL) { + fprintf(stderr, "%s: NULL arg supplied\n", __func__); + free_args(args, *argcount); + return NULL; + } + + tmp = realloc(args, sizeof(char *) * (*argcount + 1)); + if (tmp == NULL) { + free(arg); + free_args(args, *argcount); + return NULL; + } + + tmp[*argcount] = arg; + (*argcount)++; + + return tmp; +} + +#if defined(__linux__) && defined(__x86_64__) + +/* TODO: Can likely get this value from rlimits in the future */ +#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 +#define VTD_CAP_MGAW_SHIFT 16 +#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) + +static int +get_iommu_width(void) +{ + DIR *dir; + FILE *file; + struct dirent *entry; + char mgaw_path[64]; + char buf[64]; + char *end; + long long int val; + int width, tmp; + + dir = opendir("/sys/devices/virtual/iommu/"); + if (dir == NULL) { + return -EINVAL; + } + + width = 0; + + while ((entry = readdir(dir)) != NULL) { + /* Find directories named "dmar0", "dmar1", etc */ + if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { + continue; + } + + tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", + entry->d_name); + if ((unsigned)tmp >= sizeof(mgaw_path)) { + continue; + } + + file = fopen(mgaw_path, "r"); + if (file == NULL) { + continue; + } + + if (fgets(buf, sizeof(buf), file) == NULL) { + fclose(file); + continue; + } + + val = strtoll(buf, &end, 16); + if (val == LLONG_MIN || val == LLONG_MAX) { + fclose(file); + continue; + } + + tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; + if (width == 0 || tmp < width) { + width = tmp; + } + + fclose(file); + } + + closedir(dir); + + return width; +} + +#endif + +static int +build_eal_cmdline(const struct spdk_env_opts *opts) +{ + int argcount = 0; + char **args; + + args = NULL; + + /* set the program name */ + args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); + if (args == NULL) { + return -1; + } + + /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ + if (opts->shm_id < 0) { + args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); + if (args == NULL) { + return -1; + } + } + + /* set the coremask */ + /* NOTE: If coremask starts with '[' and ends with ']' it is a core list + */ + if (opts->core_mask[0] == '[') { + char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); + + if (l_arg != NULL) { + int len = strlen(l_arg); + + if (l_arg[len - 1] == ']') { + l_arg[len - 1] = '\0'; + } + } + args = push_arg(args, &argcount, l_arg); + } else { + args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); + } + + if (args == NULL) { + return -1; + } + + /* set the memory channel number */ + if (opts->mem_channel > 0) { + args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); + if (args == NULL) { + return -1; + } + } + + /* set the memory size */ + if (opts->mem_size >= 0) { + args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); + if (args == NULL) { + return -1; + } + } + + /* set the master core */ + if (opts->master_core > 0) { + args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", + opts->master_core)); + if (args == NULL) { + return -1; + } + } + + /* set no pci if enabled */ + if (opts->no_pci) { + args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); + if (args == NULL) { + return -1; + } + } + + /* create just one hugetlbfs file */ + if (opts->hugepage_single_segments) { + args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); + if (args == NULL) { + return -1; + } + } + + /* unlink hugepages after initialization */ + if (opts->unlink_hugepage) { + args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); + if (args == NULL) { + return -1; + } + } + + /* use a specific hugetlbfs mount */ + if (opts->hugedir) { + args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); + if (args == NULL) { + return -1; + } + } + + if (opts->num_pci_addr) { + size_t i; + char bdf[32]; + struct spdk_pci_addr *pci_addr = + opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; + + for (i = 0; i < opts->num_pci_addr; i++) { + spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); + args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", + (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), + bdf)); + if (args == NULL) { + return -1; + } + } + } + + /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. + * This can be overridden by specifying the same option in opts->env_context + */ + args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); + if (args == NULL) { + return -1; + } + + /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. + * This can be overridden by specifying the same option in opts->env_context + */ + args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); + if (args == NULL) { + return -1; + } + + /* `user1` log type is used by rte_vhost, which prints an INFO log for each received + * vhost user message. We don't want that. The same log type is also used by a couple + * of other DPDK libs, but none of which we make use right now. If necessary, this can + * be overridden via opts->env_context. + */ + args = push_arg(args, &argcount, strdup("--log-level=user1:6")); + if (args == NULL) { + return -1; + } + + if (opts->env_context) { + args = push_arg(args, &argcount, strdup(opts->env_context)); + if (args == NULL) { + return -1; + } + } + +#ifdef __linux__ + + if (opts->iova_mode) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); + if (args == NULL) { + return -1; + } + } else { + /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, + * but DPDK guesses it should be iova-mode=va. Add a check and force + * iova-mode=pa here. */ + if (rte_vfio_noiommu_is_enabled()) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } + +#if defined(__x86_64__) + /* DPDK by default guesses that it should be using iova-mode=va so that it can + * support running as an unprivileged user. However, some systems (especially + * virtual machines) don't have an IOMMU capable of handling the full virtual + * address space and DPDK doesn't currently catch that. Add a check in SPDK + * and force iova-mode=pa here. */ + if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } +#elif defined(__PPC64__) + /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly + * auto-detect at the moment, so we'll just force it here. */ + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } +#endif + } + + + /* Set the base virtual address - it must be an address that is not in the + * ASAN shadow region, otherwise ASAN-enabled builds will ignore the + * mmap hint. + * + * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm + */ + args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); + if (args == NULL) { + return -1; + } + + /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. + * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two + * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split + * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. + */ +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { + args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); + if (args == NULL) { + return -1; + } + } +#endif + + if (opts->shm_id < 0) { + args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", + getpid())); + if (args == NULL) { + return -1; + } + } else { + args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", + opts->shm_id)); + if (args == NULL) { + return -1; + } + + /* set the process type */ + args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); + if (args == NULL) { + return -1; + } + } +#endif + + g_eal_cmdline = args; + g_eal_cmdline_argcount = argcount; + return argcount; +} + +int +spdk_env_dpdk_post_init(bool legacy_mem) +{ + int rc; + + pci_env_init(); + + rc = mem_map_init(legacy_mem); + if (rc < 0) { + fprintf(stderr, "Failed to allocate mem_map\n"); + return rc; + } + + rc = vtophys_init(); + if (rc < 0) { + fprintf(stderr, "Failed to initialize vtophys\n"); + return rc; + } + + return 0; +} + +void +spdk_env_dpdk_post_fini(void) +{ + pci_env_fini(); + + free_args(g_eal_cmdline, g_eal_cmdline_argcount); + g_eal_cmdline = NULL; + g_eal_cmdline_argcount = 0; +} + +int +spdk_env_init(const struct spdk_env_opts *opts) +{ + char **dpdk_args = NULL; + int i, rc; + int orig_optind; + bool legacy_mem; + + /* If SPDK env has been initialized before, then only pci env requires + * reinitialization. + */ + if (g_external_init == false) { + if (opts != NULL) { + fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); + return -EINVAL; + } + + printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); + pci_env_reinit(); + + return 0; + } + + if (opts == NULL) { + fprintf(stderr, "NULL arguments to initialize DPDK\n"); + return -EINVAL; + } + + rc = build_eal_cmdline(opts); + if (rc < 0) { + fprintf(stderr, "Invalid arguments to initialize DPDK\n"); + return -EINVAL; + } + + printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); + printf("[ DPDK EAL parameters: "); + for (i = 0; i < g_eal_cmdline_argcount; i++) { + printf("%s ", g_eal_cmdline[i]); + } + printf("]\n"); + + /* DPDK rearranges the array we pass to it, so make a copy + * before passing so we can still free the individual strings + * correctly. + */ + dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); + if (dpdk_args == NULL) { + fprintf(stderr, "Failed to allocate dpdk_args\n"); + return -ENOMEM; + } + memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); + + fflush(stdout); + orig_optind = optind; + optind = 1; + rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); + optind = orig_optind; + + free(dpdk_args); + + if (rc < 0) { + if (rte_errno == EALREADY) { + fprintf(stderr, "DPDK already initialized\n"); + } else { + fprintf(stderr, "Failed to initialize DPDK\n"); + } + return -rte_errno; + } + + legacy_mem = false; + if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { + legacy_mem = true; + } + + rc = spdk_env_dpdk_post_init(legacy_mem); + if (rc == 0) { + g_external_init = false; + } + + return rc; +} + +void +spdk_env_fini(void) +{ + spdk_env_dpdk_post_fini(); +} + +bool +spdk_env_dpdk_external_init(void) +{ + return g_external_init; +} diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c new file mode 100644 index 000000000..4c2205a46 --- /dev/null +++ b/src/spdk/lib/env_dpdk/memory.c @@ -0,0 +1,1442 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_memory.h> +#include <rte_eal_memconfig.h> + +#include "spdk_internal/assert.h" + +#include "spdk/assert.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/env_dpdk.h" + +#ifdef __FreeBSD__ +#define VFIO_ENABLED 0 +#else +#include <linux/version.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_ENABLED 1 +#include <linux/vfio.h> +#include <rte_vfio.h> + +struct spdk_vfio_dma_map { + struct vfio_iommu_type1_dma_map map; + struct vfio_iommu_type1_dma_unmap unmap; + TAILQ_ENTRY(spdk_vfio_dma_map) tailq; +}; + +struct vfio_cfg { + int fd; + bool enabled; + bool noiommu_enabled; + unsigned device_ref; + TAILQ_HEAD(, spdk_vfio_dma_map) maps; + pthread_mutex_t mutex; +}; + +static struct vfio_cfg g_vfio = { + .fd = -1, + .enabled = false, + .noiommu_enabled = false, + .device_ref = 0, + .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), + .mutex = PTHREAD_MUTEX_INITIALIZER +}; + +#else +#define VFIO_ENABLED 0 +#endif +#endif + +#if DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) +#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) + +#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) +#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) + +/* Page is registered */ +#define REG_MAP_REGISTERED (1ULL << 62) + +/* A notification region barrier. The 2MB translation entry that's marked + * with this flag must be unregistered separately. This allows contiguous + * regions to be unregistered in the same chunks they were registered. + */ +#define REG_MAP_NOTIFY_START (1ULL << 63) + +/* Translation of a single 2MB page. */ +struct map_2mb { + uint64_t translation_2mb; +}; + +/* Second-level map table indexed by bits [21..29] of the virtual address. + * Each entry contains the address translation or error for entries that haven't + * been retrieved yet. + */ +struct map_1gb { + struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; +}; + +/* Top-level map table indexed by bits [30..47] of the virtual address. + * Each entry points to a second-level map table or NULL. + */ +struct map_256tb { + struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; +}; + +/* Page-granularity memory address translation */ +struct spdk_mem_map { + struct map_256tb map_256tb; + pthread_mutex_t mutex; + uint64_t default_translation; + struct spdk_mem_map_ops ops; + void *cb_ctx; + TAILQ_ENTRY(spdk_mem_map) tailq; +}; + +/* Registrations map. The 64 bit translations are bit fields with the + * following layout (starting with the low bits): + * 0 - 61 : reserved + * 62 - 63 : flags + */ +static struct spdk_mem_map *g_mem_reg_map; +static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = + TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); +static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; + +static bool g_legacy_mem; + +/* + * Walk the currently registered memory via the main memory registration map + * and call the new map's notify callback for each virtually contiguous region. + */ +static int +mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) +{ + size_t idx_256tb; + uint64_t idx_1gb; + uint64_t contig_start = UINT64_MAX; + uint64_t contig_end = UINT64_MAX; + struct map_1gb *map_1gb; + int rc; + + if (!g_mem_reg_map) { + return -EINVAL; + } + + /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ + pthread_mutex_lock(&g_mem_reg_map->mutex); + + for (idx_256tb = 0; + idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); + idx_256tb++) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + } + contig_start = UINT64_MAX; + continue; + } + + for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_start == UINT64_MAX || + (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_start == UINT64_MAX) { + contig_start = vaddr; + } + + contig_end = vaddr; + } else { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + + /* This page might be a part of a neighbour region, so process + * it again. The idx_1gb will be incremented immediately. + */ + idx_1gb--; + } + contig_start = UINT64_MAX; + } + } + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return 0; + +err_unregister: + /* Unwind to the first empty translation so we don't unregister + * a region that just failed to register. + */ + idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); + idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); + contig_start = UINT64_MAX; + contig_end = UINT64_MAX; + + /* Unregister any memory we managed to register before the failure */ + for (; idx_256tb < SIZE_MAX; idx_256tb--) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + } + contig_end = UINT64_MAX; + continue; + } + + for (; idx_1gb < UINT64_MAX; idx_1gb--) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_end == UINT64_MAX) { + contig_end = vaddr; + } + contig_start = vaddr; + } else { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + idx_1gb++; + } + contig_end = UINT64_MAX; + } + } + idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return rc; +} + +struct spdk_mem_map * +spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) +{ + struct spdk_mem_map *map; + int rc; + + map = calloc(1, sizeof(*map)); + if (map == NULL) { + return NULL; + } + + if (pthread_mutex_init(&map->mutex, NULL)) { + free(map); + return NULL; + } + + map->default_translation = default_translation; + map->cb_ctx = cb_ctx; + if (ops) { + map->ops = *ops; + } + + if (ops && ops->notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + DEBUG_PRINT("Initial mem_map notify failed\n"); + pthread_mutex_destroy(&map->mutex); + free(map); + return NULL; + } + TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + return map; +} + +void +spdk_mem_map_free(struct spdk_mem_map **pmap) +{ + struct spdk_mem_map *map; + size_t i; + + if (!pmap) { + return; + } + + map = *pmap; + + if (!map) { + return; + } + + if (map->ops.notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); + TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { + free(map->map_256tb.map[i]); + } + + pthread_mutex_destroy(&map->mutex); + + free(map); + *pmap = NULL; +} + +int +spdk_mem_register(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + if (len == 0) { + return 0; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if (reg & REG_MAP_REGISTERED) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EBUSY; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + seg_vaddr = vaddr; + seg_len = 0; + while (len > 0) { + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, + seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); + seg_len += VALUE_2MB; + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +int +spdk_mem_unregister(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg, newreg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + /* The first page must be a start of a region. Also check if it's + * registered to make sure we don't return -ERANGE for non-registered + * regions. + */ + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EINVAL; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + /* If the next page is registered, it must be a start of a region as well, + * otherwise we'd be unregistering only a part of a region. + */ + if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + seg_vaddr = vaddr; + seg_len = 0; + + while (len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); + + if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { + TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + seg_vaddr = vaddr; + seg_len = VALUE_2MB; + } else { + seg_len += VALUE_2MB; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + if (seg_len > 0) { + TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +int +spdk_mem_reserve(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + void *seg_vaddr; + size_t seg_len; + uint64_t reg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + if (len == 0) { + return 0; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + /* Check if any part of this range is already registered */ + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if (reg & REG_MAP_REGISTERED) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EBUSY; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + /* Simply set the translation to the memory map's default. This allocates the space in the + * map but does not provide a valid translation. */ + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, + g_mem_reg_map->default_translation); + + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +static struct map_1gb * +mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) +{ + struct map_1gb *map_1gb; + uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); + size_t i; + + if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { + return NULL; + } + + map_1gb = map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + pthread_mutex_lock(&map->mutex); + + /* Recheck to make sure nobody else got the mutex first. */ + map_1gb = map->map_256tb.map[idx_256tb]; + if (!map_1gb) { + map_1gb = malloc(sizeof(struct map_1gb)); + if (map_1gb) { + /* initialize all entries to default translation */ + for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { + map_1gb->map[i].translation_2mb = map->default_translation; + } + map->map_256tb.map[idx_256tb] = map_1gb; + } + } + + pthread_mutex_unlock(&map->mutex); + + if (!map_1gb) { + DEBUG_PRINT("allocation failed\n"); + return NULL; + } + } + + return map_1gb; +} + +int +spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, + uint64_t translation) +{ + uint64_t vfn_2mb; + struct map_1gb *map_1gb; + uint64_t idx_1gb; + struct map_2mb *map_2mb; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); + return -EINVAL; + } + + /* For now, only 2 MB-aligned registrations are supported */ + if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", + __func__, vaddr, size); + return -EINVAL; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + + while (size) { + map_1gb = mem_map_get_map_1gb(map, vfn_2mb); + if (!map_1gb) { + DEBUG_PRINT("could not get %p map\n", (void *)vaddr); + return -ENOMEM; + } + + idx_1gb = MAP_1GB_IDX(vfn_2mb); + map_2mb = &map_1gb->map[idx_1gb]; + map_2mb->translation_2mb = translation; + + size -= VALUE_2MB; + vfn_2mb++; + } + + return 0; +} + +int +spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) +{ + return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); +} + +inline uint64_t +spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) +{ + const struct map_1gb *map_1gb; + const struct map_2mb *map_2mb; + uint64_t idx_256tb; + uint64_t idx_1gb; + uint64_t vfn_2mb; + uint64_t cur_size; + uint64_t prev_translation; + uint64_t orig_translation; + + if (spdk_unlikely(vaddr & ~MASK_256TB)) { + DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); + return map->default_translation; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + return map->default_translation; + } + + cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); + map_2mb = &map_1gb->map[idx_1gb]; + if (size == NULL || map->ops.are_contiguous == NULL || + map_2mb->translation_2mb == map->default_translation) { + if (size != NULL) { + *size = spdk_min(*size, cur_size); + } + return map_2mb->translation_2mb; + } + + orig_translation = map_2mb->translation_2mb; + prev_translation = orig_translation; + while (cur_size < *size) { + vfn_2mb++; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + break; + } + + map_2mb = &map_1gb->map[idx_1gb]; + if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { + break; + } + + cur_size += VALUE_2MB; + prev_translation = map_2mb->translation_2mb; + } + + *size = spdk_min(*size, cur_size); + return orig_translation; +} + +static void +memory_hotplug_cb(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg) +{ + if (event_type == RTE_MEM_EVENT_ALLOC) { + spdk_mem_register((void *)addr, len); + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + if (!spdk_env_dpdk_external_init()) { + return; + } +#endif + + /* Prior to DPDK 19.02, we have to worry about DPDK + * freeing memory in different units than it was allocated. + * That doesn't work with things like RDMA MRs. So for + * those versions of DPDK, mark each segment so that DPDK + * won't later free it. That ensures we don't have to deal + * with that scenario. + * + * DPDK 19.02 added the --match-allocations RTE flag to + * avoid this condition. + * + * Note: if the user initialized DPDK separately, we can't + * be sure that --match-allocations was specified, so need + * to still mark the segments so they aren't freed. + */ + while (len > 0) { + struct rte_memseg *seg; + + seg = rte_mem_virt2memseg(addr, NULL); + assert(seg != NULL); + seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + addr = (void *)((uintptr_t)addr + seg->hugepage_sz); + len -= seg->hugepage_sz; + } + } else if (event_type == RTE_MEM_EVENT_FREE) { + spdk_mem_unregister((void *)addr, len); + } +} + +static int +memory_iter_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + return spdk_mem_register(ms->addr, len); +} + +int +mem_map_init(bool legacy_mem) +{ + g_legacy_mem = legacy_mem; + + g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); + if (g_mem_reg_map == NULL) { + DEBUG_PRINT("memory registration map allocation failed\n"); + return -ENOMEM; + } + + /* + * Walk all DPDK memory segments and register them + * with the master memory map + */ + rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); + rte_memseg_contig_walk(memory_iter_cb, NULL); + return 0; +} + +bool +spdk_iommu_is_enabled(void) +{ +#if VFIO_ENABLED + return g_vfio.enabled && !g_vfio.noiommu_enabled; +#else + return false; +#endif +} + +struct spdk_vtophys_pci_device { + struct rte_pci_device *pci_device; + TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; +}; + +static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = + TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); + +static struct spdk_mem_map *g_vtophys_map; +static struct spdk_mem_map *g_phys_ref_map; + +#if VFIO_ENABLED +static int +vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + uint64_t refcount; + int ret; + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; + } + + dma_map = calloc(1, sizeof(*dma_map)); + if (dma_map == NULL) { + return -ENOMEM; + } + + dma_map->map.argsz = sizeof(dma_map->map); + dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map->map.vaddr = vaddr; + dma_map->map.iova = iova; + dma_map->map.size = size; + + dma_map->unmap.argsz = sizeof(dma_map->unmap); + dma_map->unmap.flags = 0; + dma_map->unmap.iova = iova; + dma_map->unmap.size = size; + + pthread_mutex_lock(&g_vfio.mutex); + if (g_vfio.device_ref == 0) { + /* VFIO requires at least one device (IOMMU group) to be added to + * a VFIO container before it is possible to perform any IOMMU + * operations on that container. This memory will be mapped once + * the first device (IOMMU group) is hotplugged. + * + * Since the vfio container is managed internally by DPDK, it is + * also possible that some device is already in that container, but + * it's not managed by SPDK - e.g. an NIC attached internally + * inside DPDK. We could map the memory straight away in such + * scenario, but there's no need to do it. DPDK devices clearly + * don't need our mappings and hence we defer the mapping + * unconditionally until the first SPDK-managed device is + * hotplugged. + */ + goto out_insert; + } + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return ret; + } + +out_insert: + TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; +} + +static int +vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + uint64_t refcount; + int ret; + + pthread_mutex_lock(&g_vfio.mutex); + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + if (dma_map->map.iova == iova) { + break; + } + } + + if (dma_map == NULL) { + DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); + pthread_mutex_unlock(&g_vfio.mutex); + return -ENXIO; + } + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); + } + + /* We still have outstanding references, don't clear it. */ + if (refcount > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return 0; + } + + /** don't support partial or multiple-page unmap for now */ + assert(dma_map->map.size == size); + + if (g_vfio.device_ref == 0) { + /* Memory is not mapped anymore, just remove it's references */ + goto out_remove; + } + + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + return ret; + } + +out_remove: + TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return 0; +} +#endif + +static uint64_t +vtophys_get_paddr_memseg(uint64_t vaddr) +{ + uintptr_t paddr; + struct rte_memseg *seg; + + seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); + if (seg != NULL) { + paddr = seg->phys_addr; + if (paddr == RTE_BAD_IOVA) { + return SPDK_VTOPHYS_ERROR; + } + paddr += (vaddr - (uintptr_t)seg->addr); + return paddr; + } + + return SPDK_VTOPHYS_ERROR; +} + +/* Try to get the paddr from /proc/self/pagemap */ +static uint64_t +vtophys_get_paddr_pagemap(uint64_t vaddr) +{ + uintptr_t paddr; + + /* Silence static analyzers */ + assert(vaddr != 0); + paddr = rte_mem_virt2iova((void *)vaddr); + if (paddr == RTE_BAD_IOVA) { + /* + * The vaddr may be valid but doesn't have a backing page + * assigned yet. Touch the page to ensure a backing page + * gets assigned, then try to translate again. + */ + rte_atomic64_read((rte_atomic64_t *)vaddr); + paddr = rte_mem_virt2iova((void *)vaddr); + } + if (paddr == RTE_BAD_IOVA) { + /* Unable to get to the physical address. */ + return SPDK_VTOPHYS_ERROR; + } + + return paddr; +} + +/* Try to get the paddr from pci devices */ +static uint64_t +vtophys_get_paddr_pci(uint64_t vaddr) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + uintptr_t paddr; + struct rte_pci_device *dev; + struct rte_mem_resource *res; + unsigned r; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + dev = vtophys_dev->pci_device; + + for (r = 0; r < PCI_MAX_RESOURCE; r++) { + res = &dev->mem_resource[r]; + if (res->phys_addr && vaddr >= (uint64_t)res->addr && + vaddr < (uint64_t)res->addr + res->len) { + paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); + DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, + (void *)paddr); + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + return paddr; + } + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + + return SPDK_VTOPHYS_ERROR; +} + +static int +vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t len) +{ + int rc = 0, pci_phys = 0; + uint64_t paddr; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", + vaddr, len); + return -EINVAL; + } + + /* Get the physical address from the DPDK memsegs */ + paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (paddr == SPDK_VTOPHYS_ERROR) { + /* This is not an address that DPDK is managing. */ +#if VFIO_ENABLED + enum rte_iova_mode iova_mode; + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) + iova_mode = rte_eal_iova_mode(); +#else + iova_mode = rte_eal_get_configuration()->iova_mode; +#endif + + if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { + /* We'll use the virtual address as the iova to match DPDK. */ + paddr = (uint64_t)vaddr; + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); + if (rc) { + return -EFAULT; + } + while (len > 0) { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + vaddr += VALUE_2MB; + paddr += VALUE_2MB; + len -= VALUE_2MB; + } + } else +#endif + { + /* Get the physical address from /proc/self/pagemap. */ + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + /* Get the physical address from PCI devices */ + paddr = vtophys_get_paddr_pci((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + /* The beginning of this address range points to a PCI resource, + * so the rest must point to a PCI resource as well. + */ + pci_phys = 1; + } + + /* Get paddr for each 2MB chunk in this address range */ + while (len > 0) { + /* Get the physical address from /proc/self/pagemap. */ + if (pci_phys) { + paddr = vtophys_get_paddr_pci((uint64_t)vaddr); + } else { + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); + } + + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + + /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ + if (!pci_phys && (paddr & MASK_2MB)) { + DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); + return -EINVAL; + } +#if VFIO_ENABLED + /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory + * with the IOMMU using the physical address to match. */ + if (spdk_iommu_is_enabled()) { + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); + if (rc) { + DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); + return -EFAULT; + } + } +#endif + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + } + } else { + /* This is an address managed by DPDK. Just setup the translations. */ + while (len > 0) { + paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + } + + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: +#if VFIO_ENABLED + if (paddr == SPDK_VTOPHYS_ERROR) { + /* + * This is not an address that DPDK is managing. If vfio is enabled, + * we need to unmap the range from the IOMMU + */ + if (spdk_iommu_is_enabled()) { + uint64_t buffer_len = len; + uint8_t *va = vaddr; + enum rte_iova_mode iova_mode; + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) + iova_mode = rte_eal_iova_mode(); +#else + iova_mode = rte_eal_get_configuration()->iova_mode; +#endif + /* + * In virtual address mode, the region is contiguous and can be done in + * one unmap. + */ + if (iova_mode == RTE_IOVA_VA) { + paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); + if (buffer_len != len || paddr != (uintptr_t)va) { + DEBUG_PRINT("Unmapping %p with length %lu failed because " + "translation had address 0x%" PRIx64 " and length %lu\n", + va, len, paddr, buffer_len); + return -EINVAL; + } + rc = vtophys_iommu_unmap_dma(paddr, len); + if (rc) { + DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); + return -EFAULT; + } + } else if (iova_mode == RTE_IOVA_PA) { + /* Get paddr for each 2MB chunk in this address range */ + while (buffer_len > 0) { + paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); + + if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { + DEBUG_PRINT("could not get phys addr for %p\n", va); + return -EFAULT; + } + + rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); + if (rc) { + DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); + return -EFAULT; + } + + va += VALUE_2MB; + buffer_len -= VALUE_2MB; + } + } + } + } +#endif + while (len > 0) { + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + break; + default: + SPDK_UNREACHABLE(); + } + + return rc; +} + +static int +vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) +{ + /* This function is always called with paddrs for two subsequent + * 2MB chunks in virtual address space, so those chunks will be only + * physically contiguous if the physical addresses are 2MB apart + * from each other as well. + */ + return (paddr2 - paddr1 == VALUE_2MB); +} + +#if VFIO_ENABLED + +static bool +vfio_enabled(void) +{ + return rte_vfio_is_enabled("vfio_pci"); +} + +/* Check if IOMMU is enabled on the system */ +static bool +has_iommu_groups(void) +{ + struct dirent *d; + int count = 0; + DIR *dir = opendir("/sys/kernel/iommu_groups"); + + if (dir == NULL) { + return false; + } + + while (count < 3 && (d = readdir(dir)) != NULL) { + count++; + } + + closedir(dir); + /* there will always be ./ and ../ entries */ + return count > 2; +} + +static bool +vfio_noiommu_enabled(void) +{ + return rte_vfio_noiommu_is_enabled(); +} + +static void +vtophys_iommu_init(void) +{ + char proc_fd_path[PATH_MAX + 1]; + char link_path[PATH_MAX + 1]; + const char vfio_path[] = "/dev/vfio/vfio"; + DIR *dir; + struct dirent *d; + + if (!vfio_enabled()) { + return; + } + + if (vfio_noiommu_enabled()) { + g_vfio.noiommu_enabled = true; + } else if (!has_iommu_groups()) { + return; + } + + dir = opendir("/proc/self/fd"); + if (!dir) { + DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); + return; + } + + while ((d = readdir(dir)) != NULL) { + if (d->d_type != DT_LNK) { + continue; + } + + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); + if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { + continue; + } + + if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { + sscanf(d->d_name, "%d", &g_vfio.fd); + break; + } + } + + closedir(dir); + + if (g_vfio.fd < 0) { + DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); + return; + } + + g_vfio.enabled = true; + + return; +} +#endif + +void +vtophys_pci_device_added(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + + vtophys_dev = calloc(1, sizeof(*vtophys_dev)); + if (vtophys_dev) { + vtophys_dev->pci_device = pci_device; + TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); + } else { + DEBUG_PRINT("Memory allocation error\n"); + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + g_vfio.device_ref++; + if (g_vfio.device_ref > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the first SPDK device using DPDK vfio. This means that the first + * IOMMU group might have been just been added to the DPDK vfio container. + * From this point it is certain that the memory can be mapped now. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +void +vtophys_pci_device_removed(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + if (vtophys_dev->pci_device == pci_device) { + TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); + free(vtophys_dev); + break; + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + assert(g_vfio.device_ref > 0); + g_vfio.device_ref--; + if (g_vfio.device_ref > 0) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have + * any additional devices using it's vfio container, all the mappings + * will be automatically removed by the Linux vfio driver. We unmap + * the memory manually to be able to easily re-map it later regardless + * of other, external factors. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +int +vtophys_init(void) +{ + const struct spdk_mem_map_ops vtophys_map_ops = { + .notify_cb = vtophys_notify, + .are_contiguous = vtophys_check_contiguous_entries, + }; + + const struct spdk_mem_map_ops phys_ref_map_ops = { + .notify_cb = NULL, + .are_contiguous = NULL, + }; + +#if VFIO_ENABLED + vtophys_iommu_init(); +#endif + + g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); + if (g_phys_ref_map == NULL) { + DEBUG_PRINT("phys_ref map allocation failed.\n"); + return -ENOMEM; + } + + g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); + if (g_vtophys_map == NULL) { + DEBUG_PRINT("vtophys map allocation failed\n"); + return -ENOMEM; + } + return 0; +} + +uint64_t +spdk_vtophys(void *buf, uint64_t *size) +{ + uint64_t vaddr, paddr_2mb; + + vaddr = (uint64_t)buf; + paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); + + /* + * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, + * we will still bitwise-or it with the buf offset below, but the result will still be + * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being + * unaligned) we must now check the return value before addition. + */ + SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); + if (paddr_2mb == SPDK_VTOPHYS_ERROR) { + return SPDK_VTOPHYS_ERROR; + } else { + return paddr_2mb + (vaddr & MASK_2MB); + } +} diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c new file mode 100644 index 000000000..5fd1b4abd --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci.c @@ -0,0 +1,1063 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include <rte_alarm.h> +#include <rte_devargs.h> +#include "spdk/env.h" + +#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" + +#define PCI_CFG_SIZE 256 +#define PCI_EXT_CAP_ID_SN 0x03 + +/* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time + * might cause the internal IPC to misbehave. Just retry in such case. + */ +#define DPDK_HOTPLUG_RETRY_COUNT 4 + +/* DPDK alarm/interrupt thread */ +static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices); +/* devices hotplugged on a dpdk thread */ +static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices = + TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices); +static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers); + +static int +map_bar_rte(struct spdk_pci_device *device, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + struct rte_pci_device *dev = device->dev_handle; + + *mapped_addr = dev->mem_resource[bar].addr; + *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr; + *size = (uint64_t)dev->mem_resource[bar].len; + + return 0; +} + +static int +unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr) +{ + return 0; +} + +static int +cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + + rc = rte_pci_read_config(dev->dev_handle, value, len, offset); + + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +static int +cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + + rc = rte_pci_write_config(dev->dev_handle, value, len, offset); + +#ifdef __FreeBSD__ + /* DPDK returns 0 on success and -1 on failure */ + return rc; +#endif + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +static void +remove_rte_dev(struct rte_pci_device *rte_dev) +{ + char bdf[32]; + int i = 0, rc; + + snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name); + do { + rc = rte_eal_hotplug_remove("pci", bdf); + } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); +} + +static void +detach_rte_cb(void *_dev) +{ + remove_rte_dev(_dev); +} + +static void +detach_rte(struct spdk_pci_device *dev) +{ + struct rte_pci_device *rte_dev = dev->dev_handle; + int i; + bool removed; + + if (!spdk_process_is_primary()) { + remove_rte_dev(rte_dev); + return; + } + + pthread_mutex_lock(&g_pci_mutex); + dev->internal.attached = false; + /* prevent the hotremove notification from removing this device */ + dev->internal.pending_removal = true; + pthread_mutex_unlock(&g_pci_mutex); + + rte_eal_alarm_set(1, detach_rte_cb, rte_dev); + + /* wait up to 2s for the cb to execute */ + for (i = 2000; i > 0; i--) { + + spdk_delay_us(1000); + pthread_mutex_lock(&g_pci_mutex); + removed = dev->internal.removed; + pthread_mutex_unlock(&g_pci_mutex); + + if (removed) { + break; + } + } + + /* besides checking the removed flag, we also need to wait + * for the dpdk detach function to unwind, as it's doing some + * operations even after calling our detach callback. Simply + * cancel the alarm - if it started executing already, this + * call will block and wait for it to finish. + */ + rte_eal_alarm_cancel(detach_rte_cb, rte_dev); + + /* the device could have been finally removed, so just check + * it again. + */ + pthread_mutex_lock(&g_pci_mutex); + removed = dev->internal.removed; + pthread_mutex_unlock(&g_pci_mutex); + if (!removed) { + fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n", + rte_dev->name); + /* If we reach this state, then the device couldn't be removed and most likely + a subsequent hot add of a device in the same BDF will fail */ + } +} + +void +spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags) +{ + struct spdk_pci_driver *driver; + + driver = calloc(1, sizeof(*driver)); + if (!driver) { + /* we can't do any better than bailing atm */ + return; + } + + driver->name = name; + driver->id_table = id_table; + driver->drv_flags = flags; + TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq); +} + +struct spdk_pci_driver * +spdk_pci_nvme_get_driver(void) +{ + return spdk_pci_get_driver("nvme"); +} + +struct spdk_pci_driver * +spdk_pci_get_driver(const char *name) +{ + struct spdk_pci_driver *driver; + + TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { + if (strcmp(driver->name, name) == 0) { + return driver; + } + } + + return NULL; +} + +static void +pci_device_rte_hotremove(const char *device_name, + enum rte_dev_event_type event, + void *cb_arg) +{ + struct spdk_pci_device *dev; + bool can_detach = false; + + if (event != RTE_DEV_EVENT_REMOVE) { + return; + } + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + struct rte_pci_device *rte_dev = dev->dev_handle; + + if (strcmp(rte_dev->name, device_name) == 0 && + !dev->internal.pending_removal) { + can_detach = !dev->internal.attached; + /* prevent any further attaches */ + dev->internal.pending_removal = true; + break; + } + } + pthread_mutex_unlock(&g_pci_mutex); + + if (dev != NULL && can_detach) { + /* if device is not attached we can remove it right away. + * Otherwise it will be removed at detach. + */ + remove_rte_dev(dev->dev_handle); + } +} + +static void +cleanup_pci_devices(void) +{ + struct spdk_pci_device *dev, *tmp; + + pthread_mutex_lock(&g_pci_mutex); + /* cleanup removed devices */ + TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) { + if (!dev->internal.removed) { + continue; + } + + vtophys_pci_device_removed(dev->dev_handle); + TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); + free(dev); + } + + /* add newly-attached devices */ + TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) { + TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq); + TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); + vtophys_pci_device_added(dev->dev_handle); + } + pthread_mutex_unlock(&g_pci_mutex); +} + +static int scan_pci_bus(bool delay_init); + +/* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */ +static int +register_rte_driver(struct spdk_pci_driver *driver) +{ + unsigned pci_id_count = 0; + struct rte_pci_id *rte_id_table; + char *rte_name; + size_t rte_name_len; + uint32_t rte_flags; + + assert(driver->id_table); + while (driver->id_table[pci_id_count].vendor_id) { + pci_id_count++; + } + assert(pci_id_count > 0); + + rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table)); + if (!rte_id_table) { + return -ENOMEM; + } + + while (pci_id_count > 0) { + struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1]; + const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1]; + + rte_id->class_id = spdk_id->class_id; + rte_id->vendor_id = spdk_id->vendor_id; + rte_id->device_id = spdk_id->device_id; + rte_id->subsystem_vendor_id = spdk_id->subvendor_id; + rte_id->subsystem_device_id = spdk_id->subdevice_id; + pci_id_count--; + } + + assert(driver->name); + rte_name_len = strlen(driver->name) + strlen("spdk_") + 1; + rte_name = calloc(rte_name_len, 1); + if (!rte_name) { + free(rte_id_table); + return -ENOMEM; + } + + snprintf(rte_name, rte_name_len, "spdk_%s", driver->name); + driver->driver.driver.name = rte_name; + driver->driver.id_table = rte_id_table; + + rte_flags = 0; + if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) { + rte_flags |= RTE_PCI_DRV_NEED_MAPPING; + } + if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) { + rte_flags |= RTE_PCI_DRV_WC_ACTIVATE; + } + driver->driver.drv_flags = rte_flags; + + driver->driver.probe = pci_device_init; + driver->driver.remove = pci_device_fini; + + rte_pci_register(&driver->driver); + return 0; +} + +static inline void +_pci_env_init(void) +{ + /* We assume devices were present on the bus for more than 2 seconds + * before initializing SPDK and there's no need to wait more. We scan + * the bus, but we don't blacklist any devices. + */ + scan_pci_bus(false); + + /* Register a single hotremove callback for all devices. */ + if (spdk_process_is_primary()) { + rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL); + } +} + +void +pci_env_init(void) +{ + struct spdk_pci_driver *driver; + + TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { + register_rte_driver(driver); + } + + _pci_env_init(); +} + +void +pci_env_reinit(void) +{ + /* There is no need to register pci drivers again, since they were + * already pre-registered in pci_env_init. + */ + + _pci_env_init(); +} + +void +pci_env_fini(void) +{ + struct spdk_pci_device *dev; + char bdf[32]; + + cleanup_pci_devices(); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (dev->internal.attached) { + spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr); + fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf); + } + } + + if (spdk_process_is_primary()) { + rte_dev_event_callback_unregister(NULL, pci_device_rte_hotremove, NULL); + } +} + +int +pci_device_init(struct rte_pci_driver *_drv, + struct rte_pci_device *_dev) +{ + struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv; + struct spdk_pci_device *dev; + int rc; + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return -1; + } + + dev->dev_handle = _dev; + + dev->addr.domain = _dev->addr.domain; + dev->addr.bus = _dev->addr.bus; + dev->addr.dev = _dev->addr.devid; + dev->addr.func = _dev->addr.function; + dev->id.class_id = _dev->id.class_id; + dev->id.vendor_id = _dev->id.vendor_id; + dev->id.device_id = _dev->id.device_id; + dev->id.subvendor_id = _dev->id.subsystem_vendor_id; + dev->id.subdevice_id = _dev->id.subsystem_device_id; + dev->socket_id = _dev->device.numa_node; + dev->type = "pci"; + + dev->map_bar = map_bar_rte; + dev->unmap_bar = unmap_bar_rte; + dev->cfg_read = cfg_read_rte; + dev->cfg_write = cfg_write_rte; + + dev->internal.driver = driver; + dev->internal.claim_fd = -1; + + if (driver->cb_fn != NULL) { + rc = driver->cb_fn(driver->cb_arg, dev); + if (rc != 0) { + free(dev); + return rc; + } + dev->internal.attached = true; + } + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq); + pthread_mutex_unlock(&g_pci_mutex); + return 0; +} + +int +pci_device_fini(struct rte_pci_device *_dev) +{ + struct spdk_pci_device *dev; + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (dev->dev_handle == _dev) { + break; + } + } + + if (dev == NULL || dev->internal.attached) { + /* The device might be still referenced somewhere in SPDK. */ + pthread_mutex_unlock(&g_pci_mutex); + return -1; + } + + /* remove our whitelist_at option */ + if (_dev->device.devargs) { + _dev->device.devargs->data = NULL; + } + + assert(!dev->internal.removed); + dev->internal.removed = true; + pthread_mutex_unlock(&g_pci_mutex); + return 0; + +} + +void +spdk_pci_device_detach(struct spdk_pci_device *dev) +{ + assert(dev->internal.attached); + + if (dev->internal.claim_fd >= 0) { + spdk_pci_device_unclaim(dev); + } + + if (strcmp(dev->type, "pci") == 0) { + /* if it's a physical device we need to deal with DPDK on + * a different process and we can't just unset one flag + * here. We also want to stop using any device resources + * so that the device isn't "in use" by the userspace driver + * once we detach it. This would allow attaching the device + * to a different process, or to a kernel driver like nvme. + */ + detach_rte(dev); + } else { + dev->internal.attached = false; + } + + cleanup_pci_devices(); +} + +static int +scan_pci_bus(bool delay_init) +{ + struct spdk_pci_driver *driver; + struct rte_pci_device *rte_dev; + uint64_t now; + + rte_bus_scan(); + now = spdk_get_ticks(); + + driver = TAILQ_FIRST(&g_pci_drivers); + if (!driver) { + return 0; + } + + TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) { + struct rte_devargs *da; + + da = rte_dev->device.devargs; + if (!da) { + char devargs_str[128]; + + /* the device was never blacklisted or whitelisted */ + da = calloc(1, sizeof(*da)); + if (!da) { + return -1; + } + + snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name); + if (rte_devargs_parse(da, devargs_str) != 0) { + free(da); + return -1; + } + + rte_devargs_insert(&da); + rte_dev->device.devargs = da; + } + + if (da->data) { + uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data; + + /* this device was seen by spdk before... */ + if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) { + da->policy = RTE_DEV_WHITELISTED; + } + } else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST && + da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) { + /* override the policy only if not permanently blacklisted */ + + if (delay_init) { + da->policy = RTE_DEV_BLACKLISTED; + da->data = (void *)(now + 2 * spdk_get_ticks_hz()); + } else { + da->policy = RTE_DEV_WHITELISTED; + da->data = (void *)(uintptr_t)now; + } + } + } + + return 0; +} + +int +spdk_pci_device_attach(struct spdk_pci_driver *driver, + spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ + struct spdk_pci_device *dev; + struct rte_pci_device *rte_dev; + struct rte_devargs *da; + int rc; + char bdf[32]; + + spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address); + + cleanup_pci_devices(); + + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { + break; + } + } + + if (dev != NULL && dev->internal.driver == driver) { + pthread_mutex_lock(&g_pci_mutex); + if (dev->internal.attached || dev->internal.pending_removal) { + pthread_mutex_unlock(&g_pci_mutex); + return -1; + } + + rc = enum_cb(enum_ctx, dev); + if (rc == 0) { + dev->internal.attached = true; + } + pthread_mutex_unlock(&g_pci_mutex); + return rc; + } + + driver->cb_fn = enum_cb; + driver->cb_arg = enum_ctx; + + int i = 0; + + do { + rc = rte_eal_hotplug_add("pci", bdf, ""); + } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); + + if (i > 1 && rc == -EEXIST) { + /* Even though the previous request timed out, the device + * was attached successfully. + */ + rc = 0; + } + + driver->cb_arg = NULL; + driver->cb_fn = NULL; + + cleanup_pci_devices(); + + if (rc != 0) { + return -1; + } + + /* explicit attach ignores the whitelist, so if we blacklisted this + * device before let's enable it now - just for clarity. + */ + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { + break; + } + } + assert(dev != NULL); + + rte_dev = dev->dev_handle; + da = rte_dev->device.devargs; + if (da && da->data) { + da->data = (void *)(uintptr_t)spdk_get_ticks(); + da->policy = RTE_DEV_WHITELISTED; + } + + return 0; +} + +/* Note: You can call spdk_pci_enumerate from more than one thread + * simultaneously safely, but you cannot call spdk_pci_enumerate + * and rte_eal_pci_probe simultaneously. + */ +int +spdk_pci_enumerate(struct spdk_pci_driver *driver, + spdk_pci_enum_cb enum_cb, + void *enum_ctx) +{ + struct spdk_pci_device *dev; + int rc; + + cleanup_pci_devices(); + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (dev->internal.attached || + dev->internal.driver != driver || + dev->internal.pending_removal) { + continue; + } + + rc = enum_cb(enum_ctx, dev); + if (rc == 0) { + dev->internal.attached = true; + } else if (rc < 0) { + pthread_mutex_unlock(&g_pci_mutex); + return -1; + } + } + pthread_mutex_unlock(&g_pci_mutex); + + if (scan_pci_bus(true) != 0) { + return -1; + } + + driver->cb_fn = enum_cb; + driver->cb_arg = enum_ctx; + + if (rte_bus_probe() != 0) { + driver->cb_arg = NULL; + driver->cb_fn = NULL; + return -1; + } + + driver->cb_arg = NULL; + driver->cb_fn = NULL; + + cleanup_pci_devices(); + return 0; +} + +struct spdk_pci_device * +spdk_pci_get_first_device(void) +{ + return TAILQ_FIRST(&g_pci_devices); +} + +struct spdk_pci_device * +spdk_pci_get_next_device(struct spdk_pci_device *prev) +{ + return TAILQ_NEXT(prev, internal.tailq); +} + +int +spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + return dev->map_bar(dev, bar, mapped_addr, phys_addr, size); +} + +int +spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr) +{ + return dev->unmap_bar(dev, bar, addr); +} + +uint32_t +spdk_pci_device_get_domain(struct spdk_pci_device *dev) +{ + return dev->addr.domain; +} + +uint8_t +spdk_pci_device_get_bus(struct spdk_pci_device *dev) +{ + return dev->addr.bus; +} + +uint8_t +spdk_pci_device_get_dev(struct spdk_pci_device *dev) +{ + return dev->addr.dev; +} + +uint8_t +spdk_pci_device_get_func(struct spdk_pci_device *dev) +{ + return dev->addr.func; +} + +uint16_t +spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev) +{ + return dev->id.vendor_id; +} + +uint16_t +spdk_pci_device_get_device_id(struct spdk_pci_device *dev) +{ + return dev->id.device_id; +} + +uint16_t +spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev) +{ + return dev->id.subvendor_id; +} + +uint16_t +spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev) +{ + return dev->id.subdevice_id; +} + +struct spdk_pci_id +spdk_pci_device_get_id(struct spdk_pci_device *dev) +{ + return dev->id; +} + +int +spdk_pci_device_get_socket_id(struct spdk_pci_device *dev) +{ + return dev->socket_id; +} + +int +spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + return dev->cfg_read(dev, value, len, offset); +} + +int +spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + return dev->cfg_write(dev, value, len, offset); +} + +int +spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 1, offset); +} + +int +spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 1, offset); +} + +int +spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 2, offset); +} + +int +spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 2, offset); +} + +int +spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 4, offset); +} + +int +spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 4, offset); +} + +int +spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len) +{ + int err; + uint32_t pos, header = 0; + uint32_t i, buf[2]; + + if (len < 17) { + return -1; + } + + err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE); + if (err || !header) { + return -1; + } + + pos = PCI_CFG_SIZE; + while (1) { + if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) { + if (pos) { + /* skip the header */ + pos += 4; + for (i = 0; i < 2; i++) { + err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i); + if (err) { + return -1; + } + } + snprintf(sn, len, "%08x%08x", buf[1], buf[0]); + return 0; + } + } + pos = (header >> 20) & 0xffc; + /* 0 if no other items exist */ + if (pos < PCI_CFG_SIZE) { + return -1; + } + err = spdk_pci_device_cfg_read32(dev, &header, pos); + if (err) { + return -1; + } + } + return -1; +} + +struct spdk_pci_addr +spdk_pci_device_get_addr(struct spdk_pci_device *dev) +{ + return dev->addr; +} + +bool +spdk_pci_device_is_removed(struct spdk_pci_device *dev) +{ + return dev->internal.pending_removal; +} + +int +spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) +{ + if (a1->domain > a2->domain) { + return 1; + } else if (a1->domain < a2->domain) { + return -1; + } else if (a1->bus > a2->bus) { + return 1; + } else if (a1->bus < a2->bus) { + return -1; + } else if (a1->dev > a2->dev) { + return 1; + } else if (a1->dev < a2->dev) { + return -1; + } else if (a1->func > a2->func) { + return 1; + } else if (a1->func < a2->func) { + return -1; + } + + return 0; +} + +#ifdef __linux__ +int +spdk_pci_device_claim(struct spdk_pci_device *dev) +{ + int dev_fd; + char dev_name[64]; + int pid; + void *dev_map; + struct flock pcidev_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + + snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", + dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); + + dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (dev_fd == -1) { + fprintf(stderr, "could not open %s\n", dev_name); + return -errno; + } + + if (ftruncate(dev_fd, sizeof(int)) != 0) { + fprintf(stderr, "could not truncate %s\n", dev_name); + close(dev_fd); + return -errno; + } + + dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED, dev_fd, 0); + if (dev_map == MAP_FAILED) { + fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno); + close(dev_fd); + return -errno; + } + + if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { + pid = *(int *)dev_map; + fprintf(stderr, "Cannot create lock on device %s, probably" + " process %d has claimed it\n", dev_name, pid); + munmap(dev_map, sizeof(int)); + close(dev_fd); + /* F_SETLK returns unspecified errnos, normalize them */ + return -EACCES; + } + + *(int *)dev_map = (int)getpid(); + munmap(dev_map, sizeof(int)); + dev->internal.claim_fd = dev_fd; + /* Keep dev_fd open to maintain the lock. */ + return 0; +} + +void +spdk_pci_device_unclaim(struct spdk_pci_device *dev) +{ + char dev_name[64]; + + snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", + dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); + + close(dev->internal.claim_fd); + dev->internal.claim_fd = -1; + unlink(dev_name); +} +#endif /* __linux__ */ + +#ifdef __FreeBSD__ +int +spdk_pci_device_claim(struct spdk_pci_device *dev) +{ + /* TODO */ + return 0; +} + +void +spdk_pci_device_unclaim(struct spdk_pci_device *dev) +{ + /* TODO */ +} +#endif /* __FreeBSD__ */ + +int +spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf) +{ + unsigned domain, bus, dev, func; + + if (addr == NULL || bdf == NULL) { + return -EINVAL; + } + + if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) || + (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) { + /* Matched a full address - all variables are initialized */ + } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) { + func = 0; + } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) || + (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) { + domain = 0; + } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) || + (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) { + domain = 0; + func = 0; + } else { + return -EINVAL; + } + + if (bus > 0xFF || dev > 0x1F || func > 7) { + return -EINVAL; + } + + addr->domain = domain; + addr->bus = bus; + addr->dev = dev; + addr->func = func; + + return 0; +} + +int +spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr) +{ + int rc; + + rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x", + addr->domain, addr->bus, + addr->dev, addr->func); + + if (rc > 0 && (size_t)rc < sz) { + return 0; + } + + return -1; +} + +void +spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev) +{ + assert(dev->map_bar != NULL); + assert(dev->unmap_bar != NULL); + assert(dev->cfg_read != NULL); + assert(dev->cfg_write != NULL); + dev->internal.driver = drv; + TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); +} + +void +spdk_pci_unhook_device(struct spdk_pci_device *dev) +{ + assert(!dev->internal.attached); + TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); +} + +const char * +spdk_pci_device_get_type(const struct spdk_pci_device *dev) +{ + return dev->type; +} diff --git a/src/spdk/lib/env_dpdk/pci_idxd.c b/src/spdk/lib/env_dpdk/pci_idxd.c new file mode 100644 index 000000000..eddbfa4af --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_idxd.c @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct spdk_pci_id idxd_driver_id[] = { + {SPDK_IDXD_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IDXD)}, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_idxd_get_driver(void) +{ + return spdk_pci_get_driver("idxd"); +} + +SPDK_PCI_DRIVER_REGISTER("idxd", idxd_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING); diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c new file mode 100644 index 000000000..28b7bdb44 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_ioat.c @@ -0,0 +1,98 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct spdk_pci_id ioat_driver_id[] = { + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_ICX)}, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_ioat_get_driver(void) +{ + return spdk_pci_get_driver("ioat"); +} + +SPDK_PCI_DRIVER_REGISTER("ioat", ioat_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING); diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c new file mode 100644 index 000000000..e525a4a8e --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_virtio.c @@ -0,0 +1,53 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct spdk_pci_id virtio_pci_driver_id[] = { + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_virtio_get_driver(void) +{ + return spdk_pci_get_driver("virtio"); +} + +SPDK_PCI_DRIVER_REGISTER("virtio", virtio_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); diff --git a/src/spdk/lib/env_dpdk/pci_vmd.c b/src/spdk/lib/env_dpdk/pci_vmd.c new file mode 100644 index 000000000..fb6860873 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_vmd.c @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct spdk_pci_id vmd_pci_driver_id[] = { + { SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_vmd_get_driver(void) +{ + return spdk_pci_get_driver("vmd"); +} + +SPDK_PCI_DRIVER_REGISTER("vmd", vmd_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); diff --git a/src/spdk/lib/env_dpdk/spdk_env_dpdk.map b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map new file mode 100644 index 000000000..a465f0938 --- /dev/null +++ b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map @@ -0,0 +1,114 @@ +{ + global: + + # Public functions in env.h + spdk_malloc; + spdk_zmalloc; + spdk_realloc; + spdk_free; + spdk_env_opts_init; + spdk_env_init; + spdk_env_fini; + spdk_dma_malloc; + spdk_dma_malloc_socket; + spdk_dma_zmalloc; + spdk_dma_zmalloc_socket; + spdk_dma_realloc; + spdk_dma_free; + spdk_memzone_reserve; + spdk_memzone_reserve_aligned; + spdk_memzone_lookup; + spdk_memzone_free; + spdk_memzone_dump; + spdk_mempool_create; + spdk_mempool_create_ctor; + spdk_mempool_get_name; + spdk_mempool_free; + spdk_mempool_get; + spdk_mempool_get_bulk; + spdk_mempool_put; + spdk_mempool_put_bulk; + spdk_mempool_count; + spdk_mempool_obj_iter; + spdk_mempool_lookup; + spdk_env_get_core_count; + spdk_env_get_current_core; + spdk_env_get_first_core; + spdk_env_get_last_core; + spdk_env_get_next_core; + spdk_env_get_socket_id; + spdk_env_thread_launch_pinned; + spdk_env_thread_wait_all; + spdk_process_is_primary; + spdk_get_ticks; + spdk_get_ticks_hz; + spdk_delay_us; + spdk_pause; + spdk_ring_create; + spdk_ring_free; + spdk_ring_count; + spdk_ring_enqueue; + spdk_ring_dequeue; + spdk_iommu_is_enabled; + spdk_vtophys; + spdk_pci_get_driver; + spdk_pci_driver_register; + spdk_pci_nvme_get_driver; + spdk_pci_vmd_get_driver; + spdk_pci_idxd_get_driver; + spdk_pci_ioat_get_driver; + spdk_pci_virtio_get_driver; + spdk_pci_enumerate; + spdk_pci_get_first_device; + spdk_pci_get_next_device; + spdk_pci_device_map_bar; + spdk_pci_device_unmap_bar; + spdk_pci_device_get_domain; + spdk_pci_device_get_bus; + spdk_pci_device_get_dev; + spdk_pci_device_get_func; + spdk_pci_device_get_addr; + spdk_pci_device_get_vendor_id; + spdk_pci_device_get_device_id; + spdk_pci_device_get_subvendor_id; + spdk_pci_device_get_subdevice_id; + spdk_pci_device_get_id; + spdk_pci_device_get_socket_id; + spdk_pci_device_get_serial_number; + spdk_pci_device_claim; + spdk_pci_device_unclaim; + spdk_pci_device_detach; + spdk_pci_device_attach; + spdk_pci_device_cfg_read; + spdk_pci_device_cfg_write; + spdk_pci_device_cfg_read8; + spdk_pci_device_cfg_write8; + spdk_pci_device_cfg_read16; + spdk_pci_device_cfg_write16; + spdk_pci_device_cfg_read32; + spdk_pci_device_cfg_write32; + spdk_pci_device_is_removed; + spdk_pci_addr_compare; + spdk_pci_addr_parse; + spdk_pci_addr_fmt; + spdk_pci_hook_device; + spdk_pci_unhook_device; + spdk_pci_device_get_type; + spdk_unaffinitize_thread; + spdk_call_unaffinitized; + spdk_mem_map_alloc; + spdk_mem_map_free; + spdk_mem_map_set_translation; + spdk_mem_map_clear_translation; + spdk_mem_map_translate; + spdk_mem_register; + spdk_mem_unregister; + + # Public functions in env_dpdk.h + spdk_env_dpdk_post_init; + spdk_env_dpdk_post_fini; + spdk_env_dpdk_external_init; + spdk_env_dpdk_dump_mem_stats; + + local: *; +}; diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c new file mode 100644 index 000000000..01c7b8d9f --- /dev/null +++ b/src/spdk/lib/env_dpdk/threads.c @@ -0,0 +1,108 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_lcore.h> + +uint32_t +spdk_env_get_core_count(void) +{ + return rte_lcore_count(); +} + +uint32_t +spdk_env_get_current_core(void) +{ + return rte_lcore_id(); +} + +uint32_t +spdk_env_get_first_core(void) +{ + return rte_get_next_lcore(-1, 0, 0); +} + +uint32_t +spdk_env_get_last_core(void) +{ + uint32_t i; + uint32_t last_core = UINT32_MAX; + + SPDK_ENV_FOREACH_CORE(i) { + last_core = i; + } + + assert(last_core != UINT32_MAX); + + return last_core; +} + +uint32_t +spdk_env_get_next_core(uint32_t prev_core) +{ + unsigned lcore; + + lcore = rte_get_next_lcore(prev_core, 0, 0); + if (lcore == RTE_MAX_LCORE) { + return UINT32_MAX; + } + return lcore; +} + +uint32_t +spdk_env_get_socket_id(uint32_t core) +{ + if (core >= RTE_MAX_LCORE) { + return SPDK_ENV_SOCKET_ID_ANY; + } + + return rte_lcore_to_socket_id(core); +} + +int +spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg) +{ + int rc; + + rc = rte_eal_remote_launch(fn, arg, core); + + return rc; +} + +void +spdk_env_thread_wait_all(void) +{ + rte_eal_mp_wait_lcore(); +} |