diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/lib/env_dpdk | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/env_dpdk')
-rw-r--r-- | src/spdk/lib/env_dpdk/Makefile | 42 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/env.c | 419 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/env.mk | 112 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/env_internal.h | 104 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/init.c | 401 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/memory.c | 712 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci.c | 551 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_ioat.c | 123 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_nvme.c | 89 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/pci_virtio.c | 80 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/threads.c | 108 | ||||
-rw-r--r-- | src/spdk/lib/env_dpdk/vtophys.c | 691 |
12 files changed, 3432 insertions, 0 deletions
diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile new file mode 100644 index 00000000..b7a6961f --- /dev/null +++ b/src/spdk/lib/env_dpdk/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = env.c memory.c pci.c vtophys.c init.c threads.c +C_SRCS += pci_nvme.c pci_ioat.c pci_virtio.c +LIBNAME = env_dpdk + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c new file mode 100644 index 00000000..a5238e54 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.c @@ -0,0 +1,419 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" + +#include <rte_config.h> +#include <rte_cycles.h> +#include <rte_malloc.h> +#include <rte_mempool.h> +#include <rte_memzone.h> +#include <rte_version.h> + +static uint64_t +virt_to_phys(void *vaddr) +{ + uint64_t ret; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + ret = rte_malloc_virt2iova(vaddr); + if (ret != RTE_BAD_IOVA) { + return ret; + } +#else + ret = rte_malloc_virt2phy(vaddr); + if (ret != RTE_BAD_PHYS_ADDR) { + return ret; + } +#endif + + return spdk_vtophys(vaddr); +} + +void * +spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + if (flags == 0) { + return NULL; + } + + void *buf = rte_malloc_socket(NULL, size, align, socket_id); + if (buf && phys_addr) { + *phys_addr = virt_to_phys(buf); + } + return buf; +} + +void * +spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags); + if (buf) { + memset(buf, 0, size); + } + return buf; +} + +void +spdk_free(void *buf) +{ + rte_free(buf); +} + +void * +spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr) +{ + void *new_buf = rte_realloc(buf, size, align); + if (new_buf && phys_addr) { + *phys_addr = virt_to_phys(new_buf); + } + return new_buf; +} + +void +spdk_dma_free(void *buf) +{ + spdk_free(buf); +} + +void * +spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + const struct rte_memzone *mz; + unsigned dpdk_flags = 0; + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) + /* Older DPDKs do not offer such flag since their + * memzones are iova-contiguous by default. + */ + if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) { + dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG; + } +#endif + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align); + + if (mz != NULL) { + memset(mz->addr, 0, len); + return mz->addr; + } else { + return NULL; + } +} + +void * +spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags) +{ + return spdk_memzone_reserve_aligned(name, len, socket_id, flags, + RTE_CACHE_LINE_SIZE); +} + +void * +spdk_memzone_lookup(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return mz->addr; + } else { + return NULL; + } +} + +int +spdk_memzone_free(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return rte_memzone_free(mz); + } + + return -1; +} + +void +spdk_memzone_dump(FILE *f) +{ + rte_memzone_dump(f); +} + +struct spdk_mempool * +spdk_mempool_create_ctor(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id, + spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg) +{ + struct rte_mempool *mp; + size_t tmp; + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + /* No more than half of all elements can be in cache */ + tmp = (count / 2) / rte_lcore_count(); + if (cache_size > tmp) { + cache_size = tmp; + } + + if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) { + cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE; + } + + mp = rte_mempool_create(name, count, ele_size, cache_size, + 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg, + socket_id, MEMPOOL_F_NO_PHYS_CONTIG); + + return (struct spdk_mempool *)mp; +} + + +struct spdk_mempool * +spdk_mempool_create(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id) +{ + return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id, + NULL, NULL); +} + +char * +spdk_mempool_get_name(struct spdk_mempool *mp) +{ + return ((struct rte_mempool *)mp)->name; +} + +void +spdk_mempool_free(struct spdk_mempool *mp) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(16, 7, 0, 1) + rte_mempool_free((struct rte_mempool *)mp); +#endif +} + +void * +spdk_mempool_get(struct spdk_mempool *mp) +{ + void *ele = NULL; + int rc; + + rc = rte_mempool_get((struct rte_mempool *)mp, &ele); + if (rc != 0) { + return NULL; + } + return ele; +} + +int +spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +void +spdk_mempool_put(struct spdk_mempool *mp, void *ele) +{ + rte_mempool_put((struct rte_mempool *)mp, ele); +} + +void +spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +size_t +spdk_mempool_count(const struct spdk_mempool *pool) +{ +#if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1) + return rte_mempool_count((struct rte_mempool *)pool); +#else + return rte_mempool_avail_count((struct rte_mempool *)pool); +#endif +} + +bool +spdk_process_is_primary(void) +{ + return (rte_eal_process_type() == RTE_PROC_PRIMARY); +} + +uint64_t spdk_get_ticks(void) +{ + return rte_get_timer_cycles(); +} + +uint64_t spdk_get_ticks_hz(void) +{ + return rte_get_timer_hz(); +} + +void spdk_delay_us(unsigned int us) +{ + rte_delay_us(us); +} + +void +spdk_unaffinitize_thread(void) +{ + rte_cpuset_t new_cpuset; + long num_cores, i; + + CPU_ZERO(&new_cpuset); + + num_cores = sysconf(_SC_NPROCESSORS_CONF); + + /* Create a mask containing all CPUs */ + for (i = 0; i < num_cores; i++) { + CPU_SET(i, &new_cpuset); + } + + rte_thread_set_affinity(&new_cpuset); +} + +void * +spdk_call_unaffinitized(void *cb(void *arg), void *arg) +{ + rte_cpuset_t orig_cpuset; + void *ret; + + if (cb == NULL) { + return NULL; + } + + rte_thread_get_affinity(&orig_cpuset); + + spdk_unaffinitize_thread(); + + ret = cb(arg); + + rte_thread_set_affinity(&orig_cpuset); + + return ret; +} + +struct spdk_ring * +spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id) +{ + char ring_name[64]; + static uint32_t ring_num = 0; + unsigned flags = 0; + + switch (type) { + case SPDK_RING_TYPE_SP_SC: + flags = RING_F_SP_ENQ | RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_SC: + flags = RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_MC: + flags = 0; + break; + default: + return NULL; + } + + snprintf(ring_name, sizeof(ring_name), "ring_%u_%d", + __sync_fetch_and_add(&ring_num, 1), getpid()); + + return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags); +} + +void +spdk_ring_free(struct spdk_ring *ring) +{ + rte_ring_free((struct rte_ring *)ring); +} + +size_t +spdk_ring_count(struct spdk_ring *ring) +{ + return rte_ring_count((struct rte_ring *)ring); +} + +size_t +spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count) +{ + int rc; +#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0) + rc = rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count); + if (rc == 0) { + return count; + } + + return 0; +#else + rc = rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count, NULL); + return rc; +#endif +} + +size_t +spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count) +{ +#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0) + return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count); +#else + return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL); +#endif +} diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk new file mode 100644 index 00000000..989bdd11 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.mk @@ -0,0 +1,112 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# This makefile snippet must define the following flags: +# ENV_CFLAGS +# ENV_CXXFLAGS +# ENV_LIBS +# ENV_LINKER_ARGS + +DPDK_DIR = $(CONFIG_DPDK_DIR) + +export DPDK_ABS_DIR = $(abspath $(DPDK_DIR)) + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h)) +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include +else +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk +endif +DPDK_INC := -I$(DPDK_INC_DIR) + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_eal.a)) +DPDK_LIB_EXT = .a +else +DPDK_LIB_EXT = .so +endif + +DPDK_LIB_LIST = rte_eal rte_mempool rte_ring + +# librte_mempool_ring was new added from DPDK 17.05. Link this library used for +# ring based mempool management API. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*)) +DPDK_LIB_LIST += rte_mempool_ring +endif + +# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its +# existence to maintain backward compatibility. +ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),) +DPDK_LIB_LIST += rte_malloc +endif + +# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally +# based on their existence to maintain backward compatibility. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*)) +DPDK_LIB_LIST += rte_pci +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*)) +DPDK_LIB_LIST += rte_bus_pci +endif + +ifeq ($(CONFIG_CRYPTO),y) +DPDK_LIB_LIST += rte_cryptodev rte_reorder rte_bus_vdev rte_pmd_aesni_mb rte_pmd_qat rte_mbuf +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*)) +DPDK_LIB_LIST += rte_kvargs +endif + +DPDK_LIB = $(DPDK_LIB_LIST:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT)) +ifeq ($(CONFIG_CRYPTO),y) +DPDK_LIB += $(SPDK_ROOT_DIR)/intel-ipsec-mb/libIPSec_MB.a +endif + +# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05 +ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations +ENV_CXXFLAGS = $(ENV_CFLAGS) +ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk) +ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB) +ENV_LINKER_ARGS = $(ENV_DPDK_FILE) -Wl,--whole-archive $(DPDK_LIB) -Wl,--no-whole-archive + +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + +ifeq ($(OS),Linux) +ENV_LINKER_ARGS += -ldl +endif +ifeq ($(OS),FreeBSD) +ENV_LINKER_ARGS += -lexecinfo +endif diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h new file mode 100644 index 00000000..d95084ea --- /dev/null +++ b/src/spdk/lib/env_dpdk/env_internal.h @@ -0,0 +1,104 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ENV_INTERNAL_H +#define SPDK_ENV_INTERNAL_H + +#include "spdk/stdinc.h" + +#define spdk_pci_device rte_pci_device + +#include "spdk/env.h" + +#include <rte_config.h> +#include <rte_version.h> +#include <rte_eal.h> +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 0) +#include <rte_bus.h> +extern struct rte_pci_bus rte_pci_bus; +#endif +#include <rte_pci.h> +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 1) +#include <rte_bus_pci.h> +#endif +#include <rte_dev.h> + +/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], + * which is enough to cover 256 TB. + */ +#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */ +#define MASK_256TB ((1ULL << SHIFT_256TB) - 1) + +#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */ +#define MASK_1GB ((1ULL << SHIFT_1GB) - 1) + +#define SHIFT_2MB 21 /* (1 << 21) == 2MB */ +#define MASK_2MB ((1ULL << SHIFT_2MB) - 1) +#define VALUE_2MB (1 << SHIFT_2MB) + +#define SHIFT_4KB 12 /* (1 << 12) == 4KB */ +#define MASK_4KB ((1ULL << SHIFT_4KB) - 1) + +struct spdk_pci_enum_ctx { + struct rte_pci_driver driver; + spdk_pci_enum_cb cb_fn; + void *cb_arg; + pthread_mutex_t mtx; + bool is_registered; +}; + +int spdk_pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); +int spdk_pci_device_fini(struct rte_pci_device *device); + +int spdk_pci_enumerate(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_cb, void *enum_ctx); +int spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_cb, void *enum_ctx, + struct spdk_pci_addr *pci_address); + +int spdk_mem_map_init(void); +int spdk_vtophys_init(void); + +/** + * Report a DMA-capable PCI device to the vtophys translation code. + * Increases the refcount of active DMA-capable devices managed by SPDK. + * This must be called after a `rte_pci_device` is created. + */ +void spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device); + +/** + * Report the removal of a DMA-capable PCI device to the vtophys translation code. + * Decreases the refcount of active DMA-capable devices managed by SPDK. + * This must be called before a `rte_pci_device` is destroyed. + */ +void spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device); + +#endif diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c new file mode 100644 index 00000000..1a2fafe1 --- /dev/null +++ b/src/spdk/lib/env_dpdk/init.c @@ -0,0 +1,401 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include "spdk/version.h" + +#include <rte_config.h> +#include <rte_eal.h> + +#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" +#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" + +static char **eal_cmdline; +static int eal_cmdline_argcount; + +static char * +_sprintf_alloc(const char *format, ...) +{ + va_list args; + va_list args_copy; + char *buf; + size_t bufsize; + int rc; + + va_start(args, format); + + /* Try with a small buffer first. */ + bufsize = 32; + + /* Limit maximum buffer size to something reasonable so we don't loop forever. */ + while (bufsize <= 1024 * 1024) { + buf = malloc(bufsize); + if (buf == NULL) { + va_end(args); + return NULL; + } + + va_copy(args_copy, args); + rc = vsnprintf(buf, bufsize, format, args_copy); + va_end(args_copy); + + /* + * If vsnprintf() returned a count within our current buffer size, we are done. + * The count does not include the \0 terminator, so rc == bufsize is not OK. + */ + if (rc >= 0 && (size_t)rc < bufsize) { + va_end(args); + return buf; + } + + /* + * vsnprintf() should return the required space, but some libc versions do not + * implement this correctly, so just double the buffer size and try again. + * + * We don't need the data in buf, so rather than realloc(), use free() and malloc() + * again to avoid a copy. + */ + free(buf); + bufsize *= 2; + } + + va_end(args); + return NULL; +} + +static void +spdk_env_unlink_shared_files(void) +{ + /* Starting with DPDK 18.05, there are more files with unpredictable paths + * and filenames. The --no-shconf option prevents from creating them, but + * only for DPDK 18.08+. For DPDK 18.05 we just leave them be. + */ +#if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0) + char buffer[PATH_MAX]; + + snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid()); + if (unlink(buffer)) { + fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno); + } +#endif +} + +void +spdk_env_opts_init(struct spdk_env_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; + opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; + opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; + opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; +} + +static void +spdk_free_args(char **args, int argcount) +{ + int i; + + for (i = 0; i < argcount; i++) { + free(args[i]); + } + + if (argcount) { + free(args); + } +} + +static char ** +spdk_push_arg(char *args[], int *argcount, char *arg) +{ + char **tmp; + + if (arg == NULL) { + fprintf(stderr, "%s: NULL arg supplied\n", __func__); + spdk_free_args(args, *argcount); + return NULL; + } + + tmp = realloc(args, sizeof(char *) * (*argcount + 1)); + if (tmp == NULL) { + spdk_free_args(args, *argcount); + return NULL; + } + + tmp[*argcount] = arg; + (*argcount)++; + + return tmp; +} + +static void +spdk_destruct_eal_cmdline(void) +{ + spdk_free_args(eal_cmdline, eal_cmdline_argcount); +} + + +static int +spdk_build_eal_cmdline(const struct spdk_env_opts *opts) +{ + int argcount = 0; + char **args; + + args = NULL; + + /* set the program name */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); + if (args == NULL) { + return -1; + } + + /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ + if (opts->shm_id < 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); + if (args == NULL) { + return -1; + } + } + + /* set the coremask */ + /* NOTE: If coremask starts with '[' and ends with ']' it is a core list + */ + if (opts->core_mask[0] == '[') { + char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); + int len = strlen(l_arg); + if (l_arg[len - 1] == ']') { + l_arg[len - 1] = '\0'; + } + args = spdk_push_arg(args, &argcount, l_arg); + } else { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); + } + + if (args == NULL) { + return -1; + } + + /* set the memory channel number */ + if (opts->mem_channel > 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); + if (args == NULL) { + return -1; + } + } + + /* set the memory size */ + if (opts->mem_size >= 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); + if (args == NULL) { + return -1; + } + } + + /* set the master core */ + if (opts->master_core > 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", + opts->master_core)); + if (args == NULL) { + return -1; + } + } + + /* set no pci if enabled */ + if (opts->no_pci) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci")); + if (args == NULL) { + return -1; + } + } + + /* create just one hugetlbfs file */ + if (opts->hugepage_single_segments) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); + if (args == NULL) { + return -1; + } + } + + /* unlink hugepages after initialization */ + if (opts->unlink_hugepage) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); + if (args == NULL) { + return -1; + } + } + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0) + /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem")); + if (args == NULL) { + return -1; + } +#endif + + if (opts->num_pci_addr) { + size_t i; + char bdf[32]; + struct spdk_pci_addr *pci_addr = + opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; + + for (i = 0; i < opts->num_pci_addr; i++) { + spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); + args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s", + (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), + bdf)); + if (args == NULL) { + return -1; + } + } + } + +#ifdef __linux__ + if (opts->shm_id < 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", + getpid())); + if (args == NULL) { + return -1; + } + } else { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", + opts->shm_id)); + if (args == NULL) { + return -1; + } + + /* Set the base virtual address - it must be an address that is not in the + * ASAN shadow region, otherwise ASAN-enabled builds will ignore the + * mmap hint. + * + * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm + */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000")); + if (args == NULL) { + return -1; + } + + /* set the process type */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); + if (args == NULL) { + return -1; + } + } +#endif + + eal_cmdline = args; + eal_cmdline_argcount = argcount; + if (atexit(spdk_destruct_eal_cmdline) != 0) { + fprintf(stderr, "Failed to register cleanup handler\n"); + } + + return argcount; +} + +int spdk_env_init(const struct spdk_env_opts *opts) +{ + char **dpdk_args = NULL; + int i, rc; + int orig_optind; + + rc = spdk_build_eal_cmdline(opts); + if (rc < 0) { + fprintf(stderr, "Invalid arguments to initialize DPDK\n"); + return -1; + } + + printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); + printf("[ DPDK EAL parameters: "); + for (i = 0; i < eal_cmdline_argcount; i++) { + printf("%s ", eal_cmdline[i]); + } + printf("]\n"); + + /* DPDK rearranges the array we pass to it, so make a copy + * before passing so we can still free the individual strings + * correctly. + */ + dpdk_args = calloc(eal_cmdline_argcount, sizeof(char *)); + if (dpdk_args == NULL) { + fprintf(stderr, "Failed to allocate dpdk_args\n"); + return -1; + } + memcpy(dpdk_args, eal_cmdline, sizeof(char *) * eal_cmdline_argcount); + + fflush(stdout); + orig_optind = optind; + optind = 1; + rc = rte_eal_init(eal_cmdline_argcount, dpdk_args); + optind = orig_optind; + + free(dpdk_args); + + if (rc < 0) { + fprintf(stderr, "Failed to initialize DPDK\n"); + return -1; + } + + if (opts->shm_id < 0 && !opts->hugepage_single_segments) { + /* + * Unlink hugepage and config info files after init. This will ensure they get + * deleted on app exit, even if the app crashes and does not exit normally. + * Only do this when not in multi-process mode, since for multi-process other + * apps will need to open these files. These files are not created for + * "single file segments". + */ + spdk_env_unlink_shared_files(); + } + + if (spdk_mem_map_init() < 0) { + fprintf(stderr, "Failed to allocate mem_map\n"); + return -1; + } + if (spdk_vtophys_init() < 0) { + fprintf(stderr, "Failed to initialize vtophys\n"); + return -1; + } + + return 0; +} diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c new file mode 100644 index 00000000..eaeccb90 --- /dev/null +++ b/src/spdk/lib/env_dpdk/memory.c @@ -0,0 +1,712 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_eal_memconfig.h> + +#include "spdk_internal/assert.h" + +#include "spdk/assert.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#if DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) +#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) + +#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) +#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) + +/* Page is registered */ +#define REG_MAP_REGISTERED (1ULL << 62) + +/* A notification region barrier. The 2MB translation entry that's marked + * with this flag must be unregistered separately. This allows contiguous + * regions to be unregistered in the same chunks they were registered. + */ +#define REG_MAP_NOTIFY_START (1ULL << 63) + +/* Translation of a single 2MB page. */ +struct map_2mb { + uint64_t translation_2mb; +}; + +/* Second-level map table indexed by bits [21..29] of the virtual address. + * Each entry contains the address translation or error for entries that haven't + * been retrieved yet. + */ +struct map_1gb { + struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; +}; + +/* Top-level map table indexed by bits [30..47] of the virtual address. + * Each entry points to a second-level map table or NULL. + */ +struct map_256tb { + struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; +}; + +/* Page-granularity memory address translation */ +struct spdk_mem_map { + struct map_256tb map_256tb; + pthread_mutex_t mutex; + uint64_t default_translation; + struct spdk_mem_map_ops ops; + void *cb_ctx; + TAILQ_ENTRY(spdk_mem_map) tailq; +}; + +/* Registrations map. The 64 bit translations are bit fields with the + * following layout (starting with the low bits): + * 0 - 61 : reserved + * 62 - 63 : flags + */ +static struct spdk_mem_map *g_mem_reg_map; +static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); +static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* + * Walk the currently registered memory via the main memory registration map + * and call the new map's notify callback for each virtually contiguous region. + */ +static int +spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) +{ + size_t idx_256tb; + uint64_t idx_1gb; + uint64_t contig_start = UINT64_MAX; + uint64_t contig_end = UINT64_MAX; + struct map_1gb *map_1gb; + int rc; + + if (!g_mem_reg_map) { + return -EINVAL; + } + + /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ + pthread_mutex_lock(&g_mem_reg_map->mutex); + + for (idx_256tb = 0; + idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); + idx_256tb++) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + } + contig_start = UINT64_MAX; + continue; + } + + for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_start == UINT64_MAX || + (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_start == UINT64_MAX) { + contig_start = vaddr; + } + + contig_end = vaddr; + } else { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + + /* This page might be a part of a neighbour region, so process + * it again. The idx_1gb will be incremented immediately. + */ + idx_1gb--; + } + contig_start = UINT64_MAX; + } + } + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return 0; + +err_unregister: + /* Unwind to the first empty translation so we don't unregister + * a region that just failed to register. + */ + idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); + idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); + contig_start = UINT64_MAX; + contig_end = UINT64_MAX; + + /* Unregister any memory we managed to register before the failure */ + for (; idx_256tb < SIZE_MAX; idx_256tb--) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + } + contig_end = UINT64_MAX; + continue; + } + + for (; idx_1gb < UINT64_MAX; idx_1gb--) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_end == UINT64_MAX) { + contig_end = vaddr; + } + contig_start = vaddr; + } else { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + idx_1gb++; + } + contig_end = UINT64_MAX; + } + } + idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return rc; +} + +struct spdk_mem_map * +spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) +{ + struct spdk_mem_map *map; + int rc; + + map = calloc(1, sizeof(*map)); + if (map == NULL) { + return NULL; + } + + if (pthread_mutex_init(&map->mutex, NULL)) { + free(map); + return NULL; + } + + map->default_translation = default_translation; + map->cb_ctx = cb_ctx; + if (ops) { + map->ops = *ops; + } + + if (ops && ops->notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + DEBUG_PRINT("Initial mem_map notify failed\n"); + pthread_mutex_destroy(&map->mutex); + free(map); + return NULL; + } + TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + return map; +} + +void +spdk_mem_map_free(struct spdk_mem_map **pmap) +{ + struct spdk_mem_map *map; + size_t i; + + if (!pmap) { + return; + } + + map = *pmap; + + if (!map) { + return; + } + + if (map->ops.notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); + TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { + free(map->map_256tb.map[i]); + } + + pthread_mutex_destroy(&map->mutex); + + free(map); + *pmap = NULL; +} + +int +spdk_mem_register(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + if (len == 0) { + return 0; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if (reg & REG_MAP_REGISTERED) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EBUSY; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + seg_vaddr = vaddr; + seg_len = 0; + while (len > 0) { + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, + seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); + seg_len += VALUE_2MB; + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +int +spdk_mem_unregister(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg, newreg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + /* The first page must be a start of a region. Also check if it's + * registered to make sure we don't return -ERANGE for non-registered + * regions. + */ + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EINVAL; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + /* If the next page is registered, it must be a start of a region as well, + * otherwise we'd be unregistering only a part of a region. + */ + if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + seg_vaddr = vaddr; + seg_len = 0; + + while (len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); + + if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + seg_vaddr = vaddr; + seg_len = VALUE_2MB; + } else { + seg_len += VALUE_2MB; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + if (seg_len > 0) { + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +static struct map_1gb * +spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) +{ + struct map_1gb *map_1gb; + uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); + size_t i; + + if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { + return NULL; + } + + map_1gb = map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + pthread_mutex_lock(&map->mutex); + + /* Recheck to make sure nobody else got the mutex first. */ + map_1gb = map->map_256tb.map[idx_256tb]; + if (!map_1gb) { + map_1gb = malloc(sizeof(struct map_1gb)); + if (map_1gb) { + /* initialize all entries to default translation */ + for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { + map_1gb->map[i].translation_2mb = map->default_translation; + } + map->map_256tb.map[idx_256tb] = map_1gb; + } + } + + pthread_mutex_unlock(&map->mutex); + + if (!map_1gb) { + DEBUG_PRINT("allocation failed\n"); + return NULL; + } + } + + return map_1gb; +} + +int +spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, + uint64_t translation) +{ + uint64_t vfn_2mb; + struct map_1gb *map_1gb; + uint64_t idx_1gb; + struct map_2mb *map_2mb; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); + return -EINVAL; + } + + /* For now, only 2 MB-aligned registrations are supported */ + if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", + __func__, vaddr, size); + return -EINVAL; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + + while (size) { + map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); + if (!map_1gb) { + DEBUG_PRINT("could not get %p map\n", (void *)vaddr); + return -ENOMEM; + } + + idx_1gb = MAP_1GB_IDX(vfn_2mb); + map_2mb = &map_1gb->map[idx_1gb]; + map_2mb->translation_2mb = translation; + + size -= VALUE_2MB; + vfn_2mb++; + } + + return 0; +} + +int +spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) +{ + uint64_t vfn_2mb; + struct map_1gb *map_1gb; + uint64_t idx_1gb; + struct map_2mb *map_2mb; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); + return -EINVAL; + } + + /* For now, only 2 MB-aligned registrations are supported */ + if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", + __func__, vaddr, size); + return -EINVAL; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + + while (size) { + map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); + if (!map_1gb) { + DEBUG_PRINT("could not get %p map\n", (void *)vaddr); + return -ENOMEM; + } + + idx_1gb = MAP_1GB_IDX(vfn_2mb); + map_2mb = &map_1gb->map[idx_1gb]; + map_2mb->translation_2mb = map->default_translation; + + size -= VALUE_2MB; + vfn_2mb++; + } + + return 0; +} + +uint64_t +spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) +{ + const struct map_1gb *map_1gb; + const struct map_2mb *map_2mb; + uint64_t idx_256tb; + uint64_t idx_1gb; + uint64_t vfn_2mb; + uint64_t total_size = 0; + uint64_t cur_size; + uint64_t prev_translation; + + if (size != NULL) { + total_size = *size; + *size = 0; + } + + if (spdk_unlikely(vaddr & ~MASK_256TB)) { + DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); + return map->default_translation; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + return map->default_translation; + } + + cur_size = VALUE_2MB; + if (size != NULL) { + *size = VALUE_2MB; + } + + map_2mb = &map_1gb->map[idx_1gb]; + if (size == NULL || map->ops.are_contiguous == NULL || + map_2mb->translation_2mb == map->default_translation) { + return map_2mb->translation_2mb; + } + + prev_translation = map_2mb->translation_2mb;; + while (cur_size < total_size) { + vfn_2mb++; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + break; + } + + map_2mb = &map_1gb->map[idx_1gb]; + if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { + break; + } + + cur_size += VALUE_2MB; + prev_translation = map_2mb->translation_2mb; + } + + *size = cur_size; + return prev_translation; +} + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) +static void +memory_hotplug_cb(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg) +{ + if (event_type == RTE_MEM_EVENT_ALLOC) { + while (len > 0) { + struct rte_memseg *seg; + + seg = rte_mem_virt2memseg(addr, NULL); + assert(seg != NULL); + assert(len >= seg->hugepage_sz); + + spdk_mem_register((void *)seg->addr, seg->hugepage_sz); + addr = (void *)((uintptr_t)addr + seg->hugepage_sz); + len -= seg->hugepage_sz; + } + } else if (event_type == RTE_MEM_EVENT_FREE) { + spdk_mem_unregister((void *)addr, len); + } +} + +static int +memory_iter_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + return spdk_mem_register(ms->addr, len); +} +#endif + +int +spdk_mem_map_init(void) +{ + g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); + if (g_mem_reg_map == NULL) { + DEBUG_PRINT("memory registration map allocation failed\n"); + return -1; + } + + /* + * Walk all DPDK memory segments and register them + * with the master memory map + */ +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) + rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); + rte_memseg_contig_walk(memory_iter_cb, NULL); +#else + struct rte_mem_config *mcfg; + size_t seg_idx; + + mcfg = rte_eal_get_configuration()->mem_config; + for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { + struct rte_memseg *seg = &mcfg->memseg[seg_idx]; + + if (seg->addr == NULL) { + break; + } + + spdk_mem_register(seg->addr, seg->len); + } +#endif + return 0; +} diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c new file mode 100644 index 00000000..4153ac93 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci.c @@ -0,0 +1,551 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/env.h" + +#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" + +#define PCI_CFG_SIZE 256 +#define PCI_EXT_CAP_ID_SN 0x03 + +int +spdk_pci_device_init(struct rte_pci_driver *driver, + struct rte_pci_device *device) +{ + struct spdk_pci_enum_ctx *ctx = (struct spdk_pci_enum_ctx *)driver; + int rc; + + if (!ctx->cb_fn) { +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_unmap_device(device); +#elif RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + rte_eal_pci_unmap_device(device); +#endif + + /* Return a positive value to indicate that this device does not belong to this driver, but + * this isn't an error. */ + return 1; + } + + rc = ctx->cb_fn(ctx->cb_arg, (struct spdk_pci_device *)device); + if (rc != 0) { + return rc; + } + + spdk_vtophys_pci_device_added(device); + return 0; +} + +int +spdk_pci_device_fini(struct rte_pci_device *device) +{ + spdk_vtophys_pci_device_removed(device); + return 0; +} + +void +spdk_pci_device_detach(struct spdk_pci_device *device) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) +#if RTE_VERSION < RTE_VERSION_NUM(17, 05, 0, 0) + rte_eal_device_remove(&device->device); +#endif +#endif + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + struct spdk_pci_addr addr; + char bdf[32]; + + addr.domain = device->addr.domain; + addr.bus = device->addr.bus; + addr.dev = device->addr.devid; + addr.func = device->addr.function; + + spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr); + if (rte_eal_dev_detach(&device->device) < 0) { + fprintf(stderr, "Failed to detach PCI device %s (device already removed?).\n", bdf); + } +#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_detach(&device->addr); +#else + rte_eal_pci_detach(&device->addr); +#endif +} + +int +spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx, + spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + char bdf[32]; + + spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address); +#else + struct rte_pci_addr addr; + + addr.domain = pci_address->domain; + addr.bus = pci_address->bus; + addr.devid = pci_address->dev; + addr.function = pci_address->func; +#endif + + pthread_mutex_lock(&ctx->mtx); + + if (!ctx->is_registered) { + ctx->is_registered = true; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_register(&ctx->driver); +#else + rte_eal_pci_register(&ctx->driver); +#endif + } + + ctx->cb_fn = enum_cb; + ctx->cb_arg = enum_ctx; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + if (rte_eal_dev_attach(bdf, "") != 0) { +#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + if (rte_pci_probe_one(&addr) != 0) { +#else + if (rte_eal_pci_probe_one(&addr) != 0) { +#endif + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + return -1; + } + + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + + return 0; +} + +/* Note: You can call spdk_pci_enumerate from more than one thread + * simultaneously safely, but you cannot call spdk_pci_enumerate + * and rte_eal_pci_probe simultaneously. + */ +int +spdk_pci_enumerate(struct spdk_pci_enum_ctx *ctx, + spdk_pci_enum_cb enum_cb, + void *enum_ctx) +{ + pthread_mutex_lock(&ctx->mtx); + + if (!ctx->is_registered) { + ctx->is_registered = true; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_register(&ctx->driver); +#else + rte_eal_pci_register(&ctx->driver); +#endif + } + + ctx->cb_fn = enum_cb; + ctx->cb_arg = enum_ctx; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + if (rte_bus_probe() != 0) { +#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + if (rte_pci_probe() != 0) { +#else + if (rte_eal_pci_probe() != 0) { +#endif + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + return -1; + } + + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + + return 0; +} + +int +spdk_pci_device_map_bar(struct spdk_pci_device *device, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + struct rte_pci_device *dev = device; + + *mapped_addr = dev->mem_resource[bar].addr; + *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr; + *size = (uint64_t)dev->mem_resource[bar].len; + + return 0; +} + +int +spdk_pci_device_unmap_bar(struct spdk_pci_device *device, uint32_t bar, void *addr) +{ + return 0; +} + +uint32_t +spdk_pci_device_get_domain(struct spdk_pci_device *dev) +{ + return dev->addr.domain; +} + +uint8_t +spdk_pci_device_get_bus(struct spdk_pci_device *dev) +{ + return dev->addr.bus; +} + +uint8_t +spdk_pci_device_get_dev(struct spdk_pci_device *dev) +{ + return dev->addr.devid; +} + +uint8_t +spdk_pci_device_get_func(struct spdk_pci_device *dev) +{ + return dev->addr.function; +} + +uint16_t +spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev) +{ + return dev->id.vendor_id; +} + +uint16_t +spdk_pci_device_get_device_id(struct spdk_pci_device *dev) +{ + return dev->id.device_id; +} + +uint16_t +spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev) +{ + return dev->id.subsystem_vendor_id; +} + +uint16_t +spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev) +{ + return dev->id.subsystem_device_id; +} + +struct spdk_pci_id +spdk_pci_device_get_id(struct spdk_pci_device *pci_dev) +{ + struct spdk_pci_id pci_id; + + pci_id.vendor_id = spdk_pci_device_get_vendor_id(pci_dev); + pci_id.device_id = spdk_pci_device_get_device_id(pci_dev); + pci_id.subvendor_id = spdk_pci_device_get_subvendor_id(pci_dev); + pci_id.subdevice_id = spdk_pci_device_get_subdevice_id(pci_dev); + + return pci_id; +} + +int +spdk_pci_device_get_socket_id(struct spdk_pci_device *pci_dev) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + return pci_dev->device.numa_node; +#else + return pci_dev->numa_node; +#endif +} + +int +spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rc = rte_pci_read_config(dev, value, len, offset); +#else + rc = rte_eal_pci_read_config(dev, value, len, offset); +#endif + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +int +spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rc = rte_pci_write_config(dev, value, len, offset); +#else + rc = rte_eal_pci_write_config(dev, value, len, offset); +#endif + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +int +spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 1, offset); +} + +int +spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 1, offset); +} + +int +spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 2, offset); +} + +int +spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 2, offset); +} + +int +spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 4, offset); +} + +int +spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 4, offset); +} + +int +spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len) +{ + int err; + uint32_t pos, header = 0; + uint32_t i, buf[2]; + + if (len < 17) { + return -1; + } + + err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE); + if (err || !header) { + return -1; + } + + pos = PCI_CFG_SIZE; + while (1) { + if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) { + if (pos) { + /* skip the header */ + pos += 4; + for (i = 0; i < 2; i++) { + err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i); + if (err) { + return -1; + } + } + snprintf(sn, len, "%08x%08x", buf[1], buf[0]); + return 0; + } + } + pos = (header >> 20) & 0xffc; + /* 0 if no other items exist */ + if (pos < PCI_CFG_SIZE) { + return -1; + } + err = spdk_pci_device_cfg_read32(dev, &header, pos); + if (err) { + return -1; + } + } + return -1; +} + +struct spdk_pci_addr +spdk_pci_device_get_addr(struct spdk_pci_device *pci_dev) +{ + struct spdk_pci_addr pci_addr; + + pci_addr.domain = spdk_pci_device_get_domain(pci_dev); + pci_addr.bus = spdk_pci_device_get_bus(pci_dev); + pci_addr.dev = spdk_pci_device_get_dev(pci_dev); + pci_addr.func = spdk_pci_device_get_func(pci_dev); + + return pci_addr; +} + +int +spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) +{ + if (a1->domain > a2->domain) { + return 1; + } else if (a1->domain < a2->domain) { + return -1; + } else if (a1->bus > a2->bus) { + return 1; + } else if (a1->bus < a2->bus) { + return -1; + } else if (a1->dev > a2->dev) { + return 1; + } else if (a1->dev < a2->dev) { + return -1; + } else if (a1->func > a2->func) { + return 1; + } else if (a1->func < a2->func) { + return -1; + } + + return 0; +} + +#ifdef __linux__ +int +spdk_pci_device_claim(const struct spdk_pci_addr *pci_addr) +{ + int dev_fd; + char dev_name[64]; + int pid; + void *dev_map; + struct flock pcidev_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + + snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", pci_addr->domain, + pci_addr->bus, + pci_addr->dev, pci_addr->func); + + dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (dev_fd == -1) { + fprintf(stderr, "could not open %s\n", dev_name); + return -1; + } + + if (ftruncate(dev_fd, sizeof(int)) != 0) { + fprintf(stderr, "could not truncate %s\n", dev_name); + close(dev_fd); + return -1; + } + + dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED, dev_fd, 0); + if (dev_map == MAP_FAILED) { + fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno); + close(dev_fd); + return -1; + } + + if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { + pid = *(int *)dev_map; + fprintf(stderr, "Cannot create lock on device %s, probably" + " process %d has claimed it\n", dev_name, pid); + munmap(dev_map, sizeof(int)); + close(dev_fd); + return -1; + } + + *(int *)dev_map = (int)getpid(); + munmap(dev_map, sizeof(int)); + /* Keep dev_fd open to maintain the lock. */ + return dev_fd; +} +#endif /* __linux__ */ + +#ifdef __FreeBSD__ +int +spdk_pci_device_claim(const struct spdk_pci_addr *pci_addr) +{ + /* TODO */ + return 0; +} +#endif /* __FreeBSD__ */ + +int +spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf) +{ + unsigned domain, bus, dev, func; + + if (addr == NULL || bdf == NULL) { + return -EINVAL; + } + + if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) || + (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) { + /* Matched a full address - all variables are initialized */ + } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) { + func = 0; + } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) || + (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) { + domain = 0; + } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) || + (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) { + domain = 0; + func = 0; + } else { + return -EINVAL; + } + + if (bus > 0xFF || dev > 0x1F || func > 7) { + return -EINVAL; + } + + addr->domain = domain; + addr->bus = bus; + addr->dev = dev; + addr->func = func; + + return 0; +} + +int +spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr) +{ + int rc; + + rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x", + addr->domain, addr->bus, + addr->dev, addr->func); + + if (rc > 0 && (size_t)rc < sz) { + return 0; + } + + return -1; +} diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c new file mode 100644 index 00000000..b9640283 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_ioat.c @@ -0,0 +1,123 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) RTE_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct rte_pci_id ioat_driver_id[] = { + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)}, + { .vendor_id = 0, /* sentinel */ }, +}; + +static struct spdk_pci_enum_ctx g_ioat_pci_drv = { + .driver = { + .drv_flags = RTE_PCI_DRV_NEED_MAPPING, + .id_table = ioat_driver_id, +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + .probe = spdk_pci_device_init, + .remove = spdk_pci_device_fini, + .driver.name = "spdk_ioat", +#else + .devinit = spdk_pci_device_init, + .devuninit = spdk_pci_device_fini, + .name = "spdk_ioat", +#endif + }, + + .cb_fn = NULL, + .cb_arg = NULL, + .mtx = PTHREAD_MUTEX_INITIALIZER, + .is_registered = false, +}; + +int +spdk_pci_ioat_device_attach(spdk_pci_enum_cb enum_cb, void *enum_ctx, + struct spdk_pci_addr *pci_address) +{ + return spdk_pci_device_attach(&g_ioat_pci_drv, enum_cb, enum_ctx, pci_address); +} + +int +spdk_pci_ioat_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx) +{ + return spdk_pci_enumerate(&g_ioat_pci_drv, enum_cb, enum_ctx); +} diff --git a/src/spdk/lib/env_dpdk/pci_nvme.c b/src/spdk/lib/env_dpdk/pci_nvme.c new file mode 100644 index 00000000..4f3b84d1 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_nvme.c @@ -0,0 +1,89 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct rte_pci_id nvme_pci_driver_id[] = { +#if RTE_VERSION >= RTE_VERSION_NUM(16, 7, 0, 1) + { + .class_id = SPDK_PCI_CLASS_NVME, + .vendor_id = PCI_ANY_ID, + .device_id = PCI_ANY_ID, + .subsystem_vendor_id = PCI_ANY_ID, + .subsystem_device_id = PCI_ANY_ID, + }, +#else + {RTE_PCI_DEVICE(0x8086, 0x0953)}, +#endif + { .vendor_id = 0, /* sentinel */ }, +}; + +static struct spdk_pci_enum_ctx g_nvme_pci_drv = { + .driver = { + .drv_flags = RTE_PCI_DRV_NEED_MAPPING +#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) + | RTE_PCI_DRV_WC_ACTIVATE +#endif + , + .id_table = nvme_pci_driver_id, +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + .probe = spdk_pci_device_init, + .remove = spdk_pci_device_fini, + .driver.name = "spdk_nvme", +#else + .devinit = spdk_pci_device_init, + .devuninit = spdk_pci_device_fini, + .name = "spdk_nvme", +#endif + }, + + .cb_fn = NULL, + .cb_arg = NULL, + .mtx = PTHREAD_MUTEX_INITIALIZER, + .is_registered = false, +}; + +int +spdk_pci_nvme_device_attach(spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ + return spdk_pci_device_attach(&g_nvme_pci_drv, enum_cb, enum_ctx, pci_address); +} + +int +spdk_pci_nvme_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx) +{ + return spdk_pci_enumerate(&g_nvme_pci_drv, enum_cb, enum_ctx); +} diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c new file mode 100644 index 00000000..1fcb80d7 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_virtio.c @@ -0,0 +1,80 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct rte_pci_id virtio_pci_driver_id[] = { + { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) }, + { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +static struct spdk_pci_enum_ctx g_virtio_pci_drv = { + .driver = { + .drv_flags = RTE_PCI_DRV_NEED_MAPPING +#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) + | RTE_PCI_DRV_WC_ACTIVATE +#endif + , + .id_table = virtio_pci_driver_id, +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + .probe = spdk_pci_device_init, + .remove = spdk_pci_device_fini, + .driver.name = "spdk_virtio", +#else + .devinit = spdk_pci_device_init, + .devuninit = spdk_pci_device_fini, + .name = "spdk_virtio", +#endif + }, + + .cb_fn = NULL, + .cb_arg = NULL, + .mtx = PTHREAD_MUTEX_INITIALIZER, + .is_registered = false, +}; + +int +spdk_pci_virtio_device_attach(spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ + return spdk_pci_device_attach(&g_virtio_pci_drv, enum_cb, enum_ctx, pci_address); +} + +int +spdk_pci_virtio_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx) +{ + return spdk_pci_enumerate(&g_virtio_pci_drv, enum_cb, enum_ctx); +} diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c new file mode 100644 index 00000000..55b0bbb6 --- /dev/null +++ b/src/spdk/lib/env_dpdk/threads.c @@ -0,0 +1,108 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" + +#include <rte_config.h> +#include <rte_lcore.h> + +uint32_t +spdk_env_get_core_count(void) +{ + return rte_lcore_count(); +} + +uint32_t +spdk_env_get_current_core(void) +{ + return rte_lcore_id(); +} + +uint32_t +spdk_env_get_first_core(void) +{ + return rte_get_next_lcore(-1, 0, 0); +} + +uint32_t +spdk_env_get_last_core(void) +{ + uint32_t i; + uint32_t last_core = UINT32_MAX; + + SPDK_ENV_FOREACH_CORE(i) { + last_core = i; + } + + assert(last_core != UINT32_MAX); + + return last_core; +} + +uint32_t +spdk_env_get_next_core(uint32_t prev_core) +{ + unsigned lcore; + + lcore = rte_get_next_lcore(prev_core, 0, 0); + if (lcore == RTE_MAX_LCORE) { + return UINT32_MAX; + } + return lcore; +} + +uint32_t +spdk_env_get_socket_id(uint32_t core) +{ + if (core >= RTE_MAX_LCORE) { + return SPDK_ENV_SOCKET_ID_ANY; + } + + return rte_lcore_to_socket_id(core); +} + +int +spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg) +{ + int rc; + + rc = rte_eal_remote_launch(fn, arg, core); + + return rc; +} + +void +spdk_env_thread_wait_all(void) +{ + rte_eal_mp_wait_lcore(); +} diff --git a/src/spdk/lib/env_dpdk/vtophys.c b/src/spdk/lib/env_dpdk/vtophys.c new file mode 100644 index 00000000..00e8bb6d --- /dev/null +++ b/src/spdk/lib/env_dpdk/vtophys.c @@ -0,0 +1,691 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_eal_memconfig.h> + +#include "spdk_internal/assert.h" + +#include "spdk/assert.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#ifdef __FreeBSD__ +#define SPDK_VFIO_ENABLED 0 +#else +#include <linux/version.h> +/* + * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API, + * and we can't link to internal symbols when built against shared library DPDK, + * so disable VFIO entirely in that case. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \ + (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB)) + +#define SPDK_VFIO_ENABLED 1 +#include <linux/vfio.h> + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) +#include <rte_vfio.h> +#else +/* Internal DPDK function forward declaration */ +int pci_vfio_is_enabled(void); +#endif + +struct spdk_vfio_dma_map { + struct vfio_iommu_type1_dma_map map; + struct vfio_iommu_type1_dma_unmap unmap; + TAILQ_ENTRY(spdk_vfio_dma_map) tailq; +}; + +struct vfio_cfg { + int fd; + bool enabled; + unsigned device_ref; + TAILQ_HEAD(, spdk_vfio_dma_map) maps; + pthread_mutex_t mutex; +}; + +static struct vfio_cfg g_vfio = { + .fd = -1, + .enabled = false, + .device_ref = 0, + .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), + .mutex = PTHREAD_MUTEX_INITIALIZER +}; + +#else +#define SPDK_VFIO_ENABLED 0 +#endif +#endif + +#if DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +struct spdk_vtophys_pci_device { + struct rte_pci_device *pci_device; + TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; + uint64_t ref; +}; + +static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = + TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); + +static struct spdk_mem_map *g_vtophys_map; + +#if SPDK_VFIO_ENABLED +static int +vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + int ret; + + dma_map = calloc(1, sizeof(*dma_map)); + if (dma_map == NULL) { + return -ENOMEM; + } + + dma_map->map.argsz = sizeof(dma_map->map); + dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map->map.vaddr = vaddr; + dma_map->map.iova = iova; + dma_map->map.size = size; + + dma_map->unmap.argsz = sizeof(dma_map->unmap); + dma_map->unmap.flags = 0; + dma_map->unmap.iova = iova; + dma_map->unmap.size = size; + + pthread_mutex_lock(&g_vfio.mutex); + if (g_vfio.device_ref == 0) { + /* VFIO requires at least one device (IOMMU group) to be added to + * a VFIO container before it is possible to perform any IOMMU + * operations on that container. This memory will be mapped once + * the first device (IOMMU group) is hotplugged. + * + * Since the vfio container is managed internally by DPDK, it is + * also possible that some device is already in that container, but + * it's not managed by SPDK - e.g. an NIC attached internally + * inside DPDK. We could map the memory straight away in such + * scenario, but there's no need to do it. DPDK devices clearly + * don't need our mappings and hence we defer the mapping + * unconditionally until the first SPDK-managed device is + * hotplugged. + */ + goto out_insert; + } + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return ret; + } + +out_insert: + TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + return 0; +} + +static int +vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + int ret; + + pthread_mutex_lock(&g_vfio.mutex); + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + if (dma_map->map.iova == iova) { + break; + } + } + + if (dma_map == NULL) { + DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); + pthread_mutex_unlock(&g_vfio.mutex); + return -ENXIO; + } + + /** don't support partial or multiple-page unmap for now */ + assert(dma_map->map.size == size); + + if (g_vfio.device_ref == 0) { + /* Memory is not mapped anymore, just remove it's references */ + goto out_remove; + } + + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + return ret; + } + +out_remove: + TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return 0; +} +#endif + +static uint64_t +vtophys_get_paddr_memseg(uint64_t vaddr) +{ + uintptr_t paddr; + struct rte_memseg *seg; + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) + seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); + if (seg != NULL) { + paddr = seg->phys_addr; + if (paddr == RTE_BAD_IOVA) { + return SPDK_VTOPHYS_ERROR; + } + paddr += (vaddr - (uintptr_t)seg->addr); + return paddr; + } +#else + struct rte_mem_config *mcfg; + uint32_t seg_idx; + + mcfg = rte_eal_get_configuration()->mem_config; + for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { + seg = &mcfg->memseg[seg_idx]; + if (seg->addr == NULL) { + break; + } + + if (vaddr >= (uintptr_t)seg->addr && + vaddr < ((uintptr_t)seg->addr + seg->len)) { + paddr = seg->phys_addr; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + if (paddr == RTE_BAD_IOVA) { +#else + if (paddr == RTE_BAD_PHYS_ADDR) { +#endif + return SPDK_VTOPHYS_ERROR; + } + paddr += (vaddr - (uintptr_t)seg->addr); + return paddr; + } + } +#endif + + return SPDK_VTOPHYS_ERROR; +} + +/* Try to get the paddr from /proc/self/pagemap */ +static uint64_t +vtophys_get_paddr_pagemap(uint64_t vaddr) +{ + uintptr_t paddr; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) +#define BAD_ADDR RTE_BAD_IOVA +#define VTOPHYS rte_mem_virt2iova +#else +#define BAD_ADDR RTE_BAD_PHYS_ADDR +#define VTOPHYS rte_mem_virt2phy +#endif + + /* + * Note: the virt2phy/virt2iova functions have changed over time, such + * that older versions may return 0 while recent versions will never + * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead. To support older and + * newer versions, check for both return values. + */ + paddr = VTOPHYS((void *)vaddr); + if (paddr == 0 || paddr == BAD_ADDR) { + /* + * The vaddr may be valid but doesn't have a backing page + * assigned yet. Touch the page to ensure a backing page + * gets assigned, then try to translate again. + */ + rte_atomic64_read((rte_atomic64_t *)vaddr); + paddr = VTOPHYS((void *)vaddr); + } + if (paddr == 0 || paddr == BAD_ADDR) { + /* Unable to get to the physical address. */ + return SPDK_VTOPHYS_ERROR; + } + +#undef BAD_ADDR +#undef VTOPHYS + + return paddr; +} + +/* Try to get the paddr from pci devices */ +static uint64_t +vtophys_get_paddr_pci(uint64_t vaddr) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + uintptr_t paddr; + struct rte_pci_device *dev; +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 1) + struct rte_mem_resource *res; +#else + struct rte_pci_resource *res; +#endif + unsigned r; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + dev = vtophys_dev->pci_device; + + for (r = 0; r < PCI_MAX_RESOURCE; r++) { + res = &dev->mem_resource[r]; + if (res->phys_addr && vaddr >= (uint64_t)res->addr && + vaddr < (uint64_t)res->addr + res->len) { + paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); + DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, + (void *)paddr); + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + return paddr; + } + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + + return SPDK_VTOPHYS_ERROR; +} + +static int +spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t len) +{ + int rc = 0, pci_phys = 0; + uint64_t paddr; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + while (len > 0) { + /* Get the physical address from the DPDK memsegs */ + paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (paddr == SPDK_VTOPHYS_ERROR) { + /* This is not an address that DPDK is managing. */ +#if SPDK_VFIO_ENABLED + if (g_vfio.enabled) { + /* We'll use the virtual address as the iova. DPDK + * currently uses physical addresses as the iovas (or counts + * up from 0 if it can't get physical addresses), so + * the range of user space virtual addresses and physical + * addresses will never overlap. + */ + paddr = (uint64_t)vaddr; + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); + if (rc) { + return -EFAULT; + } + } else +#endif + { + /* Get the physical address from /proc/self/pagemap. */ + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + /* Get the physical address from PCI devices */ + paddr = vtophys_get_paddr_pci((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + pci_phys = 1; + } + } + } + /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ + if (!pci_phys && (paddr & MASK_2MB)) { + DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); + return -EINVAL; + } + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: +#if SPDK_VFIO_ENABLED + if (paddr == SPDK_VTOPHYS_ERROR) { + /* + * This is not an address that DPDK is managing. If vfio is enabled, + * we need to unmap the range from the IOMMU + */ + if (g_vfio.enabled) { + uint64_t buffer_len; + paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); + if (buffer_len != VALUE_2MB) { + return -EINVAL; + } + rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); + if (rc) { + return -EFAULT; + } + } + } +#endif + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); + break; + default: + SPDK_UNREACHABLE(); + } + + if (rc != 0) { + return rc; + } + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + return rc; +} + +#if SPDK_VFIO_ENABLED + +static bool +spdk_vfio_enabled(void) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + return rte_vfio_is_enabled("vfio_pci"); +#else + return pci_vfio_is_enabled(); +#endif +} + +static void +spdk_vtophys_iommu_init(void) +{ + char proc_fd_path[PATH_MAX + 1]; + char link_path[PATH_MAX + 1]; + const char vfio_path[] = "/dev/vfio/vfio"; + DIR *dir; + struct dirent *d; + + if (!spdk_vfio_enabled()) { + return; + } + + dir = opendir("/proc/self/fd"); + if (!dir) { + DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); + return; + } + + while ((d = readdir(dir)) != NULL) { + if (d->d_type != DT_LNK) { + continue; + } + + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); + if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { + continue; + } + + if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { + sscanf(d->d_name, "%d", &g_vfio.fd); + break; + } + } + + closedir(dir); + + if (g_vfio.fd < 0) { + DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); + return; + } + + g_vfio.enabled = true; + + return; +} +#endif + +void +spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + bool found = false; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + if (vtophys_dev->pci_device == pci_device) { + vtophys_dev->ref++; + found = true; + break; + } + } + + if (!found) { + vtophys_dev = calloc(1, sizeof(*vtophys_dev)); + if (vtophys_dev) { + vtophys_dev->pci_device = pci_device; + vtophys_dev->ref = 1; + TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); + } else { + DEBUG_PRINT("Memory allocation error\n"); + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if SPDK_VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + g_vfio.device_ref++; + if (g_vfio.device_ref > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the first SPDK device using DPDK vfio. This means that the first + * IOMMU group might have been just been added to the DPDK vfio container. + * From this point it is certain that the memory can be mapped now. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +void +spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + if (vtophys_dev->pci_device == pci_device) { + assert(vtophys_dev->ref > 0); + if (--vtophys_dev->ref == 0) { + TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); + free(vtophys_dev); + } + break; + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if SPDK_VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + assert(g_vfio.device_ref > 0); + g_vfio.device_ref--; + if (g_vfio.device_ref > 0) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have + * any additional devices using it's vfio container, all the mappings + * will be automatically removed by the Linux vfio driver. We unmap + * the memory manually to be able to easily re-map it later regardless + * of other, external factors. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +int +spdk_vtophys_init(void) +{ + const struct spdk_mem_map_ops vtophys_map_ops = { + .notify_cb = spdk_vtophys_notify, + .are_contiguous = NULL + }; + +#if SPDK_VFIO_ENABLED + spdk_vtophys_iommu_init(); +#endif + + g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); + if (g_vtophys_map == NULL) { + DEBUG_PRINT("vtophys map allocation failed\n"); + return -1; + } + return 0; +} + +uint64_t +spdk_vtophys(void *buf) +{ + uint64_t vaddr, paddr_2mb; + + vaddr = (uint64_t)buf; + + paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, NULL); + + /* + * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, + * we will still bitwise-or it with the buf offset below, but the result will still be + * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being + * unaligned) we must now check the return value before addition. + */ + SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); + if (paddr_2mb == SPDK_VTOPHYS_ERROR) { + return SPDK_VTOPHYS_ERROR; + } else { + return paddr_2mb + ((uint64_t)buf & MASK_2MB); + } +} + +static int +spdk_bus_scan(void) +{ + return 0; +} + +static int +spdk_bus_probe(void) +{ + return 0; +} + +static struct rte_device * +spdk_bus_find_device(const struct rte_device *start, + rte_dev_cmp_t cmp, const void *data) +{ + return NULL; +} + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) +static enum rte_iova_mode +spdk_bus_get_iommu_class(void) { + /* Since we register our PCI drivers after EAL init, we have no chance + * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu + * class. DPDK uses RTE_IOVA_PA by default because for some platforms + * it's the only supported mode, but then SPDK does not support those + * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus + * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used + * (i.e. at least one device on the system is bound to uio_pci_generic), + * so we simply return RTE_IOVA_VA here. + */ + return RTE_IOVA_VA; +} +#endif + +struct rte_bus spdk_bus = { + .scan = spdk_bus_scan, + .probe = spdk_bus_probe, + .find_device = spdk_bus_find_device, +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + .get_iommu_class = spdk_bus_get_iommu_class, +#endif +}; + +RTE_REGISTER_BUS(spdk, spdk_bus); |