diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/lib/vhost | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/vhost')
-rw-r--r-- | src/spdk/lib/vhost/Makefile | 54 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost_compat.c | 402 | ||||
-rw-r--r-- | src/spdk/lib/vhost/spdk_vhost.map | 27 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost.c | 1634 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_blk.c | 1354 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_internal.h | 496 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_nvme.c | 1500 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_rpc.c | 652 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_scsi.c | 1536 |
9 files changed, 7655 insertions, 0 deletions
diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile new file mode 100644 index 000000000..1fe9b6e40 --- /dev/null +++ b/src/spdk/lib/vhost/Makefile @@ -0,0 +1,54 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 4 +SO_MINOR := 0 + +CFLAGS += -I. +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c rte_vhost_compat.c + +ifeq ($(CONFIG_VHOST_INTERNAL_LIB),y) +C_SRCS += vhost_nvme.c +CFLAGS := -I../rte_vhost $(CFLAGS) +endif + +LIBNAME = vhost + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vhost.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vhost/rte_vhost_compat.c b/src/spdk/lib/vhost/rte_vhost_compat.c new file mode 100644 index 000000000..53f31bfd7 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost_compat.c @@ -0,0 +1,402 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * Set of workarounds for rte_vhost to make it work with device types + * other than vhost-net. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" + +#include "spdk_internal/vhost_user.h" + +static inline void +vhost_session_mem_region_calc(uint64_t *previous_start, uint64_t *start, uint64_t *end, + uint64_t *len, struct rte_vhost_mem_region *region) +{ + *start = FLOOR_2MB(region->mmap_addr); + *end = CEIL_2MB(region->mmap_addr + region->mmap_size); + if (*start == *previous_start) { + *start += (size_t) VALUE_2MB; + } + *previous_start = *start; + *len = *end - *start; +} + +void +vhost_session_mem_register(struct rte_vhost_memory *mem) +{ + uint64_t start, end, len; + uint32_t i; + uint64_t previous_start = UINT64_MAX; + + + for (i = 0; i < mem->nregions; i++) { + vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]); + SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", + start, len); + + if (spdk_mem_register((void *)start, len) != 0) { + SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n", + i); + continue; + } + } +} + +void +vhost_session_mem_unregister(struct rte_vhost_memory *mem) +{ + uint64_t start, end, len; + uint32_t i; + uint64_t previous_start = UINT64_MAX; + + for (i = 0; i < mem->nregions; i++) { + vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]); + if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) { + continue; /* region has not been registered */ + } + + if (spdk_mem_unregister((void *)start, len) != 0) { + assert(false); + } + } +} + +static int +new_connection(int vid) +{ + char ifname[PATH_MAX]; + + if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) { + SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid); + return -1; + } + + return vhost_new_connection_cb(vid, ifname); +} + +static int +start_device(int vid) +{ + return vhost_start_device_cb(vid); +} + +static void +stop_device(int vid) +{ + vhost_stop_device_cb(vid); +} + +static void +destroy_connection(int vid) +{ + vhost_destroy_connection_cb(vid); +} + +static const struct vhost_device_ops g_spdk_vhost_ops = { + .new_device = start_device, + .destroy_device = stop_device, + .new_connection = new_connection, + .destroy_connection = destroy_connection, +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + .get_config = vhost_get_config_cb, + .set_config = vhost_set_config_cb, + .vhost_nvme_admin_passthrough = vhost_nvme_admin_passthrough, + .vhost_nvme_set_cq_call = vhost_nvme_set_cq_call, + .vhost_nvme_get_cap = vhost_nvme_get_cap, + .vhost_nvme_set_bar_mr = vhost_nvme_set_bar_mr, +#endif +}; + +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB + +static enum rte_vhost_msg_result +extern_vhost_pre_msg_handler(int vid, void *_msg) +{ + struct vhost_user_msg *msg = _msg; + struct spdk_vhost_session *vsession; + + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid); + assert(false); + return RTE_VHOST_MSG_RESULT_ERR; + } + + switch (msg->request) { + case VHOST_USER_GET_VRING_BASE: + if (vsession->forced_polling && vsession->started) { + /* Our queue is stopped for whatever reason, but we may still + * need to poll it after it's initialized again. + */ + g_spdk_vhost_ops.destroy_device(vid); + } + break; + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ADDR: + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_KICK: + if (vsession->forced_polling && vsession->started) { + /* Additional queues are being initialized, so we either processed + * enough I/Os and are switching from SeaBIOS to the OS now, or + * we were never in SeaBIOS in the first place. Either way, we + * don't need our workaround anymore. + */ + g_spdk_vhost_ops.destroy_device(vid); + vsession->forced_polling = false; + } + break; + case VHOST_USER_SET_VRING_CALL: + /* rte_vhost will close the previous callfd and won't notify + * us about any change. This will effectively make SPDK fail + * to deliver any subsequent interrupts until a session is + * restarted. We stop the session here before closing the previous + * fd (so that all interrupts must have been delivered by the + * time the descriptor is closed) and start right after (which + * will make SPDK retrieve the latest, up-to-date callfd from + * rte_vhost. + */ + case VHOST_USER_SET_MEM_TABLE: + /* rte_vhost will unmap previous memory that SPDK may still + * have pending DMA operations on. We can't let that happen, + * so stop the device before letting rte_vhost unmap anything. + * This will block until all pending I/Os are finished. + * We will start the device again from the post-processing + * message handler. + */ + if (vsession->started) { + g_spdk_vhost_ops.destroy_device(vid); + vsession->needs_restart = true; + } + break; + case VHOST_USER_GET_CONFIG: { + int rc = 0; + + spdk_vhost_lock(); + if (vsession->vdev->backend->vhost_get_config) { + rc = vsession->vdev->backend->vhost_get_config(vsession->vdev, + msg->payload.cfg.region, msg->payload.cfg.size); + if (rc != 0) { + msg->size = 0; + } + } + spdk_vhost_unlock(); + + return RTE_VHOST_MSG_RESULT_REPLY; + } + case VHOST_USER_SET_CONFIG: { + int rc = 0; + + spdk_vhost_lock(); + if (vsession->vdev->backend->vhost_set_config) { + rc = vsession->vdev->backend->vhost_set_config(vsession->vdev, + msg->payload.cfg.region, msg->payload.cfg.offset, + msg->payload.cfg.size, msg->payload.cfg.flags); + } + spdk_vhost_unlock(); + + return rc == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR; + } + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +static enum rte_vhost_msg_result +extern_vhost_post_msg_handler(int vid, void *_msg) +{ + struct vhost_user_msg *msg = _msg; + struct spdk_vhost_session *vsession; + + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid); + assert(false); + return RTE_VHOST_MSG_RESULT_ERR; + } + + if (vsession->needs_restart) { + g_spdk_vhost_ops.new_device(vid); + vsession->needs_restart = false; + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; + } + + switch (msg->request) { + case VHOST_USER_SET_FEATURES: + /* rte_vhost requires all queues to be fully initialized in order + * to start I/O processing. This behavior is not compliant with the + * vhost-user specification and doesn't work with QEMU 2.12+, which + * will only initialize 1 I/O queue for the SeaBIOS boot. + * Theoretically, we should start polling each virtqueue individually + * after receiving its SET_VRING_KICK message, but rte_vhost is not + * designed to poll individual queues. So here we use a workaround + * to detect when the vhost session could be potentially at that SeaBIOS + * stage and we mark it to start polling as soon as its first virtqueue + * gets initialized. This doesn't hurt any non-QEMU vhost slaves + * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent + * at any time, but QEMU will send it at least once on SeaBIOS + * initialization - whenever powered-up or rebooted. + */ + vsession->forced_polling = true; + break; + case VHOST_USER_SET_VRING_KICK: + /* vhost-user spec tells us to start polling a queue after receiving + * its SET_VRING_KICK message. Let's do it! + */ + if (vsession->forced_polling && !vsession->started) { + g_spdk_vhost_ops.new_device(vid); + } + break; + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops = { + .pre_msg_handle = extern_vhost_pre_msg_handler, + .post_msg_handle = extern_vhost_post_msg_handler, +}; + +void +vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession) +{ + int rc; + + rc = rte_vhost_extern_callback_register(vsession->vid, &g_spdk_extern_vhost_ops, NULL); + if (rc != 0) { + SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n", + vsession->vid); + return; + } +} + +#else /* SPDK_CONFIG_VHOST_INTERNAL_LIB */ + +void +vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession) +{ + /* nothing to do. all the changes are already incorporated into rte_vhost */ +} + +#endif + +int +vhost_register_unix_socket(const char *path, const char *ctrl_name, + uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features) +{ + struct stat file_stat; +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB + uint64_t features = 0; +#endif + + /* Register vhost driver to handle vhost messages. */ + if (stat(path, &file_stat) != -1) { + if (!S_ISSOCK(file_stat.st_mode)) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The file already exists and is not a socket.\n", + path); + return -EIO; + } else if (unlink(path) != 0) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The socket already exists and failed to unlink.\n", + path); + return -EIO; + } + } + + if (rte_vhost_driver_register(path, 0) != 0) { + SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name); + SPDK_ERRLOG("Check if domain socket %s already exists\n", path); + return -EIO; + } + if (rte_vhost_driver_set_features(path, virtio_features) || + rte_vhost_driver_disable_features(path, disabled_features)) { + SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name); + + rte_vhost_driver_unregister(path); + return -EIO; + } + + if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) { + rte_vhost_driver_unregister(path); + SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name); + return -EIO; + } + +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB + rte_vhost_driver_get_protocol_features(path, &features); + features |= protocol_features; + rte_vhost_driver_set_protocol_features(path, features); +#endif + + if (rte_vhost_driver_start(path) != 0) { + SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n", + ctrl_name, errno, spdk_strerror(errno)); + rte_vhost_driver_unregister(path); + return -EIO; + } + + return 0; +} + +int +vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + return rte_vhost_get_mem_table(vid, mem); +} + +int +vhost_driver_unregister(const char *path) +{ + return rte_vhost_driver_unregister(path); +} + +int +vhost_get_negotiated_features(int vid, uint64_t *negotiated_features) +{ + return rte_vhost_get_negotiated_features(vid, negotiated_features); +} diff --git a/src/spdk/lib/vhost/spdk_vhost.map b/src/spdk/lib/vhost/spdk_vhost.map new file mode 100644 index 000000000..de38e5a5e --- /dev/null +++ b/src/spdk/lib/vhost/spdk_vhost.map @@ -0,0 +1,27 @@ +{ + global: + + # public functions + spdk_vhost_set_socket_path; + spdk_vhost_init; + spdk_vhost_fini; + spdk_vhost_config_json; + spdk_vhost_shutdown_cb; + spdk_vhost_lock; + spdk_vhost_trylock; + spdk_vhost_unlock; + spdk_vhost_dev_find; + spdk_vhost_dev_next; + spdk_vhost_dev_get_name; + spdk_vhost_dev_get_cpumask; + spdk_vhost_set_coalescing; + spdk_vhost_get_coalescing; + spdk_vhost_scsi_dev_construct; + spdk_vhost_scsi_dev_add_tgt; + spdk_vhost_scsi_dev_get_tgt; + spdk_vhost_scsi_dev_remove_tgt; + spdk_vhost_blk_construct; + spdk_vhost_dev_remove; + + local: *; +}; diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c new file mode 100644 index 000000000..b904d8bf9 --- /dev/null +++ b/src/spdk/lib/vhost/vhost.c @@ -0,0 +1,1634 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" + +static struct spdk_cpuset g_vhost_core_mask; + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_dirname[PATH_MAX] = ""; + +/* Thread performing all vhost management operations */ +static struct spdk_thread *g_vhost_init_thread; + +static spdk_vhost_fini_cb g_fini_cpl_cb; + +/** + * DPDK calls our callbacks synchronously but the work those callbacks + * perform needs to be async. Luckily, all DPDK callbacks are called on + * a DPDK-internal pthread, so we'll just wait on a semaphore in there. + */ +static sem_t g_dpdk_sem; + +/** Return code for the current DPDK callback */ +static int g_dpdk_response; + +struct vhost_session_fn_ctx { + /** Device pointer obtained before enqueuing the event */ + struct spdk_vhost_dev *vdev; + + /** ID of the session to send event to. */ + uint32_t vsession_id; + + /** User provided function to be executed on session's thread. */ + spdk_vhost_session_fn cb_fn; + + /** + * User provided function to be called on the init thread + * after iterating through all sessions. + */ + spdk_vhost_dev_fn cpl_fn; + + /** Custom user context */ + void *user_ctx; +}; + +static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER( + g_vhost_devices); +static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER; + +void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len) +{ + void *vva; + uint64_t newlen; + + newlen = len; + vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen); + if (newlen != len) { + return NULL; + } + + return vva; + +} + +static void +vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_id) +{ + struct vring_desc *desc, *desc_table; + uint32_t desc_table_size; + int rc; + + if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { + return; + } + + rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Can't log used ring descriptors!\n"); + return; + } + + do { + if (vhost_vring_desc_is_wr(desc)) { + /* To be honest, only pages realy touched should be logged, but + * doing so would require tracking those changes in each backed. + * Also backend most likely will touch all/most of those pages so + * for lets assume we touched all pages passed to as writeable buffers. */ + rte_vhost_log_write(vsession->vid, desc->addr, desc->len); + } + vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + } while (desc); +} + +static void +vhost_log_used_vring_elem(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t idx) +{ + uint64_t offset, len; + + if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { + return; + } + + if (spdk_unlikely(virtqueue->packed.packed_ring)) { + offset = idx * sizeof(struct vring_packed_desc); + len = sizeof(struct vring_packed_desc); + } else { + offset = offsetof(struct vring_used, ring[idx]); + len = sizeof(virtqueue->vring.used->ring[idx]); + } + + rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len); +} + +static void +vhost_log_used_vring_idx(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue) +{ + uint64_t offset, len; + uint16_t vq_idx; + + if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { + return; + } + + offset = offsetof(struct vring_used, idx); + len = sizeof(virtqueue->vring.used->idx); + vq_idx = virtqueue - vsession->virtqueue; + + rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len); +} + +/* + * Get available requests from avail ring. + */ +uint16_t +vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs, + uint16_t reqs_len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_avail *avail = vring->avail; + uint16_t size_mask = vring->size - 1; + uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx; + uint16_t count, i; + + count = avail_idx - last_idx; + if (spdk_likely(count == 0)) { + return 0; + } + + if (spdk_unlikely(count > vring->size)) { + /* TODO: the queue is unrecoverably broken and should be marked so. + * For now we will fail silently and report there are no new avail entries. + */ + return 0; + } + + count = spdk_min(count, reqs_len); + virtqueue->last_avail_idx += count; + for (i = 0; i < count; i++) { + reqs[i] = vring->avail->ring[(last_idx + i) & size_mask]; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", + last_idx, avail_idx, count); + + return count; +} + +static bool +vhost_vring_desc_is_indirect(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_INDIRECT); +} + +static bool +vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc) +{ + return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0; +} + +int +vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size) +{ + if (spdk_unlikely(req_idx >= virtqueue->vring.size)) { + return -1; + } + + *desc = &virtqueue->vring.desc[req_idx]; + + if (vhost_vring_desc_is_indirect(*desc)) { + *desc_table_size = (*desc)->len / sizeof(**desc); + *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr, + sizeof(**desc) * *desc_table_size); + *desc = *desc_table; + if (*desc == NULL) { + return -1; + } + + return 0; + } + + *desc_table = virtqueue->vring.desc; + *desc_table_size = virtqueue->vring.size; + + return 0; +} + +int +vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_packed_desc **desc, + struct vring_packed_desc **desc_table, uint32_t *desc_table_size) +{ + *desc = &virtqueue->vring.desc_packed[req_idx]; + + /* In packed ring when the desc is non-indirect we get next desc + * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc + * is indirect we get next desc by idx and desc_table_size. It's + * different from split ring. + */ + if (vhost_vring_packed_desc_is_indirect(*desc)) { + *desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc); + *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr, + (*desc)->len); + *desc = *desc_table; + if (spdk_unlikely(*desc == NULL)) { + return -1; + } + } else { + *desc_table = NULL; + *desc_table_size = 0; + } + + return 0; +} + +int +vhost_vq_used_signal(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue) +{ + if (virtqueue->used_req_cnt == 0) { + return 0; + } + + virtqueue->req_cnt += virtqueue->used_req_cnt; + virtqueue->used_req_cnt = 0; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n", + virtqueue - vsession->virtqueue, virtqueue->last_used_idx); + + if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) { + /* interrupt signalled */ + return 1; + } else { + /* interrupt not signalled */ + return 0; + } +} + + +static void +check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint32_t irq_delay_base = vsession->coalescing_delay_time_base; + uint32_t io_threshold = vsession->coalescing_io_rate_threshold; + int32_t irq_delay; + uint32_t req_cnt; + uint16_t q_idx; + + if (now < vsession->next_stats_check_time) { + return; + } + + vsession->next_stats_check_time = now + vsession->stats_check_interval; + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + virtqueue = &vsession->virtqueue[q_idx]; + + req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt; + if (req_cnt <= io_threshold) { + continue; + } + + irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold; + virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay); + + virtqueue->req_cnt = 0; + virtqueue->next_event_time = now; + } +} + +static inline bool +vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq) +{ + if (spdk_unlikely(vq->packed.packed_ring)) { + if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) { + return true; + } + } else { + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { + return true; + } + } + + return false; +} + +void +vhost_session_used_signal(struct spdk_vhost_session *vsession) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint64_t now; + uint16_t q_idx; + + if (vsession->coalescing_delay_time_base == 0) { + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + virtqueue = &vsession->virtqueue[q_idx]; + + if (virtqueue->vring.desc == NULL) { + continue; + } + + if (vhost_vq_event_is_suppressed(virtqueue)) { + continue; + } + + vhost_vq_used_signal(vsession, virtqueue); + } + } else { + now = spdk_get_ticks(); + check_session_io_stats(vsession, now); + + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + virtqueue = &vsession->virtqueue[q_idx]; + + /* No need for event right now */ + if (now < virtqueue->next_event_time) { + continue; + } + + if (vhost_vq_event_is_suppressed(virtqueue)) { + continue; + } + + if (!vhost_vq_used_signal(vsession, virtqueue)) { + continue; + } + + /* Syscall is quite long so update time */ + now = spdk_get_ticks(); + virtqueue->next_event_time = now + virtqueue->irq_delay_time; + } + } +} + +static int +vhost_session_set_coalescing(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *ctx) +{ + vsession->coalescing_delay_time_base = + vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL; + vsession->coalescing_io_rate_threshold = + vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; + return 0; +} + +static int +vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, + uint32_t iops_threshold) +{ + uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL; + uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; + + if (delay_time_base >= UINT32_MAX) { + SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us); + return -EINVAL; + } else if (io_rate == 0) { + SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate, + 1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS); + return -EINVAL; + } + + vdev->coalescing_delay_us = delay_base_us; + vdev->coalescing_iops_threshold = iops_threshold; + return 0; +} + +int +spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, + uint32_t iops_threshold) +{ + int rc; + + rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold); + if (rc != 0) { + return rc; + } + + vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL); + return 0; +} + +void +spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us, + uint32_t *iops_threshold) +{ + if (delay_base_us) { + *delay_base_us = vdev->coalescing_delay_us; + } + + if (iops_threshold) { + *iops_threshold = vdev->coalescing_iops_threshold; + } +} + +/* + * Enqueue id and len to used ring. + */ +void +vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t id, uint32_t len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_used *used = vring->used; + uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1); + uint16_t vq_idx = virtqueue->vring_idx; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n", + virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len); + + vhost_log_req_desc(vsession, virtqueue, id); + + virtqueue->last_used_idx++; + used->ring[last_idx].id = id; + used->ring[last_idx].len = len; + + /* Ensure the used ring is updated before we log it or increment used->idx. */ + spdk_smp_wmb(); + + rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id); + + vhost_log_used_vring_elem(vsession, virtqueue, last_idx); + * (volatile uint16_t *) &used->idx = virtqueue->last_used_idx; + vhost_log_used_vring_idx(vsession, virtqueue); + + rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id); + + virtqueue->used_req_cnt++; +} + +void +vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t num_descs, uint16_t buffer_id, + uint32_t length) +{ + struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx]; + bool used, avail; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - RING: buffer_id=%"PRIu16"\n", + virtqueue - vsession->virtqueue, buffer_id); + + /* When the descriptor is used, two flags in descriptor + * avail flag and used flag are set to equal + * and used flag value == used_wrap_counter. + */ + used = !!(desc->flags & VRING_DESC_F_USED); + avail = !!(desc->flags & VRING_DESC_F_AVAIL); + if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) { + SPDK_ERRLOG("descriptor has been used before\n"); + return; + } + + /* In used desc addr is unused and len specifies the buffer length + * that has been written to by the device. + */ + desc->addr = 0; + desc->len = length; + + /* This bit specifies whether any data has been written by the device */ + if (length != 0) { + desc->flags |= VRING_DESC_F_WRITE; + } + + /* Buffer ID is included in the last descriptor in the list. + * The driver needs to keep track of the size of the list corresponding + * to each buffer ID. + */ + desc->id = buffer_id; + + /* A device MUST NOT make the descriptor used before buffer_id is + * written to the descriptor. + */ + spdk_smp_wmb(); + /* To mark a desc as used, the device sets the F_USED bit in flags to match + * the internal Device ring wrap counter. It also sets the F_AVAIL bit to + * match the same value. + */ + if (virtqueue->packed.used_phase) { + desc->flags |= VRING_DESC_F_AVAIL_USED; + } else { + desc->flags &= ~VRING_DESC_F_AVAIL_USED; + } + + vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx); + virtqueue->last_used_idx += num_descs; + if (virtqueue->last_used_idx >= virtqueue->vring.size) { + virtqueue->last_used_idx -= virtqueue->vring.size; + virtqueue->packed.used_phase = !virtqueue->packed.used_phase; + } + + virtqueue->used_req_cnt++; +} + +bool +vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue) +{ + uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags; + + /* To mark a desc as available, the driver sets the F_AVAIL bit in flags + * to match the internal avail wrap counter. It also sets the F_USED bit to + * match the inverse value but it's not mandatory. + */ + return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase); +} + +bool +vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc) +{ + return (cur_desc->flags & VRING_DESC_F_WRITE) != 0; +} + +int +vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx, + struct spdk_vhost_virtqueue *vq, + struct vring_packed_desc *desc_table, + uint32_t desc_table_size) +{ + if (desc_table != NULL) { + /* When the desc_table isn't NULL means it's indirect and we get the next + * desc by req_idx and desc_table_size. The return value is NULL means + * we reach the last desc of this request. + */ + (*req_idx)++; + if (*req_idx < desc_table_size) { + *desc = &desc_table[*req_idx]; + } else { + *desc = NULL; + } + } else { + /* When the desc_table is NULL means it's non-indirect and we get the next + * desc by req_idx and F_NEXT in flags. The return value is NULL means + * we reach the last desc of this request. When return new desc + * we update the req_idx too. + */ + if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) { + *desc = NULL; + return 0; + } + + *req_idx = (*req_idx + 1) % vq->vring.size; + *desc = &vq->vring.desc_packed[*req_idx]; + } + + return 0; +} + +static int +vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, uintptr_t payload, uint64_t remaining) +{ + uintptr_t vva; + uint64_t len; + + do { + if (*iov_index >= SPDK_VHOST_IOVS_MAX) { + SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX); + return -1; + } + len = remaining; + vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len); + if (vva == 0 || len == 0) { + SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload); + return -1; + } + iov[*iov_index].iov_base = (void *)vva; + iov[*iov_index].iov_len = len; + remaining -= len; + payload += len; + (*iov_index)++; + } while (remaining); + + return 0; +} + +int +vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_packed_desc *desc) +{ + return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index, + desc->addr, desc->len); +} + +/* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx. + * 2, Update the vq->last_avail_idx to point next available desc chain. + * 3, Update the avail_wrap_counter if last_avail_idx overturn. + */ +uint16_t +vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx, + uint16_t *num_descs) +{ + struct vring_packed_desc *desc; + uint16_t desc_head = req_idx; + + *num_descs = 1; + + desc = &vq->vring.desc_packed[req_idx]; + if (!vhost_vring_packed_desc_is_indirect(desc)) { + while ((desc->flags & VRING_DESC_F_NEXT) != 0) { + req_idx = (req_idx + 1) % vq->vring.size; + desc = &vq->vring.desc_packed[req_idx]; + (*num_descs)++; + } + } + + /* Queue Size doesn't have to be a power of 2 + * Device maintains last_avail_idx so we can make sure + * the value is valid(0 ~ vring.size - 1) + */ + vq->last_avail_idx = (req_idx + 1) % vq->vring.size; + if (vq->last_avail_idx < desc_head) { + vq->packed.avail_phase = !vq->packed.avail_phase; + } + + return desc->id; +} + +int +vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size) +{ + struct vring_desc *old_desc = *desc; + uint16_t next_idx; + + if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { + *desc = NULL; + return 0; + } + + next_idx = old_desc->next; + if (spdk_unlikely(next_idx >= desc_table_size)) { + *desc = NULL; + return -1; + } + + *desc = &desc_table[next_idx]; + return 0; +} + +int +vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc) +{ + return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index, + desc->addr, desc->len); +} + +static struct spdk_vhost_session * +vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id) +{ + struct spdk_vhost_session *vsession; + + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->id == id) { + return vsession; + } + } + + return NULL; +} + +struct spdk_vhost_session * +vhost_session_find_by_vid(int vid) +{ + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + + TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->vid == vid) { + return vsession; + } + } + } + + return NULL; +} + +struct spdk_vhost_dev * +spdk_vhost_dev_next(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return TAILQ_FIRST(&g_vhost_devices); + } + + return TAILQ_NEXT(vdev, tailq); +} + +struct spdk_vhost_dev * +spdk_vhost_dev_find(const char *ctrlr_name) +{ + struct spdk_vhost_dev *vdev; + size_t dev_dirname_len = strlen(dev_dirname); + + if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) { + ctrlr_name += dev_dirname_len; + } + + TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { + if (strcmp(vdev->name, ctrlr_name) == 0) { + return vdev; + } + } + + return NULL; +} + +static int +vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) +{ + int rc; + + if (cpumask == NULL) { + return -1; + } + + if (mask == NULL) { + spdk_cpuset_copy(cpumask, &g_vhost_core_mask); + return 0; + } + + rc = spdk_cpuset_parse(cpumask, mask); + if (rc < 0) { + SPDK_ERRLOG("invalid cpumask %s\n", mask); + return -1; + } + + spdk_cpuset_and(cpumask, &g_vhost_core_mask); + + if (spdk_cpuset_count(cpumask) == 0) { + SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n", + spdk_cpuset_fmt(&g_vhost_core_mask)); + return -1; + } + + return 0; +} + +static void +vhost_setup_core_mask(void *ctx) +{ + struct spdk_thread *thread = spdk_get_thread(); + spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread)); +} + +static void +vhost_setup_core_mask_done(void *ctx) +{ + spdk_vhost_init_cb init_cb = ctx; + + if (spdk_cpuset_count(&g_vhost_core_mask) == 0) { + init_cb(-ECHILD); + return; + } + + init_cb(0); +} + +static void +vhost_dev_thread_exit(void *arg1) +{ + spdk_thread_exit(spdk_get_thread()); +} + +int +vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend) +{ + char path[PATH_MAX]; + struct spdk_cpuset cpumask = {}; + int rc; + + assert(vdev); + if (name == NULL) { + SPDK_ERRLOG("Can't register controller with no name\n"); + return -EINVAL; + } + + if (vhost_parse_core_mask(mask_str, &cpumask) != 0) { + SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n", + mask_str, spdk_cpuset_fmt(&g_vhost_core_mask)); + return -EINVAL; + } + + if (spdk_vhost_dev_find(name)) { + SPDK_ERRLOG("vhost controller %s already exists.\n", name); + return -EEXIST; + } + + if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) { + SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname, + name); + return -EINVAL; + } + + vdev->name = strdup(name); + vdev->path = strdup(path); + if (vdev->name == NULL || vdev->path == NULL) { + rc = -EIO; + goto out; + } + + vdev->thread = spdk_thread_create(vdev->name, &cpumask); + if (vdev->thread == NULL) { + SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name); + rc = -EIO; + goto out; + } + + vdev->registered = true; + vdev->backend = backend; + TAILQ_INIT(&vdev->vsessions); + + vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US, + SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD); + + if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features, + vdev->protocol_features)) { + spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); + rc = -EIO; + goto out; + } + + TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name); + return 0; + +out: + free(vdev->name); + free(vdev->path); + return rc; +} + +int +vhost_dev_unregister(struct spdk_vhost_dev *vdev) +{ + if (!TAILQ_EMPTY(&vdev->vsessions)) { + SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name); + return -EBUSY; + } + + if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) { + SPDK_ERRLOG("Could not unregister controller %s with vhost library\n" + "Check if domain socket %s still exists\n", + vdev->name, vdev->path); + return -EIO; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name); + + spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); + + free(vdev->name); + free(vdev->path); + TAILQ_REMOVE(&g_vhost_devices, vdev, tailq); + return 0; +} + +const char * +spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return vdev->name; +} + +const struct spdk_cpuset * +spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return spdk_thread_get_cpumask(vdev->thread); +} + +static void +wait_for_semaphore(int timeout_sec, const char *errmsg) +{ + struct timespec timeout; + int rc; + + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_sec += timeout_sec; + rc = sem_timedwait(&g_dpdk_sem, &timeout); + if (rc != 0) { + SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg); + sem_wait(&g_dpdk_sem); + } +} + +static void +vhost_session_cb_done(int rc) +{ + g_dpdk_response = rc; + sem_post(&g_dpdk_sem); +} + +void +vhost_session_start_done(struct spdk_vhost_session *vsession, int response) +{ + if (response == 0) { + vsession->started = true; + + assert(vsession->vdev->active_session_num < UINT32_MAX); + vsession->vdev->active_session_num++; + } + + vhost_session_cb_done(response); +} + +void +vhost_session_stop_done(struct spdk_vhost_session *vsession, int response) +{ + if (response == 0) { + vsession->started = false; + + assert(vsession->vdev->active_session_num > 0); + vsession->vdev->active_session_num--; + } + + vhost_session_cb_done(response); +} + +static void +vhost_event_cb(void *arg1) +{ + struct vhost_session_fn_ctx *ctx = arg1; + struct spdk_vhost_session *vsession; + + if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { + spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1); + return; + } + + vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id); + ctx->cb_fn(ctx->vdev, vsession, NULL); + pthread_mutex_unlock(&g_vhost_mutex); +} + +int +vhost_session_send_event(struct spdk_vhost_session *vsession, + spdk_vhost_session_fn cb_fn, unsigned timeout_sec, + const char *errmsg) +{ + struct vhost_session_fn_ctx ev_ctx = {0}; + struct spdk_vhost_dev *vdev = vsession->vdev; + + ev_ctx.vdev = vdev; + ev_ctx.vsession_id = vsession->id; + ev_ctx.cb_fn = cb_fn; + + spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx); + + pthread_mutex_unlock(&g_vhost_mutex); + wait_for_semaphore(timeout_sec, errmsg); + pthread_mutex_lock(&g_vhost_mutex); + + return g_dpdk_response; +} + +static void +foreach_session_finish_cb(void *arg1) +{ + struct vhost_session_fn_ctx *ev_ctx = arg1; + struct spdk_vhost_dev *vdev = ev_ctx->vdev; + + if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { + spdk_thread_send_msg(spdk_get_thread(), + foreach_session_finish_cb, arg1); + return; + } + + assert(vdev->pending_async_op_num > 0); + vdev->pending_async_op_num--; + if (ev_ctx->cpl_fn != NULL) { + ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx); + } + + pthread_mutex_unlock(&g_vhost_mutex); + free(ev_ctx); +} + +static void +foreach_session(void *arg1) +{ + struct vhost_session_fn_ctx *ev_ctx = arg1; + struct spdk_vhost_session *vsession; + struct spdk_vhost_dev *vdev = ev_ctx->vdev; + int rc; + + if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { + spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1); + return; + } + + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->initialized) { + rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx); + if (rc < 0) { + goto out; + } + } + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + + spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1); +} + +void +vhost_dev_foreach_session(struct spdk_vhost_dev *vdev, + spdk_vhost_session_fn fn, + spdk_vhost_dev_fn cpl_fn, + void *arg) +{ + struct vhost_session_fn_ctx *ev_ctx; + + ev_ctx = calloc(1, sizeof(*ev_ctx)); + if (ev_ctx == NULL) { + SPDK_ERRLOG("Failed to alloc vhost event.\n"); + assert(false); + return; + } + + ev_ctx->vdev = vdev; + ev_ctx->cb_fn = fn; + ev_ctx->cpl_fn = cpl_fn; + ev_ctx->user_ctx = arg; + + assert(vdev->pending_async_op_num < UINT32_MAX); + vdev->pending_async_op_num++; + + spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx); +} + +static int +_stop_session(struct spdk_vhost_session *vsession) +{ + struct spdk_vhost_dev *vdev = vsession->vdev; + struct spdk_vhost_virtqueue *q; + int rc; + uint16_t i; + + rc = vdev->backend->stop_session(vsession); + if (rc != 0) { + SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid); + pthread_mutex_unlock(&g_vhost_mutex); + return rc; + } + + for (i = 0; i < vsession->max_queues; i++) { + q = &vsession->virtqueue[i]; + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc == NULL) { + continue; + } + + /* Packed virtqueues support up to 2^15 entries each + * so left one bit can be used as wrap counter. + */ + if (q->packed.packed_ring) { + q->last_avail_idx = q->last_avail_idx | + ((uint16_t)q->packed.avail_phase << 15); + q->last_used_idx = q->last_used_idx | + ((uint16_t)q->packed.used_phase << 15); + } + + rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx); + } + + vhost_session_mem_unregister(vsession->mem); + free(vsession->mem); + + return 0; +} + +int +vhost_stop_device_cb(int vid) +{ + struct spdk_vhost_session *vsession; + int rc; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + pthread_mutex_unlock(&g_vhost_mutex); + return -EINVAL; + } + + if (!vsession->started) { + /* already stopped, nothing to do */ + pthread_mutex_unlock(&g_vhost_mutex); + return -EALREADY; + } + + rc = _stop_session(vsession); + pthread_mutex_unlock(&g_vhost_mutex); + + return rc; +} + +int +vhost_start_device_cb(int vid) +{ + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + int rc = -1; + uint16_t i; + bool packed_ring; + + pthread_mutex_lock(&g_vhost_mutex); + + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + goto out; + } + + vdev = vsession->vdev; + if (vsession->started) { + /* already started, nothing to do */ + rc = 0; + goto out; + } + + if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); + goto out; + } + + packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0); + + vsession->max_queues = 0; + memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue)); + for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { + struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; + + q->vring_idx = -1; + if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) { + continue; + } + q->vring_idx = i; + rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight); + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc == NULL || q->vring.size == 0) { + continue; + } + + if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) { + q->vring.desc = NULL; + continue; + } + + if (packed_ring) { + /* Packed virtqueues support up to 2^15 entries each + * so left one bit can be used as wrap counter. + */ + q->packed.avail_phase = q->last_avail_idx >> 15; + q->last_avail_idx = q->last_avail_idx & 0x7FFF; + q->packed.used_phase = q->last_used_idx >> 15; + q->last_used_idx = q->last_used_idx & 0x7FFF; + + /* Disable I/O submission notifications, we'll be polling. */ + q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; + } else { + /* Disable I/O submission notifications, we'll be polling. */ + q->vring.used->flags = VRING_USED_F_NO_NOTIFY; + } + + q->packed.packed_ring = packed_ring; + vsession->max_queues = i + 1; + } + + if (vhost_get_mem_table(vid, &vsession->mem) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); + goto out; + } + + /* + * Not sure right now but this look like some kind of QEMU bug and guest IO + * might be frozed without kicking all queues after live-migration. This look like + * the previous vhost instance failed to effectively deliver all interrupts before + * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts + * should be ignored by guest virtio driver. + * + * Tested on QEMU 2.10.91 and 2.11.50. + */ + for (i = 0; i < vsession->max_queues; i++) { + struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc != NULL && q->vring.size > 0) { + rte_vhost_vring_call(vsession->vid, q->vring_idx); + } + } + + vhost_session_set_coalescing(vdev, vsession, NULL); + vhost_session_mem_register(vsession->mem); + vsession->initialized = true; + rc = vdev->backend->start_session(vsession); + if (rc != 0) { + vhost_session_mem_unregister(vsession->mem); + free(vsession->mem); + goto out; + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + return rc; +} + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB +int +vhost_get_config_cb(int vid, uint8_t *config, uint32_t len) +{ + struct spdk_vhost_session *vsession; + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + goto out; + } + + vdev = vsession->vdev; + if (vdev->backend->vhost_get_config) { + rc = vdev->backend->vhost_get_config(vdev, config, len); + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + return rc; +} + +int +vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags) +{ + struct spdk_vhost_session *vsession; + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + goto out; + } + + vdev = vsession->vdev; + if (vdev->backend->vhost_set_config) { + rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags); + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + return rc; +} +#endif + +int +spdk_vhost_set_socket_path(const char *basename) +{ + int ret; + + if (basename && strlen(basename) > 0) { + ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename); + if (ret <= 0) { + return -EINVAL; + } + if ((size_t)ret >= sizeof(dev_dirname) - 2) { + SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret); + return -EINVAL; + } + + if (dev_dirname[ret - 1] != '/') { + dev_dirname[ret] = '/'; + dev_dirname[ret + 1] = '\0'; + } + } + + return 0; +} + +void +vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + assert(vdev->backend->dump_info_json != NULL); + vdev->backend->dump_info_json(vdev, w); +} + +int +spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev) +{ + if (vdev->pending_async_op_num) { + return -EBUSY; + } + + return vdev->backend->remove_device(vdev); +} + +int +vhost_new_connection_cb(int vid, const char *ifname) +{ + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + + pthread_mutex_lock(&g_vhost_mutex); + + vdev = spdk_vhost_dev_find(ifname); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid); + pthread_mutex_unlock(&g_vhost_mutex); + return -1; + } + + /* We expect sessions inside vdev->vsessions to be sorted in ascending + * order in regard of vsession->id. For now we always set id = vsessions_cnt++ + * and append each session to the very end of the vsessions list. + * This is required for spdk_vhost_dev_foreach_session() to work. + */ + if (vdev->vsessions_num == UINT_MAX) { + assert(false); + return -EINVAL; + } + + if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) + + vdev->backend->session_ctx_size)) { + SPDK_ERRLOG("vsession alloc failed\n"); + pthread_mutex_unlock(&g_vhost_mutex); + return -1; + } + memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size); + + vsession->vdev = vdev; + vsession->vid = vid; + vsession->id = vdev->vsessions_num++; + vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid); + if (vsession->name == NULL) { + SPDK_ERRLOG("vsession alloc failed\n"); + pthread_mutex_unlock(&g_vhost_mutex); + free(vsession); + return -1; + } + vsession->started = false; + vsession->initialized = false; + vsession->next_stats_check_time = 0; + vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS * + spdk_get_ticks_hz() / 1000UL; + TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq); + + vhost_session_install_rte_compat_hooks(vsession); + pthread_mutex_unlock(&g_vhost_mutex); + return 0; +} + +int +vhost_destroy_connection_cb(int vid) +{ + struct spdk_vhost_session *vsession; + int rc = 0; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + pthread_mutex_unlock(&g_vhost_mutex); + return -EINVAL; + } + + if (vsession->started) { + rc = _stop_session(vsession); + } + + TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq); + free(vsession->name); + free(vsession); + pthread_mutex_unlock(&g_vhost_mutex); + + return rc; +} + +void +spdk_vhost_lock(void) +{ + pthread_mutex_lock(&g_vhost_mutex); +} + +int +spdk_vhost_trylock(void) +{ + return -pthread_mutex_trylock(&g_vhost_mutex); +} + +void +spdk_vhost_unlock(void) +{ + pthread_mutex_unlock(&g_vhost_mutex); +} + +void +spdk_vhost_init(spdk_vhost_init_cb init_cb) +{ + size_t len; + int ret; + + g_vhost_init_thread = spdk_get_thread(); + assert(g_vhost_init_thread != NULL); + + if (dev_dirname[0] == '\0') { + if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) { + SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno)); + ret = -1; + goto out; + } + + len = strlen(dev_dirname); + if (dev_dirname[len - 1] != '/') { + dev_dirname[len] = '/'; + dev_dirname[len + 1] = '\0'; + } + } + + ret = sem_init(&g_dpdk_sem, 0, 0); + if (ret != 0) { + SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n"); + ret = -1; + goto out; + } + + ret = vhost_scsi_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost controllers\n"); + goto out; + } + + ret = vhost_blk_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost block controllers\n"); + goto out; + } + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + ret = vhost_nvme_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n"); + goto out; + } +#endif + + spdk_cpuset_zero(&g_vhost_core_mask); + + /* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really + * created. + */ + spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done); + return; +out: + init_cb(ret); +} + +static void +vhost_fini(void *arg1) +{ + struct spdk_vhost_dev *vdev, *tmp; + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_next(NULL); + while (vdev != NULL) { + tmp = spdk_vhost_dev_next(vdev); + spdk_vhost_dev_remove(vdev); + /* don't care if it fails, there's nothing we can do for now */ + vdev = tmp; + } + spdk_vhost_unlock(); + + spdk_cpuset_zero(&g_vhost_core_mask); + + /* All devices are removed now. */ + sem_destroy(&g_dpdk_sem); + + g_fini_cpl_cb(); +} + +static void * +session_shutdown(void *arg) +{ + struct spdk_vhost_dev *vdev = NULL; + + TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { + vhost_driver_unregister(vdev->path); + vdev->registered = false; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n"); + spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL); + return NULL; +} + +void +spdk_vhost_fini(spdk_vhost_fini_cb fini_cb) +{ + pthread_t tid; + int rc; + + assert(spdk_get_thread() == g_vhost_init_thread); + g_fini_cpl_cb = fini_cb; + + /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK + * ops for stopping a device or removing a connection, we need to call it from + * a separate thread to avoid deadlock. + */ + rc = pthread_create(&tid, NULL, &session_shutdown, NULL); + if (rc < 0) { + SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc)); + abort(); + } + pthread_detach(tid); +} + +void +spdk_vhost_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_dev *vdev; + uint32_t delay_base_us; + uint32_t iops_threshold; + + spdk_json_write_array_begin(w); + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_next(NULL); + while (vdev != NULL) { + vdev->backend->write_config_json(vdev, w); + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + if (delay_base_us) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + vdev = spdk_vhost_dev_next(vdev); + } + spdk_vhost_unlock(); + + spdk_json_write_array_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST) +SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING) diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c new file mode 100644 index 000000000..d387cb27d --- /dev/null +++ b/src/spdk/lib/vhost/vhost_blk.c @@ -0,0 +1,1354 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/virtio_blk.h> + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/vhost.h" + +#include "vhost_internal.h" +#include <rte_version.h> + +/* Minimal set of features supported by every SPDK VHOST-BLK device */ +#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \ + (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ + (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ + (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \ + (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ + (1ULL << VIRTIO_BLK_F_MQ)) + +/* Not supported features */ +#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ + (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ + (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI)) + +/* Vhost-blk support protocol features */ +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB +#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \ + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) +#else +#define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG) +#endif + +struct spdk_vhost_blk_task { + struct spdk_bdev_io *bdev_io; + struct spdk_vhost_blk_session *bvsession; + struct spdk_vhost_virtqueue *vq; + + volatile uint8_t *status; + + uint16_t req_idx; + uint16_t num_descs; + uint16_t buffer_id; + + /* for io wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + /** Number of bytes that were written. */ + uint32_t used_len; + uint16_t iovcnt; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; +}; + +struct spdk_vhost_blk_dev { + struct spdk_vhost_dev vdev; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + /* dummy_io_channel is used to hold a bdev reference */ + struct spdk_io_channel *dummy_io_channel; + bool readonly; +}; + +struct spdk_vhost_blk_session { + /* The parent session must be the very first field in this struct */ + struct spdk_vhost_session vsession; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_poller *requestq_poller; + struct spdk_io_channel *io_channel; + struct spdk_poller *stop_poller; +}; + +/* forward declaration */ +static const struct spdk_vhost_dev_backend vhost_blk_device_backend; + +static int +process_blk_request(struct spdk_vhost_blk_task *task, + struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq); + +static void +blk_task_finish(struct spdk_vhost_blk_task *task) +{ + assert(task->bvsession->vsession.task_cnt > 0); + task->bvsession->vsession.task_cnt--; + task->used = false; +} + +static void +blk_task_init(struct spdk_vhost_blk_task *task) +{ + task->used = true; + task->iovcnt = SPDK_COUNTOF(task->iovs); + task->status = NULL; + task->used_len = 0; +} + +static void +blk_task_enqueue(struct spdk_vhost_blk_task *task) +{ + if (task->vq->packed.packed_ring) { + vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq, + task->num_descs, + task->buffer_id, task->used_len); + } else { + vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, + task->req_idx, task->used_len); + } +} + +static void +invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) +{ + if (task->status) { + *task->status = status; + } + + blk_task_enqueue(task); + blk_task_finish(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * total size of suplied buffers + * + * FIXME: Make this function return to rd_cnt and wr_cnt + */ +static int +blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_dev *vdev = vsession->vdev; + struct vring_desc *desc, *desc_table; + uint16_t out_cnt = 0, cnt = 0; + uint32_t desc_table_size, len = 0; + uint32_t desc_handled_cnt; + int rc; + + rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); + return -1; + } + + desc_handled_cnt = 0; + while (1) { + /* + * Maximum cnt reached? + * Should not happen if request is well formatted, otherwise this is a BUG. + */ + if (spdk_unlikely(cnt == *iovs_cnt)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n", + vsession->name, req_idx); + return -1; + } + + if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", + vsession->name, req_idx, cnt); + return -1; + } + + len += desc->len; + + out_cnt += vhost_vring_desc_is_wr(desc); + + rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n", + vsession->name, req_idx); + return -1; + } else if (desc == NULL) { + break; + } + + desc_handled_cnt++; + if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { + /* Break a cycle and report an error, if any. */ + SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n", + vsession->name, desc_table_size, desc_handled_cnt); + return -1; + } + } + + /* + * There must be least two descriptors. + * First contain request so it must be readable. + * Last descriptor contain buffer for response so it must be writable. + */ + if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { + return -1; + } + + *length = len; + *iovs_cnt = cnt; + return 0; +} + +static int +blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_dev *vdev = vsession->vdev; + struct vring_packed_desc *desc = NULL, *desc_table; + uint16_t out_cnt = 0, cnt = 0; + uint32_t desc_table_size, len = 0; + int rc = 0; + + rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc, + &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); + return rc; + } + + if (desc_table != NULL) { + req_idx = 0; + } + + while (1) { + /* + * Maximum cnt reached? + * Should not happen if request is well formatted, otherwise this is a BUG. + */ + if (spdk_unlikely(cnt == *iovs_cnt)) { + SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n", + vsession->name, req_idx); + return -EINVAL; + } + + if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) { + SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", + vsession->name, req_idx, cnt); + return -EINVAL; + } + + len += desc->len; + out_cnt += vhost_vring_packed_desc_is_wr(desc); + + /* desc is NULL means we reach the last desc of this request */ + vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size); + if (desc == NULL) { + break; + } + } + + /* + * There must be least two descriptors. + * First contain request so it must be readable. + * Last descriptor contain buffer for response so it must be writable. + */ + if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { + return -EINVAL; + } + + *length = len; + *iovs_cnt = cnt; + + return 0; +} + +static void +blk_request_finish(bool success, struct spdk_vhost_blk_task *task) +{ + *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + + blk_task_enqueue(task); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task, + task->req_idx, success ? "OK" : "FAIL"); + blk_task_finish(task); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_blk_task *task = cb_arg; + + spdk_bdev_free_io(bdev_io); + blk_request_finish(success, task); +} + +static void +blk_request_resubmit(void *arg) +{ + struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; + int rc = 0; + + blk_task_init(task); + + rc = process_blk_request(task, task->bvsession, task->vq); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task); + } +} + +static inline void +blk_request_queue_io(struct spdk_vhost_blk_task *task) +{ + int rc; + struct spdk_vhost_blk_session *bvsession = task->bvsession; + struct spdk_bdev *bdev = bvsession->bvdev->bdev; + + task->bdev_io_wait.bdev = bdev; + task->bdev_io_wait.cb_fn = blk_request_resubmit; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + } +} + +static int +process_blk_request(struct spdk_vhost_blk_task *task, + struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev; + const struct virtio_blk_outhdr *req; + struct virtio_blk_discard_write_zeroes *desc; + struct iovec *iov; + uint32_t type; + uint32_t payload_len; + uint64_t flush_bytes; + int rc; + + if (vq->packed.packed_ring) { + rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, + &payload_len); + } else { + rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, + &payload_len); + } + + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); + /* Only READ and WRITE are supported for now. */ + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + iov = &task->iovs[0]; + if (spdk_unlikely(iov->iov_len != sizeof(*req))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", + iov->iov_len, sizeof(*req), task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + req = iov->iov_base; + + iov = &task->iovs[task->iovcnt - 1]; + if (spdk_unlikely(iov->iov_len != 1)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", + iov->iov_len, 1, task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + task->status = iov->iov_base; + payload_len -= sizeof(*req) + sizeof(*task->status); + task->iovcnt -= 2; + + type = req->type; +#ifdef VIRTIO_BLK_T_BARRIER + /* Don't care about barier for now (as QEMU's virtio-blk do). */ + type &= ~VIRTIO_BLK_T_BARRIER; +#endif + + switch (type) { + case VIRTIO_BLK_T_IN: + case VIRTIO_BLK_T_OUT: + if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { + SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", + type ? "WRITE" : "READ", task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + if (type == VIRTIO_BLK_T_IN) { + task->used_len = payload_len + sizeof(*task->status); + rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else if (!bvdev->readonly) { + task->used_len = sizeof(*task->status); + rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n"); + rc = -1; + } + + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_DISCARD: + desc = task->iovs[1].iov_base; + if (payload_len != sizeof(*desc)) { + SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + + rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel, + desc->sector * 512, desc->num_sectors * 512, + blk_request_complete_cb, task); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_WRITE_ZEROES: + desc = task->iovs[1].iov_base; + if (payload_len != sizeof(*desc)) { + SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + + /* Zeroed and Unmap the range, SPDK doen't support it. */ + if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { + SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n"); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel, + desc->sector * 512, desc->num_sectors * 512, + blk_request_complete_cb, task); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_FLUSH: + flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev); + if (req->sector != 0) { + SPDK_NOTICELOG("sector must be zero for flush command\n"); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel, + 0, flush_bytes, + blk_request_complete_cb, task); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovcnt || !payload_len) { + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); + spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), + task->used_len, ' '); + blk_request_finish(true, task); + break; + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + return 0; +} + +static void +process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) +{ + struct spdk_vhost_blk_task *task; + uint16_t task_idx = req_idx, num_descs; + + if (vq->packed.packed_ring) { + /* Packed ring used the buffer_id as the task_idx to get task struct. + * In kernel driver, it uses the vq->free_head to set the buffer_id so the value + * must be in the range of 0 ~ vring.size. The free_head value must be unique + * in the outstanding requests. + * We can't use the req_idx as the task_idx because the desc can be reused in + * the next phase even when it's not completed in the previous phase. For example, + * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving + * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used + * as task_idx because we will know task[0]->used is true at phase 1. + * The split queue is quite different, the desc would insert into the free list when + * device completes the request, the driver gets the desc from the free list which + * ensures the req_idx is unique in the outstanding requests. + */ + task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); + } + + task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + task->bvsession->vsession.name, task_idx); + task->used_len = 0; + blk_task_enqueue(task); + return; + } + + if (vq->packed.packed_ring) { + task->req_idx = req_idx; + task->num_descs = num_descs; + task->buffer_id = task_idx; + } + + task->bvsession->vsession.task_cnt++; + + blk_task_init(task); + + if (process_blk_request(task, task->bvsession, vq) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task, + task_idx); + } else { + SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx); + } +} + +static void +submit_inflight_desc(struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight; + spdk_vhost_resubmit_desc *resubmit_list; + uint16_t req_idx; + + if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) { + return; + } + + resubmit_list = resubmit->resubmit_list; + while (resubmit->resubmit_num-- > 0) { + req_idx = resubmit_list[resubmit->resubmit_num].index; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n", + req_idx); + + if (spdk_unlikely(req_idx >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + vsession->name, req_idx, vq->vring.size); + vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); + continue; + } + + process_blk_task(vq, req_idx); + } + + free(resubmit_list); + resubmit->resubmit_list = NULL; +} + +static void +process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS]; + uint16_t reqs_cnt, i; + + submit_inflight_desc(bvsession, vq); + + reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + if (!reqs_cnt) { + return; + } + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + vsession->name, reqs[i], vq->vring.size); + vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); + continue; + } + + rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]); + + process_blk_task(vq, reqs[i]); + } +} + +static void +process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + uint16_t i = 0; + + while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS && + vhost_vq_packed_ring_is_avail(vq)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", + vq->last_avail_idx); + + process_blk_task(vq, vq->last_avail_idx); + } +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_blk_session *bvsession = arg; + struct spdk_vhost_session *vsession = &bvsession->vsession; + + uint16_t q_idx; + bool packed_ring; + + /* In a session, every vq supports the same format */ + packed_ring = vsession->virtqueue[0].packed.packed_ring; + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + if (packed_ring) { + process_packed_vq(bvsession, &vsession->virtqueue[q_idx]); + } else { + process_vq(bvsession, &vsession->virtqueue[q_idx]); + } + } + + vhost_session_used_signal(vsession); + + return SPDK_POLLER_BUSY; +} + +static void +no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + uint32_t length; + uint16_t iovcnt, req_idx; + + if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { + return; + } + + iovcnt = SPDK_COUNTOF(iovs); + if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) { + *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); + } + + vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); +} + +static void +no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_blk_task *task; + uint32_t length; + uint16_t req_idx = vq->last_avail_idx; + uint16_t task_idx, num_descs; + + if (!vhost_vq_packed_ring_is_avail(vq)) { + return; + } + + task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); + task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + vsession->name, req_idx); + vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, + task->buffer_id, task->used_len); + return; + } + + task->req_idx = req_idx; + task->num_descs = num_descs; + task->buffer_id = task_idx; + blk_task_init(task); + + if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, + &length)) { + *(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); + } + + task->used = false; + vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, + task->buffer_id, task->used_len); +} + +static int +no_bdev_vdev_worker(void *arg) +{ + struct spdk_vhost_blk_session *bvsession = arg; + struct spdk_vhost_session *vsession = &bvsession->vsession; + uint16_t q_idx; + bool packed_ring; + + /* In a session, every vq supports the same format */ + packed_ring = vsession->virtqueue[0].packed.packed_ring; + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + if (packed_ring) { + no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]); + } else { + no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]); + } + } + + vhost_session_used_signal(vsession); + + if (vsession->task_cnt == 0 && bvsession->io_channel) { + spdk_put_io_channel(bvsession->io_channel); + bvsession->io_channel = NULL; + } + + return SPDK_POLLER_BUSY; +} + +static struct spdk_vhost_blk_session * +to_blk_session(struct spdk_vhost_session *vsession) +{ + assert(vsession->vdev->backend == &vhost_blk_device_backend); + return (struct spdk_vhost_blk_session *)vsession; +} + +static struct spdk_vhost_blk_dev * +to_blk_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return NULL; + } + + if (vdev->backend != &vhost_blk_device_backend) { + SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); +} + +static int +vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *ctx) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0) + SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid); + rte_vhost_slave_config_change(vsession->vid, false); +#else + SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n"); +#endif + + return 0; +} + +static void +blk_resize_cb(void *resize_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = resize_ctx; + + spdk_vhost_lock(); + vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb, + NULL, NULL); + spdk_vhost_unlock(); +} + +static void +vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) +{ + + /* All sessions have been notified, time to close the bdev */ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + + assert(bvdev != NULL); + spdk_put_io_channel(bvdev->dummy_io_channel); + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + bvdev->bdev = NULL; +} + +static int +vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *ctx) +{ + struct spdk_vhost_blk_session *bvsession; + + bvsession = (struct spdk_vhost_blk_session *)vsession; + if (bvsession->requestq_poller) { + spdk_poller_unregister(&bvsession->requestq_poller); + bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0); + } + + return 0; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = remove_ctx; + + SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n", + bvdev->vdev.name); + + spdk_vhost_lock(); + vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb, + vhost_dev_bdev_remove_cpl_cb, NULL); + spdk_vhost_unlock(); +} + +static void +bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, + void *event_ctx) +{ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Bdev event: type %d, name %s\n", + type, + bdev->name); + + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); + bdev_remove_cb(event_ctx); + break; + case SPDK_BDEV_EVENT_RESIZE: + SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); + blk_resize_cb(event_ctx); + break; + default: + SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); + break; + } +} + +static void +free_task_pool(struct spdk_vhost_blk_session *bvsession) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_blk_session *bvsession) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_blk_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(bvsession); + return -1; + } + vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vq->tasks == NULL) { + SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + vsession->name, task_cnt, i); + free_task_pool(bvsession); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; + task->bvsession = bvsession; + task->req_idx = j; + task->vq = vq; + } + } + + return 0; +} + +static int +vhost_blk_start_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); + struct spdk_vhost_blk_dev *bvdev; + int i, rc = 0; + + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + bvsession->bvdev = bvdev; + + /* validate all I/O queues are in a contiguous index range */ + for (i = 0; i < vsession->max_queues; i++) { + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (vsession->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(bvsession); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); + goto out; + } + + if (bvdev->bdev) { + bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); + if (!bvsession->io_channel) { + free_task_pool(bvsession); + SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name); + rc = -1; + goto out; + } + } + + bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, + bvsession, 0); + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); +out: + vhost_session_start_done(vsession, rc); + return rc; +} + +static int +vhost_blk_start(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_blk_start_cb, + 3, "start session"); +} + +static int +destroy_session_poller_cb(void *arg) +{ + struct spdk_vhost_blk_session *bvsession = arg; + struct spdk_vhost_session *vsession = &bvsession->vsession; + int i; + + if (vsession->task_cnt > 0) { + return SPDK_POLLER_BUSY; + } + + if (spdk_vhost_trylock() != 0) { + return SPDK_POLLER_BUSY; + } + + for (i = 0; i < vsession->max_queues; i++) { + vsession->virtqueue[i].next_event_time = 0; + vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); + + if (bvsession->io_channel) { + spdk_put_io_channel(bvsession->io_channel); + bvsession->io_channel = NULL; + } + + free_task_pool(bvsession); + spdk_poller_unregister(&bvsession->stop_poller); + vhost_session_stop_done(vsession, 0); + + spdk_vhost_unlock(); + return SPDK_POLLER_BUSY; +} + +static int +vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); + + spdk_poller_unregister(&bvsession->requestq_poller); + bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb, + bvsession, 1000); + return 0; +} + +static int +vhost_blk_stop(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_blk_stop_cb, + 3, "stop session"); +} + +static void +vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + + spdk_json_write_named_object_begin(w, "block"); + + spdk_json_write_named_bool(w, "readonly", bvdev->readonly); + + spdk_json_write_name(w, "bdev"); + if (bvdev->bdev) { + spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev)); + } else { + spdk_json_write_null(w); + } + + spdk_json_write_object_end(w); +} + +static void +vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + + if (!bvdev->bdev) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_create_blk_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); + spdk_json_write_named_string(w, "cpumask", + spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); + spdk_json_write_named_bool(w, "readonly", bvdev->readonly); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static int vhost_blk_destroy(struct spdk_vhost_dev *dev); + +static int +vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config blkcfg; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_bdev *bdev; + uint32_t blk_size; + uint64_t blkcnt; + + memset(&blkcfg, 0, sizeof(blkcfg)); + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + bdev = bvdev->bdev; + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = spdk_bdev_get_block_size(bdev); + blkcnt = spdk_bdev_get_num_blocks(bdev); + if (spdk_bdev_get_buf_align(bdev) > 1) { + blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; + blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1); + } else { + blkcfg.size_max = 131072; + /* -2 for REQ and RESP and -1 for region boundary splitting */ + blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; + } + } + + blkcfg.blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg.min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg.capacity = (blkcnt * blk_size) / 512; + /* QEMU can overwrite this value when started */ + blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES; + + if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + /* 16MiB, expressed in 512 Bytes */ + blkcfg.max_discard_sectors = 32768; + blkcfg.max_discard_seg = 1; + blkcfg.discard_sector_alignment = blk_size / 512; + } + if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + blkcfg.max_write_zeroes_sectors = 32768; + blkcfg.max_write_zeroes_seg = 1; + } + + memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg))); + + return 0; +} + +static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { + .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session), + .start_session = vhost_blk_start, + .stop_session = vhost_blk_stop, + .vhost_get_config = vhost_blk_get_config, + .dump_info_json = vhost_blk_dump_info_json, + .write_config_json = vhost_blk_write_config_json, + .remove_device = vhost_blk_destroy, +}; + +int +vhost_blk_controller_construct(void) +{ + struct spdk_conf_section *sp; + unsigned ctrlr_num; + char *bdev_name; + char *cpumask; + char *name; + bool readonly; + bool packed_ring; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false); + packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false); + + bdev_name = spdk_conf_section_get_val(sp, "Dev"); + if (bdev_name == NULL) { + continue; + } + + if (spdk_vhost_blk_construct(name, cpumask, bdev_name, + readonly, packed_ring) < 0) { + return -1; + } + } + + return 0; +} + +int +spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, + bool readonly, bool packed_ring) +{ + struct spdk_vhost_blk_dev *bvdev = NULL; + struct spdk_vhost_dev *vdev; + struct spdk_bdev *bdev; + int ret = 0; + + spdk_vhost_lock(); + bdev = spdk_bdev_get_by_name(dev_name); + if (bdev == NULL) { + SPDK_ERRLOG("%s: bdev '%s' not found\n", + name, dev_name); + ret = -ENODEV; + goto out; + } + + bvdev = calloc(1, sizeof(*bvdev)); + if (bvdev == NULL) { + ret = -ENOMEM; + goto out; + } + + vdev = &bvdev->vdev; + vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE; + vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES; + vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES; + + vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED; + + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD); + } + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); + } + if (readonly) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO); + } + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH); + } + + ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc); + if (ret != 0) { + SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", + name, dev_name, ret); + goto out; + } + + /* + * When starting qemu with vhost-user-blk multiqueue, the vhost device will + * be started/stopped many times, related to the queues num, as the + * vhost-user backend doesn't know the exact number of queues used for this + * device. The target have to stop and start the device once got a valid + * IO queue. + * When stoping and starting the vhost device, the backend bdev io device + * will be deleted and created repeatedly. + * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that + * the io device will not be deleted. + */ + bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); + + bvdev->bdev = bdev; + bvdev->readonly = readonly; + ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend); + if (ret != 0) { + spdk_put_io_channel(bvdev->dummy_io_channel); + spdk_bdev_close(bvdev->bdev_desc); + goto out; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name); +out: + if (ret != 0 && bvdev) { + free(bvdev); + } + spdk_vhost_unlock(); + return ret; +} + +static int +vhost_blk_destroy(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + int rc; + + assert(bvdev != NULL); + + rc = vhost_dev_unregister(&bvdev->vdev); + if (rc != 0) { + return rc; + } + + /* if the bdev is removed, don't need call spdk_put_io_channel. */ + if (bvdev->bdev) { + spdk_put_io_channel(bvdev->dummy_io_channel); + } + + if (bvdev->bdev_desc) { + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + } + bvdev->bdev = NULL; + + free(bvdev); + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK) +SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA) diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h new file mode 100644 index 000000000..3aa89768a --- /dev/null +++ b/src/spdk/lib/vhost/vhost_internal.h @@ -0,0 +1,496 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VHOST_INTERNAL_H +#define SPDK_VHOST_INTERNAL_H +#include <linux/virtio_config.h> + +#include "spdk/stdinc.h" + +#include <rte_vhost.h> + +#include "spdk_internal/vhost_user.h" +#include "spdk_internal/log.h" +#include "spdk/util.h" +#include "spdk/rpc.h" +#include "spdk/config.h" + +#define SPDK_VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8 + +#define SPDK_VHOST_IOVS_MAX 129 + +#define SPDK_VHOST_VQ_MAX_SUBMISSIONS 32 + +/* + * Rate at which stats are checked for interrupt coalescing. + */ +#define SPDK_VHOST_STATS_CHECK_INTERVAL_MS 10 +/* + * Default threshold at which interrupts start to be coalesced. + */ +#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000 + +/* + * Currently coalescing is not used by default. + * Setting this to value > 0 here or by RPC will enable coalescing. + */ +#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0 + +#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_F_RING_PACKED)) + +#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY)) + +#define VRING_DESC_F_AVAIL (1ULL << VRING_PACKED_DESC_F_AVAIL) +#define VRING_DESC_F_USED (1ULL << VRING_PACKED_DESC_F_USED) +#define VRING_DESC_F_AVAIL_USED (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) + +typedef struct rte_vhost_resubmit_desc spdk_vhost_resubmit_desc; +typedef struct rte_vhost_resubmit_info spdk_vhost_resubmit_info; + +struct spdk_vhost_virtqueue { + struct rte_vhost_vring vring; + struct rte_vhost_ring_inflight vring_inflight; + uint16_t last_avail_idx; + uint16_t last_used_idx; + + struct { + /* To mark a descriptor as available in packed ring + * Equal to avail_wrap_counter in spec. + */ + uint8_t avail_phase : 1; + /* To mark a descriptor as used in packed ring + * Equal to used_wrap_counter in spec. + */ + uint8_t used_phase : 1; + uint8_t padding : 5; + bool packed_ring : 1; + } packed; + + void *tasks; + + /* Request count from last stats check */ + uint32_t req_cnt; + + /* Request count from last event */ + uint16_t used_req_cnt; + + /* How long interrupt is delayed */ + uint32_t irq_delay_time; + + /* Next time when we need to send event */ + uint64_t next_event_time; + + /* Associated vhost_virtqueue in the virtio device's virtqueue list */ + uint32_t vring_idx; +} __attribute((aligned(SPDK_CACHE_LINE_SIZE))); + +struct spdk_vhost_session { + struct spdk_vhost_dev *vdev; + + /* rte_vhost connection ID. */ + int vid; + + /* Unique session ID. */ + uint64_t id; + /* Unique session name. */ + char *name; + + bool initialized; + bool started; + bool needs_restart; + bool forced_polling; + + struct rte_vhost_memory *mem; + + int task_cnt; + + uint16_t max_queues; + + uint64_t negotiated_features; + + /* Local copy of device coalescing settings. */ + uint32_t coalescing_delay_time_base; + uint32_t coalescing_io_rate_threshold; + + /* Next time when stats for event coalescing will be checked. */ + uint64_t next_stats_check_time; + + /* Interval used for event coalescing checking. */ + uint64_t stats_check_interval; + + struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES]; + + TAILQ_ENTRY(spdk_vhost_session) tailq; +}; + +struct spdk_vhost_dev { + char *name; + char *path; + + struct spdk_thread *thread; + bool registered; + + uint64_t virtio_features; + uint64_t disabled_features; + uint64_t protocol_features; + + const struct spdk_vhost_dev_backend *backend; + + /* Saved orginal values used to setup coalescing to avoid integer + * rounding issues during save/load config. + */ + uint32_t coalescing_delay_us; + uint32_t coalescing_iops_threshold; + + /* Current connections to the device */ + TAILQ_HEAD(, spdk_vhost_session) vsessions; + + /* Increment-only session counter */ + uint64_t vsessions_num; + + /* Number of started and actively polled sessions */ + uint32_t active_session_num; + + /* Number of pending asynchronous operations */ + uint32_t pending_async_op_num; + + TAILQ_ENTRY(spdk_vhost_dev) tailq; +}; + +/** + * \param vdev vhost device. + * \param vsession vhost session. + * \param arg user-provided parameter. + * + * \return negative values will break the foreach call, meaning + * the function won't be called again. Return codes zero and + * positive don't have any effect. + */ +typedef int (*spdk_vhost_session_fn)(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *arg); + +/** + * \param vdev vhost device. + * \param arg user-provided parameter. + */ +typedef void (*spdk_vhost_dev_fn)(struct spdk_vhost_dev *vdev, void *arg); + +struct spdk_vhost_dev_backend { + /** + * Size of additional per-session context data + * allocated whenever a new client connects. + */ + size_t session_ctx_size; + + int (*start_session)(struct spdk_vhost_session *vsession); + int (*stop_session)(struct spdk_vhost_session *vsession); + + int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len); + int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t offset, uint32_t size, uint32_t flags); + + void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + int (*remove_device)(struct spdk_vhost_dev *vdev); +}; + +void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len); + +uint16_t vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs, + uint16_t reqs_len); + +/** + * Get a virtio split descriptor at given index in given virtqueue. + * The descriptor will provide access to the entire descriptor + * chain. The subsequent descriptors are accesible via + * \c spdk_vhost_vring_desc_get_next. + * \param vsession vhost session + * \param vq virtqueue + * \param req_idx descriptor index + * \param desc pointer to be set to the descriptor + * \param desc_table descriptor table to be used with + * \c spdk_vhost_vring_desc_get_next. This might be either + * default virtqueue descriptor table or per-chain indirect + * table. + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid. + * If -1 is returned, the content of params is undefined. + */ +int vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size); + +/** + * Get a virtio packed descriptor at given index in given virtqueue. + * The descriptor will provide access to the entire descriptor + * chain. The subsequent descriptors are accesible via + * \c vhost_vring_packed_desc_get_next. + * \param vsession vhost session + * \param vq virtqueue + * \param req_idx descriptor index + * \param desc pointer to be set to the descriptor + * \param desc_table descriptor table to be used with + * \c spdk_vhost_vring_desc_get_next. This might be either + * \c NULL or per-chain indirect table. + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid. + * If -1 is returned, the content of params is undefined. + */ +int vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_packed_desc **desc, + struct vring_packed_desc **desc_table, uint32_t *desc_table_size); + +/** + * Send IRQ/call client (if pending) for \c vq. + * \param vsession vhost session + * \param vq virtqueue + * \return + * 0 - if no interrupt was signalled + * 1 - if interrupt was signalled + */ +int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq); + + +/** + * Send IRQs for all queues that need to be signaled. + * \param vsession vhost session + * \param vq virtqueue + */ +void vhost_session_used_signal(struct spdk_vhost_session *vsession); + +void vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *vq, + uint16_t id, uint32_t len); + +/** + * Enqueue the entry to the used ring when device complete the request. + * \param vsession vhost session + * \param vq virtqueue + * \req_idx descriptor index. It's the first index of this descriptor chain. + * \num_descs descriptor count. It's the count of the number of buffers in the chain. + * \buffer_id descriptor buffer ID. + * \length device write length. Specify the length of the buffer that has been initialized + * (written to) by the device + */ +void vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t num_descs, uint16_t buffer_id, + uint32_t length); + +/** + * Get subsequent descriptor from given table. + * \param desc current descriptor, will be set to the + * next descriptor (NULL in case this is the last + * descriptor in the chain or the next desc is invalid) + * \param desc_table descriptor table + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid + * The *desc* param will be set regardless of the + * return value. + */ +int vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size); +static inline bool +vhost_vring_desc_is_wr(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +int vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc); + +bool vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue); + +/** + * Get subsequent descriptor from vq or desc table. + * \param desc current descriptor, will be set to the + * next descriptor (NULL in case this is the last + * descriptor in the chain or the next desc is invalid) + * \req_idx index of current desc, will be set to the next + * index. If desc_table != NULL the req_idx is the the vring index + * or the req_idx is the desc_table index. + * \param desc_table descriptor table + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid + * The *desc* param will be set regardless of the + * return value. + */ +int vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx, + struct spdk_vhost_virtqueue *vq, + struct vring_packed_desc *desc_table, + uint32_t desc_table_size); + +bool vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc); + +int vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_packed_desc *desc); + +uint16_t vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx, + uint16_t *num_descs); + +static inline bool __attribute__((always_inline)) +vhost_dev_has_feature(struct spdk_vhost_session *vsession, unsigned feature_id) +{ + return vsession->negotiated_features & (1ULL << feature_id); +} + +int vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend); +int vhost_dev_unregister(struct spdk_vhost_dev *vdev); + +int vhost_scsi_controller_construct(void); +int vhost_blk_controller_construct(void); +void vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + +/* + * Vhost callbacks for vhost_device_ops interface + */ + +int vhost_new_connection_cb(int vid, const char *ifname); +int vhost_start_device_cb(int vid); +int vhost_stop_device_cb(int vid); +int vhost_destroy_connection_cb(int vid); + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB +int vhost_get_config_cb(int vid, uint8_t *config, uint32_t len); +int vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, + uint32_t size, uint32_t flags); +#endif + +/* + * Memory registration functions used in start/stop device callbacks + */ +void vhost_session_mem_register(struct rte_vhost_memory *mem); +void vhost_session_mem_unregister(struct rte_vhost_memory *mem); + +/* + * Call a function for each session of the provided vhost device. + * The function will be called one-by-one on each session's thread. + * + * \param vdev vhost device + * \param fn function to call on each session's thread + * \param cpl_fn function to be called at the end of the iteration on + * the vhost management thread. + * Optional, can be NULL. + * \param arg additional argument to the both callbacks + */ +void vhost_dev_foreach_session(struct spdk_vhost_dev *dev, + spdk_vhost_session_fn fn, + spdk_vhost_dev_fn cpl_fn, + void *arg); + +/** + * Call a function on the provided lcore and block until either + * spdk_vhost_session_start_done() or spdk_vhost_session_stop_done() + * is called. + * + * This must be called under the global vhost mutex, which this function + * will unlock for the time it's waiting. It's meant to be called only + * from start/stop session callbacks. + * + * \param vsession vhost session + * \param cb_fn the function to call. The void *arg parameter in cb_fn + * is always NULL. + * \param timeout_sec timeout in seconds. This function will still + * block after the timeout expires, but will print the provided errmsg. + * \param errmsg error message to print once the timeout expires + * \return return the code passed to spdk_vhost_session_event_done(). + */ +int vhost_session_send_event(struct spdk_vhost_session *vsession, + spdk_vhost_session_fn cb_fn, unsigned timeout_sec, + const char *errmsg); + +/** + * Finish a blocking spdk_vhost_session_send_event() call and finally + * start the session. This must be called on the target lcore, which + * will now receive all session-related messages (e.g. from + * spdk_vhost_dev_foreach_session()). + * + * Must be called under the global vhost lock. + * + * \param vsession vhost session + * \param response return code + */ +void vhost_session_start_done(struct spdk_vhost_session *vsession, int response); + +/** + * Finish a blocking spdk_vhost_session_send_event() call and finally + * stop the session. This must be called on the session's lcore which + * used to receive all session-related messages (e.g. from + * spdk_vhost_dev_foreach_session()). After this call, the session- + * related messages will be once again processed by any arbitrary thread. + * + * Must be called under the global vhost lock. + * + * Must be called under the global vhost mutex. + * + * \param vsession vhost session + * \param response return code + */ +void vhost_session_stop_done(struct spdk_vhost_session *vsession, int response); + +struct spdk_vhost_session *vhost_session_find_by_vid(int vid); +void vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession); +int vhost_register_unix_socket(const char *path, const char *ctrl_name, + uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features); +int vhost_driver_unregister(const char *path); +int vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); +int vhost_get_negotiated_features(int vid, uint64_t *negotiated_features); + +int remove_vhost_controller(struct spdk_vhost_dev *vdev); + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB +int vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf); +int vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd); +int vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size); +int vhost_nvme_get_cap(int vid, uint64_t *cap); +int vhost_nvme_controller_construct(void); +int vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues); +int vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev); +int vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, + const char *bdev_name); +#endif + +#endif /* SPDK_VHOST_INTERNAL_H */ diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c new file mode 100644 index 000000000..10f53baf9 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_nvme.c @@ -0,0 +1,1500 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "spdk/bdev.h" +#include "spdk/version.h" +#include "spdk/nvme_spec.h" +#include "spdk/likely.h" + +#include "vhost_internal.h" + +#define MAX_IO_QUEUES 31 +#define MAX_IOVS 64 +#define MAX_NAMESPACE 8 +#define MAX_QUEUE_ENTRIES_SUPPORTED 256 +#define MAX_BATCH_IO 8 + +struct spdk_vhost_nvme_sq { + uint16_t sqid; + uint16_t size; + uint16_t cqid; + bool valid; + struct spdk_nvme_cmd *sq_cmd; + uint16_t sq_head; + uint16_t sq_tail; +}; + +struct spdk_vhost_nvme_cq { + uint8_t phase; + uint16_t size; + uint16_t cqid; + bool valid; + volatile struct spdk_nvme_cpl *cq_cqe; + uint16_t cq_head; + uint16_t guest_signaled_cq_head; + uint32_t need_signaled_cnt; + STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks; + bool irq_enabled; + int virq; +}; + +struct spdk_vhost_nvme_ns { + struct spdk_bdev *bdev; + uint32_t block_size; + uint64_t capacity; + uint32_t nsid; + uint32_t active_ns; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_nvme_ns_data nsdata; +}; + +struct spdk_vhost_nvme_task { + struct spdk_nvme_cmd cmd; + struct spdk_vhost_nvme_dev *nvme; + uint16_t sqid; + uint16_t cqid; + + /** array of iovecs to transfer. */ + struct iovec iovs[MAX_IOVS]; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_ns *ns; + + /* parent pointer. */ + struct spdk_vhost_nvme_task *parent; + uint8_t dnr; + uint8_t sct; + uint8_t sc; + uint32_t num_children; + STAILQ_ENTRY(spdk_vhost_nvme_task) stailq; +}; + +struct spdk_vhost_nvme_dev { + struct spdk_vhost_dev vdev; + + uint32_t num_io_queues; + union spdk_nvme_cap_register cap; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + struct spdk_nvme_ctrlr_data cdata; + + uint32_t num_sqs; + uint32_t num_cqs; + + uint32_t num_ns; + struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE]; + + volatile uint32_t *bar; + volatile uint32_t *bar_db; + uint64_t bar_size; + bool dataplane_started; + + volatile uint32_t *dbbuf_dbs; + volatile uint32_t *dbbuf_eis; + struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1]; + struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1]; + + /* The one and only session associated with this device */ + struct spdk_vhost_session *vsession; + + TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq; + STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks; + struct spdk_poller *requestq_poller; + struct spdk_poller *stop_poller; +}; + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend; + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static int +nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task); + +static struct spdk_vhost_nvme_dev * +to_nvme_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev->backend != &spdk_vhost_nvme_device_backend) { + SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev); +} + +static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs); + +static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride) +{ + return qid * 2 * db_stride; +} + +static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride) +{ + return (qid * 2 + 1) * db_stride; +} + +static void +nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq) +{ + cq->cq_head++; + if (cq->cq_head >= cq->size) { + cq->cq_head = 0; + cq->phase = !cq->phase; + } +} + +static bool +nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq) +{ + return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head); +} + +static void +nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq) +{ + sq->sq_head = (sq->sq_head + 1) % sq->size; +} + +static struct spdk_vhost_nvme_sq * +vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->sq_queue[qid]; +} + +static struct spdk_vhost_nvme_cq * +vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->cq_queue[qid]; +} + +static inline uint32_t +vhost_nvme_get_queue_head(struct spdk_vhost_nvme_dev *nvme, uint32_t offset) +{ + if (nvme->dataplane_started) { + return nvme->dbbuf_dbs[offset]; + + } else if (nvme->bar) { + return nvme->bar_db[offset]; + } + + assert(0); + + return 0; +} + +static void * +vhost_nvme_gpa_to_vva(void *priv, uint64_t addr, uint64_t len) +{ + struct spdk_vhost_session *vsession = priv; + + return vhost_gpa_to_vva(vsession, addr, len); +} + +static int +vhost_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd, + struct spdk_vhost_nvme_task *task, uint32_t len) +{ + int err; + + err = spdk_nvme_map_prps(nvme->vsession, cmd, task->iovs, len, 4096, + vhost_nvme_gpa_to_vva); + if (spdk_unlikely(err < 0)) { + return err; + } + task->iovcnt = err; + return 0; +} + +static void +nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_cq *cq; + uint32_t qid, cq_head; + + assert(nvme != NULL); + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq || !cq->valid) { + continue; + } + + cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(qid, 1)); + if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) { + eventfd_write(cq->virq, (eventfd_t)1); + cq->need_signaled_cnt = 0; + } + } +} + +static void +vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_dev *nvme = task->nvme; + struct spdk_nvme_cpl cqe = {0}; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + struct spdk_nvme_cmd *cmd = &task->cmd; + uint16_t cqid = task->cqid; + uint16_t sqid = task->sqid; + + cq = vhost_nvme_get_cq_from_qid(nvme, cqid); + sq = vhost_nvme_get_sq_from_qid(nvme, sqid); + if (spdk_unlikely(!cq || !sq)) { + return; + } + + cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(cqid, 1)); + if (spdk_unlikely(nvme_cq_is_full(cq))) { + STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq); + return; + } + + cqe.sqid = sqid; + cqe.sqhd = sq->sq_head; + cqe.cid = cmd->cid; + cqe.status.dnr = task->dnr; + cqe.status.sct = task->sct; + cqe.status.sc = task->sc; + cqe.status.p = !cq->phase; + cq->cq_cqe[cq->cq_head] = cqe; + spdk_smp_wmb(); + cq->cq_cqe[cq->cq_head].status.p = cq->phase; + + nvme_inc_cq_head(cq); + cq->need_signaled_cnt++; + + /* MMIO Controll */ + if (nvme->dataplane_started) { + nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *task = cb_arg; + struct spdk_nvme_cmd *cmd = &task->cmd; + int sc, sct; + uint32_t cdw0; + + assert(bdev_io != NULL); + + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); + spdk_bdev_free_io(bdev_io); + + task->dnr = !success; + task->sct = sct; + task->sc = sc; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10); + } + + vhost_nvme_task_complete(task); +} + +static void +blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *child = cb_arg; + struct spdk_vhost_nvme_task *task = child->parent; + struct spdk_vhost_nvme_dev *nvme = task->nvme; + int sct, sc; + uint32_t cdw0; + + assert(bdev_io != NULL); + + task->num_children--; + if (!success) { + task->dnr = 1; + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); + task->sct = sct; + task->sc = sc; + } + + spdk_bdev_free_io(bdev_io); + + if (!task->num_children) { + vhost_nvme_task_complete(task); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); +} + +static struct spdk_vhost_nvme_ns * +vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid) +{ + if (spdk_unlikely(!nsid || nsid > dev->num_ns)) { + return NULL; + } + + return &dev->ns[nsid - 1]; +} + +static void +vhost_nvme_resubmit_task(void *arg) +{ + struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg; + int rc; + + rc = nvme_process_sq(task->nvme, task->sq, task); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc); + } +} + +static int +vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task) +{ + int rc; + + task->bdev_io_wait.bdev = task->ns->bdev; + task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + vhost_nvme_task_complete(task); + } + + return rc; +} + +static int +nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_task *child; + struct spdk_nvme_cmd *cmd = &task->cmd; + struct spdk_vhost_nvme_ns *ns; + int ret = -1; + uint32_t len, nlba, block_size; + uint64_t slba; + struct spdk_nvme_dsm_range *range; + uint16_t i, num_ranges = 0; + + task->nvme = nvme; + task->dnr = 0; + task->sct = 0; + task->sc = 0; + + ns = vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid); + if (spdk_unlikely(!ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + vhost_nvme_task_complete(task); + return -1; + } + + block_size = ns->block_size; + task->num_children = 0; + task->cqid = sq->cqid; + task->sqid = sq->sqid; + + task->ns = ns; + + if (spdk_unlikely(!ns->active_ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + vhost_nvme_task_complete(task); + return -1; + } + + /* valid only for Read/Write commands */ + nlba = (cmd->cdw12 & 0xffff) + 1; + slba = cmd->cdw11; + slba = (slba << 32) | cmd->cdw10; + + if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE || + cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + if (cmd->psdt != SPDK_NVME_PSDT_PRP) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n", + cmd->psdt >> 1, cmd->psdt & 1u); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + vhost_nvme_task_complete(task); + return -1; + } + + if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + num_ranges = (cmd->cdw10 & 0xff) + 1; + len = num_ranges * sizeof(struct spdk_nvme_dsm_range); + } else { + len = nlba * block_size; + } + + ret = vhost_nvme_map_prps(nvme, cmd, task, len); + if (spdk_unlikely(ret != 0)) { + SPDK_ERRLOG("nvme command map prps failed\n"); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + vhost_nvme_task_complete(task); + return -1; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_WRITE: + ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_FLUSH: + ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel, + 0, ns->capacity, + blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base; + for (i = 0; i < num_ranges; i++) { + if (!STAILQ_EMPTY(&nvme->free_tasks)) { + child = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + SPDK_ERRLOG("No free task now\n"); + ret = -1; + break; + } + task->num_children++; + child->parent = task; + ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel, + range[i].starting_lba * block_size, + range[i].length * block_size, + blk_unmap_complete_cb, child); + if (ret) { + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); + break; + } + } + break; + default: + ret = -1; + break; + } + + if (spdk_unlikely(ret)) { + if (ret == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n"); + task->sq = sq; + ret = vhost_nvme_queue_task(task); + } else { + /* post error status to cqe */ + SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + vhost_nvme_task_complete(task); + } + } + + return ret; +} + +static int +nvme_worker(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_task *task; + uint32_t qid, dbbuf_sq; + int ret; + int count = -1; + + if (spdk_unlikely(!nvme->num_sqs)) { + return SPDK_POLLER_IDLE; + } + + if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) { + return SPDK_POLLER_IDLE; + } + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + + sq = vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq->valid) { + continue; + } + cq = vhost_nvme_get_cq_from_qid(nvme, sq->cqid); + if (spdk_unlikely(!cq)) { + return SPDK_POLLER_BUSY; + } + cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1)); + if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) && + !nvme_cq_is_full(cq))) { + task = STAILQ_FIRST(&cq->cq_full_waited_tasks); + STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq); + vhost_nvme_task_complete(task); + } + + dbbuf_sq = vhost_nvme_get_queue_head(nvme, sq_offset(qid, 1)); + sq->sq_tail = (uint16_t)dbbuf_sq; + count = 0; + + while (sq->sq_head != sq->sq_tail) { + if (spdk_unlikely(!sq->sq_cmd)) { + break; + } + if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + return SPDK_POLLER_BUSY; + } + + task->cmd = sq->sq_cmd[sq->sq_head]; + nvme_inc_sq_head(sq); + + /* processing IO */ + ret = nvme_process_sq(nvme, sq, task); + if (spdk_unlikely(ret)) { + SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head, + sq->sq_tail); + } + + /* MMIO Control */ + if (nvme->dataplane_started) { + nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1); + } + + /* Maximum batch I/Os to pick up at once */ + if (count++ == MAX_BATCH_IO) { + break; + } + } + } + + /* Completion Queue */ + nvme_cq_signal_fd(nvme); + + return count; +} + +static int +vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + struct spdk_vhost_session *vsession = nvme->vsession; + uint64_t dbs_dma_addr, eis_dma_addr; + + dbs_dma_addr = cmd->dptr.prp.prp1; + eis_dma_addr = cmd->dptr.prp.prp2; + + if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) { + return -1; + } + /* Guest Physical Address to Host Virtual Address */ + nvme->dbbuf_dbs = vhost_gpa_to_vva(vsession, dbs_dma_addr, 4096); + nvme->dbbuf_eis = vhost_gpa_to_vva(vsession, eis_dma_addr, 4096); + if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) { + return -1; + } + /* zeroed the doorbell buffer memory */ + memset((void *)nvme->dbbuf_dbs, 0, 4096); + memset((void *)nvme->dbbuf_eis, 0, 4096); + + cpl->status.sc = 0; + cpl->status.sct = 0; + + /* Data plane started */ + nvme->dataplane_started = true; + + return 0; +} + +static int +vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid, qsize, cqid; + uint64_t dma_addr; + uint64_t requested_len; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + cqid = (cmd->cdw11 >> 16) & 0xffff; + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + sq = vhost_nvme_get_sq_from_qid(nvme, qid); + cq = vhost_nvme_get_cq_from_qid(nvme, cqid); + if (!sq || !cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n", + qid, cqid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + + sq->sqid = qid; + sq->cqid = cqid; + sq->size = qsize + 1; + sq->sq_head = sq->sq_tail = 0; + requested_len = sizeof(struct spdk_nvme_cmd) * sq->size; + sq->sq_cmd = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len); + if (!sq->sq_cmd) { + return -1; + } + nvme->num_sqs++; + sq->valid = true; + if (nvme->bar) { + nvme->bar_db[sq_offset(qid, 1)] = 0; + } + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_sq *sq; + + qid = cmd->cdw10 & 0xffff; + sq = vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq) { + return -1; + } + + /* We didn't see scenarios when deleting submission + * queue while I/O is running against the submisson + * queue for now, otherwise, we must ensure the poller + * will not run with this submission queue. + */ + nvme->num_sqs--; + sq->valid = false; + + memset(sq, 0, sizeof(*sq)); + sq->sq_cmd = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + + return 0; +} + +static int +vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qsize, qid; + uint64_t dma_addr; + struct spdk_vhost_nvme_cq *cq; + uint64_t requested_len; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + cq->cqid = qid; + cq->size = qsize + 1; + cq->phase = 1; + cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1; + /* Setup virq through vhost messages */ + cq->virq = -1; + cq->cq_head = 0; + cq->guest_signaled_cq_head = 0; + cq->need_signaled_cnt = 0; + requested_len = sizeof(struct spdk_nvme_cpl) * cq->size; + cq->cq_cqe = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len); + if (!cq->cq_cqe) { + return -1; + } + nvme->num_cqs++; + cq->valid = true; + if (nvme->bar) { + nvme->bar_db[cq_offset(qid, 1)] = 0; + } + STAILQ_INIT(&cq->cq_full_waited_tasks); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_cq *cq; + + qid = cmd->cdw10 & 0xffff; + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + nvme->num_cqs--; + cq->valid = false; + + memset(cq, 0, sizeof(*cq)); + cq->cq_cqe = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static struct spdk_vhost_nvme_dev * +vhost_nvme_get_by_name(int vid) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + + TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) { + vdev = &nvme->vdev; + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->vid == vid) { + return nvme; + } + } + } + + return NULL; +} + +int +vhost_nvme_get_cap(int vid, uint64_t *cap) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + *cap = nvme->cap.raw; + return 0; +} + +int +vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf) +{ + struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd; + struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe; + struct spdk_vhost_nvme_ns *ns; + int ret = 0; + struct spdk_vhost_nvme_dev *nvme; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc); + switch (req->opc) { + case SPDK_NVME_OPC_IDENTIFY: + if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) { + memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data)); + + } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) { + ns = vhost_nvme_get_ns_from_nsid(nvme, req->nsid); + if (!ns) { + cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE; + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + break; + } + memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data)); + } + /* successfully */ + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + case SPDK_NVME_OPC_CREATE_IO_CQ: + ret = vhost_nvme_create_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_CQ: + ret = vhost_nvme_delete_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_CREATE_IO_SQ: + ret = vhost_nvme_create_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_SQ: + ret = vhost_nvme_delete_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_GET_FEATURES: + case SPDK_NVME_OPC_SET_FEATURES: + if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) { + cpl->status.sc = 0; + cpl->status.sct = 0; + cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16); + } else { + cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD; + cpl->status.sct = SPDK_NVME_SCT_GENERIC; + } + break; + case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: + ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl); + break; + case SPDK_NVME_OPC_ABORT: + /* TODO: ABORT failed fow now */ + cpl->cdw0 = 1; + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + } + + if (ret) { + SPDK_ERRLOG("Admin Passthrough Failed with %u\n", req->opc); + } + + return 0; +} + +int +vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + nvme->bar = (volatile uint32_t *)(uintptr_t)(bar_addr); + /* BAR0 SQ/CQ doorbell registers start from offset 0x1000 */ + nvme->bar_db = (volatile uint32_t *)(uintptr_t)(bar_addr + 0x1000ull); + nvme->bar_size = bar_size; + + return 0; +} + +int +vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_nvme_cq *cq; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + if (cq->irq_enabled) { + cq->virq = fd; + } else { + SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_task *task; + + while (!STAILQ_EMPTY(&nvme->free_tasks)) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + spdk_free(task); + } +} + +static int +alloc_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + uint32_t entries, i; + struct spdk_vhost_nvme_task *task; + + entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED; + + for (i = 0; i < entries; i++) { + task = spdk_zmalloc(sizeof(struct spdk_vhost_nvme_task), + SPDK_CACHE_LINE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (task == NULL) { + SPDK_ERRLOG("Controller %s alloc task pool failed\n", + nvme->vdev.name); + free_task_pool(nvme); + return -1; + } + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); + } + + return 0; +} + +static int +vhost_nvme_start_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + int rc = 0; + + if (nvme == NULL) { + rc = -1; + goto out; + } + + rc = alloc_task_pool(nvme); + if (rc) { + goto out; + } + + SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vsession->vid, + vdev->path, spdk_env_get_current_core()); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc); + if (!ns_dev->bdev_io_channel) { + rc = -1; + goto out; + } + } + + nvme->vsession = vsession; + /* Start the NVMe Poller */ + nvme->requestq_poller = SPDK_POLLER_REGISTER(nvme_worker, nvme, 0); + +out: + vhost_session_start_done(vsession, rc); + return rc; +} + +static int +vhost_nvme_start(struct spdk_vhost_session *vsession) +{ + if (vsession->vdev->active_session_num > 0) { + /* We're trying to start a second session */ + SPDK_ERRLOG("Vhost-NVMe devices can support only one simultaneous connection.\n"); + return -1; + } + + return vhost_session_send_event(vsession, vhost_nvme_start_cb, + 3, "start session"); +} + +static void +vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns) +{ + ns->active_ns = 0; + spdk_bdev_close(ns->bdev_desc); + ns->bdev_desc = NULL; + ns->bdev = NULL; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_nvme_ns *ns = remove_ctx; + + SPDK_NOTICELOG("Removing NS %u, Block Device %s\n", + ns->nsid, spdk_bdev_get_name(ns->bdev)); + + vhost_nvme_deactive_ns(ns); +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = arg; + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n"); + + /* FIXME wait for pending I/Os to complete */ + + if (spdk_vhost_trylock() != 0) { + return SPDK_POLLER_BUSY; + } + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (ns_dev->bdev_io_channel) { + spdk_put_io_channel(ns_dev->bdev_io_channel); + ns_dev->bdev_io_channel = NULL; + } + } + /* Clear BAR space */ + if (nvme->bar) { + memset((void *)nvme->bar, 0, nvme->bar_size); + } + nvme->num_sqs = 0; + nvme->num_cqs = 0; + nvme->dbbuf_dbs = NULL; + nvme->dbbuf_eis = NULL; + nvme->dataplane_started = false; + + spdk_poller_unregister(&nvme->stop_poller); + vhost_session_stop_done(nvme->vsession, 0); + + spdk_vhost_unlock(); + return SPDK_POLLER_BUSY; +} + +static int +vhost_nvme_stop_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + + if (nvme == NULL) { + vhost_session_stop_done(vsession, -1); + return -1; + } + + free_task_pool(nvme); + SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vsession->vid, vdev->path); + + spdk_poller_unregister(&nvme->requestq_poller); + nvme->stop_poller = SPDK_POLLER_REGISTER(destroy_device_poller_cb, nvme, 1000); + + return 0; +} + +static int +vhost_nvme_stop(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_nvme_stop_cb, + 3, "start session"); +} + +static void +vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_named_array_begin(w, "namespaces"); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid); + spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_create_nvme_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues); + spdk_json_write_named_string(w, "cpumask", + spdk_cpuset_fmt(spdk_thread_get_cpumask(nvme->vdev.thread))); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_nvme_controller_add_ns"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = { + .session_ctx_size = 0, + .start_session = vhost_nvme_start, + .stop_session = vhost_nvme_stop, + .dump_info_json = vhost_nvme_dump_info_json, + .write_config_json = vhost_nvme_write_config_json, + .remove_device = vhost_nvme_dev_remove, +}; + +static int +vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + struct spdk_nvme_ns_data *nsdata; + uint64_t num_blocks; + uint32_t i; + + /* Identify Namespace */ + cdata->nn = dev->num_ns; + for (i = 0; i < dev->num_ns; i++) { + nsdata = &dev->ns[i].nsdata; + if (dev->ns[i].active_ns) { + num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev); + nsdata->nsze = num_blocks; + /* ncap must be non-zero for active Namespace */ + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev); + dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev); + dev->ns[i].capacity = num_blocks * dev->ns[i].block_size; + } else { + memset(nsdata, 0, sizeof(*nsdata)); + } + } + return 0; +} + +static int +vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + char sn[20]; + + /* Controller Capabilities */ + dev->cap.bits.cqr = 1; + dev->cap.bits.to = 1; + dev->cap.bits.dstrd = 0; + dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + dev->cap.bits.mpsmin = 0; + dev->cap.bits.mpsmax = 0; + /* MQES is 0 based value */ + dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1; + + /* Controller Configuration */ + dev->cc.bits.en = 0; + + /* Controller Status */ + dev->csts.bits.rdy = 0; + + /* Identify Controller */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + cdata->vid = 0x8086; + cdata->ssvid = 0x8086; + spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' '); + snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name); + spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' '); + cdata->ieee[0] = 0xe4; + cdata->ieee[1] = 0xd2; + cdata->ieee[2] = 0x5c; + cdata->ver.bits.mjr = 1; + cdata->ver.bits.mnr = 0; + cdata->mdts = 5; /* 128 KiB */ + cdata->rab = 6; + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->oncs.dsm = 1; + /* Emulated NVMe controller */ + cdata->oacs.doorbell_buffer_config = 1; + + vhost_nvme_ns_identify_update(dev); + + return 0; +} + +int +vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues) +{ + struct spdk_vhost_nvme_dev *dev; + int rc; + + if (posix_memalign((void **)&dev, SPDK_CACHE_LINE_SIZE, sizeof(*dev))) { + return -ENOMEM; + } + memset(dev, 0, sizeof(*dev)); + + if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) { + free(dev); + return -EINVAL; + } + + spdk_vhost_lock(); + rc = vhost_dev_register(&dev->vdev, name, cpumask, + &spdk_vhost_nvme_device_backend); + + if (rc) { + free(dev); + spdk_vhost_unlock(); + return rc; + } + + dev->num_io_queues = num_io_queues; + STAILQ_INIT(&dev->free_tasks); + TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq); + + vhost_nvme_ctrlr_identify_update(dev); + + SPDK_NOTICELOG("Controller %s: Constructed\n", name); + spdk_vhost_unlock(); + return rc; +} + +int +vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + int rc; + uint32_t i; + + if (nvme == NULL) { + return -EINVAL; + } + + TAILQ_REMOVE(&g_nvme_ctrlrs, nvme, tailq); + for (i = 0; i < nvme->num_ns; i++) { + ns = &nvme->ns[i]; + if (ns->active_ns) { + vhost_nvme_deactive_ns(ns); + } + } + + rc = vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + free(nvme); + return 0; +} + +int +vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + struct spdk_bdev *bdev; + int rc = -1; + + if (nvme == NULL) { + return -ENODEV; + } + + if (nvme->num_ns == MAX_NAMESPACE) { + SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns); + return -ENOSPC; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("could not find bdev %s\n", bdev_name); + return -ENODEV; + } + + ns = &nvme->ns[nvme->num_ns]; + rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("Could not open bdev '%s', error=%d\n", + bdev_name, rc); + return rc; + } + + nvme->ns[nvme->num_ns].bdev = bdev; + nvme->ns[nvme->num_ns].active_ns = 1; + nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1; + nvme->num_ns++; + + vhost_nvme_ns_identify_update(nvme); + + return rc; +} + +int +vhost_nvme_controller_construct(void) +{ + struct spdk_conf_section *sp; + const char *name; + const char *bdev_name; + const char *cpumask; + int rc, i = 0; + struct spdk_vhost_dev *vdev; + uint32_t ctrlr_num, io_queues; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + rc = spdk_conf_section_get_intval(sp, "NumberOfQueues"); + if (rc > 0) { + io_queues = rc; + } else { + io_queues = 1; + } + + rc = vhost_nvme_dev_construct(name, cpumask, io_queues); + if (rc < 0) { + SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num); + return -1; + } + + vdev = spdk_vhost_dev_find(name); + if (!vdev) { + return -1; + } + + for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0); + if (!bdev_name) { + SPDK_ERRLOG("namespace configuration missing bdev name\n"); + break; + } + rc = vhost_nvme_dev_add_ns(vdev, bdev_name); + if (rc < 0) { + SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n", + ctrlr_num, bdev_name); + break; + } + } + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME) diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c new file mode 100644 index 000000000..196d75918 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_rpc.c @@ -0,0 +1,652 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk/scsi.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" +#include "spdk/bdev.h" + +struct rpc_vhost_scsi_ctrlr { + char *ctrlr; + char *cpumask; +}; + +static void +free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static const struct spdk_json_object_decoder rpc_vhost_create_scsi_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +rpc_vhost_create_scsi_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_scsi_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_create_scsi_ctrlr, + SPDK_COUNTOF(rpc_vhost_create_scsi_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_scsi_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_scsi_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_create_scsi_controller", rpc_vhost_create_scsi_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_scsi_controller, construct_vhost_scsi_controller) + +struct rpc_vhost_scsi_ctrlr_add_target { + char *ctrlr; + int32_t scsi_target_num; + char *bdev_name; +}; + +static void +free_rpc_vhost_scsi_ctrlr_add_target(struct rpc_vhost_scsi_ctrlr_add_target *req) +{ + free(req->ctrlr); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_vhost_scsi_ctrlr_add_target[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, scsi_target_num), spdk_json_decode_int32}, + {"bdev_name", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, bdev_name), spdk_json_decode_string }, +}; + +static void +rpc_vhost_scsi_controller_add_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_scsi_ctrlr_add_target req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_scsi_ctrlr_add_target, + SPDK_COUNTOF(rpc_vhost_scsi_ctrlr_add_target), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_add_tgt(vdev, req.scsi_target_num, req.bdev_name); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_scsi_ctrlr_add_target(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_int32(w, rc); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_scsi_ctrlr_add_target(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_scsi_controller_add_target", rpc_vhost_scsi_controller_add_target, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_add_target, add_vhost_scsi_lun) + +struct rpc_remove_vhost_scsi_ctrlr_target { + char *ctrlr; + uint32_t scsi_target_num; +}; + +static void +free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req) +{ + free(req->ctrlr); +} + +static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = { + {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32}, +}; + +static int +rpc_vhost_scsi_controller_remove_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct spdk_jsonrpc_request *request = arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; +} + +static void +rpc_vhost_scsi_controller_remove_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_vhost_scsi_ctrlr_target req = {0}; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_remove_target, + SPDK_COUNTOF(rpc_vhost_remove_target), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, req.scsi_target_num, + rpc_vhost_scsi_controller_remove_target_finish_cb, + request); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + + free_rpc_remove_vhost_scsi_ctrlr_target(&req); + return; + +invalid: + free_rpc_remove_vhost_scsi_ctrlr_target(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} + +SPDK_RPC_REGISTER("vhost_scsi_controller_remove_target", + rpc_vhost_scsi_controller_remove_target, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_remove_target, remove_vhost_scsi_target) + +struct rpc_vhost_blk_ctrlr { + char *ctrlr; + char *dev_name; + char *cpumask; + bool readonly; + bool packed_ring; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string }, + {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true}, + {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true}, + {"packed_ring", offsetof(struct rpc_vhost_blk_ctrlr, packed_ring), spdk_json_decode_bool, true}, +}; + +static void +free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req) +{ + free(req->ctrlr); + free(req->dev_name); + free(req->cpumask); +} + +static void +rpc_vhost_create_blk_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_blk_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name, + req.readonly, req.packed_ring); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_blk_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_blk_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("vhost_create_blk_controller", rpc_vhost_create_blk_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_blk_controller, construct_vhost_blk_controller) + +struct rpc_delete_vhost_ctrlr { + char *ctrlr; +}; + +static const struct spdk_json_object_decoder rpc_delete_vhost_ctrlr_decoder[] = { + {"ctrlr", offsetof(struct rpc_delete_vhost_ctrlr, ctrlr), spdk_json_decode_string }, +}; + +static void +free_rpc_delete_vhost_ctrlr(struct rpc_delete_vhost_ctrlr *req) +{ + free(req->ctrlr); +} + +static void +rpc_vhost_delete_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_vhost_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_vhost_ctrlr_decoder, + SPDK_COUNTOF(rpc_delete_vhost_ctrlr_decoder), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_dev_remove(vdev); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + + free_rpc_delete_vhost_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + free_rpc_delete_vhost_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("vhost_delete_controller", rpc_vhost_delete_controller, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_delete_controller, remove_vhost_controller) + +struct rpc_get_vhost_ctrlrs { + char *name; +}; + +static void +_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev) +{ + uint32_t delay_base_us, iops_threshold; + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev)); + spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", + spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); + spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); + spdk_json_write_named_string(w, "socket", vdev->path); + + spdk_json_write_named_object_begin(w, "backend_specific"); + vhost_dump_info_json(vdev, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = { + {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true}, +}; + +static void +free_rpc_get_vhost_ctrlrs(struct rpc_get_vhost_ctrlrs *req) +{ + free(req->name); +} + +static void +rpc_vhost_get_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_vhost_ctrlrs req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders, + SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + if (req.name != NULL) { + vdev = spdk_vhost_dev_find(req.name); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + free_rpc_get_vhost_ctrlrs(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + _rpc_get_vhost_controller(w, vdev); + spdk_vhost_unlock(); + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + return; + } + + free_rpc_get_vhost_ctrlrs(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + vdev = spdk_vhost_dev_next(NULL); + while (vdev != NULL) { + _rpc_get_vhost_controller(w, vdev); + vdev = spdk_vhost_dev_next(vdev); + } + spdk_vhost_unlock(); + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_get_vhost_ctrlrs(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_get_controllers", rpc_vhost_get_controllers, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_get_controllers, get_vhost_controllers) + + +struct rpc_vhost_ctrlr_coalescing { + char *ctrlr; + uint32_t delay_base_us; + uint32_t iops_threshold; +}; + +static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = { + {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string }, + {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32}, + {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32}, +}; + +static void +free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req) +{ + free(req->ctrlr); +} + +static void +rpc_vhost_controller_set_coalescing(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_ctrlr_coalescing req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing, + SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_set_coalescing(vdev, req.delay_base_us, req.iops_threshold); + spdk_vhost_unlock(); + if (rc) { + goto invalid; + } + + free_rpc_set_vhost_controllers_event_coalescing(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + free_rpc_set_vhost_controllers_event_coalescing(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_controller_set_coalescing", rpc_vhost_controller_set_coalescing, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_controller_set_coalescing, set_vhost_controller_coalescing) + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + +struct rpc_vhost_nvme_ctrlr { + char *ctrlr; + uint32_t io_queues; + char *cpumask; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string }, + {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32}, + {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static void +rpc_vhost_create_nvme_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_nvme_ctrlr req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_nvme_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_nvme_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("vhost_create_nvme_controller", rpc_vhost_create_nvme_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_nvme_controller, construct_vhost_nvme_controller) + +struct rpc_vhost_nvme_ctrlr_add_ns { + char *ctrlr; + char *bdev_name; +}; + +static void +free_rpc_vhost_nvme_ctrlr_add_ns(struct rpc_vhost_nvme_ctrlr_add_ns *req) +{ + free(req->ctrlr); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = { + {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, ctrlr), spdk_json_decode_string }, + {"bdev_name", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, bdev_name), spdk_json_decode_string }, +}; + +static void +rpc_vhost_nvme_controller_add_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_nvme_ctrlr_add_ns req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns, + SPDK_COUNTOF(rpc_vhost_nvme_add_ns), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = vhost_nvme_dev_add_ns(vdev, req.bdev_name); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + free_rpc_vhost_nvme_ctrlr_add_ns(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_nvme_ctrlr_add_ns(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_nvme_controller_add_ns", rpc_vhost_nvme_controller_add_ns, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_nvme_controller_add_ns, add_vhost_nvme_ns) + +#endif /* SPDK_CONFIG_VHOST_INTERNAL_LIB */ + +SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC) diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c new file mode 100644 index 000000000..49e49dc76 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_scsi.c @@ -0,0 +1,1536 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <linux/virtio_scsi.h> + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/scsi.h" +#include "spdk/scsi_spec.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +#include "spdk/vhost.h" +#include "vhost_internal.h" + +/* Features supported by SPDK VHOST lib. */ +#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_INOUT) | \ + (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ + (1ULL << VIRTIO_SCSI_F_CHANGE ) | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +/* Features that are specified in VIRTIO SCSI but currently not supported: + * - Live migration not supported yet + * - T10 PI + */ +#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +#define MGMT_POLL_PERIOD_US (1000 * 5) + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +enum spdk_scsi_dev_vhost_status { + /* Target ID is empty. */ + VHOST_SCSI_DEV_EMPTY, + + /* Target is still being added. */ + VHOST_SCSI_DEV_ADDING, + + /* Target ID occupied. */ + VHOST_SCSI_DEV_PRESENT, + + /* Target ID is occupied but removal is in progress. */ + VHOST_SCSI_DEV_REMOVING, + + /* In session - device (SCSI target) seen but removed. */ + VHOST_SCSI_DEV_REMOVED, +}; + +/** Context for a SCSI target in a vhost device */ +struct spdk_scsi_dev_vhost_state { + struct spdk_scsi_dev *dev; + enum spdk_scsi_dev_vhost_status status; + spdk_vhost_event_fn remove_cb; + void *remove_ctx; +}; + +struct spdk_vhost_scsi_dev { + int ref; + bool registered; + struct spdk_vhost_dev vdev; + struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; +}; + +/** Context for a SCSI target in a vhost session */ +struct spdk_scsi_dev_session_state { + struct spdk_scsi_dev *dev; + enum spdk_scsi_dev_vhost_status status; +}; + +struct spdk_vhost_scsi_session { + struct spdk_vhost_session vsession; + + struct spdk_vhost_scsi_dev *svdev; + /** Local copy of the device state */ + struct spdk_scsi_dev_session_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + struct spdk_poller *requestq_poller; + struct spdk_poller *mgmt_poller; + struct spdk_poller *stop_poller; +}; + +struct spdk_vhost_scsi_task { + struct spdk_scsi_task scsi; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + + union { + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + }; + + struct spdk_vhost_scsi_session *svsession; + struct spdk_scsi_dev *scsi_dev; + + /** Number of bytes that were written. */ + uint32_t used_len; + + int req_idx; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + struct spdk_vhost_virtqueue *vq; +}; + +static int vhost_scsi_start(struct spdk_vhost_session *vsession); +static int vhost_scsi_stop(struct spdk_vhost_session *vsession); +static void vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static void vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static int vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev); + +static const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = { + .session_ctx_size = sizeof(struct spdk_vhost_scsi_session) - sizeof(struct spdk_vhost_session), + .start_session = vhost_scsi_start, + .stop_session = vhost_scsi_stop, + .dump_info_json = vhost_scsi_dump_info_json, + .write_config_json = vhost_scsi_write_config_json, + .remove_device = vhost_scsi_dev_remove, +}; + +static inline void +scsi_task_init(struct spdk_vhost_scsi_task *task) +{ + memset(&task->scsi, 0, sizeof(task->scsi)); + /* Tmf_resp pointer and resp pointer are in a union. + * Here means task->tmf_resp = task->resp = NULL. + */ + task->resp = NULL; + task->used = true; + task->used_len = 0; +} + +static void +vhost_scsi_task_put(struct spdk_vhost_scsi_task *task) +{ + spdk_scsi_task_put(&task->scsi); +} + +static void +vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + struct spdk_vhost_session *vsession = &task->svsession->vsession; + + assert(vsession->task_cnt > 0); + vsession->task_cnt--; + task->used = false; +} + +static void +remove_scsi_tgt(struct spdk_vhost_scsi_dev *svdev, + unsigned scsi_tgt_num) +{ + struct spdk_scsi_dev_vhost_state *state; + struct spdk_scsi_dev *dev; + + state = &svdev->scsi_dev_state[scsi_tgt_num]; + dev = state->dev; + state->dev = NULL; + assert(state->status == VHOST_SCSI_DEV_REMOVING); + state->status = VHOST_SCSI_DEV_EMPTY; + spdk_scsi_dev_destruct(dev, NULL, NULL); + if (state->remove_cb) { + state->remove_cb(&svdev->vdev, state->remove_ctx); + state->remove_cb = NULL; + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n", + svdev->vdev.name, scsi_tgt_num); + + if (--svdev->ref == 0 && svdev->registered == false) { + free(svdev); + } +} + +static void +vhost_scsi_dev_process_removed_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev, + struct spdk_vhost_scsi_dev, vdev); + + /* all sessions have already detached the device */ + if (svdev->scsi_dev_state[scsi_tgt_num].status != VHOST_SCSI_DEV_REMOVING) { + /* device was already removed in the meantime */ + return; + } + + remove_scsi_tgt(svdev, scsi_tgt_num); +} + +static int +vhost_scsi_session_process_removed(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession; + struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num]; + + if (state->dev != NULL) { + /* there's still a session that references this device, + * so abort our foreach chain here. We'll be called + * again from this session's management poller after it + * is removed in there + */ + return -1; + } + + return 0; +} + +static void +process_removed_devs(struct spdk_vhost_scsi_session *svsession) +{ + struct spdk_scsi_dev *dev; + struct spdk_scsi_dev_session_state *state; + int i; + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + state = &svsession->scsi_dev_state[i]; + dev = state->dev; + + if (dev && state->status == VHOST_SCSI_DEV_REMOVING && + !spdk_scsi_dev_has_pending_tasks(dev, NULL)) { + /* detach the device from this session */ + spdk_scsi_dev_free_io_channels(dev); + state->dev = NULL; + state->status = VHOST_SCSI_DEV_REMOVED; + /* try to detach it globally */ + spdk_vhost_lock(); + vhost_dev_foreach_session(&svsession->svdev->vdev, + vhost_scsi_session_process_removed, + vhost_scsi_dev_process_removed_cpl_cb, + (void *)(uintptr_t)i); + spdk_vhost_unlock(); + } + } +} + +static void +eventq_enqueue(struct spdk_vhost_scsi_session *svsession, unsigned scsi_dev_num, + uint32_t event, uint32_t reason) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_vhost_virtqueue *vq; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_event *desc_ev; + uint32_t desc_table_size, req_size = 0; + uint16_t req; + int rc; + + assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + vq = &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]; + + if (vq->vring.desc == NULL || vhost_vq_avail_ring_get(vq, &req, 1) != 1) { + SPDK_ERRLOG("%s: failed to send virtio event (no avail ring entries?).\n", + vsession->name); + return; + } + + rc = vhost_vq_get_desc(vsession, vq, req, &desc, &desc_table, &desc_table_size); + if (rc != 0 || desc->len < sizeof(*desc_ev)) { + SPDK_ERRLOG("%s: invalid eventq descriptor at index %"PRIu16".\n", + vsession->name, req); + goto out; + } + + desc_ev = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*desc_ev)); + if (desc_ev == NULL) { + SPDK_ERRLOG("%s: eventq descriptor at index %"PRIu16" points " + "to unmapped guest memory address %p.\n", + vsession->name, req, (void *)(uintptr_t)desc->addr); + goto out; + } + + desc_ev->event = event; + desc_ev->lun[0] = 1; + desc_ev->lun[1] = scsi_dev_num; + /* virtio LUN id 0 can refer either to the entire device + * or actual LUN 0 (the only supported by vhost for now) + */ + desc_ev->lun[2] = 0 >> 8; + desc_ev->lun[3] = 0 & 0xFF; + /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3) + * current implementation relies on linux kernel sources + */ + memset(&desc_ev->lun[4], 0, 4); + desc_ev->reason = reason; + req_size = sizeof(*desc_ev); + +out: + vhost_vq_used_ring_enqueue(vsession, vq, req, req_size); +} + +static void +submit_completion(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + + vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, + task->used_len); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx); + + vhost_scsi_task_put(task); +} + +static void +vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + submit_completion(task); +} + +static void +vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + /* The SCSI task has completed. Do final processing and then post + notification to the virtqueue's "used" ring. + */ + task->resp->status = task->scsi.status; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len); + task->resp->sense_len = task->scsi.sense_data_len; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx, + task->scsi.status); + } + assert(task->scsi.transfer_len == task->scsi.length); + task->resp->resid = task->scsi.length - task->scsi.data_transferred; + + submit_completion(task); +} + +static void +task_submit(struct spdk_vhost_scsi_task *task) +{ + task->resp->response = VIRTIO_SCSI_S_OK; + spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi); +} + +static void +mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func) +{ + task->tmf_resp->response = VIRTIO_SCSI_S_OK; + task->scsi.function = func; + spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi); +} + +static void +invalid_request(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + + vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, + task->used_len); + vhost_scsi_task_put(task); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n", + task->resp ? task->resp->response : -1); +} + +static int +vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun) +{ + struct spdk_vhost_scsi_session *svsession = task->svsession; + struct spdk_scsi_dev_session_state *state; + uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; + + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8); + + /* First byte must be 1 and second is target */ + if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + return -1; + } + + state = &svsession->scsi_dev_state[lun[1]]; + task->scsi_dev = state->dev; + if (state->dev == NULL || state->status != VHOST_SCSI_DEV_PRESENT) { + /* If dev has been hotdetached, return 0 to allow sending + * additional hotremove event via sense codes. + */ + return state->status != VHOST_SCSI_DEV_EMPTY ? 0 : -1; + } + + task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0); + task->scsi.lun = spdk_scsi_dev_get_lun(state->dev, lun_id); + return 0; +} + +static void +process_ctrl_request(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_ctrl_tmf_req *ctrl_req; + struct virtio_scsi_ctrl_an_resp *an_resp; + uint32_t desc_table_size, used_len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_mgmt_cpl, vhost_scsi_task_free_cb); + rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, + &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("%s: invalid controlq descriptor at index %d.\n", + vsession->name, task->req_idx); + goto out; + } + + ctrl_req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*ctrl_req)); + if (ctrl_req == NULL) { + SPDK_ERRLOG("%s: invalid task management request at index %d.\n", + vsession->name, task->req_idx); + goto out; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, + "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n", + task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->last_used_idx, + task->vq->vring.kickfd, task->vq->vring.size); + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, desc->len); + + vhost_scsi_task_init_target(task, ctrl_req->lun); + + vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (spdk_unlikely(desc == NULL)) { + SPDK_ERRLOG("%s: no response descriptor for controlq request %d.\n", + vsession->name, task->req_idx); + goto out; + } + + /* Process the TMF request */ + switch (ctrl_req->type) { + case VIRTIO_SCSI_T_TMF: + task->tmf_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->tmf_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) { + SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto out; + } + + /* Check if we are processing a valid request */ + if (task->scsi_dev == NULL) { + task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; + break; + } + + switch (ctrl_req->subtype) { + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + /* Handle LUN reset */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: LUN reset\n", vsession->name); + + mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + return; + default: + task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED; + /* Unsupported command */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: unsupported TMF command %x\n", + vsession->name, ctrl_req->subtype); + break; + } + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: { + an_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*an_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) { + SPDK_WARNLOG("%s: asynchronous response descriptor points to invalid guest memory region\n", + vsession->name); + goto out; + } + + an_resp->response = VIRTIO_SCSI_S_ABORTED; + break; + } + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: Unsupported control command %x\n", + vsession->name, ctrl_req->type); + break; + } + + used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp); +out: + vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, used_len); + vhost_scsi_task_put(task); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * -1 if request is invalid and must be aborted, + * 0 if all data are set. + */ +static int +task_data_setup(struct spdk_vhost_scsi_task *task, + struct virtio_scsi_cmd_req **req) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + struct vring_desc *desc, *desc_table; + struct iovec *iovs = task->iovs; + uint16_t iovcnt = 0; + uint32_t desc_table_len, len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_cpl, vhost_scsi_task_free_cb); + + rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len); + /* First descriptor must be readable */ + if (spdk_unlikely(rc != 0 || vhost_vring_desc_is_wr(desc) || + desc->len < sizeof(struct virtio_scsi_cmd_req))) { + SPDK_WARNLOG("%s: invalid first request descriptor at index %"PRIu16".\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + *req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(**req)); + if (spdk_unlikely(*req == NULL)) { + SPDK_WARNLOG("%s: request descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + /* Each request must have at least 2 descriptors (e.g. request and response) */ + vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (desc == NULL) { + SPDK_WARNLOG("%s: descriptor chain at index %d contains neither payload nor response buffer.\n", + vsession->name, task->req_idx); + goto invalid_task; + } + task->scsi.dxfer_dir = vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV : + SPDK_SCSI_DIR_TO_DEV; + task->scsi.iovs = iovs; + + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + /* + * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN] + */ + task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto invalid_task; + } + rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + if (desc == NULL) { + /* + * TEST UNIT READY command and some others might not contain any payload and this is not an error. + */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, + "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx); + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE); + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + task->scsi.iovcnt = 1; + task->scsi.iovs[0].iov_len = 0; + task->scsi.length = 0; + task->scsi.transfer_len = 0; + return 0; + } + + /* All remaining descriptors are data. */ + while (desc) { + if (spdk_unlikely(!vhost_vring_desc_is_wr(desc))) { + SPDK_WARNLOG("%s: FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n", + vsession->name, iovcnt); + goto invalid_task; + } + + if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n", + vsession->name, task->req_idx); + goto invalid_task; + } + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len; + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV"); + /* + * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp] + * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir. + */ + + /* Process descriptors up to response. */ + while (!vhost_vring_desc_is_wr(desc)) { + if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(desc == NULL)) { + SPDK_WARNLOG("%s: TO_DEV cmd: no response descriptor.\n", vsession->name); + goto invalid_task; + } + } + + task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + } + + task->scsi.iovcnt = iovcnt; + task->scsi.length = len; + task->scsi.transfer_len = len; + return 0; + +invalid_task: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n", + vsession->name, task->req_idx); + return -1; +} + +static int +process_request(struct spdk_vhost_scsi_task *task) +{ + struct virtio_scsi_cmd_req *req; + int result; + + result = task_data_setup(task, &req); + if (result) { + return result; + } + + result = vhost_scsi_task_init_target(task, req->lun); + if (spdk_unlikely(result != 0)) { + task->resp->response = VIRTIO_SCSI_S_BAD_TARGET; + return -1; + } + + task->scsi.cdb = req->cdb; + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE); + + if (spdk_unlikely(task->scsi.lun == NULL)) { + spdk_scsi_task_process_null_lun(&task->scsi); + task->resp->response = VIRTIO_SCSI_S_OK; + return 1; + } + + return 0; +} + +static void +process_scsi_task(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *vq, + uint16_t req_idx) +{ + struct spdk_vhost_scsi_task *task; + int result; + + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[req_idx]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + vsession->name, req_idx); + vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); + return; + } + + vsession->task_cnt++; + scsi_task_init(task); + + if (spdk_unlikely(vq->vring_idx == VIRTIO_SCSI_CONTROLQ)) { + process_ctrl_request(task); + } else { + result = process_request(task); + if (likely(result == 0)) { + task_submit(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task, + task->req_idx); + } else if (result > 0) { + vhost_scsi_task_cpl(&task->scsi); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task, + task->req_idx); + } else { + invalid_request(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task, + task->req_idx); + } + } +} + +static void +process_vq(struct spdk_vhost_scsi_session *svsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + assert(reqs_cnt <= 32); + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + vsession->name, reqs[i], vq->vring.size); + vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); + continue; + } + + process_scsi_task(vsession, vq, reqs[i]); + } +} + +static int +vdev_mgmt_worker(void *arg) +{ + struct spdk_vhost_scsi_session *svsession = arg; + struct spdk_vhost_session *vsession = &svsession->vsession; + + process_removed_devs(svsession); + vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]); + + process_vq(svsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]); + vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]); + + return SPDK_POLLER_BUSY; +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_scsi_session *svsession = arg; + struct spdk_vhost_session *vsession = &svsession->vsession; + uint32_t q_idx; + + for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vsession->max_queues; q_idx++) { + process_vq(svsession, &vsession->virtqueue[q_idx]); + } + + vhost_session_used_signal(vsession); + + return SPDK_POLLER_BUSY; +} + +static struct spdk_vhost_scsi_dev * +to_scsi_dev(struct spdk_vhost_dev *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->backend != &spdk_vhost_scsi_device_backend) { + SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name); + return NULL; + } + + return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev); +} + +static struct spdk_vhost_scsi_session * +to_scsi_session(struct spdk_vhost_session *vsession) +{ + assert(vsession->vdev->backend == &spdk_vhost_scsi_device_backend); + return (struct spdk_vhost_scsi_session *)vsession; +} + +int +spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask) +{ + struct spdk_vhost_scsi_dev *svdev = calloc(1, sizeof(*svdev)); + int rc; + + if (svdev == NULL) { + return -ENOMEM; + } + + svdev->vdev.virtio_features = SPDK_VHOST_SCSI_FEATURES; + svdev->vdev.disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES; + + spdk_vhost_lock(); + rc = vhost_dev_register(&svdev->vdev, name, cpumask, + &spdk_vhost_scsi_device_backend); + + if (rc) { + free(svdev); + spdk_vhost_unlock(); + return rc; + } + + svdev->registered = true; + + spdk_vhost_unlock(); + return rc; +} + +static int +vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev); + int rc, i; + + assert(svdev != NULL); + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + if (svdev->scsi_dev_state[i].dev) { + if (vdev->registered) { + SPDK_ERRLOG("%s: SCSI target %d is still present.\n", vdev->name, i); + return -EBUSY; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i); + return rc; + } + } + } + + rc = vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + svdev->registered = false; + + if (svdev->ref == 0) { + free(svdev); + } + + return 0; +} + +struct spdk_scsi_dev * +spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num) +{ + struct spdk_vhost_scsi_dev *svdev; + + assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + svdev = to_scsi_dev(vdev); + assert(svdev != NULL); + if (svdev->scsi_dev_state[num].status != VHOST_SCSI_DEV_PRESENT) { + return NULL; + } + + assert(svdev->scsi_dev_state[num].dev != NULL); + return svdev->scsi_dev_state[num].dev; +} + +static void +vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + const struct spdk_scsi_dev *scsi_dev; + unsigned scsi_dev_num; + + assert(lun != NULL); + assert(svdev != NULL); + scsi_dev = spdk_scsi_lun_get_dev(lun); + for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) { + if (svdev->scsi_dev_state[scsi_dev_num].dev == scsi_dev) { + break; + } + } + + if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + /* The entire device has been already removed. */ + return; + } + + /* remove entire device */ + spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL); +} + +static void +vhost_scsi_dev_add_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev, + struct spdk_vhost_scsi_dev, vdev); + struct spdk_scsi_dev_vhost_state *vhost_sdev; + + vhost_sdev = &svdev->scsi_dev_state[scsi_tgt_num]; + + /* All sessions have added the target */ + assert(vhost_sdev->status == VHOST_SCSI_DEV_ADDING); + vhost_sdev->status = VHOST_SCSI_DEV_PRESENT; + svdev->ref++; +} + +static int +vhost_scsi_session_add_tgt(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession; + struct spdk_scsi_dev_session_state *session_sdev = &svsession->scsi_dev_state[scsi_tgt_num]; + struct spdk_scsi_dev_vhost_state *vhost_sdev; + int rc; + + if (!vsession->started || session_sdev->dev != NULL) { + /* Nothing to do. */ + return 0; + } + + vhost_sdev = &svsession->svdev->scsi_dev_state[scsi_tgt_num]; + session_sdev->dev = vhost_sdev->dev; + session_sdev->status = VHOST_SCSI_DEV_PRESENT; + + rc = spdk_scsi_dev_allocate_io_channels(svsession->scsi_dev_state[scsi_tgt_num].dev); + if (rc != 0) { + SPDK_ERRLOG("%s: Couldn't allocate io channnel for SCSI target %u.\n", + vsession->name, scsi_tgt_num); + + /* unset the SCSI target so that all I/O to it will be rejected */ + session_sdev->dev = NULL; + /* Set status to EMPTY so that we won't reply with SCSI hotremove + * sense codes - the device hasn't ever been added. + */ + session_sdev->status = VHOST_SCSI_DEV_EMPTY; + + /* Return with no error. We'll continue allocating io_channels for + * other sessions on this device in hopes they succeed. The sessions + * that failed to allocate io_channels simply won't be able to + * detect the SCSI target, nor do any I/O to it. + */ + return 0; + } + + if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) { + eventq_enqueue(svsession, scsi_tgt_num, + VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN); + } else { + SPDK_NOTICELOG("%s: driver does not support hotplug. " + "Please restart it or perform a rescan.\n", + vsession->name); + } + + return 0; +} + +int +spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, int scsi_tgt_num, + const char *bdev_name) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev_vhost_state *state; + char target_name[SPDK_SCSI_DEV_MAX_NAME]; + int lun_id_list[1]; + const char *bdev_names_list[1]; + + svdev = to_scsi_dev(vdev); + assert(svdev != NULL); + if (scsi_tgt_num < 0) { + for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) { + if (svdev->scsi_dev_state[scsi_tgt_num].dev == NULL) { + break; + } + } + + if (scsi_tgt_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: all SCSI target slots are already in use.\n", vdev->name); + return -ENOSPC; + } + } else { + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: SCSI target number is too big (got %d, max %d)\n", + vdev->name, scsi_tgt_num, SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + return -EINVAL; + } + } + + if (bdev_name == NULL) { + SPDK_ERRLOG("No lun name specified\n"); + return -EINVAL; + } + + state = &svdev->scsi_dev_state[scsi_tgt_num]; + if (state->dev != NULL) { + SPDK_ERRLOG("%s: SCSI target %u already occupied\n", vdev->name, scsi_tgt_num); + return -EEXIST; + } + + /* + * At this stage only one LUN per target + */ + snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num); + lun_id_list[0] = 0; + bdev_names_list[0] = (char *)bdev_name; + + state->status = VHOST_SCSI_DEV_ADDING; + state->dev = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, 1, + SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, + vhost_scsi_lun_hotremove, svdev); + + if (state->dev == NULL) { + state->status = VHOST_SCSI_DEV_EMPTY; + SPDK_ERRLOG("%s: couldn't create SCSI target %u using bdev '%s'\n", + vdev->name, scsi_tgt_num, bdev_name); + return -EINVAL; + } + spdk_scsi_dev_add_port(state->dev, 0, "vhost"); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: added SCSI target %u using bdev '%s'\n", + vdev->name, scsi_tgt_num, bdev_name); + + vhost_dev_foreach_session(vdev, vhost_scsi_session_add_tgt, + vhost_scsi_dev_add_tgt_cpl_cb, + (void *)(uintptr_t)scsi_tgt_num); + return scsi_tgt_num; +} + +struct scsi_tgt_hotplug_ctx { + unsigned scsi_tgt_num; + bool async_fini; +}; + +static void +vhost_scsi_dev_remove_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *_ctx) +{ + struct scsi_tgt_hotplug_ctx *ctx = _ctx; + struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev, + struct spdk_vhost_scsi_dev, vdev); + + if (!ctx->async_fini) { + /* there aren't any active sessions, so remove the dev and exit */ + remove_scsi_tgt(svdev, ctx->scsi_tgt_num); + } + + free(ctx); +} + +static int +vhost_scsi_session_remove_tgt(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *_ctx) +{ + struct scsi_tgt_hotplug_ctx *ctx = _ctx; + unsigned scsi_tgt_num = ctx->scsi_tgt_num; + struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession; + struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num]; + + if (!vsession->started || state->dev == NULL) { + /* Nothing to do */ + return 0; + } + + /* Mark the target for removal */ + assert(state->status == VHOST_SCSI_DEV_PRESENT); + state->status = VHOST_SCSI_DEV_REMOVING; + + /* Send a hotremove Virtio event */ + if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) { + eventq_enqueue(svsession, scsi_tgt_num, + VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); + } + + /* Wait for the session's management poller to remove the target after + * all its pending I/O has finished. + */ + ctx->async_fini = true; + return 0; +} + +int +spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num, + spdk_vhost_event_fn cb_fn, void *cb_arg) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev_vhost_state *scsi_dev_state; + struct scsi_tgt_hotplug_ctx *ctx; + + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: invalid SCSI target number %d\n", vdev->name, scsi_tgt_num); + return -EINVAL; + } + + svdev = to_scsi_dev(vdev); + assert(svdev != NULL); + scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num]; + + if (scsi_dev_state->status != VHOST_SCSI_DEV_PRESENT) { + return -EBUSY; + } + + if (scsi_dev_state->dev == NULL || scsi_dev_state->status == VHOST_SCSI_DEV_ADDING) { + SPDK_ERRLOG("%s: SCSI target %u is not occupied\n", vdev->name, scsi_tgt_num); + return -ENODEV; + } + + assert(scsi_dev_state->status != VHOST_SCSI_DEV_EMPTY); + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("calloc failed\n"); + return -ENOMEM; + } + + ctx->scsi_tgt_num = scsi_tgt_num; + ctx->async_fini = false; + + scsi_dev_state->remove_cb = cb_fn; + scsi_dev_state->remove_ctx = cb_arg; + scsi_dev_state->status = VHOST_SCSI_DEV_REMOVING; + + vhost_dev_foreach_session(vdev, vhost_scsi_session_remove_tgt, + vhost_scsi_dev_remove_tgt_cpl_cb, ctx); + return 0; +} + +int +vhost_scsi_controller_construct(void) +{ + struct spdk_conf_section *sp = spdk_conf_first_section(NULL); + struct spdk_vhost_dev *vdev; + int i, dev_num; + unsigned ctrlr_num = 0; + char *bdev_name, *tgt_num_str; + char *cpumask; + char *name; + char *tgt = NULL; + + while (sp != NULL) { + if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) { + sp = spdk_conf_next_section(sp); + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + + if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) { + return -1; + } + + vdev = spdk_vhost_dev_find(name); + assert(vdev); + + for (i = 0; ; i++) { + + tgt = spdk_conf_section_get_nval(sp, "Target", i); + if (tgt == NULL) { + break; + } + + tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0); + if (tgt_num_str == NULL) { + SPDK_ERRLOG("%s: invalid or missing SCSI target number\n", name); + return -1; + } + + dev_num = (int)strtol(tgt_num_str, NULL, 10); + bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("%s: invalid or missing bdev name for SCSI target %d\n", name, dev_num); + return -1; + } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) { + SPDK_ERRLOG("%s: only one LUN per SCSI target is supported\n", name); + return -1; + } + + if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) { + return -1; + } + } + + sp = spdk_conf_next_section(sp); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_scsi_session *svsession) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_scsi_session *svsession) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_scsi_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(svsession); + return -1; + } + vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vq->tasks == NULL) { + SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + vsession->name, task_cnt, i); + free_task_pool(svsession); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j]; + task->svsession = svsession; + task->vq = vq; + task->req_idx = j; + } + } + + return 0; +} + +static int +vhost_scsi_start_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + struct spdk_vhost_scsi_dev *svdev = svsession->svdev; + struct spdk_scsi_dev_vhost_state *state; + uint32_t i; + int rc; + + /* validate all I/O queues are in a contiguous index range */ + for (i = VIRTIO_SCSI_REQUESTQ; i < vsession->max_queues; i++) { + if (vsession->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(svsession); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); + goto out; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + state = &svdev->scsi_dev_state[i]; + if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) { + continue; + } + + assert(svsession->scsi_dev_state[i].status == VHOST_SCSI_DEV_EMPTY); + svsession->scsi_dev_state[i].dev = state->dev; + svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_PRESENT; + rc = spdk_scsi_dev_allocate_io_channels(state->dev); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc io_channel for SCSI target %"PRIu32"\n", + vsession->name, i); + /* unset the SCSI target so that all I/O to it will be rejected */ + svsession->scsi_dev_state[i].dev = NULL; + /* set EMPTY state so that we won't reply with SCSI hotremove + * sense codes - the device hasn't ever been added. + */ + svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_EMPTY; + continue; + } + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); + + svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0); + if (vsession->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc && + vsession->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) { + svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession, + MGMT_POLL_PERIOD_US); + } +out: + vhost_session_start_done(vsession, rc); + return rc; +} + +static int +vhost_scsi_start(struct spdk_vhost_session *vsession) +{ + struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + struct spdk_vhost_scsi_dev *svdev; + + svdev = to_scsi_dev(vsession->vdev); + assert(svdev != NULL); + svsession->svdev = svdev; + + return vhost_session_send_event(vsession, vhost_scsi_start_cb, + 3, "start session"); +} + +static int +destroy_session_poller_cb(void *arg) +{ + struct spdk_vhost_scsi_session *svsession = arg; + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_scsi_dev_session_state *state; + uint32_t i; + + if (vsession->task_cnt > 0) { + return SPDK_POLLER_BUSY; + } + + if (spdk_vhost_trylock() != 0) { + return SPDK_POLLER_BUSY; + } + + for (i = 0; i < vsession->max_queues; i++) { + vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + enum spdk_scsi_dev_vhost_status prev_status; + + state = &svsession->scsi_dev_state[i]; + /* clear the REMOVED status so that we won't send hotremove events anymore */ + prev_status = state->status; + state->status = VHOST_SCSI_DEV_EMPTY; + if (state->dev == NULL) { + continue; + } + + spdk_scsi_dev_free_io_channels(state->dev); + + state->dev = NULL; + + if (prev_status == VHOST_SCSI_DEV_REMOVING) { + /* try to detach it globally */ + vhost_dev_foreach_session(vsession->vdev, + vhost_scsi_session_process_removed, + vhost_scsi_dev_process_removed_cpl_cb, + (void *)(uintptr_t)i); + } + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); + + free_task_pool(svsession); + + spdk_poller_unregister(&svsession->stop_poller); + vhost_session_stop_done(vsession, 0); + + spdk_vhost_unlock(); + return SPDK_POLLER_BUSY; +} + +static int +vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + + /* Stop receiving new I/O requests */ + spdk_poller_unregister(&svsession->requestq_poller); + + /* Stop receiving controlq requests, also stop processing the + * asynchronous hotremove events. All the remaining events + * will be finalized by the stop_poller below. + */ + spdk_poller_unregister(&svsession->mgmt_poller); + + /* Wait for all pending I/Os to complete, then process all the + * remaining hotremove events one last time. + */ + svsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb, + svsession, 1000); + + return 0; +} + +static int +vhost_scsi_stop(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_scsi_stop_cb, + 3, "stop session"); +} + +static void +vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_scsi_dev *sdev; + struct spdk_scsi_lun *lun; + uint32_t dev_idx; + uint32_t lun_idx; + + assert(vdev != NULL); + spdk_json_write_named_array_begin(w, "scsi"); + for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) { + sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx); + if (!sdev) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_uint32(w, "scsi_dev_num", dev_idx); + + spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev)); + + spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev)); + + spdk_json_write_named_array_begin(w, "luns"); + + for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) { + lun = spdk_scsi_dev_get_lun(sdev, lun_idx); + if (!lun) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "id", spdk_scsi_lun_get_id(lun)); + + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_scsi_dev *scsi_dev; + struct spdk_scsi_lun *lun; + uint32_t i; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_create_scsi_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "cpumask", + spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + scsi_dev = spdk_vhost_scsi_dev_get_tgt(vdev, i); + if (scsi_dev == NULL) { + continue; + } + + lun = spdk_scsi_dev_get_lun(scsi_dev, 0); + assert(lun != NULL); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_scsi_controller_add_target"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(w, "scsi_target_num", i); + + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA) |