summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib/vhost
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/lib/vhost
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/vhost')
-rw-r--r--src/spdk/lib/vhost/Makefile54
-rw-r--r--src/spdk/lib/vhost/rte_vhost_compat.c402
-rw-r--r--src/spdk/lib/vhost/spdk_vhost.map27
-rw-r--r--src/spdk/lib/vhost/vhost.c1634
-rw-r--r--src/spdk/lib/vhost/vhost_blk.c1354
-rw-r--r--src/spdk/lib/vhost/vhost_internal.h496
-rw-r--r--src/spdk/lib/vhost/vhost_nvme.c1500
-rw-r--r--src/spdk/lib/vhost/vhost_rpc.c652
-rw-r--r--src/spdk/lib/vhost/vhost_scsi.c1536
9 files changed, 7655 insertions, 0 deletions
diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile
new file mode 100644
index 000000000..1fe9b6e40
--- /dev/null
+++ b/src/spdk/lib/vhost/Makefile
@@ -0,0 +1,54 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 4
+SO_MINOR := 0
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+
+C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c rte_vhost_compat.c
+
+ifeq ($(CONFIG_VHOST_INTERNAL_LIB),y)
+C_SRCS += vhost_nvme.c
+CFLAGS := -I../rte_vhost $(CFLAGS)
+endif
+
+LIBNAME = vhost
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vhost.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vhost/rte_vhost_compat.c b/src/spdk/lib/vhost/rte_vhost_compat.c
new file mode 100644
index 000000000..53f31bfd7
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost_compat.c
@@ -0,0 +1,402 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * Set of workarounds for rte_vhost to make it work with device types
+ * other than vhost-net.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+#include "spdk_internal/vhost_user.h"
+
+static inline void
+vhost_session_mem_region_calc(uint64_t *previous_start, uint64_t *start, uint64_t *end,
+ uint64_t *len, struct rte_vhost_mem_region *region)
+{
+ *start = FLOOR_2MB(region->mmap_addr);
+ *end = CEIL_2MB(region->mmap_addr + region->mmap_size);
+ if (*start == *previous_start) {
+ *start += (size_t) VALUE_2MB;
+ }
+ *previous_start = *start;
+ *len = *end - *start;
+}
+
+void
+vhost_session_mem_register(struct rte_vhost_memory *mem)
+{
+ uint64_t start, end, len;
+ uint32_t i;
+ uint64_t previous_start = UINT64_MAX;
+
+
+ for (i = 0; i < mem->nregions; i++) {
+ vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]);
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
+ start, len);
+
+ if (spdk_mem_register((void *)start, len) != 0) {
+ SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
+ i);
+ continue;
+ }
+ }
+}
+
+void
+vhost_session_mem_unregister(struct rte_vhost_memory *mem)
+{
+ uint64_t start, end, len;
+ uint32_t i;
+ uint64_t previous_start = UINT64_MAX;
+
+ for (i = 0; i < mem->nregions; i++) {
+ vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]);
+ if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) {
+ continue; /* region has not been registered */
+ }
+
+ if (spdk_mem_unregister((void *)start, len) != 0) {
+ assert(false);
+ }
+ }
+}
+
+static int
+new_connection(int vid)
+{
+ char ifname[PATH_MAX];
+
+ if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
+ SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
+ return -1;
+ }
+
+ return vhost_new_connection_cb(vid, ifname);
+}
+
+static int
+start_device(int vid)
+{
+ return vhost_start_device_cb(vid);
+}
+
+static void
+stop_device(int vid)
+{
+ vhost_stop_device_cb(vid);
+}
+
+static void
+destroy_connection(int vid)
+{
+ vhost_destroy_connection_cb(vid);
+}
+
+static const struct vhost_device_ops g_spdk_vhost_ops = {
+ .new_device = start_device,
+ .destroy_device = stop_device,
+ .new_connection = new_connection,
+ .destroy_connection = destroy_connection,
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ .get_config = vhost_get_config_cb,
+ .set_config = vhost_set_config_cb,
+ .vhost_nvme_admin_passthrough = vhost_nvme_admin_passthrough,
+ .vhost_nvme_set_cq_call = vhost_nvme_set_cq_call,
+ .vhost_nvme_get_cap = vhost_nvme_get_cap,
+ .vhost_nvme_set_bar_mr = vhost_nvme_set_bar_mr,
+#endif
+};
+
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+
+static enum rte_vhost_msg_result
+extern_vhost_pre_msg_handler(int vid, void *_msg)
+{
+ struct vhost_user_msg *msg = _msg;
+ struct spdk_vhost_session *vsession;
+
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid);
+ assert(false);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ switch (msg->request) {
+ case VHOST_USER_GET_VRING_BASE:
+ if (vsession->forced_polling && vsession->started) {
+ /* Our queue is stopped for whatever reason, but we may still
+ * need to poll it after it's initialized again.
+ */
+ g_spdk_vhost_ops.destroy_device(vid);
+ }
+ break;
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ADDR:
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_KICK:
+ if (vsession->forced_polling && vsession->started) {
+ /* Additional queues are being initialized, so we either processed
+ * enough I/Os and are switching from SeaBIOS to the OS now, or
+ * we were never in SeaBIOS in the first place. Either way, we
+ * don't need our workaround anymore.
+ */
+ g_spdk_vhost_ops.destroy_device(vid);
+ vsession->forced_polling = false;
+ }
+ break;
+ case VHOST_USER_SET_VRING_CALL:
+ /* rte_vhost will close the previous callfd and won't notify
+ * us about any change. This will effectively make SPDK fail
+ * to deliver any subsequent interrupts until a session is
+ * restarted. We stop the session here before closing the previous
+ * fd (so that all interrupts must have been delivered by the
+ * time the descriptor is closed) and start right after (which
+ * will make SPDK retrieve the latest, up-to-date callfd from
+ * rte_vhost.
+ */
+ case VHOST_USER_SET_MEM_TABLE:
+ /* rte_vhost will unmap previous memory that SPDK may still
+ * have pending DMA operations on. We can't let that happen,
+ * so stop the device before letting rte_vhost unmap anything.
+ * This will block until all pending I/Os are finished.
+ * We will start the device again from the post-processing
+ * message handler.
+ */
+ if (vsession->started) {
+ g_spdk_vhost_ops.destroy_device(vid);
+ vsession->needs_restart = true;
+ }
+ break;
+ case VHOST_USER_GET_CONFIG: {
+ int rc = 0;
+
+ spdk_vhost_lock();
+ if (vsession->vdev->backend->vhost_get_config) {
+ rc = vsession->vdev->backend->vhost_get_config(vsession->vdev,
+ msg->payload.cfg.region, msg->payload.cfg.size);
+ if (rc != 0) {
+ msg->size = 0;
+ }
+ }
+ spdk_vhost_unlock();
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+ }
+ case VHOST_USER_SET_CONFIG: {
+ int rc = 0;
+
+ spdk_vhost_lock();
+ if (vsession->vdev->backend->vhost_set_config) {
+ rc = vsession->vdev->backend->vhost_set_config(vsession->vdev,
+ msg->payload.cfg.region, msg->payload.cfg.offset,
+ msg->payload.cfg.size, msg->payload.cfg.flags);
+ }
+ spdk_vhost_unlock();
+
+ return rc == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR;
+ }
+ default:
+ break;
+ }
+
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+static enum rte_vhost_msg_result
+extern_vhost_post_msg_handler(int vid, void *_msg)
+{
+ struct vhost_user_msg *msg = _msg;
+ struct spdk_vhost_session *vsession;
+
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid);
+ assert(false);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (vsession->needs_restart) {
+ g_spdk_vhost_ops.new_device(vid);
+ vsession->needs_restart = false;
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+ }
+
+ switch (msg->request) {
+ case VHOST_USER_SET_FEATURES:
+ /* rte_vhost requires all queues to be fully initialized in order
+ * to start I/O processing. This behavior is not compliant with the
+ * vhost-user specification and doesn't work with QEMU 2.12+, which
+ * will only initialize 1 I/O queue for the SeaBIOS boot.
+ * Theoretically, we should start polling each virtqueue individually
+ * after receiving its SET_VRING_KICK message, but rte_vhost is not
+ * designed to poll individual queues. So here we use a workaround
+ * to detect when the vhost session could be potentially at that SeaBIOS
+ * stage and we mark it to start polling as soon as its first virtqueue
+ * gets initialized. This doesn't hurt any non-QEMU vhost slaves
+ * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent
+ * at any time, but QEMU will send it at least once on SeaBIOS
+ * initialization - whenever powered-up or rebooted.
+ */
+ vsession->forced_polling = true;
+ break;
+ case VHOST_USER_SET_VRING_KICK:
+ /* vhost-user spec tells us to start polling a queue after receiving
+ * its SET_VRING_KICK message. Let's do it!
+ */
+ if (vsession->forced_polling && !vsession->started) {
+ g_spdk_vhost_ops.new_device(vid);
+ }
+ break;
+ default:
+ break;
+ }
+
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops = {
+ .pre_msg_handle = extern_vhost_pre_msg_handler,
+ .post_msg_handle = extern_vhost_post_msg_handler,
+};
+
+void
+vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession)
+{
+ int rc;
+
+ rc = rte_vhost_extern_callback_register(vsession->vid, &g_spdk_extern_vhost_ops, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n",
+ vsession->vid);
+ return;
+ }
+}
+
+#else /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
+
+void
+vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession)
+{
+ /* nothing to do. all the changes are already incorporated into rte_vhost */
+}
+
+#endif
+
+int
+vhost_register_unix_socket(const char *path, const char *ctrl_name,
+ uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features)
+{
+ struct stat file_stat;
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ uint64_t features = 0;
+#endif
+
+ /* Register vhost driver to handle vhost messages. */
+ if (stat(path, &file_stat) != -1) {
+ if (!S_ISSOCK(file_stat.st_mode)) {
+ SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+ "The file already exists and is not a socket.\n",
+ path);
+ return -EIO;
+ } else if (unlink(path) != 0) {
+ SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+ "The socket already exists and failed to unlink.\n",
+ path);
+ return -EIO;
+ }
+ }
+
+ if (rte_vhost_driver_register(path, 0) != 0) {
+ SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name);
+ SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
+ return -EIO;
+ }
+ if (rte_vhost_driver_set_features(path, virtio_features) ||
+ rte_vhost_driver_disable_features(path, disabled_features)) {
+ SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name);
+
+ rte_vhost_driver_unregister(path);
+ return -EIO;
+ }
+
+ if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
+ rte_vhost_driver_unregister(path);
+ SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name);
+ return -EIO;
+ }
+
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ rte_vhost_driver_get_protocol_features(path, &features);
+ features |= protocol_features;
+ rte_vhost_driver_set_protocol_features(path, features);
+#endif
+
+ if (rte_vhost_driver_start(path) != 0) {
+ SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
+ ctrl_name, errno, spdk_strerror(errno));
+ rte_vhost_driver_unregister(path);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+ return rte_vhost_get_mem_table(vid, mem);
+}
+
+int
+vhost_driver_unregister(const char *path)
+{
+ return rte_vhost_driver_unregister(path);
+}
+
+int
+vhost_get_negotiated_features(int vid, uint64_t *negotiated_features)
+{
+ return rte_vhost_get_negotiated_features(vid, negotiated_features);
+}
diff --git a/src/spdk/lib/vhost/spdk_vhost.map b/src/spdk/lib/vhost/spdk_vhost.map
new file mode 100644
index 000000000..de38e5a5e
--- /dev/null
+++ b/src/spdk/lib/vhost/spdk_vhost.map
@@ -0,0 +1,27 @@
+{
+ global:
+
+ # public functions
+ spdk_vhost_set_socket_path;
+ spdk_vhost_init;
+ spdk_vhost_fini;
+ spdk_vhost_config_json;
+ spdk_vhost_shutdown_cb;
+ spdk_vhost_lock;
+ spdk_vhost_trylock;
+ spdk_vhost_unlock;
+ spdk_vhost_dev_find;
+ spdk_vhost_dev_next;
+ spdk_vhost_dev_get_name;
+ spdk_vhost_dev_get_cpumask;
+ spdk_vhost_set_coalescing;
+ spdk_vhost_get_coalescing;
+ spdk_vhost_scsi_dev_construct;
+ spdk_vhost_scsi_dev_add_tgt;
+ spdk_vhost_scsi_dev_get_tgt;
+ spdk_vhost_scsi_dev_remove_tgt;
+ spdk_vhost_blk_construct;
+ spdk_vhost_dev_remove;
+
+ local: *;
+};
diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c
new file mode 100644
index 000000000..b904d8bf9
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost.c
@@ -0,0 +1,1634 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+static struct spdk_cpuset g_vhost_core_mask;
+
+/* Path to folder where character device will be created. Can be set by user. */
+static char dev_dirname[PATH_MAX] = "";
+
+/* Thread performing all vhost management operations */
+static struct spdk_thread *g_vhost_init_thread;
+
+static spdk_vhost_fini_cb g_fini_cpl_cb;
+
+/**
+ * DPDK calls our callbacks synchronously but the work those callbacks
+ * perform needs to be async. Luckily, all DPDK callbacks are called on
+ * a DPDK-internal pthread, so we'll just wait on a semaphore in there.
+ */
+static sem_t g_dpdk_sem;
+
+/** Return code for the current DPDK callback */
+static int g_dpdk_response;
+
+struct vhost_session_fn_ctx {
+ /** Device pointer obtained before enqueuing the event */
+ struct spdk_vhost_dev *vdev;
+
+ /** ID of the session to send event to. */
+ uint32_t vsession_id;
+
+ /** User provided function to be executed on session's thread. */
+ spdk_vhost_session_fn cb_fn;
+
+ /**
+ * User provided function to be called on the init thread
+ * after iterating through all sessions.
+ */
+ spdk_vhost_dev_fn cpl_fn;
+
+ /** Custom user context */
+ void *user_ctx;
+};
+
+static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER(
+ g_vhost_devices);
+static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
+{
+ void *vva;
+ uint64_t newlen;
+
+ newlen = len;
+ vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen);
+ if (newlen != len) {
+ return NULL;
+ }
+
+ return vva;
+
+}
+
+static void
+vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_id)
+{
+ struct vring_desc *desc, *desc_table;
+ uint32_t desc_table_size;
+ int rc;
+
+ if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Can't log used ring descriptors!\n");
+ return;
+ }
+
+ do {
+ if (vhost_vring_desc_is_wr(desc)) {
+ /* To be honest, only pages realy touched should be logged, but
+ * doing so would require tracking those changes in each backed.
+ * Also backend most likely will touch all/most of those pages so
+ * for lets assume we touched all pages passed to as writeable buffers. */
+ rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
+ }
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ } while (desc);
+}
+
+static void
+vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t idx)
+{
+ uint64_t offset, len;
+
+ if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ if (spdk_unlikely(virtqueue->packed.packed_ring)) {
+ offset = idx * sizeof(struct vring_packed_desc);
+ len = sizeof(struct vring_packed_desc);
+ } else {
+ offset = offsetof(struct vring_used, ring[idx]);
+ len = sizeof(virtqueue->vring.used->ring[idx]);
+ }
+
+ rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len);
+}
+
+static void
+vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue)
+{
+ uint64_t offset, len;
+ uint16_t vq_idx;
+
+ if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ offset = offsetof(struct vring_used, idx);
+ len = sizeof(virtqueue->vring.used->idx);
+ vq_idx = virtqueue - vsession->virtqueue;
+
+ rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
+}
+
+/*
+ * Get available requests from avail ring.
+ */
+uint16_t
+vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
+ uint16_t reqs_len)
+{
+ struct rte_vhost_vring *vring = &virtqueue->vring;
+ struct vring_avail *avail = vring->avail;
+ uint16_t size_mask = vring->size - 1;
+ uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx;
+ uint16_t count, i;
+
+ count = avail_idx - last_idx;
+ if (spdk_likely(count == 0)) {
+ return 0;
+ }
+
+ if (spdk_unlikely(count > vring->size)) {
+ /* TODO: the queue is unrecoverably broken and should be marked so.
+ * For now we will fail silently and report there are no new avail entries.
+ */
+ return 0;
+ }
+
+ count = spdk_min(count, reqs_len);
+ virtqueue->last_avail_idx += count;
+ for (i = 0; i < count; i++) {
+ reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
+ last_idx, avail_idx, count);
+
+ return count;
+}
+
+static bool
+vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
+{
+ return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
+}
+
+static bool
+vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc)
+{
+ return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0;
+}
+
+int
+vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+ uint32_t *desc_table_size)
+{
+ if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
+ return -1;
+ }
+
+ *desc = &virtqueue->vring.desc[req_idx];
+
+ if (vhost_vring_desc_is_indirect(*desc)) {
+ *desc_table_size = (*desc)->len / sizeof(**desc);
+ *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+ sizeof(**desc) * *desc_table_size);
+ *desc = *desc_table;
+ if (*desc == NULL) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+ *desc_table = virtqueue->vring.desc;
+ *desc_table_size = virtqueue->vring.size;
+
+ return 0;
+}
+
+int
+vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_packed_desc **desc,
+ struct vring_packed_desc **desc_table, uint32_t *desc_table_size)
+{
+ *desc = &virtqueue->vring.desc_packed[req_idx];
+
+ /* In packed ring when the desc is non-indirect we get next desc
+ * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc
+ * is indirect we get next desc by idx and desc_table_size. It's
+ * different from split ring.
+ */
+ if (vhost_vring_packed_desc_is_indirect(*desc)) {
+ *desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc);
+ *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+ (*desc)->len);
+ *desc = *desc_table;
+ if (spdk_unlikely(*desc == NULL)) {
+ return -1;
+ }
+ } else {
+ *desc_table = NULL;
+ *desc_table_size = 0;
+ }
+
+ return 0;
+}
+
+int
+vhost_vq_used_signal(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue)
+{
+ if (virtqueue->used_req_cnt == 0) {
+ return 0;
+ }
+
+ virtqueue->req_cnt += virtqueue->used_req_cnt;
+ virtqueue->used_req_cnt = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
+ virtqueue - vsession->virtqueue, virtqueue->last_used_idx);
+
+ if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) {
+ /* interrupt signalled */
+ return 1;
+ } else {
+ /* interrupt not signalled */
+ return 0;
+ }
+}
+
+
+static void
+check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now)
+{
+ struct spdk_vhost_virtqueue *virtqueue;
+ uint32_t irq_delay_base = vsession->coalescing_delay_time_base;
+ uint32_t io_threshold = vsession->coalescing_io_rate_threshold;
+ int32_t irq_delay;
+ uint32_t req_cnt;
+ uint16_t q_idx;
+
+ if (now < vsession->next_stats_check_time) {
+ return;
+ }
+
+ vsession->next_stats_check_time = now + vsession->stats_check_interval;
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ virtqueue = &vsession->virtqueue[q_idx];
+
+ req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
+ if (req_cnt <= io_threshold) {
+ continue;
+ }
+
+ irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
+ virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
+
+ virtqueue->req_cnt = 0;
+ virtqueue->next_event_time = now;
+ }
+}
+
+static inline bool
+vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq)
+{
+ if (spdk_unlikely(vq->packed.packed_ring)) {
+ if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) {
+ return true;
+ }
+ } else {
+ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+vhost_session_used_signal(struct spdk_vhost_session *vsession)
+{
+ struct spdk_vhost_virtqueue *virtqueue;
+ uint64_t now;
+ uint16_t q_idx;
+
+ if (vsession->coalescing_delay_time_base == 0) {
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ virtqueue = &vsession->virtqueue[q_idx];
+
+ if (virtqueue->vring.desc == NULL) {
+ continue;
+ }
+
+ if (vhost_vq_event_is_suppressed(virtqueue)) {
+ continue;
+ }
+
+ vhost_vq_used_signal(vsession, virtqueue);
+ }
+ } else {
+ now = spdk_get_ticks();
+ check_session_io_stats(vsession, now);
+
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ virtqueue = &vsession->virtqueue[q_idx];
+
+ /* No need for event right now */
+ if (now < virtqueue->next_event_time) {
+ continue;
+ }
+
+ if (vhost_vq_event_is_suppressed(virtqueue)) {
+ continue;
+ }
+
+ if (!vhost_vq_used_signal(vsession, virtqueue)) {
+ continue;
+ }
+
+ /* Syscall is quite long so update time */
+ now = spdk_get_ticks();
+ virtqueue->next_event_time = now + virtqueue->irq_delay_time;
+ }
+ }
+}
+
+static int
+vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *ctx)
+{
+ vsession->coalescing_delay_time_base =
+ vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
+ vsession->coalescing_io_rate_threshold =
+ vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
+ return 0;
+}
+
+static int
+vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+ uint32_t iops_threshold)
+{
+ uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
+ uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
+
+ if (delay_time_base >= UINT32_MAX) {
+ SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
+ return -EINVAL;
+ } else if (io_rate == 0) {
+ SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
+ 1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS);
+ return -EINVAL;
+ }
+
+ vdev->coalescing_delay_us = delay_base_us;
+ vdev->coalescing_iops_threshold = iops_threshold;
+ return 0;
+}
+
+int
+spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+ uint32_t iops_threshold)
+{
+ int rc;
+
+ rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL);
+ return 0;
+}
+
+void
+spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
+ uint32_t *iops_threshold)
+{
+ if (delay_base_us) {
+ *delay_base_us = vdev->coalescing_delay_us;
+ }
+
+ if (iops_threshold) {
+ *iops_threshold = vdev->coalescing_iops_threshold;
+ }
+}
+
+/*
+ * Enqueue id and len to used ring.
+ */
+void
+vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t id, uint32_t len)
+{
+ struct rte_vhost_vring *vring = &virtqueue->vring;
+ struct vring_used *used = vring->used;
+ uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
+ uint16_t vq_idx = virtqueue->vring_idx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
+ virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
+
+ vhost_log_req_desc(vsession, virtqueue, id);
+
+ virtqueue->last_used_idx++;
+ used->ring[last_idx].id = id;
+ used->ring[last_idx].len = len;
+
+ /* Ensure the used ring is updated before we log it or increment used->idx. */
+ spdk_smp_wmb();
+
+ rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id);
+
+ vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
+ * (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
+ vhost_log_used_vring_idx(vsession, virtqueue);
+
+ rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id);
+
+ virtqueue->used_req_cnt++;
+}
+
+void
+vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t num_descs, uint16_t buffer_id,
+ uint32_t length)
+{
+ struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx];
+ bool used, avail;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - RING: buffer_id=%"PRIu16"\n",
+ virtqueue - vsession->virtqueue, buffer_id);
+
+ /* When the descriptor is used, two flags in descriptor
+ * avail flag and used flag are set to equal
+ * and used flag value == used_wrap_counter.
+ */
+ used = !!(desc->flags & VRING_DESC_F_USED);
+ avail = !!(desc->flags & VRING_DESC_F_AVAIL);
+ if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) {
+ SPDK_ERRLOG("descriptor has been used before\n");
+ return;
+ }
+
+ /* In used desc addr is unused and len specifies the buffer length
+ * that has been written to by the device.
+ */
+ desc->addr = 0;
+ desc->len = length;
+
+ /* This bit specifies whether any data has been written by the device */
+ if (length != 0) {
+ desc->flags |= VRING_DESC_F_WRITE;
+ }
+
+ /* Buffer ID is included in the last descriptor in the list.
+ * The driver needs to keep track of the size of the list corresponding
+ * to each buffer ID.
+ */
+ desc->id = buffer_id;
+
+ /* A device MUST NOT make the descriptor used before buffer_id is
+ * written to the descriptor.
+ */
+ spdk_smp_wmb();
+ /* To mark a desc as used, the device sets the F_USED bit in flags to match
+ * the internal Device ring wrap counter. It also sets the F_AVAIL bit to
+ * match the same value.
+ */
+ if (virtqueue->packed.used_phase) {
+ desc->flags |= VRING_DESC_F_AVAIL_USED;
+ } else {
+ desc->flags &= ~VRING_DESC_F_AVAIL_USED;
+ }
+
+ vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx);
+ virtqueue->last_used_idx += num_descs;
+ if (virtqueue->last_used_idx >= virtqueue->vring.size) {
+ virtqueue->last_used_idx -= virtqueue->vring.size;
+ virtqueue->packed.used_phase = !virtqueue->packed.used_phase;
+ }
+
+ virtqueue->used_req_cnt++;
+}
+
+bool
+vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue)
+{
+ uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags;
+
+ /* To mark a desc as available, the driver sets the F_AVAIL bit in flags
+ * to match the internal avail wrap counter. It also sets the F_USED bit to
+ * match the inverse value but it's not mandatory.
+ */
+ return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase);
+}
+
+bool
+vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc)
+{
+ return (cur_desc->flags & VRING_DESC_F_WRITE) != 0;
+}
+
+int
+vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+ struct spdk_vhost_virtqueue *vq,
+ struct vring_packed_desc *desc_table,
+ uint32_t desc_table_size)
+{
+ if (desc_table != NULL) {
+ /* When the desc_table isn't NULL means it's indirect and we get the next
+ * desc by req_idx and desc_table_size. The return value is NULL means
+ * we reach the last desc of this request.
+ */
+ (*req_idx)++;
+ if (*req_idx < desc_table_size) {
+ *desc = &desc_table[*req_idx];
+ } else {
+ *desc = NULL;
+ }
+ } else {
+ /* When the desc_table is NULL means it's non-indirect and we get the next
+ * desc by req_idx and F_NEXT in flags. The return value is NULL means
+ * we reach the last desc of this request. When return new desc
+ * we update the req_idx too.
+ */
+ if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) {
+ *desc = NULL;
+ return 0;
+ }
+
+ *req_idx = (*req_idx + 1) % vq->vring.size;
+ *desc = &vq->vring.desc_packed[*req_idx];
+ }
+
+ return 0;
+}
+
+static int
+vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, uintptr_t payload, uint64_t remaining)
+{
+ uintptr_t vva;
+ uint64_t len;
+
+ do {
+ if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
+ SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
+ return -1;
+ }
+ len = remaining;
+ vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len);
+ if (vva == 0 || len == 0) {
+ SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
+ return -1;
+ }
+ iov[*iov_index].iov_base = (void *)vva;
+ iov[*iov_index].iov_len = len;
+ remaining -= len;
+ payload += len;
+ (*iov_index)++;
+ } while (remaining);
+
+ return 0;
+}
+
+int
+vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_packed_desc *desc)
+{
+ return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+ desc->addr, desc->len);
+}
+
+/* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx.
+ * 2, Update the vq->last_avail_idx to point next available desc chain.
+ * 3, Update the avail_wrap_counter if last_avail_idx overturn.
+ */
+uint16_t
+vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+ uint16_t *num_descs)
+{
+ struct vring_packed_desc *desc;
+ uint16_t desc_head = req_idx;
+
+ *num_descs = 1;
+
+ desc = &vq->vring.desc_packed[req_idx];
+ if (!vhost_vring_packed_desc_is_indirect(desc)) {
+ while ((desc->flags & VRING_DESC_F_NEXT) != 0) {
+ req_idx = (req_idx + 1) % vq->vring.size;
+ desc = &vq->vring.desc_packed[req_idx];
+ (*num_descs)++;
+ }
+ }
+
+ /* Queue Size doesn't have to be a power of 2
+ * Device maintains last_avail_idx so we can make sure
+ * the value is valid(0 ~ vring.size - 1)
+ */
+ vq->last_avail_idx = (req_idx + 1) % vq->vring.size;
+ if (vq->last_avail_idx < desc_head) {
+ vq->packed.avail_phase = !vq->packed.avail_phase;
+ }
+
+ return desc->id;
+}
+
+int
+vhost_vring_desc_get_next(struct vring_desc **desc,
+ struct vring_desc *desc_table, uint32_t desc_table_size)
+{
+ struct vring_desc *old_desc = *desc;
+ uint16_t next_idx;
+
+ if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
+ *desc = NULL;
+ return 0;
+ }
+
+ next_idx = old_desc->next;
+ if (spdk_unlikely(next_idx >= desc_table_size)) {
+ *desc = NULL;
+ return -1;
+ }
+
+ *desc = &desc_table[next_idx];
+ return 0;
+}
+
+int
+vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_desc *desc)
+{
+ return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+ desc->addr, desc->len);
+}
+
+static struct spdk_vhost_session *
+vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
+{
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->id == id) {
+ return vsession;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_vhost_session *
+vhost_session_find_by_vid(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->vid == vid) {
+ return vsession;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
+{
+ if (vdev == NULL) {
+ return TAILQ_FIRST(&g_vhost_devices);
+ }
+
+ return TAILQ_NEXT(vdev, tailq);
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_find(const char *ctrlr_name)
+{
+ struct spdk_vhost_dev *vdev;
+ size_t dev_dirname_len = strlen(dev_dirname);
+
+ if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
+ ctrlr_name += dev_dirname_len;
+ }
+
+ TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+ if (strcmp(vdev->name, ctrlr_name) == 0) {
+ return vdev;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+{
+ int rc;
+
+ if (cpumask == NULL) {
+ return -1;
+ }
+
+ if (mask == NULL) {
+ spdk_cpuset_copy(cpumask, &g_vhost_core_mask);
+ return 0;
+ }
+
+ rc = spdk_cpuset_parse(cpumask, mask);
+ if (rc < 0) {
+ SPDK_ERRLOG("invalid cpumask %s\n", mask);
+ return -1;
+ }
+
+ spdk_cpuset_and(cpumask, &g_vhost_core_mask);
+
+ if (spdk_cpuset_count(cpumask) == 0) {
+ SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n",
+ spdk_cpuset_fmt(&g_vhost_core_mask));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+vhost_setup_core_mask(void *ctx)
+{
+ struct spdk_thread *thread = spdk_get_thread();
+ spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread));
+}
+
+static void
+vhost_setup_core_mask_done(void *ctx)
+{
+ spdk_vhost_init_cb init_cb = ctx;
+
+ if (spdk_cpuset_count(&g_vhost_core_mask) == 0) {
+ init_cb(-ECHILD);
+ return;
+ }
+
+ init_cb(0);
+}
+
+static void
+vhost_dev_thread_exit(void *arg1)
+{
+ spdk_thread_exit(spdk_get_thread());
+}
+
+int
+vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+ const struct spdk_vhost_dev_backend *backend)
+{
+ char path[PATH_MAX];
+ struct spdk_cpuset cpumask = {};
+ int rc;
+
+ assert(vdev);
+ if (name == NULL) {
+ SPDK_ERRLOG("Can't register controller with no name\n");
+ return -EINVAL;
+ }
+
+ if (vhost_parse_core_mask(mask_str, &cpumask) != 0) {
+ SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n",
+ mask_str, spdk_cpuset_fmt(&g_vhost_core_mask));
+ return -EINVAL;
+ }
+
+ if (spdk_vhost_dev_find(name)) {
+ SPDK_ERRLOG("vhost controller %s already exists.\n", name);
+ return -EEXIST;
+ }
+
+ if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
+ SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
+ name);
+ return -EINVAL;
+ }
+
+ vdev->name = strdup(name);
+ vdev->path = strdup(path);
+ if (vdev->name == NULL || vdev->path == NULL) {
+ rc = -EIO;
+ goto out;
+ }
+
+ vdev->thread = spdk_thread_create(vdev->name, &cpumask);
+ if (vdev->thread == NULL) {
+ SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name);
+ rc = -EIO;
+ goto out;
+ }
+
+ vdev->registered = true;
+ vdev->backend = backend;
+ TAILQ_INIT(&vdev->vsessions);
+
+ vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
+ SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
+
+ if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features,
+ vdev->protocol_features)) {
+ spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+ rc = -EIO;
+ goto out;
+ }
+
+ TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq);
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
+ return 0;
+
+out:
+ free(vdev->name);
+ free(vdev->path);
+ return rc;
+}
+
+int
+vhost_dev_unregister(struct spdk_vhost_dev *vdev)
+{
+ if (!TAILQ_EMPTY(&vdev->vsessions)) {
+ SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
+ return -EBUSY;
+ }
+
+ if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) {
+ SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
+ "Check if domain socket %s still exists\n",
+ vdev->name, vdev->path);
+ return -EIO;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
+
+ spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+
+ free(vdev->name);
+ free(vdev->path);
+ TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
+ return 0;
+}
+
+const char *
+spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
+{
+ assert(vdev != NULL);
+ return vdev->name;
+}
+
+const struct spdk_cpuset *
+spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
+{
+ assert(vdev != NULL);
+ return spdk_thread_get_cpumask(vdev->thread);
+}
+
+static void
+wait_for_semaphore(int timeout_sec, const char *errmsg)
+{
+ struct timespec timeout;
+ int rc;
+
+ clock_gettime(CLOCK_REALTIME, &timeout);
+ timeout.tv_sec += timeout_sec;
+ rc = sem_timedwait(&g_dpdk_sem, &timeout);
+ if (rc != 0) {
+ SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
+ sem_wait(&g_dpdk_sem);
+ }
+}
+
+static void
+vhost_session_cb_done(int rc)
+{
+ g_dpdk_response = rc;
+ sem_post(&g_dpdk_sem);
+}
+
+void
+vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
+{
+ if (response == 0) {
+ vsession->started = true;
+
+ assert(vsession->vdev->active_session_num < UINT32_MAX);
+ vsession->vdev->active_session_num++;
+ }
+
+ vhost_session_cb_done(response);
+}
+
+void
+vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
+{
+ if (response == 0) {
+ vsession->started = false;
+
+ assert(vsession->vdev->active_session_num > 0);
+ vsession->vdev->active_session_num--;
+ }
+
+ vhost_session_cb_done(response);
+}
+
+static void
+vhost_event_cb(void *arg1)
+{
+ struct vhost_session_fn_ctx *ctx = arg1;
+ struct spdk_vhost_session *vsession;
+
+ if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+ spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1);
+ return;
+ }
+
+ vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
+ ctx->cb_fn(ctx->vdev, vsession, NULL);
+ pthread_mutex_unlock(&g_vhost_mutex);
+}
+
+int
+vhost_session_send_event(struct spdk_vhost_session *vsession,
+ spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+ const char *errmsg)
+{
+ struct vhost_session_fn_ctx ev_ctx = {0};
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+
+ ev_ctx.vdev = vdev;
+ ev_ctx.vsession_id = vsession->id;
+ ev_ctx.cb_fn = cb_fn;
+
+ spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx);
+
+ pthread_mutex_unlock(&g_vhost_mutex);
+ wait_for_semaphore(timeout_sec, errmsg);
+ pthread_mutex_lock(&g_vhost_mutex);
+
+ return g_dpdk_response;
+}
+
+static void
+foreach_session_finish_cb(void *arg1)
+{
+ struct vhost_session_fn_ctx *ev_ctx = arg1;
+ struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+
+ if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+ spdk_thread_send_msg(spdk_get_thread(),
+ foreach_session_finish_cb, arg1);
+ return;
+ }
+
+ assert(vdev->pending_async_op_num > 0);
+ vdev->pending_async_op_num--;
+ if (ev_ctx->cpl_fn != NULL) {
+ ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx);
+ }
+
+ pthread_mutex_unlock(&g_vhost_mutex);
+ free(ev_ctx);
+}
+
+static void
+foreach_session(void *arg1)
+{
+ struct vhost_session_fn_ctx *ev_ctx = arg1;
+ struct spdk_vhost_session *vsession;
+ struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+ int rc;
+
+ if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+ spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1);
+ return;
+ }
+
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->initialized) {
+ rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx);
+ if (rc < 0) {
+ goto out;
+ }
+ }
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+
+ spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1);
+}
+
+void
+vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
+ spdk_vhost_session_fn fn,
+ spdk_vhost_dev_fn cpl_fn,
+ void *arg)
+{
+ struct vhost_session_fn_ctx *ev_ctx;
+
+ ev_ctx = calloc(1, sizeof(*ev_ctx));
+ if (ev_ctx == NULL) {
+ SPDK_ERRLOG("Failed to alloc vhost event.\n");
+ assert(false);
+ return;
+ }
+
+ ev_ctx->vdev = vdev;
+ ev_ctx->cb_fn = fn;
+ ev_ctx->cpl_fn = cpl_fn;
+ ev_ctx->user_ctx = arg;
+
+ assert(vdev->pending_async_op_num < UINT32_MAX);
+ vdev->pending_async_op_num++;
+
+ spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx);
+}
+
+static int
+_stop_session(struct spdk_vhost_session *vsession)
+{
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+ struct spdk_vhost_virtqueue *q;
+ int rc;
+ uint16_t i;
+
+ rc = vdev->backend->stop_session(vsession);
+ if (rc != 0) {
+ SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+ }
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ q = &vsession->virtqueue[i];
+
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (q->vring.desc == NULL) {
+ continue;
+ }
+
+ /* Packed virtqueues support up to 2^15 entries each
+ * so left one bit can be used as wrap counter.
+ */
+ if (q->packed.packed_ring) {
+ q->last_avail_idx = q->last_avail_idx |
+ ((uint16_t)q->packed.avail_phase << 15);
+ q->last_used_idx = q->last_used_idx |
+ ((uint16_t)q->packed.used_phase << 15);
+ }
+
+ rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
+ }
+
+ vhost_session_mem_unregister(vsession->mem);
+ free(vsession->mem);
+
+ return 0;
+}
+
+int
+vhost_stop_device_cb(int vid)
+{
+ struct spdk_vhost_session *vsession;
+ int rc;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -EINVAL;
+ }
+
+ if (!vsession->started) {
+ /* already stopped, nothing to do */
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -EALREADY;
+ }
+
+ rc = _stop_session(vsession);
+ pthread_mutex_unlock(&g_vhost_mutex);
+
+ return rc;
+}
+
+int
+vhost_start_device_cb(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+ int rc = -1;
+ uint16_t i;
+ bool packed_ring;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ goto out;
+ }
+
+ vdev = vsession->vdev;
+ if (vsession->started) {
+ /* already started, nothing to do */
+ rc = 0;
+ goto out;
+ }
+
+ if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
+ goto out;
+ }
+
+ packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0);
+
+ vsession->max_queues = 0;
+ memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
+ for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
+ struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
+
+ q->vring_idx = -1;
+ if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) {
+ continue;
+ }
+ q->vring_idx = i;
+ rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight);
+
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (q->vring.desc == NULL || q->vring.size == 0) {
+ continue;
+ }
+
+ if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) {
+ q->vring.desc = NULL;
+ continue;
+ }
+
+ if (packed_ring) {
+ /* Packed virtqueues support up to 2^15 entries each
+ * so left one bit can be used as wrap counter.
+ */
+ q->packed.avail_phase = q->last_avail_idx >> 15;
+ q->last_avail_idx = q->last_avail_idx & 0x7FFF;
+ q->packed.used_phase = q->last_used_idx >> 15;
+ q->last_used_idx = q->last_used_idx & 0x7FFF;
+
+ /* Disable I/O submission notifications, we'll be polling. */
+ q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
+ } else {
+ /* Disable I/O submission notifications, we'll be polling. */
+ q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+ }
+
+ q->packed.packed_ring = packed_ring;
+ vsession->max_queues = i + 1;
+ }
+
+ if (vhost_get_mem_table(vid, &vsession->mem) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
+ goto out;
+ }
+
+ /*
+ * Not sure right now but this look like some kind of QEMU bug and guest IO
+ * might be frozed without kicking all queues after live-migration. This look like
+ * the previous vhost instance failed to effectively deliver all interrupts before
+ * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
+ * should be ignored by guest virtio driver.
+ *
+ * Tested on QEMU 2.10.91 and 2.11.50.
+ */
+ for (i = 0; i < vsession->max_queues; i++) {
+ struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
+
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (q->vring.desc != NULL && q->vring.size > 0) {
+ rte_vhost_vring_call(vsession->vid, q->vring_idx);
+ }
+ }
+
+ vhost_session_set_coalescing(vdev, vsession, NULL);
+ vhost_session_mem_register(vsession->mem);
+ vsession->initialized = true;
+ rc = vdev->backend->start_session(vsession);
+ if (rc != 0) {
+ vhost_session_mem_unregister(vsession->mem);
+ free(vsession->mem);
+ goto out;
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+}
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int
+vhost_get_config_cb(int vid, uint8_t *config, uint32_t len)
+{
+ struct spdk_vhost_session *vsession;
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ goto out;
+ }
+
+ vdev = vsession->vdev;
+ if (vdev->backend->vhost_get_config) {
+ rc = vdev->backend->vhost_get_config(vdev, config, len);
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+}
+
+int
+vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
+{
+ struct spdk_vhost_session *vsession;
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ goto out;
+ }
+
+ vdev = vsession->vdev;
+ if (vdev->backend->vhost_set_config) {
+ rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+}
+#endif
+
+int
+spdk_vhost_set_socket_path(const char *basename)
+{
+ int ret;
+
+ if (basename && strlen(basename) > 0) {
+ ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
+ if (ret <= 0) {
+ return -EINVAL;
+ }
+ if ((size_t)ret >= sizeof(dev_dirname) - 2) {
+ SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
+ return -EINVAL;
+ }
+
+ if (dev_dirname[ret - 1] != '/') {
+ dev_dirname[ret] = '/';
+ dev_dirname[ret + 1] = '\0';
+ }
+ }
+
+ return 0;
+}
+
+void
+vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ assert(vdev->backend->dump_info_json != NULL);
+ vdev->backend->dump_info_json(vdev, w);
+}
+
+int
+spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->pending_async_op_num) {
+ return -EBUSY;
+ }
+
+ return vdev->backend->remove_device(vdev);
+}
+
+int
+vhost_new_connection_cb(int vid, const char *ifname)
+{
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+
+ vdev = spdk_vhost_dev_find(ifname);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -1;
+ }
+
+ /* We expect sessions inside vdev->vsessions to be sorted in ascending
+ * order in regard of vsession->id. For now we always set id = vsessions_cnt++
+ * and append each session to the very end of the vsessions list.
+ * This is required for spdk_vhost_dev_foreach_session() to work.
+ */
+ if (vdev->vsessions_num == UINT_MAX) {
+ assert(false);
+ return -EINVAL;
+ }
+
+ if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
+ vdev->backend->session_ctx_size)) {
+ SPDK_ERRLOG("vsession alloc failed\n");
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -1;
+ }
+ memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
+
+ vsession->vdev = vdev;
+ vsession->vid = vid;
+ vsession->id = vdev->vsessions_num++;
+ vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid);
+ if (vsession->name == NULL) {
+ SPDK_ERRLOG("vsession alloc failed\n");
+ pthread_mutex_unlock(&g_vhost_mutex);
+ free(vsession);
+ return -1;
+ }
+ vsession->started = false;
+ vsession->initialized = false;
+ vsession->next_stats_check_time = 0;
+ vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS *
+ spdk_get_ticks_hz() / 1000UL;
+ TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
+
+ vhost_session_install_rte_compat_hooks(vsession);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return 0;
+}
+
+int
+vhost_destroy_connection_cb(int vid)
+{
+ struct spdk_vhost_session *vsession;
+ int rc = 0;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -EINVAL;
+ }
+
+ if (vsession->started) {
+ rc = _stop_session(vsession);
+ }
+
+ TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
+ free(vsession->name);
+ free(vsession);
+ pthread_mutex_unlock(&g_vhost_mutex);
+
+ return rc;
+}
+
+void
+spdk_vhost_lock(void)
+{
+ pthread_mutex_lock(&g_vhost_mutex);
+}
+
+int
+spdk_vhost_trylock(void)
+{
+ return -pthread_mutex_trylock(&g_vhost_mutex);
+}
+
+void
+spdk_vhost_unlock(void)
+{
+ pthread_mutex_unlock(&g_vhost_mutex);
+}
+
+void
+spdk_vhost_init(spdk_vhost_init_cb init_cb)
+{
+ size_t len;
+ int ret;
+
+ g_vhost_init_thread = spdk_get_thread();
+ assert(g_vhost_init_thread != NULL);
+
+ if (dev_dirname[0] == '\0') {
+ if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
+ SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
+ ret = -1;
+ goto out;
+ }
+
+ len = strlen(dev_dirname);
+ if (dev_dirname[len - 1] != '/') {
+ dev_dirname[len] = '/';
+ dev_dirname[len + 1] = '\0';
+ }
+ }
+
+ ret = sem_init(&g_dpdk_sem, 0, 0);
+ if (ret != 0) {
+ SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n");
+ ret = -1;
+ goto out;
+ }
+
+ ret = vhost_scsi_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost controllers\n");
+ goto out;
+ }
+
+ ret = vhost_blk_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost block controllers\n");
+ goto out;
+ }
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ ret = vhost_nvme_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
+ goto out;
+ }
+#endif
+
+ spdk_cpuset_zero(&g_vhost_core_mask);
+
+ /* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really
+ * created.
+ */
+ spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done);
+ return;
+out:
+ init_cb(ret);
+}
+
+static void
+vhost_fini(void *arg1)
+{
+ struct spdk_vhost_dev *vdev, *tmp;
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_next(NULL);
+ while (vdev != NULL) {
+ tmp = spdk_vhost_dev_next(vdev);
+ spdk_vhost_dev_remove(vdev);
+ /* don't care if it fails, there's nothing we can do for now */
+ vdev = tmp;
+ }
+ spdk_vhost_unlock();
+
+ spdk_cpuset_zero(&g_vhost_core_mask);
+
+ /* All devices are removed now. */
+ sem_destroy(&g_dpdk_sem);
+
+ g_fini_cpl_cb();
+}
+
+static void *
+session_shutdown(void *arg)
+{
+ struct spdk_vhost_dev *vdev = NULL;
+
+ TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+ vhost_driver_unregister(vdev->path);
+ vdev->registered = false;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
+ spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL);
+ return NULL;
+}
+
+void
+spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
+{
+ pthread_t tid;
+ int rc;
+
+ assert(spdk_get_thread() == g_vhost_init_thread);
+ g_fini_cpl_cb = fini_cb;
+
+ /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
+ * ops for stopping a device or removing a connection, we need to call it from
+ * a separate thread to avoid deadlock.
+ */
+ rc = pthread_create(&tid, NULL, &session_shutdown, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
+ abort();
+ }
+ pthread_detach(tid);
+}
+
+void
+spdk_vhost_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_dev *vdev;
+ uint32_t delay_base_us;
+ uint32_t iops_threshold;
+
+ spdk_json_write_array_begin(w);
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_next(NULL);
+ while (vdev != NULL) {
+ vdev->backend->write_config_json(vdev, w);
+
+ spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+ if (delay_base_us) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+ spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+ vdev = spdk_vhost_dev_next(vdev);
+ }
+ spdk_vhost_unlock();
+
+ spdk_json_write_array_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
+SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)
diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c
new file mode 100644
index 000000000..d387cb27d
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_blk.c
@@ -0,0 +1,1354 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/virtio_blk.h>
+
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/vhost.h"
+
+#include "vhost_internal.h"
+#include <rte_version.h>
+
+/* Minimal set of features supported by every SPDK VHOST-BLK device */
+#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
+ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
+ (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
+ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \
+ (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
+ (1ULL << VIRTIO_BLK_F_MQ))
+
+/* Not supported features */
+#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
+ (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
+ (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI))
+
+/* Vhost-blk support protocol features */
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
+#else
+#define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
+#endif
+
+struct spdk_vhost_blk_task {
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_vhost_blk_session *bvsession;
+ struct spdk_vhost_virtqueue *vq;
+
+ volatile uint8_t *status;
+
+ uint16_t req_idx;
+ uint16_t num_descs;
+ uint16_t buffer_id;
+
+ /* for io wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+ /* If set, the task is currently used for I/O processing. */
+ bool used;
+
+ /** Number of bytes that were written. */
+ uint32_t used_len;
+ uint16_t iovcnt;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+};
+
+struct spdk_vhost_blk_dev {
+ struct spdk_vhost_dev vdev;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *bdev_desc;
+ /* dummy_io_channel is used to hold a bdev reference */
+ struct spdk_io_channel *dummy_io_channel;
+ bool readonly;
+};
+
+struct spdk_vhost_blk_session {
+ /* The parent session must be the very first field in this struct */
+ struct spdk_vhost_session vsession;
+ struct spdk_vhost_blk_dev *bvdev;
+ struct spdk_poller *requestq_poller;
+ struct spdk_io_channel *io_channel;
+ struct spdk_poller *stop_poller;
+};
+
+/* forward declaration */
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task,
+ struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq);
+
+static void
+blk_task_finish(struct spdk_vhost_blk_task *task)
+{
+ assert(task->bvsession->vsession.task_cnt > 0);
+ task->bvsession->vsession.task_cnt--;
+ task->used = false;
+}
+
+static void
+blk_task_init(struct spdk_vhost_blk_task *task)
+{
+ task->used = true;
+ task->iovcnt = SPDK_COUNTOF(task->iovs);
+ task->status = NULL;
+ task->used_len = 0;
+}
+
+static void
+blk_task_enqueue(struct spdk_vhost_blk_task *task)
+{
+ if (task->vq->packed.packed_ring) {
+ vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
+ task->num_descs,
+ task->buffer_id, task->used_len);
+ } else {
+ vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
+ task->req_idx, task->used_len);
+ }
+}
+
+static void
+invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
+{
+ if (task->status) {
+ *task->status = status;
+ }
+
+ blk_task_enqueue(task);
+ blk_task_finish(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ * total size of suplied buffers
+ *
+ * FIXME: Make this function return to rd_cnt and wr_cnt
+ */
+static int
+blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+ struct vring_desc *desc, *desc_table;
+ uint16_t out_cnt = 0, cnt = 0;
+ uint32_t desc_table_size, len = 0;
+ uint32_t desc_handled_cnt;
+ int rc;
+
+ rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+ return -1;
+ }
+
+ desc_handled_cnt = 0;
+ while (1) {
+ /*
+ * Maximum cnt reached?
+ * Should not happen if request is well formatted, otherwise this is a BUG.
+ */
+ if (spdk_unlikely(cnt == *iovs_cnt)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx);
+ return -1;
+ }
+
+ if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx, cnt);
+ return -1;
+ }
+
+ len += desc->len;
+
+ out_cnt += vhost_vring_desc_is_wr(desc);
+
+ rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
+ vsession->name, req_idx);
+ return -1;
+ } else if (desc == NULL) {
+ break;
+ }
+
+ desc_handled_cnt++;
+ if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
+ /* Break a cycle and report an error, if any. */
+ SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
+ vsession->name, desc_table_size, desc_handled_cnt);
+ return -1;
+ }
+ }
+
+ /*
+ * There must be least two descriptors.
+ * First contain request so it must be readable.
+ * Last descriptor contain buffer for response so it must be writable.
+ */
+ if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+ return -1;
+ }
+
+ *length = len;
+ *iovs_cnt = cnt;
+ return 0;
+}
+
+static int
+blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+ struct vring_packed_desc *desc = NULL, *desc_table;
+ uint16_t out_cnt = 0, cnt = 0;
+ uint32_t desc_table_size, len = 0;
+ int rc = 0;
+
+ rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
+ &desc_table, &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+ return rc;
+ }
+
+ if (desc_table != NULL) {
+ req_idx = 0;
+ }
+
+ while (1) {
+ /*
+ * Maximum cnt reached?
+ * Should not happen if request is well formatted, otherwise this is a BUG.
+ */
+ if (spdk_unlikely(cnt == *iovs_cnt)) {
+ SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx);
+ return -EINVAL;
+ }
+
+ if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
+ SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx, cnt);
+ return -EINVAL;
+ }
+
+ len += desc->len;
+ out_cnt += vhost_vring_packed_desc_is_wr(desc);
+
+ /* desc is NULL means we reach the last desc of this request */
+ vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
+ if (desc == NULL) {
+ break;
+ }
+ }
+
+ /*
+ * There must be least two descriptors.
+ * First contain request so it must be readable.
+ * Last descriptor contain buffer for response so it must be writable.
+ */
+ if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+ return -EINVAL;
+ }
+
+ *length = len;
+ *iovs_cnt = cnt;
+
+ return 0;
+}
+
+static void
+blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
+{
+ *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+
+ blk_task_enqueue(task);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
+ task->req_idx, success ? "OK" : "FAIL");
+ blk_task_finish(task);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_blk_task *task = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+ blk_request_finish(success, task);
+}
+
+static void
+blk_request_resubmit(void *arg)
+{
+ struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
+ int rc = 0;
+
+ blk_task_init(task);
+
+ rc = process_blk_request(task, task->bvsession, task->vq);
+ if (rc == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
+ }
+}
+
+static inline void
+blk_request_queue_io(struct spdk_vhost_blk_task *task)
+{
+ int rc;
+ struct spdk_vhost_blk_session *bvsession = task->bvsession;
+ struct spdk_bdev *bdev = bvsession->bvdev->bdev;
+
+ task->bdev_io_wait.bdev = bdev;
+ task->bdev_io_wait.cb_fn = blk_request_resubmit;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ }
+}
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task,
+ struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
+ const struct virtio_blk_outhdr *req;
+ struct virtio_blk_discard_write_zeroes *desc;
+ struct iovec *iov;
+ uint32_t type;
+ uint32_t payload_len;
+ uint64_t flush_bytes;
+ int rc;
+
+ if (vq->packed.packed_ring) {
+ rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+ &payload_len);
+ } else {
+ rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+ &payload_len);
+ }
+
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
+ /* Only READ and WRITE are supported for now. */
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ iov = &task->iovs[0];
+ if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+ "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
+ iov->iov_len, sizeof(*req), task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ req = iov->iov_base;
+
+ iov = &task->iovs[task->iovcnt - 1];
+ if (spdk_unlikely(iov->iov_len != 1)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+ "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
+ iov->iov_len, 1, task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ task->status = iov->iov_base;
+ payload_len -= sizeof(*req) + sizeof(*task->status);
+ task->iovcnt -= 2;
+
+ type = req->type;
+#ifdef VIRTIO_BLK_T_BARRIER
+ /* Don't care about barier for now (as QEMU's virtio-blk do). */
+ type &= ~VIRTIO_BLK_T_BARRIER;
+#endif
+
+ switch (type) {
+ case VIRTIO_BLK_T_IN:
+ case VIRTIO_BLK_T_OUT:
+ if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
+ SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
+ type ? "WRITE" : "READ", task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ if (type == VIRTIO_BLK_T_IN) {
+ task->used_len = payload_len + sizeof(*task->status);
+ rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
+ &task->iovs[1], task->iovcnt, req->sector * 512,
+ payload_len, blk_request_complete_cb, task);
+ } else if (!bvdev->readonly) {
+ task->used_len = sizeof(*task->status);
+ rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
+ &task->iovs[1], task->iovcnt, req->sector * 512,
+ payload_len, blk_request_complete_cb, task);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
+ rc = -1;
+ }
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_DISCARD:
+ desc = task->iovs[1].iov_base;
+ if (payload_len != sizeof(*desc)) {
+ SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+
+ rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
+ desc->sector * 512, desc->num_sectors * 512,
+ blk_request_complete_cb, task);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_WRITE_ZEROES:
+ desc = task->iovs[1].iov_base;
+ if (payload_len != sizeof(*desc)) {
+ SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+
+ /* Zeroed and Unmap the range, SPDK doen't support it. */
+ if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+ SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
+ desc->sector * 512, desc->num_sectors * 512,
+ blk_request_complete_cb, task);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_FLUSH:
+ flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
+ if (req->sector != 0) {
+ SPDK_NOTICELOG("sector must be zero for flush command\n");
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
+ 0, flush_bytes,
+ blk_request_complete_cb, task);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_GET_ID:
+ if (!task->iovcnt || !payload_len) {
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+ task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
+ spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
+ task->used_len, ' ');
+ blk_request_finish(true, task);
+ break;
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
+{
+ struct spdk_vhost_blk_task *task;
+ uint16_t task_idx = req_idx, num_descs;
+
+ if (vq->packed.packed_ring) {
+ /* Packed ring used the buffer_id as the task_idx to get task struct.
+ * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
+ * must be in the range of 0 ~ vring.size. The free_head value must be unique
+ * in the outstanding requests.
+ * We can't use the req_idx as the task_idx because the desc can be reused in
+ * the next phase even when it's not completed in the previous phase. For example,
+ * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
+ * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
+ * as task_idx because we will know task[0]->used is true at phase 1.
+ * The split queue is quite different, the desc would insert into the free list when
+ * device completes the request, the driver gets the desc from the free list which
+ * ensures the req_idx is unique in the outstanding requests.
+ */
+ task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
+ }
+
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ task->bvsession->vsession.name, task_idx);
+ task->used_len = 0;
+ blk_task_enqueue(task);
+ return;
+ }
+
+ if (vq->packed.packed_ring) {
+ task->req_idx = req_idx;
+ task->num_descs = num_descs;
+ task->buffer_id = task_idx;
+ }
+
+ task->bvsession->vsession.task_cnt++;
+
+ blk_task_init(task);
+
+ if (process_blk_request(task, task->bvsession, vq) == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
+ task_idx);
+ } else {
+ SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
+ }
+}
+
+static void
+submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
+ spdk_vhost_resubmit_desc *resubmit_list;
+ uint16_t req_idx;
+
+ if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
+ return;
+ }
+
+ resubmit_list = resubmit->resubmit_list;
+ while (resubmit->resubmit_num-- > 0) {
+ req_idx = resubmit_list[resubmit->resubmit_num].index;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
+ req_idx);
+
+ if (spdk_unlikely(req_idx >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ vsession->name, req_idx, vq->vring.size);
+ vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+ continue;
+ }
+
+ process_blk_task(vq, req_idx);
+ }
+
+ free(resubmit_list);
+ resubmit->resubmit_list = NULL;
+}
+
+static void
+process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
+ uint16_t reqs_cnt, i;
+
+ submit_inflight_desc(bvsession, vq);
+
+ reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ if (!reqs_cnt) {
+ return;
+ }
+
+ for (i = 0; i < reqs_cnt; i++) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+ reqs[i]);
+
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ vsession->name, reqs[i], vq->vring.size);
+ vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
+ continue;
+ }
+
+ rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
+
+ process_blk_task(vq, reqs[i]);
+ }
+}
+
+static void
+process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ uint16_t i = 0;
+
+ while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
+ vhost_vq_packed_ring_is_avail(vq)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+ vq->last_avail_idx);
+
+ process_blk_task(vq, vq->last_avail_idx);
+ }
+}
+
+static int
+vdev_worker(void *arg)
+{
+ struct spdk_vhost_blk_session *bvsession = arg;
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+
+ uint16_t q_idx;
+ bool packed_ring;
+
+ /* In a session, every vq supports the same format */
+ packed_ring = vsession->virtqueue[0].packed.packed_ring;
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ if (packed_ring) {
+ process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
+ } else {
+ process_vq(bvsession, &vsession->virtqueue[q_idx]);
+ }
+ }
+
+ vhost_session_used_signal(vsession);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+ uint32_t length;
+ uint16_t iovcnt, req_idx;
+
+ if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
+ return;
+ }
+
+ iovcnt = SPDK_COUNTOF(iovs);
+ if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
+ *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+ }
+
+ vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+}
+
+static void
+no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_blk_task *task;
+ uint32_t length;
+ uint16_t req_idx = vq->last_avail_idx;
+ uint16_t task_idx, num_descs;
+
+ if (!vhost_vq_packed_ring_is_avail(vq)) {
+ return;
+ }
+
+ task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ vsession->name, req_idx);
+ vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
+ task->buffer_id, task->used_len);
+ return;
+ }
+
+ task->req_idx = req_idx;
+ task->num_descs = num_descs;
+ task->buffer_id = task_idx;
+ blk_task_init(task);
+
+ if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+ &length)) {
+ *(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+ }
+
+ task->used = false;
+ vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
+ task->buffer_id, task->used_len);
+}
+
+static int
+no_bdev_vdev_worker(void *arg)
+{
+ struct spdk_vhost_blk_session *bvsession = arg;
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ uint16_t q_idx;
+ bool packed_ring;
+
+ /* In a session, every vq supports the same format */
+ packed_ring = vsession->virtqueue[0].packed.packed_ring;
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ if (packed_ring) {
+ no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
+ } else {
+ no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
+ }
+ }
+
+ vhost_session_used_signal(vsession);
+
+ if (vsession->task_cnt == 0 && bvsession->io_channel) {
+ spdk_put_io_channel(bvsession->io_channel);
+ bvsession->io_channel = NULL;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static struct spdk_vhost_blk_session *
+to_blk_session(struct spdk_vhost_session *vsession)
+{
+ assert(vsession->vdev->backend == &vhost_blk_device_backend);
+ return (struct spdk_vhost_blk_session *)vsession;
+}
+
+static struct spdk_vhost_blk_dev *
+to_blk_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev == NULL) {
+ return NULL;
+ }
+
+ if (vdev->backend != &vhost_blk_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
+}
+
+static int
+vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession,
+ void *ctx)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
+ SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
+ rte_vhost_slave_config_change(vsession->vid, false);
+#else
+ SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
+#endif
+
+ return 0;
+}
+
+static void
+blk_resize_cb(void *resize_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev = resize_ctx;
+
+ spdk_vhost_lock();
+ vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
+ NULL, NULL);
+ spdk_vhost_unlock();
+}
+
+static void
+vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+
+ /* All sessions have been notified, time to close the bdev */
+ struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+
+ assert(bvdev != NULL);
+ spdk_put_io_channel(bvdev->dummy_io_channel);
+ spdk_bdev_close(bvdev->bdev_desc);
+ bvdev->bdev_desc = NULL;
+ bvdev->bdev = NULL;
+}
+
+static int
+vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession,
+ void *ctx)
+{
+ struct spdk_vhost_blk_session *bvsession;
+
+ bvsession = (struct spdk_vhost_blk_session *)vsession;
+ if (bvsession->requestq_poller) {
+ spdk_poller_unregister(&bvsession->requestq_poller);
+ bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
+ }
+
+ return 0;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev = remove_ctx;
+
+ SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
+ bvdev->vdev.name);
+
+ spdk_vhost_lock();
+ vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
+ vhost_dev_bdev_remove_cpl_cb, NULL);
+ spdk_vhost_unlock();
+}
+
+static void
+bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
+ void *event_ctx)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Bdev event: type %d, name %s\n",
+ type,
+ bdev->name);
+
+ switch (type) {
+ case SPDK_BDEV_EVENT_REMOVE:
+ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
+ bdev_remove_cb(event_ctx);
+ break;
+ case SPDK_BDEV_EVENT_RESIZE:
+ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
+ blk_resize_cb(event_ctx);
+ break;
+ default:
+ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
+ break;
+ }
+}
+
+static void
+free_task_pool(struct spdk_vhost_blk_session *bvsession)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ uint16_t i;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->tasks == NULL) {
+ continue;
+ }
+
+ spdk_free(vq->tasks);
+ vq->tasks = NULL;
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ struct spdk_vhost_blk_task *task;
+ uint32_t task_cnt;
+ uint16_t i;
+ uint32_t j;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->vring.desc == NULL) {
+ continue;
+ }
+
+ task_cnt = vq->vring.size;
+ if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+ /* sanity check */
+ SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+ vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+ free_task_pool(bvsession);
+ return -1;
+ }
+ vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
+ SPDK_CACHE_LINE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vq->tasks == NULL) {
+ SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+ vsession->name, task_cnt, i);
+ free_task_pool(bvsession);
+ return -1;
+ }
+
+ for (j = 0; j < task_cnt; j++) {
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
+ task->bvsession = bvsession;
+ task->req_idx = j;
+ task->vq = vq;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
+ struct spdk_vhost_blk_dev *bvdev;
+ int i, rc = 0;
+
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+ bvsession->bvdev = bvdev;
+
+ /* validate all I/O queues are in a contiguous index range */
+ for (i = 0; i < vsession->max_queues; i++) {
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (vsession->virtqueue[i].vring.desc == NULL) {
+ SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ rc = alloc_task_pool(bvsession);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
+ goto out;
+ }
+
+ if (bvdev->bdev) {
+ bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+ if (!bvsession->io_channel) {
+ free_task_pool(bvsession);
+ SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
+ bvsession, 0);
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+out:
+ vhost_session_start_done(vsession, rc);
+ return rc;
+}
+
+static int
+vhost_blk_start(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_blk_start_cb,
+ 3, "start session");
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+ struct spdk_vhost_blk_session *bvsession = arg;
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ int i;
+
+ if (vsession->task_cnt > 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (spdk_vhost_trylock() != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vsession->virtqueue[i].next_event_time = 0;
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+
+ if (bvsession->io_channel) {
+ spdk_put_io_channel(bvsession->io_channel);
+ bvsession->io_channel = NULL;
+ }
+
+ free_task_pool(bvsession);
+ spdk_poller_unregister(&bvsession->stop_poller);
+ vhost_session_stop_done(vsession, 0);
+
+ spdk_vhost_unlock();
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
+
+ spdk_poller_unregister(&bvsession->requestq_poller);
+ bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
+ bvsession, 1000);
+ return 0;
+}
+
+static int
+vhost_blk_stop(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_blk_stop_cb,
+ 3, "stop session");
+}
+
+static void
+vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+
+ spdk_json_write_named_object_begin(w, "block");
+
+ spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+
+ spdk_json_write_name(w, "bdev");
+ if (bvdev->bdev) {
+ spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
+ } else {
+ spdk_json_write_null(w);
+ }
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+
+ if (!bvdev->bdev) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
+ spdk_json_write_named_string(w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+ spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
+
+static int
+vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
+ uint32_t len)
+{
+ struct virtio_blk_config blkcfg;
+ struct spdk_vhost_blk_dev *bvdev;
+ struct spdk_bdev *bdev;
+ uint32_t blk_size;
+ uint64_t blkcnt;
+
+ memset(&blkcfg, 0, sizeof(blkcfg));
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+ bdev = bvdev->bdev;
+ if (bdev == NULL) {
+ /* We can't just return -1 here as this GET_CONFIG message might
+ * be caused by a QEMU VM reboot. Returning -1 will indicate an
+ * error to QEMU, who might then decide to terminate itself.
+ * We don't want that. A simple reboot shouldn't break the system.
+ *
+ * Presenting a block device with block size 0 and block count 0
+ * doesn't cause any problems on QEMU side and the virtio-pci
+ * device is even still available inside the VM, but there will
+ * be no block device created for it - the kernel drivers will
+ * silently reject it.
+ */
+ blk_size = 0;
+ blkcnt = 0;
+ } else {
+ blk_size = spdk_bdev_get_block_size(bdev);
+ blkcnt = spdk_bdev_get_num_blocks(bdev);
+ if (spdk_bdev_get_buf_align(bdev) > 1) {
+ blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+ blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
+ } else {
+ blkcfg.size_max = 131072;
+ /* -2 for REQ and RESP and -1 for region boundary splitting */
+ blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+ }
+ }
+
+ blkcfg.blk_size = blk_size;
+ /* minimum I/O size in blocks */
+ blkcfg.min_io_size = 1;
+ /* expressed in 512 Bytes sectors */
+ blkcfg.capacity = (blkcnt * blk_size) / 512;
+ /* QEMU can overwrite this value when started */
+ blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
+
+ if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ /* 16MiB, expressed in 512 Bytes */
+ blkcfg.max_discard_sectors = 32768;
+ blkcfg.max_discard_seg = 1;
+ blkcfg.discard_sector_alignment = blk_size / 512;
+ }
+ if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+ blkcfg.max_write_zeroes_sectors = 32768;
+ blkcfg.max_write_zeroes_seg = 1;
+ }
+
+ memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
+
+ return 0;
+}
+
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
+ .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
+ .start_session = vhost_blk_start,
+ .stop_session = vhost_blk_stop,
+ .vhost_get_config = vhost_blk_get_config,
+ .dump_info_json = vhost_blk_dump_info_json,
+ .write_config_json = vhost_blk_write_config_json,
+ .remove_device = vhost_blk_destroy,
+};
+
+int
+vhost_blk_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ unsigned ctrlr_num;
+ char *bdev_name;
+ char *cpumask;
+ char *name;
+ bool readonly;
+ bool packed_ring;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
+ packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false);
+
+ bdev_name = spdk_conf_section_get_val(sp, "Dev");
+ if (bdev_name == NULL) {
+ continue;
+ }
+
+ if (spdk_vhost_blk_construct(name, cpumask, bdev_name,
+ readonly, packed_ring) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
+ bool readonly, bool packed_ring)
+{
+ struct spdk_vhost_blk_dev *bvdev = NULL;
+ struct spdk_vhost_dev *vdev;
+ struct spdk_bdev *bdev;
+ int ret = 0;
+
+ spdk_vhost_lock();
+ bdev = spdk_bdev_get_by_name(dev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("%s: bdev '%s' not found\n",
+ name, dev_name);
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bvdev = calloc(1, sizeof(*bvdev));
+ if (bvdev == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vdev = &bvdev->vdev;
+ vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
+ vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
+ vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
+
+ vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
+
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
+ }
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
+ }
+ if (readonly) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
+ }
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
+ }
+
+ ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
+ if (ret != 0) {
+ SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
+ name, dev_name, ret);
+ goto out;
+ }
+
+ /*
+ * When starting qemu with vhost-user-blk multiqueue, the vhost device will
+ * be started/stopped many times, related to the queues num, as the
+ * vhost-user backend doesn't know the exact number of queues used for this
+ * device. The target have to stop and start the device once got a valid
+ * IO queue.
+ * When stoping and starting the vhost device, the backend bdev io device
+ * will be deleted and created repeatedly.
+ * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
+ * the io device will not be deleted.
+ */
+ bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+
+ bvdev->bdev = bdev;
+ bvdev->readonly = readonly;
+ ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
+ if (ret != 0) {
+ spdk_put_io_channel(bvdev->dummy_io_channel);
+ spdk_bdev_close(bvdev->bdev_desc);
+ goto out;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
+out:
+ if (ret != 0 && bvdev) {
+ free(bvdev);
+ }
+ spdk_vhost_unlock();
+ return ret;
+}
+
+static int
+vhost_blk_destroy(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+ int rc;
+
+ assert(bvdev != NULL);
+
+ rc = vhost_dev_unregister(&bvdev->vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ /* if the bdev is removed, don't need call spdk_put_io_channel. */
+ if (bvdev->bdev) {
+ spdk_put_io_channel(bvdev->dummy_io_channel);
+ }
+
+ if (bvdev->bdev_desc) {
+ spdk_bdev_close(bvdev->bdev_desc);
+ bvdev->bdev_desc = NULL;
+ }
+ bvdev->bdev = NULL;
+
+ free(bvdev);
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h
new file mode 100644
index 000000000..3aa89768a
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_internal.h
@@ -0,0 +1,496 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VHOST_INTERNAL_H
+#define SPDK_VHOST_INTERNAL_H
+#include <linux/virtio_config.h>
+
+#include "spdk/stdinc.h"
+
+#include <rte_vhost.h>
+
+#include "spdk_internal/vhost_user.h"
+#include "spdk_internal/log.h"
+#include "spdk/util.h"
+#include "spdk/rpc.h"
+#include "spdk/config.h"
+
+#define SPDK_VHOST_MAX_VQUEUES 256
+#define SPDK_VHOST_MAX_VQ_SIZE 1024
+
+#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8
+
+#define SPDK_VHOST_IOVS_MAX 129
+
+#define SPDK_VHOST_VQ_MAX_SUBMISSIONS 32
+
+/*
+ * Rate at which stats are checked for interrupt coalescing.
+ */
+#define SPDK_VHOST_STATS_CHECK_INTERVAL_MS 10
+/*
+ * Default threshold at which interrupts start to be coalesced.
+ */
+#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000
+
+/*
+ * Currently coalescing is not used by default.
+ * Setting this to value > 0 here or by RPC will enable coalescing.
+ */
+#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0
+
+#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VIRTIO_F_RING_PACKED))
+
+#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY))
+
+#define VRING_DESC_F_AVAIL (1ULL << VRING_PACKED_DESC_F_AVAIL)
+#define VRING_DESC_F_USED (1ULL << VRING_PACKED_DESC_F_USED)
+#define VRING_DESC_F_AVAIL_USED (VRING_DESC_F_AVAIL | VRING_DESC_F_USED)
+
+typedef struct rte_vhost_resubmit_desc spdk_vhost_resubmit_desc;
+typedef struct rte_vhost_resubmit_info spdk_vhost_resubmit_info;
+
+struct spdk_vhost_virtqueue {
+ struct rte_vhost_vring vring;
+ struct rte_vhost_ring_inflight vring_inflight;
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+
+ struct {
+ /* To mark a descriptor as available in packed ring
+ * Equal to avail_wrap_counter in spec.
+ */
+ uint8_t avail_phase : 1;
+ /* To mark a descriptor as used in packed ring
+ * Equal to used_wrap_counter in spec.
+ */
+ uint8_t used_phase : 1;
+ uint8_t padding : 5;
+ bool packed_ring : 1;
+ } packed;
+
+ void *tasks;
+
+ /* Request count from last stats check */
+ uint32_t req_cnt;
+
+ /* Request count from last event */
+ uint16_t used_req_cnt;
+
+ /* How long interrupt is delayed */
+ uint32_t irq_delay_time;
+
+ /* Next time when we need to send event */
+ uint64_t next_event_time;
+
+ /* Associated vhost_virtqueue in the virtio device's virtqueue list */
+ uint32_t vring_idx;
+} __attribute((aligned(SPDK_CACHE_LINE_SIZE)));
+
+struct spdk_vhost_session {
+ struct spdk_vhost_dev *vdev;
+
+ /* rte_vhost connection ID. */
+ int vid;
+
+ /* Unique session ID. */
+ uint64_t id;
+ /* Unique session name. */
+ char *name;
+
+ bool initialized;
+ bool started;
+ bool needs_restart;
+ bool forced_polling;
+
+ struct rte_vhost_memory *mem;
+
+ int task_cnt;
+
+ uint16_t max_queues;
+
+ uint64_t negotiated_features;
+
+ /* Local copy of device coalescing settings. */
+ uint32_t coalescing_delay_time_base;
+ uint32_t coalescing_io_rate_threshold;
+
+ /* Next time when stats for event coalescing will be checked. */
+ uint64_t next_stats_check_time;
+
+ /* Interval used for event coalescing checking. */
+ uint64_t stats_check_interval;
+
+ struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES];
+
+ TAILQ_ENTRY(spdk_vhost_session) tailq;
+};
+
+struct spdk_vhost_dev {
+ char *name;
+ char *path;
+
+ struct spdk_thread *thread;
+ bool registered;
+
+ uint64_t virtio_features;
+ uint64_t disabled_features;
+ uint64_t protocol_features;
+
+ const struct spdk_vhost_dev_backend *backend;
+
+ /* Saved orginal values used to setup coalescing to avoid integer
+ * rounding issues during save/load config.
+ */
+ uint32_t coalescing_delay_us;
+ uint32_t coalescing_iops_threshold;
+
+ /* Current connections to the device */
+ TAILQ_HEAD(, spdk_vhost_session) vsessions;
+
+ /* Increment-only session counter */
+ uint64_t vsessions_num;
+
+ /* Number of started and actively polled sessions */
+ uint32_t active_session_num;
+
+ /* Number of pending asynchronous operations */
+ uint32_t pending_async_op_num;
+
+ TAILQ_ENTRY(spdk_vhost_dev) tailq;
+};
+
+/**
+ * \param vdev vhost device.
+ * \param vsession vhost session.
+ * \param arg user-provided parameter.
+ *
+ * \return negative values will break the foreach call, meaning
+ * the function won't be called again. Return codes zero and
+ * positive don't have any effect.
+ */
+typedef int (*spdk_vhost_session_fn)(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession,
+ void *arg);
+
+/**
+ * \param vdev vhost device.
+ * \param arg user-provided parameter.
+ */
+typedef void (*spdk_vhost_dev_fn)(struct spdk_vhost_dev *vdev, void *arg);
+
+struct spdk_vhost_dev_backend {
+ /**
+ * Size of additional per-session context data
+ * allocated whenever a new client connects.
+ */
+ size_t session_ctx_size;
+
+ int (*start_session)(struct spdk_vhost_session *vsession);
+ int (*stop_session)(struct spdk_vhost_session *vsession);
+
+ int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len);
+ int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config,
+ uint32_t offset, uint32_t size, uint32_t flags);
+
+ void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+ void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+ int (*remove_device)(struct spdk_vhost_dev *vdev);
+};
+
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len);
+
+uint16_t vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs,
+ uint16_t reqs_len);
+
+/**
+ * Get a virtio split descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c spdk_vhost_vring_desc_get_next.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * default virtqueue descriptor table or per-chain indirect
+ * table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+ uint32_t *desc_table_size);
+
+/**
+ * Get a virtio packed descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c vhost_vring_packed_desc_get_next.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * \c NULL or per-chain indirect table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_packed_desc **desc,
+ struct vring_packed_desc **desc_table, uint32_t *desc_table_size);
+
+/**
+ * Send IRQ/call client (if pending) for \c vq.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \return
+ * 0 - if no interrupt was signalled
+ * 1 - if interrupt was signalled
+ */
+int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq);
+
+
+/**
+ * Send IRQs for all queues that need to be signaled.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ */
+void vhost_session_used_signal(struct spdk_vhost_session *vsession);
+
+void vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t id, uint32_t len);
+
+/**
+ * Enqueue the entry to the used ring when device complete the request.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \req_idx descriptor index. It's the first index of this descriptor chain.
+ * \num_descs descriptor count. It's the count of the number of buffers in the chain.
+ * \buffer_id descriptor buffer ID.
+ * \length device write length. Specify the length of the buffer that has been initialized
+ * (written to) by the device
+ */
+void vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t num_descs, uint16_t buffer_id,
+ uint32_t length);
+
+/**
+ * Get subsequent descriptor from given table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int vhost_vring_desc_get_next(struct vring_desc **desc,
+ struct vring_desc *desc_table, uint32_t desc_table_size);
+static inline bool
+vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
+{
+ return !!(cur_desc->flags & VRING_DESC_F_WRITE);
+}
+
+int vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_desc *desc);
+
+bool vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue);
+
+/**
+ * Get subsequent descriptor from vq or desc table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \req_idx index of current desc, will be set to the next
+ * index. If desc_table != NULL the req_idx is the the vring index
+ * or the req_idx is the desc_table index.
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+ struct spdk_vhost_virtqueue *vq,
+ struct vring_packed_desc *desc_table,
+ uint32_t desc_table_size);
+
+bool vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc);
+
+int vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_packed_desc *desc);
+
+uint16_t vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+ uint16_t *num_descs);
+
+static inline bool __attribute__((always_inline))
+vhost_dev_has_feature(struct spdk_vhost_session *vsession, unsigned feature_id)
+{
+ return vsession->negotiated_features & (1ULL << feature_id);
+}
+
+int vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+ const struct spdk_vhost_dev_backend *backend);
+int vhost_dev_unregister(struct spdk_vhost_dev *vdev);
+
+int vhost_scsi_controller_construct(void);
+int vhost_blk_controller_construct(void);
+void vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+
+/*
+ * Vhost callbacks for vhost_device_ops interface
+ */
+
+int vhost_new_connection_cb(int vid, const char *ifname);
+int vhost_start_device_cb(int vid);
+int vhost_stop_device_cb(int vid);
+int vhost_destroy_connection_cb(int vid);
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int vhost_get_config_cb(int vid, uint8_t *config, uint32_t len);
+int vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset,
+ uint32_t size, uint32_t flags);
+#endif
+
+/*
+ * Memory registration functions used in start/stop device callbacks
+ */
+void vhost_session_mem_register(struct rte_vhost_memory *mem);
+void vhost_session_mem_unregister(struct rte_vhost_memory *mem);
+
+/*
+ * Call a function for each session of the provided vhost device.
+ * The function will be called one-by-one on each session's thread.
+ *
+ * \param vdev vhost device
+ * \param fn function to call on each session's thread
+ * \param cpl_fn function to be called at the end of the iteration on
+ * the vhost management thread.
+ * Optional, can be NULL.
+ * \param arg additional argument to the both callbacks
+ */
+void vhost_dev_foreach_session(struct spdk_vhost_dev *dev,
+ spdk_vhost_session_fn fn,
+ spdk_vhost_dev_fn cpl_fn,
+ void *arg);
+
+/**
+ * Call a function on the provided lcore and block until either
+ * spdk_vhost_session_start_done() or spdk_vhost_session_stop_done()
+ * is called.
+ *
+ * This must be called under the global vhost mutex, which this function
+ * will unlock for the time it's waiting. It's meant to be called only
+ * from start/stop session callbacks.
+ *
+ * \param vsession vhost session
+ * \param cb_fn the function to call. The void *arg parameter in cb_fn
+ * is always NULL.
+ * \param timeout_sec timeout in seconds. This function will still
+ * block after the timeout expires, but will print the provided errmsg.
+ * \param errmsg error message to print once the timeout expires
+ * \return return the code passed to spdk_vhost_session_event_done().
+ */
+int vhost_session_send_event(struct spdk_vhost_session *vsession,
+ spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+ const char *errmsg);
+
+/**
+ * Finish a blocking spdk_vhost_session_send_event() call and finally
+ * start the session. This must be called on the target lcore, which
+ * will now receive all session-related messages (e.g. from
+ * spdk_vhost_dev_foreach_session()).
+ *
+ * Must be called under the global vhost lock.
+ *
+ * \param vsession vhost session
+ * \param response return code
+ */
+void vhost_session_start_done(struct spdk_vhost_session *vsession, int response);
+
+/**
+ * Finish a blocking spdk_vhost_session_send_event() call and finally
+ * stop the session. This must be called on the session's lcore which
+ * used to receive all session-related messages (e.g. from
+ * spdk_vhost_dev_foreach_session()). After this call, the session-
+ * related messages will be once again processed by any arbitrary thread.
+ *
+ * Must be called under the global vhost lock.
+ *
+ * Must be called under the global vhost mutex.
+ *
+ * \param vsession vhost session
+ * \param response return code
+ */
+void vhost_session_stop_done(struct spdk_vhost_session *vsession, int response);
+
+struct spdk_vhost_session *vhost_session_find_by_vid(int vid);
+void vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession);
+int vhost_register_unix_socket(const char *path, const char *ctrl_name,
+ uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features);
+int vhost_driver_unregister(const char *path);
+int vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+int vhost_get_negotiated_features(int vid, uint64_t *negotiated_features);
+
+int remove_vhost_controller(struct spdk_vhost_dev *vdev);
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf);
+int vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd);
+int vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size);
+int vhost_nvme_get_cap(int vid, uint64_t *cap);
+int vhost_nvme_controller_construct(void);
+int vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues);
+int vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev);
+int vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev,
+ const char *bdev_name);
+#endif
+
+#endif /* SPDK_VHOST_INTERNAL_H */
diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c
new file mode 100644
index 000000000..10f53baf9
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_nvme.c
@@ -0,0 +1,1500 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "spdk/bdev.h"
+#include "spdk/version.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/likely.h"
+
+#include "vhost_internal.h"
+
+#define MAX_IO_QUEUES 31
+#define MAX_IOVS 64
+#define MAX_NAMESPACE 8
+#define MAX_QUEUE_ENTRIES_SUPPORTED 256
+#define MAX_BATCH_IO 8
+
+struct spdk_vhost_nvme_sq {
+ uint16_t sqid;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ struct spdk_nvme_cmd *sq_cmd;
+ uint16_t sq_head;
+ uint16_t sq_tail;
+};
+
+struct spdk_vhost_nvme_cq {
+ uint8_t phase;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ volatile struct spdk_nvme_cpl *cq_cqe;
+ uint16_t cq_head;
+ uint16_t guest_signaled_cq_head;
+ uint32_t need_signaled_cnt;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks;
+ bool irq_enabled;
+ int virq;
+};
+
+struct spdk_vhost_nvme_ns {
+ struct spdk_bdev *bdev;
+ uint32_t block_size;
+ uint64_t capacity;
+ uint32_t nsid;
+ uint32_t active_ns;
+ struct spdk_bdev_desc *bdev_desc;
+ struct spdk_io_channel *bdev_io_channel;
+ struct spdk_nvme_ns_data nsdata;
+};
+
+struct spdk_vhost_nvme_task {
+ struct spdk_nvme_cmd cmd;
+ struct spdk_vhost_nvme_dev *nvme;
+ uint16_t sqid;
+ uint16_t cqid;
+
+ /** array of iovecs to transfer. */
+ struct iovec iovs[MAX_IOVS];
+
+ /** Number of iovecs in iovs array. */
+ int iovcnt;
+
+ /** Current iovec position. */
+ int iovpos;
+
+ /** Offset in current iovec. */
+ uint32_t iov_offset;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_ns *ns;
+
+ /* parent pointer. */
+ struct spdk_vhost_nvme_task *parent;
+ uint8_t dnr;
+ uint8_t sct;
+ uint8_t sc;
+ uint32_t num_children;
+ STAILQ_ENTRY(spdk_vhost_nvme_task) stailq;
+};
+
+struct spdk_vhost_nvme_dev {
+ struct spdk_vhost_dev vdev;
+
+ uint32_t num_io_queues;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ struct spdk_nvme_ctrlr_data cdata;
+
+ uint32_t num_sqs;
+ uint32_t num_cqs;
+
+ uint32_t num_ns;
+ struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE];
+
+ volatile uint32_t *bar;
+ volatile uint32_t *bar_db;
+ uint64_t bar_size;
+ bool dataplane_started;
+
+ volatile uint32_t *dbbuf_dbs;
+ volatile uint32_t *dbbuf_eis;
+ struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1];
+ struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1];
+
+ /* The one and only session associated with this device */
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks;
+ struct spdk_poller *requestq_poller;
+ struct spdk_poller *stop_poller;
+};
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend;
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+static int
+nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task);
+
+static struct spdk_vhost_nvme_dev *
+to_nvme_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->backend != &spdk_vhost_nvme_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev);
+}
+
+static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs);
+
+static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return qid * 2 * db_stride;
+}
+
+static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return (qid * 2 + 1) * db_stride;
+}
+
+static void
+nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq)
+{
+ cq->cq_head++;
+ if (cq->cq_head >= cq->size) {
+ cq->cq_head = 0;
+ cq->phase = !cq->phase;
+ }
+}
+
+static bool
+nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq)
+{
+ return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head);
+}
+
+static void
+nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq)
+{
+ sq->sq_head = (sq->sq_head + 1) % sq->size;
+}
+
+static struct spdk_vhost_nvme_sq *
+vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->sq_queue[qid];
+}
+
+static struct spdk_vhost_nvme_cq *
+vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->cq_queue[qid];
+}
+
+static inline uint32_t
+vhost_nvme_get_queue_head(struct spdk_vhost_nvme_dev *nvme, uint32_t offset)
+{
+ if (nvme->dataplane_started) {
+ return nvme->dbbuf_dbs[offset];
+
+ } else if (nvme->bar) {
+ return nvme->bar_db[offset];
+ }
+
+ assert(0);
+
+ return 0;
+}
+
+static void *
+vhost_nvme_gpa_to_vva(void *priv, uint64_t addr, uint64_t len)
+{
+ struct spdk_vhost_session *vsession = priv;
+
+ return vhost_gpa_to_vva(vsession, addr, len);
+}
+
+static int
+vhost_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd,
+ struct spdk_vhost_nvme_task *task, uint32_t len)
+{
+ int err;
+
+ err = spdk_nvme_map_prps(nvme->vsession, cmd, task->iovs, len, 4096,
+ vhost_nvme_gpa_to_vva);
+ if (spdk_unlikely(err < 0)) {
+ return err;
+ }
+ task->iovcnt = err;
+ return 0;
+}
+
+static void
+nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_cq *cq;
+ uint32_t qid, cq_head;
+
+ assert(nvme != NULL);
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq || !cq->valid) {
+ continue;
+ }
+
+ cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(qid, 1));
+ if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) {
+ eventfd_write(cq->virq, (eventfd_t)1);
+ cq->need_signaled_cnt = 0;
+ }
+ }
+}
+
+static void
+vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ struct spdk_nvme_cpl cqe = {0};
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ uint16_t cqid = task->cqid;
+ uint16_t sqid = task->sqid;
+
+ cq = vhost_nvme_get_cq_from_qid(nvme, cqid);
+ sq = vhost_nvme_get_sq_from_qid(nvme, sqid);
+ if (spdk_unlikely(!cq || !sq)) {
+ return;
+ }
+
+ cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(cqid, 1));
+ if (spdk_unlikely(nvme_cq_is_full(cq))) {
+ STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq);
+ return;
+ }
+
+ cqe.sqid = sqid;
+ cqe.sqhd = sq->sq_head;
+ cqe.cid = cmd->cid;
+ cqe.status.dnr = task->dnr;
+ cqe.status.sct = task->sct;
+ cqe.status.sc = task->sc;
+ cqe.status.p = !cq->phase;
+ cq->cq_cqe[cq->cq_head] = cqe;
+ spdk_smp_wmb();
+ cq->cq_cqe[cq->cq_head].status.p = cq->phase;
+
+ nvme_inc_cq_head(cq);
+ cq->need_signaled_cnt++;
+
+ /* MMIO Controll */
+ if (nvme->dataplane_started) {
+ nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
+ }
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *task = cb_arg;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ int sc, sct;
+ uint32_t cdw0;
+
+ assert(bdev_io != NULL);
+
+ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+ spdk_bdev_free_io(bdev_io);
+
+ task->dnr = !success;
+ task->sct = sct;
+ task->sc = sc;
+
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10);
+ }
+
+ vhost_nvme_task_complete(task);
+}
+
+static void
+blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *child = cb_arg;
+ struct spdk_vhost_nvme_task *task = child->parent;
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ int sct, sc;
+ uint32_t cdw0;
+
+ assert(bdev_io != NULL);
+
+ task->num_children--;
+ if (!success) {
+ task->dnr = 1;
+ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+ task->sct = sct;
+ task->sc = sc;
+ }
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!task->num_children) {
+ vhost_nvme_task_complete(task);
+ }
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+}
+
+static struct spdk_vhost_nvme_ns *
+vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid)
+{
+ if (spdk_unlikely(!nsid || nsid > dev->num_ns)) {
+ return NULL;
+ }
+
+ return &dev->ns[nsid - 1];
+}
+
+static void
+vhost_nvme_resubmit_task(void *arg)
+{
+ struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg;
+ int rc;
+
+ rc = nvme_process_sq(task->nvme, task->sq, task);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc);
+ }
+}
+
+static int
+vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task)
+{
+ int rc;
+
+ task->bdev_io_wait.bdev = task->ns->bdev;
+ task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ vhost_nvme_task_complete(task);
+ }
+
+ return rc;
+}
+
+static int
+nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_task *child;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = -1;
+ uint32_t len, nlba, block_size;
+ uint64_t slba;
+ struct spdk_nvme_dsm_range *range;
+ uint16_t i, num_ranges = 0;
+
+ task->nvme = nvme;
+ task->dnr = 0;
+ task->sct = 0;
+ task->sc = 0;
+
+ ns = vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid);
+ if (spdk_unlikely(!ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ block_size = ns->block_size;
+ task->num_children = 0;
+ task->cqid = sq->cqid;
+ task->sqid = sq->sqid;
+
+ task->ns = ns;
+
+ if (spdk_unlikely(!ns->active_ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ /* valid only for Read/Write commands */
+ nlba = (cmd->cdw12 & 0xffff) + 1;
+ slba = cmd->cdw11;
+ slba = (slba << 32) | cmd->cdw10;
+
+ if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE ||
+ cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ if (cmd->psdt != SPDK_NVME_PSDT_PRP) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n",
+ cmd->psdt >> 1, cmd->psdt & 1u);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ num_ranges = (cmd->cdw10 & 0xff) + 1;
+ len = num_ranges * sizeof(struct spdk_nvme_dsm_range);
+ } else {
+ len = nlba * block_size;
+ }
+
+ ret = vhost_nvme_map_prps(nvme, cmd, task, len);
+ if (spdk_unlikely(ret != 0)) {
+ SPDK_ERRLOG("nvme command map prps failed\n");
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+ }
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_WRITE:
+ ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel,
+ 0, ns->capacity,
+ blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base;
+ for (i = 0; i < num_ranges; i++) {
+ if (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ child = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ SPDK_ERRLOG("No free task now\n");
+ ret = -1;
+ break;
+ }
+ task->num_children++;
+ child->parent = task;
+ ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel,
+ range[i].starting_lba * block_size,
+ range[i].length * block_size,
+ blk_unmap_complete_cb, child);
+ if (ret) {
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+ break;
+ }
+ }
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (spdk_unlikely(ret)) {
+ if (ret == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n");
+ task->sq = sq;
+ ret = vhost_nvme_queue_task(task);
+ } else {
+ /* post error status to cqe */
+ SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ vhost_nvme_task_complete(task);
+ }
+ }
+
+ return ret;
+}
+
+static int
+nvme_worker(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_task *task;
+ uint32_t qid, dbbuf_sq;
+ int ret;
+ int count = -1;
+
+ if (spdk_unlikely(!nvme->num_sqs)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+
+ sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq->valid) {
+ continue;
+ }
+ cq = vhost_nvme_get_cq_from_qid(nvme, sq->cqid);
+ if (spdk_unlikely(!cq)) {
+ return SPDK_POLLER_BUSY;
+ }
+ cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1));
+ if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) &&
+ !nvme_cq_is_full(cq))) {
+ task = STAILQ_FIRST(&cq->cq_full_waited_tasks);
+ STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq);
+ vhost_nvme_task_complete(task);
+ }
+
+ dbbuf_sq = vhost_nvme_get_queue_head(nvme, sq_offset(qid, 1));
+ sq->sq_tail = (uint16_t)dbbuf_sq;
+ count = 0;
+
+ while (sq->sq_head != sq->sq_tail) {
+ if (spdk_unlikely(!sq->sq_cmd)) {
+ break;
+ }
+ if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ return SPDK_POLLER_BUSY;
+ }
+
+ task->cmd = sq->sq_cmd[sq->sq_head];
+ nvme_inc_sq_head(sq);
+
+ /* processing IO */
+ ret = nvme_process_sq(nvme, sq, task);
+ if (spdk_unlikely(ret)) {
+ SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head,
+ sq->sq_tail);
+ }
+
+ /* MMIO Control */
+ if (nvme->dataplane_started) {
+ nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
+ }
+
+ /* Maximum batch I/Os to pick up at once */
+ if (count++ == MAX_BATCH_IO) {
+ break;
+ }
+ }
+ }
+
+ /* Completion Queue */
+ nvme_cq_signal_fd(nvme);
+
+ return count;
+}
+
+static int
+vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_vhost_session *vsession = nvme->vsession;
+ uint64_t dbs_dma_addr, eis_dma_addr;
+
+ dbs_dma_addr = cmd->dptr.prp.prp1;
+ eis_dma_addr = cmd->dptr.prp.prp2;
+
+ if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) {
+ return -1;
+ }
+ /* Guest Physical Address to Host Virtual Address */
+ nvme->dbbuf_dbs = vhost_gpa_to_vva(vsession, dbs_dma_addr, 4096);
+ nvme->dbbuf_eis = vhost_gpa_to_vva(vsession, eis_dma_addr, 4096);
+ if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) {
+ return -1;
+ }
+ /* zeroed the doorbell buffer memory */
+ memset((void *)nvme->dbbuf_dbs, 0, 4096);
+ memset((void *)nvme->dbbuf_eis, 0, 4096);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+
+ /* Data plane started */
+ nvme->dataplane_started = true;
+
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid, qsize, cqid;
+ uint64_t dma_addr;
+ uint64_t requested_len;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ cqid = (cmd->cdw11 >> 16) & 0xffff;
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+ cq = vhost_nvme_get_cq_from_qid(nvme, cqid);
+ if (!sq || !cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n",
+ qid, cqid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+
+ sq->sqid = qid;
+ sq->cqid = cqid;
+ sq->size = qsize + 1;
+ sq->sq_head = sq->sq_tail = 0;
+ requested_len = sizeof(struct spdk_nvme_cmd) * sq->size;
+ sq->sq_cmd = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len);
+ if (!sq->sq_cmd) {
+ return -1;
+ }
+ nvme->num_sqs++;
+ sq->valid = true;
+ if (nvme->bar) {
+ nvme->bar_db[sq_offset(qid, 1)] = 0;
+ }
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_sq *sq;
+
+ qid = cmd->cdw10 & 0xffff;
+ sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq) {
+ return -1;
+ }
+
+ /* We didn't see scenarios when deleting submission
+ * queue while I/O is running against the submisson
+ * queue for now, otherwise, we must ensure the poller
+ * will not run with this submission queue.
+ */
+ nvme->num_sqs--;
+ sq->valid = false;
+
+ memset(sq, 0, sizeof(*sq));
+ sq->sq_cmd = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qsize, qid;
+ uint64_t dma_addr;
+ struct spdk_vhost_nvme_cq *cq;
+ uint64_t requested_len;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+ cq->cqid = qid;
+ cq->size = qsize + 1;
+ cq->phase = 1;
+ cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1;
+ /* Setup virq through vhost messages */
+ cq->virq = -1;
+ cq->cq_head = 0;
+ cq->guest_signaled_cq_head = 0;
+ cq->need_signaled_cnt = 0;
+ requested_len = sizeof(struct spdk_nvme_cpl) * cq->size;
+ cq->cq_cqe = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len);
+ if (!cq->cq_cqe) {
+ return -1;
+ }
+ nvme->num_cqs++;
+ cq->valid = true;
+ if (nvme->bar) {
+ nvme->bar_db[cq_offset(qid, 1)] = 0;
+ }
+ STAILQ_INIT(&cq->cq_full_waited_tasks);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_cq *cq;
+
+ qid = cmd->cdw10 & 0xffff;
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ nvme->num_cqs--;
+ cq->valid = false;
+
+ memset(cq, 0, sizeof(*cq));
+ cq->cq_cqe = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static struct spdk_vhost_nvme_dev *
+vhost_nvme_get_by_name(int vid)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) {
+ vdev = &nvme->vdev;
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->vid == vid) {
+ return nvme;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+vhost_nvme_get_cap(int vid, uint64_t *cap)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ *cap = nvme->cap.raw;
+ return 0;
+}
+
+int
+vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
+{
+ struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd;
+ struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = 0;
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc);
+ switch (req->opc) {
+ case SPDK_NVME_OPC_IDENTIFY:
+ if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) {
+ memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data));
+
+ } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) {
+ ns = vhost_nvme_get_ns_from_nsid(nvme, req->nsid);
+ if (!ns) {
+ cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE;
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ break;
+ }
+ memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data));
+ }
+ /* successfully */
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_CQ:
+ ret = vhost_nvme_create_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_CQ:
+ ret = vhost_nvme_delete_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_SQ:
+ ret = vhost_nvme_create_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_SQ:
+ ret = vhost_nvme_delete_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_GET_FEATURES:
+ case SPDK_NVME_OPC_SET_FEATURES:
+ if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) {
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16);
+ } else {
+ cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ cpl->status.sct = SPDK_NVME_SCT_GENERIC;
+ }
+ break;
+ case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG:
+ ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_ABORT:
+ /* TODO: ABORT failed fow now */
+ cpl->cdw0 = 1;
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ }
+
+ if (ret) {
+ SPDK_ERRLOG("Admin Passthrough Failed with %u\n", req->opc);
+ }
+
+ return 0;
+}
+
+int
+vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ nvme->bar = (volatile uint32_t *)(uintptr_t)(bar_addr);
+ /* BAR0 SQ/CQ doorbell registers start from offset 0x1000 */
+ nvme->bar_db = (volatile uint32_t *)(uintptr_t)(bar_addr + 0x1000ull);
+ nvme->bar_size = bar_size;
+
+ return 0;
+}
+
+int
+vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+ struct spdk_vhost_nvme_cq *cq;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ if (cq->irq_enabled) {
+ cq->virq = fd;
+ } else {
+ SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_task *task;
+
+ while (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ spdk_free(task);
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ uint32_t entries, i;
+ struct spdk_vhost_nvme_task *task;
+
+ entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED;
+
+ for (i = 0; i < entries; i++) {
+ task = spdk_zmalloc(sizeof(struct spdk_vhost_nvme_task),
+ SPDK_CACHE_LINE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (task == NULL) {
+ SPDK_ERRLOG("Controller %s alloc task pool failed\n",
+ nvme->vdev.name);
+ free_task_pool(nvme);
+ return -1;
+ }
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+ }
+
+ return 0;
+}
+
+static int
+vhost_nvme_start_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+ int rc = 0;
+
+ if (nvme == NULL) {
+ rc = -1;
+ goto out;
+ }
+
+ rc = alloc_task_pool(nvme);
+ if (rc) {
+ goto out;
+ }
+
+ SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vsession->vid,
+ vdev->path, spdk_env_get_current_core());
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc);
+ if (!ns_dev->bdev_io_channel) {
+ rc = -1;
+ goto out;
+ }
+ }
+
+ nvme->vsession = vsession;
+ /* Start the NVMe Poller */
+ nvme->requestq_poller = SPDK_POLLER_REGISTER(nvme_worker, nvme, 0);
+
+out:
+ vhost_session_start_done(vsession, rc);
+ return rc;
+}
+
+static int
+vhost_nvme_start(struct spdk_vhost_session *vsession)
+{
+ if (vsession->vdev->active_session_num > 0) {
+ /* We're trying to start a second session */
+ SPDK_ERRLOG("Vhost-NVMe devices can support only one simultaneous connection.\n");
+ return -1;
+ }
+
+ return vhost_session_send_event(vsession, vhost_nvme_start_cb,
+ 3, "start session");
+}
+
+static void
+vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns)
+{
+ ns->active_ns = 0;
+ spdk_bdev_close(ns->bdev_desc);
+ ns->bdev_desc = NULL;
+ ns->bdev = NULL;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_nvme_ns *ns = remove_ctx;
+
+ SPDK_NOTICELOG("Removing NS %u, Block Device %s\n",
+ ns->nsid, spdk_bdev_get_name(ns->bdev));
+
+ vhost_nvme_deactive_ns(ns);
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = arg;
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n");
+
+ /* FIXME wait for pending I/Os to complete */
+
+ if (spdk_vhost_trylock() != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (ns_dev->bdev_io_channel) {
+ spdk_put_io_channel(ns_dev->bdev_io_channel);
+ ns_dev->bdev_io_channel = NULL;
+ }
+ }
+ /* Clear BAR space */
+ if (nvme->bar) {
+ memset((void *)nvme->bar, 0, nvme->bar_size);
+ }
+ nvme->num_sqs = 0;
+ nvme->num_cqs = 0;
+ nvme->dbbuf_dbs = NULL;
+ nvme->dbbuf_eis = NULL;
+ nvme->dataplane_started = false;
+
+ spdk_poller_unregister(&nvme->stop_poller);
+ vhost_session_stop_done(nvme->vsession, 0);
+
+ spdk_vhost_unlock();
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_nvme_stop_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+
+ if (nvme == NULL) {
+ vhost_session_stop_done(vsession, -1);
+ return -1;
+ }
+
+ free_task_pool(nvme);
+ SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vsession->vid, vdev->path);
+
+ spdk_poller_unregister(&nvme->requestq_poller);
+ nvme->stop_poller = SPDK_POLLER_REGISTER(destroy_device_poller_cb, nvme, 1000);
+
+ return 0;
+}
+
+static int
+vhost_nvme_stop(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_nvme_stop_cb,
+ 3, "start session");
+}
+
+static void
+vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_named_array_begin(w, "namespaces");
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid);
+ spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_create_nvme_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues);
+ spdk_json_write_named_string(w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(nvme->vdev.thread)));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_nvme_controller_add_ns");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = {
+ .session_ctx_size = 0,
+ .start_session = vhost_nvme_start,
+ .stop_session = vhost_nvme_stop,
+ .dump_info_json = vhost_nvme_dump_info_json,
+ .write_config_json = vhost_nvme_write_config_json,
+ .remove_device = vhost_nvme_dev_remove,
+};
+
+static int
+vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ struct spdk_nvme_ns_data *nsdata;
+ uint64_t num_blocks;
+ uint32_t i;
+
+ /* Identify Namespace */
+ cdata->nn = dev->num_ns;
+ for (i = 0; i < dev->num_ns; i++) {
+ nsdata = &dev->ns[i].nsdata;
+ if (dev->ns[i].active_ns) {
+ num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev);
+ nsdata->nsze = num_blocks;
+ /* ncap must be non-zero for active Namespace */
+ nsdata->ncap = num_blocks;
+ nsdata->nuse = num_blocks;
+ nsdata->nlbaf = 0;
+ nsdata->flbas.format = 0;
+ nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev));
+ nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev);
+ dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev);
+ dev->ns[i].capacity = num_blocks * dev->ns[i].block_size;
+ } else {
+ memset(nsdata, 0, sizeof(*nsdata));
+ }
+ }
+ return 0;
+}
+
+static int
+vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ char sn[20];
+
+ /* Controller Capabilities */
+ dev->cap.bits.cqr = 1;
+ dev->cap.bits.to = 1;
+ dev->cap.bits.dstrd = 0;
+ dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+ dev->cap.bits.mpsmin = 0;
+ dev->cap.bits.mpsmax = 0;
+ /* MQES is 0 based value */
+ dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1;
+
+ /* Controller Configuration */
+ dev->cc.bits.en = 0;
+
+ /* Controller Status */
+ dev->csts.bits.rdy = 0;
+
+ /* Identify Controller */
+ spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+ cdata->vid = 0x8086;
+ cdata->ssvid = 0x8086;
+ spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' ');
+ snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name);
+ spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' ');
+ cdata->ieee[0] = 0xe4;
+ cdata->ieee[1] = 0xd2;
+ cdata->ieee[2] = 0x5c;
+ cdata->ver.bits.mjr = 1;
+ cdata->ver.bits.mnr = 0;
+ cdata->mdts = 5; /* 128 KiB */
+ cdata->rab = 6;
+ cdata->sqes.min = 6;
+ cdata->sqes.max = 6;
+ cdata->cqes.min = 4;
+ cdata->cqes.max = 4;
+ cdata->oncs.dsm = 1;
+ /* Emulated NVMe controller */
+ cdata->oacs.doorbell_buffer_config = 1;
+
+ vhost_nvme_ns_identify_update(dev);
+
+ return 0;
+}
+
+int
+vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues)
+{
+ struct spdk_vhost_nvme_dev *dev;
+ int rc;
+
+ if (posix_memalign((void **)&dev, SPDK_CACHE_LINE_SIZE, sizeof(*dev))) {
+ return -ENOMEM;
+ }
+ memset(dev, 0, sizeof(*dev));
+
+ if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) {
+ free(dev);
+ return -EINVAL;
+ }
+
+ spdk_vhost_lock();
+ rc = vhost_dev_register(&dev->vdev, name, cpumask,
+ &spdk_vhost_nvme_device_backend);
+
+ if (rc) {
+ free(dev);
+ spdk_vhost_unlock();
+ return rc;
+ }
+
+ dev->num_io_queues = num_io_queues;
+ STAILQ_INIT(&dev->free_tasks);
+ TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq);
+
+ vhost_nvme_ctrlr_identify_update(dev);
+
+ SPDK_NOTICELOG("Controller %s: Constructed\n", name);
+ spdk_vhost_unlock();
+ return rc;
+}
+
+int
+vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns;
+ int rc;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return -EINVAL;
+ }
+
+ TAILQ_REMOVE(&g_nvme_ctrlrs, nvme, tailq);
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns = &nvme->ns[i];
+ if (ns->active_ns) {
+ vhost_nvme_deactive_ns(ns);
+ }
+ }
+
+ rc = vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ free(nvme);
+ return 0;
+}
+
+int
+vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns;
+ struct spdk_bdev *bdev;
+ int rc = -1;
+
+ if (nvme == NULL) {
+ return -ENODEV;
+ }
+
+ if (nvme->num_ns == MAX_NAMESPACE) {
+ SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns);
+ return -ENOSPC;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("could not find bdev %s\n", bdev_name);
+ return -ENODEV;
+ }
+
+ ns = &nvme->ns[nvme->num_ns];
+ rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not open bdev '%s', error=%d\n",
+ bdev_name, rc);
+ return rc;
+ }
+
+ nvme->ns[nvme->num_ns].bdev = bdev;
+ nvme->ns[nvme->num_ns].active_ns = 1;
+ nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1;
+ nvme->num_ns++;
+
+ vhost_nvme_ns_identify_update(nvme);
+
+ return rc;
+}
+
+int
+vhost_nvme_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ const char *name;
+ const char *bdev_name;
+ const char *cpumask;
+ int rc, i = 0;
+ struct spdk_vhost_dev *vdev;
+ uint32_t ctrlr_num, io_queues;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ rc = spdk_conf_section_get_intval(sp, "NumberOfQueues");
+ if (rc > 0) {
+ io_queues = rc;
+ } else {
+ io_queues = 1;
+ }
+
+ rc = vhost_nvme_dev_construct(name, cpumask, io_queues);
+ if (rc < 0) {
+ SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num);
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ if (!vdev) {
+ return -1;
+ }
+
+ for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) {
+ bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0);
+ if (!bdev_name) {
+ SPDK_ERRLOG("namespace configuration missing bdev name\n");
+ break;
+ }
+ rc = vhost_nvme_dev_add_ns(vdev, bdev_name);
+ if (rc < 0) {
+ SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n",
+ ctrlr_num, bdev_name);
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME)
diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c
new file mode 100644
index 000000000..196d75918
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_rpc.c
@@ -0,0 +1,652 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk/scsi.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+#include "spdk/bdev.h"
+
+struct rpc_vhost_scsi_ctrlr {
+ char *ctrlr;
+ char *cpumask;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->cpumask);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_create_scsi_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string },
+ {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+rpc_vhost_create_scsi_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_scsi_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_create_scsi_ctrlr,
+ SPDK_COUNTOF(rpc_vhost_create_scsi_ctrlr),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_scsi_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_scsi_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_create_scsi_controller", rpc_vhost_create_scsi_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_scsi_controller, construct_vhost_scsi_controller)
+
+struct rpc_vhost_scsi_ctrlr_add_target {
+ char *ctrlr;
+ int32_t scsi_target_num;
+ char *bdev_name;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr_add_target(struct rpc_vhost_scsi_ctrlr_add_target *req)
+{
+ free(req->ctrlr);
+ free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_scsi_ctrlr_add_target[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, ctrlr), spdk_json_decode_string },
+ {"scsi_target_num", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, scsi_target_num), spdk_json_decode_int32},
+ {"bdev_name", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, bdev_name), spdk_json_decode_string },
+};
+
+static void
+rpc_vhost_scsi_controller_add_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_scsi_ctrlr_add_target req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_scsi_ctrlr_add_target,
+ SPDK_COUNTOF(rpc_vhost_scsi_ctrlr_add_target),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_add_tgt(vdev, req.scsi_target_num, req.bdev_name);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_scsi_ctrlr_add_target(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_int32(w, rc);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_scsi_ctrlr_add_target(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_scsi_controller_add_target", rpc_vhost_scsi_controller_add_target,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_add_target, add_vhost_scsi_lun)
+
+struct rpc_remove_vhost_scsi_ctrlr_target {
+ char *ctrlr;
+ uint32_t scsi_target_num;
+};
+
+static void
+free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req)
+{
+ free(req->ctrlr);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = {
+ {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string },
+ {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32},
+};
+
+static int
+rpc_vhost_scsi_controller_remove_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct spdk_jsonrpc_request *request = arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return 0;
+}
+
+static void
+rpc_vhost_scsi_controller_remove_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_remove_vhost_scsi_ctrlr_target req = {0};
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_remove_target,
+ SPDK_COUNTOF(rpc_vhost_remove_target),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_remove_tgt(vdev, req.scsi_target_num,
+ rpc_vhost_scsi_controller_remove_target_finish_cb,
+ request);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_remove_vhost_scsi_ctrlr_target(&req);
+ return;
+
+invalid:
+ free_rpc_remove_vhost_scsi_ctrlr_target(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+
+SPDK_RPC_REGISTER("vhost_scsi_controller_remove_target",
+ rpc_vhost_scsi_controller_remove_target, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_remove_target, remove_vhost_scsi_target)
+
+struct rpc_vhost_blk_ctrlr {
+ char *ctrlr;
+ char *dev_name;
+ char *cpumask;
+ bool readonly;
+ bool packed_ring;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string },
+ {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string },
+ {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true},
+ {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true},
+ {"packed_ring", offsetof(struct rpc_vhost_blk_ctrlr, packed_ring), spdk_json_decode_bool, true},
+};
+
+static void
+free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->dev_name);
+ free(req->cpumask);
+}
+
+static void
+rpc_vhost_create_blk_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_blk_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name,
+ req.readonly, req.packed_ring);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_blk_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_blk_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_create_blk_controller", rpc_vhost_create_blk_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_blk_controller, construct_vhost_blk_controller)
+
+struct rpc_delete_vhost_ctrlr {
+ char *ctrlr;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_vhost_ctrlr_decoder[] = {
+ {"ctrlr", offsetof(struct rpc_delete_vhost_ctrlr, ctrlr), spdk_json_decode_string },
+};
+
+static void
+free_rpc_delete_vhost_ctrlr(struct rpc_delete_vhost_ctrlr *req)
+{
+ free(req->ctrlr);
+}
+
+static void
+rpc_vhost_delete_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_vhost_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_delete_vhost_ctrlr_decoder,
+ SPDK_COUNTOF(rpc_delete_vhost_ctrlr_decoder), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_dev_remove(vdev);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_delete_vhost_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+
+invalid:
+ free_rpc_delete_vhost_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_delete_controller", rpc_vhost_delete_controller, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_delete_controller, remove_vhost_controller)
+
+struct rpc_get_vhost_ctrlrs {
+ char *name;
+};
+
+static void
+_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev)
+{
+ uint32_t delay_base_us, iops_threshold;
+
+ spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev));
+ spdk_json_write_named_string_fmt(w, "cpumask", "0x%s",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+ spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+ spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+ spdk_json_write_named_string(w, "socket", vdev->path);
+
+ spdk_json_write_named_object_begin(w, "backend_specific");
+ vhost_dump_info_json(vdev, w);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = {
+ {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_get_vhost_ctrlrs(struct rpc_get_vhost_ctrlrs *req)
+{
+ free(req->name);
+}
+
+static void
+rpc_vhost_get_controllers(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_get_vhost_ctrlrs req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders,
+ SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ if (req.name != NULL) {
+ vdev = spdk_vhost_dev_find(req.name);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ free_rpc_get_vhost_ctrlrs(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ _rpc_get_vhost_controller(w, vdev);
+ spdk_vhost_unlock();
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+ }
+
+ free_rpc_get_vhost_ctrlrs(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ vdev = spdk_vhost_dev_next(NULL);
+ while (vdev != NULL) {
+ _rpc_get_vhost_controller(w, vdev);
+ vdev = spdk_vhost_dev_next(vdev);
+ }
+ spdk_vhost_unlock();
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_get_vhost_ctrlrs(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_get_controllers", rpc_vhost_get_controllers, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_get_controllers, get_vhost_controllers)
+
+
+struct rpc_vhost_ctrlr_coalescing {
+ char *ctrlr;
+ uint32_t delay_base_us;
+ uint32_t iops_threshold;
+};
+
+static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string },
+ {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32},
+ {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32},
+};
+
+static void
+free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req)
+{
+ free(req->ctrlr);
+}
+
+static void
+rpc_vhost_controller_set_coalescing(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_ctrlr_coalescing req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing,
+ SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_set_coalescing(vdev, req.delay_base_us, req.iops_threshold);
+ spdk_vhost_unlock();
+ if (rc) {
+ goto invalid;
+ }
+
+ free_rpc_set_vhost_controllers_event_coalescing(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+
+invalid:
+ free_rpc_set_vhost_controllers_event_coalescing(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_controller_set_coalescing", rpc_vhost_controller_set_coalescing,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_controller_set_coalescing, set_vhost_controller_coalescing)
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+
+struct rpc_vhost_nvme_ctrlr {
+ char *ctrlr;
+ uint32_t io_queues;
+ char *cpumask;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string },
+ {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32},
+ {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->cpumask);
+}
+
+static void
+rpc_vhost_create_nvme_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_nvme_ctrlr req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr),
+ &req)) {
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_nvme_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_nvme_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_create_nvme_controller", rpc_vhost_create_nvme_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_nvme_controller, construct_vhost_nvme_controller)
+
+struct rpc_vhost_nvme_ctrlr_add_ns {
+ char *ctrlr;
+ char *bdev_name;
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr_add_ns(struct rpc_vhost_nvme_ctrlr_add_ns *req)
+{
+ free(req->ctrlr);
+ free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, ctrlr), spdk_json_decode_string },
+ {"bdev_name", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, bdev_name), spdk_json_decode_string },
+};
+
+static void
+rpc_vhost_nvme_controller_add_ns(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_nvme_ctrlr_add_ns req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns,
+ SPDK_COUNTOF(rpc_vhost_nvme_add_ns),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = vhost_nvme_dev_add_ns(vdev, req.bdev_name);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+ free_rpc_vhost_nvme_ctrlr_add_ns(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_nvme_ctrlr_add_ns(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_nvme_controller_add_ns", rpc_vhost_nvme_controller_add_ns,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_nvme_controller_add_ns, add_vhost_nvme_ns)
+
+#endif /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC)
diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c
new file mode 100644
index 000000000..49e49dc76
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_scsi.c
@@ -0,0 +1,1536 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <linux/virtio_scsi.h>
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/scsi.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+/* Features supported by SPDK VHOST lib. */
+#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \
+ (1ULL << VIRTIO_SCSI_F_INOUT) | \
+ (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \
+ (1ULL << VIRTIO_SCSI_F_CHANGE ) | \
+ (1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+/* Features that are specified in VIRTIO SCSI but currently not supported:
+ * - Live migration not supported yet
+ * - T10 PI
+ */
+#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
+ (1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+#define MGMT_POLL_PERIOD_US (1000 * 5)
+
+#define VIRTIO_SCSI_CONTROLQ 0
+#define VIRTIO_SCSI_EVENTQ 1
+#define VIRTIO_SCSI_REQUESTQ 2
+
+enum spdk_scsi_dev_vhost_status {
+ /* Target ID is empty. */
+ VHOST_SCSI_DEV_EMPTY,
+
+ /* Target is still being added. */
+ VHOST_SCSI_DEV_ADDING,
+
+ /* Target ID occupied. */
+ VHOST_SCSI_DEV_PRESENT,
+
+ /* Target ID is occupied but removal is in progress. */
+ VHOST_SCSI_DEV_REMOVING,
+
+ /* In session - device (SCSI target) seen but removed. */
+ VHOST_SCSI_DEV_REMOVED,
+};
+
+/** Context for a SCSI target in a vhost device */
+struct spdk_scsi_dev_vhost_state {
+ struct spdk_scsi_dev *dev;
+ enum spdk_scsi_dev_vhost_status status;
+ spdk_vhost_event_fn remove_cb;
+ void *remove_ctx;
+};
+
+struct spdk_vhost_scsi_dev {
+ int ref;
+ bool registered;
+ struct spdk_vhost_dev vdev;
+ struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+};
+
+/** Context for a SCSI target in a vhost session */
+struct spdk_scsi_dev_session_state {
+ struct spdk_scsi_dev *dev;
+ enum spdk_scsi_dev_vhost_status status;
+};
+
+struct spdk_vhost_scsi_session {
+ struct spdk_vhost_session vsession;
+
+ struct spdk_vhost_scsi_dev *svdev;
+ /** Local copy of the device state */
+ struct spdk_scsi_dev_session_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+ struct spdk_poller *requestq_poller;
+ struct spdk_poller *mgmt_poller;
+ struct spdk_poller *stop_poller;
+};
+
+struct spdk_vhost_scsi_task {
+ struct spdk_scsi_task scsi;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+
+ union {
+ struct virtio_scsi_cmd_resp *resp;
+ struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
+ };
+
+ struct spdk_vhost_scsi_session *svsession;
+ struct spdk_scsi_dev *scsi_dev;
+
+ /** Number of bytes that were written. */
+ uint32_t used_len;
+
+ int req_idx;
+
+ /* If set, the task is currently used for I/O processing. */
+ bool used;
+
+ struct spdk_vhost_virtqueue *vq;
+};
+
+static int vhost_scsi_start(struct spdk_vhost_session *vsession);
+static int vhost_scsi_stop(struct spdk_vhost_session *vsession);
+static void vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev,
+ struct spdk_json_write_ctx *w);
+static void vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev,
+ struct spdk_json_write_ctx *w);
+static int vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev);
+
+static const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = {
+ .session_ctx_size = sizeof(struct spdk_vhost_scsi_session) - sizeof(struct spdk_vhost_session),
+ .start_session = vhost_scsi_start,
+ .stop_session = vhost_scsi_stop,
+ .dump_info_json = vhost_scsi_dump_info_json,
+ .write_config_json = vhost_scsi_write_config_json,
+ .remove_device = vhost_scsi_dev_remove,
+};
+
+static inline void
+scsi_task_init(struct spdk_vhost_scsi_task *task)
+{
+ memset(&task->scsi, 0, sizeof(task->scsi));
+ /* Tmf_resp pointer and resp pointer are in a union.
+ * Here means task->tmf_resp = task->resp = NULL.
+ */
+ task->resp = NULL;
+ task->used = true;
+ task->used_len = 0;
+}
+
+static void
+vhost_scsi_task_put(struct spdk_vhost_scsi_task *task)
+{
+ spdk_scsi_task_put(&task->scsi);
+}
+
+static void
+vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+ assert(vsession->task_cnt > 0);
+ vsession->task_cnt--;
+ task->used = false;
+}
+
+static void
+remove_scsi_tgt(struct spdk_vhost_scsi_dev *svdev,
+ unsigned scsi_tgt_num)
+{
+ struct spdk_scsi_dev_vhost_state *state;
+ struct spdk_scsi_dev *dev;
+
+ state = &svdev->scsi_dev_state[scsi_tgt_num];
+ dev = state->dev;
+ state->dev = NULL;
+ assert(state->status == VHOST_SCSI_DEV_REMOVING);
+ state->status = VHOST_SCSI_DEV_EMPTY;
+ spdk_scsi_dev_destruct(dev, NULL, NULL);
+ if (state->remove_cb) {
+ state->remove_cb(&svdev->vdev, state->remove_ctx);
+ state->remove_cb = NULL;
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n",
+ svdev->vdev.name, scsi_tgt_num);
+
+ if (--svdev->ref == 0 && svdev->registered == false) {
+ free(svdev);
+ }
+}
+
+static void
+vhost_scsi_dev_process_removed_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+ struct spdk_vhost_scsi_dev, vdev);
+
+ /* all sessions have already detached the device */
+ if (svdev->scsi_dev_state[scsi_tgt_num].status != VHOST_SCSI_DEV_REMOVING) {
+ /* device was already removed in the meantime */
+ return;
+ }
+
+ remove_scsi_tgt(svdev, scsi_tgt_num);
+}
+
+static int
+vhost_scsi_session_process_removed(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+ struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num];
+
+ if (state->dev != NULL) {
+ /* there's still a session that references this device,
+ * so abort our foreach chain here. We'll be called
+ * again from this session's management poller after it
+ * is removed in there
+ */
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+process_removed_devs(struct spdk_vhost_scsi_session *svsession)
+{
+ struct spdk_scsi_dev *dev;
+ struct spdk_scsi_dev_session_state *state;
+ int i;
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+ state = &svsession->scsi_dev_state[i];
+ dev = state->dev;
+
+ if (dev && state->status == VHOST_SCSI_DEV_REMOVING &&
+ !spdk_scsi_dev_has_pending_tasks(dev, NULL)) {
+ /* detach the device from this session */
+ spdk_scsi_dev_free_io_channels(dev);
+ state->dev = NULL;
+ state->status = VHOST_SCSI_DEV_REMOVED;
+ /* try to detach it globally */
+ spdk_vhost_lock();
+ vhost_dev_foreach_session(&svsession->svdev->vdev,
+ vhost_scsi_session_process_removed,
+ vhost_scsi_dev_process_removed_cpl_cb,
+ (void *)(uintptr_t)i);
+ spdk_vhost_unlock();
+ }
+ }
+}
+
+static void
+eventq_enqueue(struct spdk_vhost_scsi_session *svsession, unsigned scsi_dev_num,
+ uint32_t event, uint32_t reason)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ struct vring_desc *desc, *desc_table;
+ struct virtio_scsi_event *desc_ev;
+ uint32_t desc_table_size, req_size = 0;
+ uint16_t req;
+ int rc;
+
+ assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ vq = &vsession->virtqueue[VIRTIO_SCSI_EVENTQ];
+
+ if (vq->vring.desc == NULL || vhost_vq_avail_ring_get(vq, &req, 1) != 1) {
+ SPDK_ERRLOG("%s: failed to send virtio event (no avail ring entries?).\n",
+ vsession->name);
+ return;
+ }
+
+ rc = vhost_vq_get_desc(vsession, vq, req, &desc, &desc_table, &desc_table_size);
+ if (rc != 0 || desc->len < sizeof(*desc_ev)) {
+ SPDK_ERRLOG("%s: invalid eventq descriptor at index %"PRIu16".\n",
+ vsession->name, req);
+ goto out;
+ }
+
+ desc_ev = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*desc_ev));
+ if (desc_ev == NULL) {
+ SPDK_ERRLOG("%s: eventq descriptor at index %"PRIu16" points "
+ "to unmapped guest memory address %p.\n",
+ vsession->name, req, (void *)(uintptr_t)desc->addr);
+ goto out;
+ }
+
+ desc_ev->event = event;
+ desc_ev->lun[0] = 1;
+ desc_ev->lun[1] = scsi_dev_num;
+ /* virtio LUN id 0 can refer either to the entire device
+ * or actual LUN 0 (the only supported by vhost for now)
+ */
+ desc_ev->lun[2] = 0 >> 8;
+ desc_ev->lun[3] = 0 & 0xFF;
+ /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3)
+ * current implementation relies on linux kernel sources
+ */
+ memset(&desc_ev->lun[4], 0, 4);
+ desc_ev->reason = reason;
+ req_size = sizeof(*desc_ev);
+
+out:
+ vhost_vq_used_ring_enqueue(vsession, vq, req, req_size);
+}
+
+static void
+submit_completion(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+ vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx,
+ task->used_len);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx);
+
+ vhost_scsi_task_put(task);
+}
+
+static void
+vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ submit_completion(task);
+}
+
+static void
+vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ /* The SCSI task has completed. Do final processing and then post
+ notification to the virtqueue's "used" ring.
+ */
+ task->resp->status = task->scsi.status;
+
+ if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+ memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len);
+ task->resp->sense_len = task->scsi.sense_data_len;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx,
+ task->scsi.status);
+ }
+ assert(task->scsi.transfer_len == task->scsi.length);
+ task->resp->resid = task->scsi.length - task->scsi.data_transferred;
+
+ submit_completion(task);
+}
+
+static void
+task_submit(struct spdk_vhost_scsi_task *task)
+{
+ task->resp->response = VIRTIO_SCSI_S_OK;
+ spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func)
+{
+ task->tmf_resp->response = VIRTIO_SCSI_S_OK;
+ task->scsi.function = func;
+ spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+invalid_request(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+ vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx,
+ task->used_len);
+ vhost_scsi_task_put(task);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n",
+ task->resp ? task->resp->response : -1);
+}
+
+static int
+vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun)
+{
+ struct spdk_vhost_scsi_session *svsession = task->svsession;
+ struct spdk_scsi_dev_session_state *state;
+ uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF;
+
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8);
+
+ /* First byte must be 1 and second is target */
+ if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ return -1;
+ }
+
+ state = &svsession->scsi_dev_state[lun[1]];
+ task->scsi_dev = state->dev;
+ if (state->dev == NULL || state->status != VHOST_SCSI_DEV_PRESENT) {
+ /* If dev has been hotdetached, return 0 to allow sending
+ * additional hotremove event via sense codes.
+ */
+ return state->status != VHOST_SCSI_DEV_EMPTY ? 0 : -1;
+ }
+
+ task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0);
+ task->scsi.lun = spdk_scsi_dev_get_lun(state->dev, lun_id);
+ return 0;
+}
+
+static void
+process_ctrl_request(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+ struct vring_desc *desc, *desc_table;
+ struct virtio_scsi_ctrl_tmf_req *ctrl_req;
+ struct virtio_scsi_ctrl_an_resp *an_resp;
+ uint32_t desc_table_size, used_len = 0;
+ int rc;
+
+ spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_mgmt_cpl, vhost_scsi_task_free_cb);
+ rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table,
+ &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("%s: invalid controlq descriptor at index %d.\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ ctrl_req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*ctrl_req));
+ if (ctrl_req == NULL) {
+ SPDK_ERRLOG("%s: invalid task management request at index %d.\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE,
+ "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n",
+ task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->last_used_idx,
+ task->vq->vring.kickfd, task->vq->vring.size);
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, desc->len);
+
+ vhost_scsi_task_init_target(task, ctrl_req->lun);
+
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ if (spdk_unlikely(desc == NULL)) {
+ SPDK_ERRLOG("%s: no response descriptor for controlq request %d.\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ /* Process the TMF request */
+ switch (ctrl_req->type) {
+ case VIRTIO_SCSI_T_TMF:
+ task->tmf_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->tmf_resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) {
+ SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ /* Check if we are processing a valid request */
+ if (task->scsi_dev == NULL) {
+ task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+ break;
+ }
+
+ switch (ctrl_req->subtype) {
+ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+ /* Handle LUN reset */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: LUN reset\n", vsession->name);
+
+ mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+ return;
+ default:
+ task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED;
+ /* Unsupported command */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: unsupported TMF command %x\n",
+ vsession->name, ctrl_req->subtype);
+ break;
+ }
+ break;
+ case VIRTIO_SCSI_T_AN_QUERY:
+ case VIRTIO_SCSI_T_AN_SUBSCRIBE: {
+ an_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*an_resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) {
+ SPDK_WARNLOG("%s: asynchronous response descriptor points to invalid guest memory region\n",
+ vsession->name);
+ goto out;
+ }
+
+ an_resp->response = VIRTIO_SCSI_S_ABORTED;
+ break;
+ }
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: Unsupported control command %x\n",
+ vsession->name, ctrl_req->type);
+ break;
+ }
+
+ used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp);
+out:
+ vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, used_len);
+ vhost_scsi_task_put(task);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ * -1 if request is invalid and must be aborted,
+ * 0 if all data are set.
+ */
+static int
+task_data_setup(struct spdk_vhost_scsi_task *task,
+ struct virtio_scsi_cmd_req **req)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+ struct vring_desc *desc, *desc_table;
+ struct iovec *iovs = task->iovs;
+ uint16_t iovcnt = 0;
+ uint32_t desc_table_len, len = 0;
+ int rc;
+
+ spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_cpl, vhost_scsi_task_free_cb);
+
+ rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len);
+ /* First descriptor must be readable */
+ if (spdk_unlikely(rc != 0 || vhost_vring_desc_is_wr(desc) ||
+ desc->len < sizeof(struct virtio_scsi_cmd_req))) {
+ SPDK_WARNLOG("%s: invalid first request descriptor at index %"PRIu16".\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ *req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(**req));
+ if (spdk_unlikely(*req == NULL)) {
+ SPDK_WARNLOG("%s: request descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ /* Each request must have at least 2 descriptors (e.g. request and response) */
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (desc == NULL) {
+ SPDK_WARNLOG("%s: descriptor chain at index %d contains neither payload nor response buffer.\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+ task->scsi.dxfer_dir = vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV :
+ SPDK_SCSI_DIR_TO_DEV;
+ task->scsi.iovs = iovs;
+
+ if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+ /*
+ * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN]
+ */
+ task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+ SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+ rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ if (desc == NULL) {
+ /*
+ * TEST UNIT READY command and some others might not contain any payload and this is not an error.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA,
+ "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx);
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE);
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+ task->scsi.iovcnt = 1;
+ task->scsi.iovs[0].iov_len = 0;
+ task->scsi.length = 0;
+ task->scsi.transfer_len = 0;
+ return 0;
+ }
+
+ /* All remaining descriptors are data. */
+ while (desc) {
+ if (spdk_unlikely(!vhost_vring_desc_is_wr(desc))) {
+ SPDK_WARNLOG("%s: FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n",
+ vsession->name, iovcnt);
+ goto invalid_task;
+ }
+
+ if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) {
+ goto invalid_task;
+ }
+ len += desc->len;
+
+ rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+ }
+
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV");
+ /*
+ * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp]
+ * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir.
+ */
+
+ /* Process descriptors up to response. */
+ while (!vhost_vring_desc_is_wr(desc)) {
+ if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) {
+ goto invalid_task;
+ }
+ len += desc->len;
+
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(desc == NULL)) {
+ SPDK_WARNLOG("%s: TO_DEV cmd: no response descriptor.\n", vsession->name);
+ goto invalid_task;
+ }
+ }
+
+ task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+ SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+ }
+
+ task->scsi.iovcnt = iovcnt;
+ task->scsi.length = len;
+ task->scsi.transfer_len = len;
+ return 0;
+
+invalid_task:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n",
+ vsession->name, task->req_idx);
+ return -1;
+}
+
+static int
+process_request(struct spdk_vhost_scsi_task *task)
+{
+ struct virtio_scsi_cmd_req *req;
+ int result;
+
+ result = task_data_setup(task, &req);
+ if (result) {
+ return result;
+ }
+
+ result = vhost_scsi_task_init_target(task, req->lun);
+ if (spdk_unlikely(result != 0)) {
+ task->resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+ return -1;
+ }
+
+ task->scsi.cdb = req->cdb;
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE);
+
+ if (spdk_unlikely(task->scsi.lun == NULL)) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ task->resp->response = VIRTIO_SCSI_S_OK;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+process_scsi_task(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx)
+{
+ struct spdk_vhost_scsi_task *task;
+ int result;
+
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[req_idx];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ vsession->name, req_idx);
+ vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+ return;
+ }
+
+ vsession->task_cnt++;
+ scsi_task_init(task);
+
+ if (spdk_unlikely(vq->vring_idx == VIRTIO_SCSI_CONTROLQ)) {
+ process_ctrl_request(task);
+ } else {
+ result = process_request(task);
+ if (likely(result == 0)) {
+ task_submit(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task,
+ task->req_idx);
+ } else if (result > 0) {
+ vhost_scsi_task_cpl(&task->scsi);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task,
+ task->req_idx);
+ } else {
+ invalid_request(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task,
+ task->req_idx);
+ }
+ }
+}
+
+static void
+process_vq(struct spdk_vhost_scsi_session *svsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ uint16_t reqs[32];
+ uint16_t reqs_cnt, i;
+
+ reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ assert(reqs_cnt <= 32);
+
+ for (i = 0; i < reqs_cnt; i++) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n",
+ reqs[i]);
+
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ vsession->name, reqs[i], vq->vring.size);
+ vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
+ continue;
+ }
+
+ process_scsi_task(vsession, vq, reqs[i]);
+ }
+}
+
+static int
+vdev_mgmt_worker(void *arg)
+{
+ struct spdk_vhost_scsi_session *svsession = arg;
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+
+ process_removed_devs(svsession);
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]);
+
+ process_vq(svsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]);
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vdev_worker(void *arg)
+{
+ struct spdk_vhost_scsi_session *svsession = arg;
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ uint32_t q_idx;
+
+ for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vsession->max_queues; q_idx++) {
+ process_vq(svsession, &vsession->virtqueue[q_idx]);
+ }
+
+ vhost_session_used_signal(vsession);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static struct spdk_vhost_scsi_dev *
+to_scsi_dev(struct spdk_vhost_dev *ctrlr)
+{
+ if (ctrlr == NULL) {
+ return NULL;
+ }
+
+ if (ctrlr->backend != &spdk_vhost_scsi_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev);
+}
+
+static struct spdk_vhost_scsi_session *
+to_scsi_session(struct spdk_vhost_session *vsession)
+{
+ assert(vsession->vdev->backend == &spdk_vhost_scsi_device_backend);
+ return (struct spdk_vhost_scsi_session *)vsession;
+}
+
+int
+spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask)
+{
+ struct spdk_vhost_scsi_dev *svdev = calloc(1, sizeof(*svdev));
+ int rc;
+
+ if (svdev == NULL) {
+ return -ENOMEM;
+ }
+
+ svdev->vdev.virtio_features = SPDK_VHOST_SCSI_FEATURES;
+ svdev->vdev.disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES;
+
+ spdk_vhost_lock();
+ rc = vhost_dev_register(&svdev->vdev, name, cpumask,
+ &spdk_vhost_scsi_device_backend);
+
+ if (rc) {
+ free(svdev);
+ spdk_vhost_unlock();
+ return rc;
+ }
+
+ svdev->registered = true;
+
+ spdk_vhost_unlock();
+ return rc;
+}
+
+static int
+vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev);
+ int rc, i;
+
+ assert(svdev != NULL);
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+ if (svdev->scsi_dev_state[i].dev) {
+ if (vdev->registered) {
+ SPDK_ERRLOG("%s: SCSI target %d is still present.\n", vdev->name, i);
+ return -EBUSY;
+ }
+
+ rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i);
+ return rc;
+ }
+ }
+ }
+
+ rc = vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+ svdev->registered = false;
+
+ if (svdev->ref == 0) {
+ free(svdev);
+ }
+
+ return 0;
+}
+
+struct spdk_scsi_dev *
+spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+
+ assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ svdev = to_scsi_dev(vdev);
+ assert(svdev != NULL);
+ if (svdev->scsi_dev_state[num].status != VHOST_SCSI_DEV_PRESENT) {
+ return NULL;
+ }
+
+ assert(svdev->scsi_dev_state[num].dev != NULL);
+ return svdev->scsi_dev_state[num].dev;
+}
+
+static void
+vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg)
+{
+ struct spdk_vhost_scsi_dev *svdev = arg;
+ const struct spdk_scsi_dev *scsi_dev;
+ unsigned scsi_dev_num;
+
+ assert(lun != NULL);
+ assert(svdev != NULL);
+ scsi_dev = spdk_scsi_lun_get_dev(lun);
+ for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) {
+ if (svdev->scsi_dev_state[scsi_dev_num].dev == scsi_dev) {
+ break;
+ }
+ }
+
+ if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ /* The entire device has been already removed. */
+ return;
+ }
+
+ /* remove entire device */
+ spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL);
+}
+
+static void
+vhost_scsi_dev_add_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+ struct spdk_vhost_scsi_dev, vdev);
+ struct spdk_scsi_dev_vhost_state *vhost_sdev;
+
+ vhost_sdev = &svdev->scsi_dev_state[scsi_tgt_num];
+
+ /* All sessions have added the target */
+ assert(vhost_sdev->status == VHOST_SCSI_DEV_ADDING);
+ vhost_sdev->status = VHOST_SCSI_DEV_PRESENT;
+ svdev->ref++;
+}
+
+static int
+vhost_scsi_session_add_tgt(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+ struct spdk_scsi_dev_session_state *session_sdev = &svsession->scsi_dev_state[scsi_tgt_num];
+ struct spdk_scsi_dev_vhost_state *vhost_sdev;
+ int rc;
+
+ if (!vsession->started || session_sdev->dev != NULL) {
+ /* Nothing to do. */
+ return 0;
+ }
+
+ vhost_sdev = &svsession->svdev->scsi_dev_state[scsi_tgt_num];
+ session_sdev->dev = vhost_sdev->dev;
+ session_sdev->status = VHOST_SCSI_DEV_PRESENT;
+
+ rc = spdk_scsi_dev_allocate_io_channels(svsession->scsi_dev_state[scsi_tgt_num].dev);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: Couldn't allocate io channnel for SCSI target %u.\n",
+ vsession->name, scsi_tgt_num);
+
+ /* unset the SCSI target so that all I/O to it will be rejected */
+ session_sdev->dev = NULL;
+ /* Set status to EMPTY so that we won't reply with SCSI hotremove
+ * sense codes - the device hasn't ever been added.
+ */
+ session_sdev->status = VHOST_SCSI_DEV_EMPTY;
+
+ /* Return with no error. We'll continue allocating io_channels for
+ * other sessions on this device in hopes they succeed. The sessions
+ * that failed to allocate io_channels simply won't be able to
+ * detect the SCSI target, nor do any I/O to it.
+ */
+ return 0;
+ }
+
+ if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) {
+ eventq_enqueue(svsession, scsi_tgt_num,
+ VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN);
+ } else {
+ SPDK_NOTICELOG("%s: driver does not support hotplug. "
+ "Please restart it or perform a rescan.\n",
+ vsession->name);
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, int scsi_tgt_num,
+ const char *bdev_name)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_dev_vhost_state *state;
+ char target_name[SPDK_SCSI_DEV_MAX_NAME];
+ int lun_id_list[1];
+ const char *bdev_names_list[1];
+
+ svdev = to_scsi_dev(vdev);
+ assert(svdev != NULL);
+ if (scsi_tgt_num < 0) {
+ for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) {
+ if (svdev->scsi_dev_state[scsi_tgt_num].dev == NULL) {
+ break;
+ }
+ }
+
+ if (scsi_tgt_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: all SCSI target slots are already in use.\n", vdev->name);
+ return -ENOSPC;
+ }
+ } else {
+ if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: SCSI target number is too big (got %d, max %d)\n",
+ vdev->name, scsi_tgt_num, SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ return -EINVAL;
+ }
+ }
+
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("No lun name specified\n");
+ return -EINVAL;
+ }
+
+ state = &svdev->scsi_dev_state[scsi_tgt_num];
+ if (state->dev != NULL) {
+ SPDK_ERRLOG("%s: SCSI target %u already occupied\n", vdev->name, scsi_tgt_num);
+ return -EEXIST;
+ }
+
+ /*
+ * At this stage only one LUN per target
+ */
+ snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num);
+ lun_id_list[0] = 0;
+ bdev_names_list[0] = (char *)bdev_name;
+
+ state->status = VHOST_SCSI_DEV_ADDING;
+ state->dev = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, 1,
+ SPDK_SPC_PROTOCOL_IDENTIFIER_SAS,
+ vhost_scsi_lun_hotremove, svdev);
+
+ if (state->dev == NULL) {
+ state->status = VHOST_SCSI_DEV_EMPTY;
+ SPDK_ERRLOG("%s: couldn't create SCSI target %u using bdev '%s'\n",
+ vdev->name, scsi_tgt_num, bdev_name);
+ return -EINVAL;
+ }
+ spdk_scsi_dev_add_port(state->dev, 0, "vhost");
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: added SCSI target %u using bdev '%s'\n",
+ vdev->name, scsi_tgt_num, bdev_name);
+
+ vhost_dev_foreach_session(vdev, vhost_scsi_session_add_tgt,
+ vhost_scsi_dev_add_tgt_cpl_cb,
+ (void *)(uintptr_t)scsi_tgt_num);
+ return scsi_tgt_num;
+}
+
+struct scsi_tgt_hotplug_ctx {
+ unsigned scsi_tgt_num;
+ bool async_fini;
+};
+
+static void
+vhost_scsi_dev_remove_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *_ctx)
+{
+ struct scsi_tgt_hotplug_ctx *ctx = _ctx;
+ struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+ struct spdk_vhost_scsi_dev, vdev);
+
+ if (!ctx->async_fini) {
+ /* there aren't any active sessions, so remove the dev and exit */
+ remove_scsi_tgt(svdev, ctx->scsi_tgt_num);
+ }
+
+ free(ctx);
+}
+
+static int
+vhost_scsi_session_remove_tgt(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *_ctx)
+{
+ struct scsi_tgt_hotplug_ctx *ctx = _ctx;
+ unsigned scsi_tgt_num = ctx->scsi_tgt_num;
+ struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+ struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num];
+
+ if (!vsession->started || state->dev == NULL) {
+ /* Nothing to do */
+ return 0;
+ }
+
+ /* Mark the target for removal */
+ assert(state->status == VHOST_SCSI_DEV_PRESENT);
+ state->status = VHOST_SCSI_DEV_REMOVING;
+
+ /* Send a hotremove Virtio event */
+ if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) {
+ eventq_enqueue(svsession, scsi_tgt_num,
+ VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED);
+ }
+
+ /* Wait for the session's management poller to remove the target after
+ * all its pending I/O has finished.
+ */
+ ctx->async_fini = true;
+ return 0;
+}
+
+int
+spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num,
+ spdk_vhost_event_fn cb_fn, void *cb_arg)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_dev_vhost_state *scsi_dev_state;
+ struct scsi_tgt_hotplug_ctx *ctx;
+
+ if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: invalid SCSI target number %d\n", vdev->name, scsi_tgt_num);
+ return -EINVAL;
+ }
+
+ svdev = to_scsi_dev(vdev);
+ assert(svdev != NULL);
+ scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num];
+
+ if (scsi_dev_state->status != VHOST_SCSI_DEV_PRESENT) {
+ return -EBUSY;
+ }
+
+ if (scsi_dev_state->dev == NULL || scsi_dev_state->status == VHOST_SCSI_DEV_ADDING) {
+ SPDK_ERRLOG("%s: SCSI target %u is not occupied\n", vdev->name, scsi_tgt_num);
+ return -ENODEV;
+ }
+
+ assert(scsi_dev_state->status != VHOST_SCSI_DEV_EMPTY);
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("calloc failed\n");
+ return -ENOMEM;
+ }
+
+ ctx->scsi_tgt_num = scsi_tgt_num;
+ ctx->async_fini = false;
+
+ scsi_dev_state->remove_cb = cb_fn;
+ scsi_dev_state->remove_ctx = cb_arg;
+ scsi_dev_state->status = VHOST_SCSI_DEV_REMOVING;
+
+ vhost_dev_foreach_session(vdev, vhost_scsi_session_remove_tgt,
+ vhost_scsi_dev_remove_tgt_cpl_cb, ctx);
+ return 0;
+}
+
+int
+vhost_scsi_controller_construct(void)
+{
+ struct spdk_conf_section *sp = spdk_conf_first_section(NULL);
+ struct spdk_vhost_dev *vdev;
+ int i, dev_num;
+ unsigned ctrlr_num = 0;
+ char *bdev_name, *tgt_num_str;
+ char *cpumask;
+ char *name;
+ char *tgt = NULL;
+
+ while (sp != NULL) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) {
+ sp = spdk_conf_next_section(sp);
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+
+ if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) {
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ assert(vdev);
+
+ for (i = 0; ; i++) {
+
+ tgt = spdk_conf_section_get_nval(sp, "Target", i);
+ if (tgt == NULL) {
+ break;
+ }
+
+ tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0);
+ if (tgt_num_str == NULL) {
+ SPDK_ERRLOG("%s: invalid or missing SCSI target number\n", name);
+ return -1;
+ }
+
+ dev_num = (int)strtol(tgt_num_str, NULL, 10);
+ bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1);
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("%s: invalid or missing bdev name for SCSI target %d\n", name, dev_num);
+ return -1;
+ } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) {
+ SPDK_ERRLOG("%s: only one LUN per SCSI target is supported\n", name);
+ return -1;
+ }
+
+ if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) {
+ return -1;
+ }
+ }
+
+ sp = spdk_conf_next_section(sp);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_scsi_session *svsession)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ uint16_t i;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->tasks == NULL) {
+ continue;
+ }
+
+ spdk_free(vq->tasks);
+ vq->tasks = NULL;
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_scsi_session *svsession)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ struct spdk_vhost_scsi_task *task;
+ uint32_t task_cnt;
+ uint16_t i;
+ uint32_t j;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->vring.desc == NULL) {
+ continue;
+ }
+
+ task_cnt = vq->vring.size;
+ if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+ /* sanity check */
+ SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+ vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+ free_task_pool(svsession);
+ return -1;
+ }
+ vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt,
+ SPDK_CACHE_LINE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vq->tasks == NULL) {
+ SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+ vsession->name, task_cnt, i);
+ free_task_pool(svsession);
+ return -1;
+ }
+
+ for (j = 0; j < task_cnt; j++) {
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j];
+ task->svsession = svsession;
+ task->vq = vq;
+ task->req_idx = j;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhost_scsi_start_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+ struct spdk_vhost_scsi_dev *svdev = svsession->svdev;
+ struct spdk_scsi_dev_vhost_state *state;
+ uint32_t i;
+ int rc;
+
+ /* validate all I/O queues are in a contiguous index range */
+ for (i = VIRTIO_SCSI_REQUESTQ; i < vsession->max_queues; i++) {
+ if (vsession->virtqueue[i].vring.desc == NULL) {
+ SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ rc = alloc_task_pool(svsession);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
+ goto out;
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ state = &svdev->scsi_dev_state[i];
+ if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) {
+ continue;
+ }
+
+ assert(svsession->scsi_dev_state[i].status == VHOST_SCSI_DEV_EMPTY);
+ svsession->scsi_dev_state[i].dev = state->dev;
+ svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_PRESENT;
+ rc = spdk_scsi_dev_allocate_io_channels(state->dev);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc io_channel for SCSI target %"PRIu32"\n",
+ vsession->name, i);
+ /* unset the SCSI target so that all I/O to it will be rejected */
+ svsession->scsi_dev_state[i].dev = NULL;
+ /* set EMPTY state so that we won't reply with SCSI hotremove
+ * sense codes - the device hasn't ever been added.
+ */
+ svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_EMPTY;
+ continue;
+ }
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+
+ svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0);
+ if (vsession->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc &&
+ vsession->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) {
+ svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession,
+ MGMT_POLL_PERIOD_US);
+ }
+out:
+ vhost_session_start_done(vsession, rc);
+ return rc;
+}
+
+static int
+vhost_scsi_start(struct spdk_vhost_session *vsession)
+{
+ struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+ struct spdk_vhost_scsi_dev *svdev;
+
+ svdev = to_scsi_dev(vsession->vdev);
+ assert(svdev != NULL);
+ svsession->svdev = svdev;
+
+ return vhost_session_send_event(vsession, vhost_scsi_start_cb,
+ 3, "start session");
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+ struct spdk_vhost_scsi_session *svsession = arg;
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_scsi_dev_session_state *state;
+ uint32_t i;
+
+ if (vsession->task_cnt > 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (spdk_vhost_trylock() != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ enum spdk_scsi_dev_vhost_status prev_status;
+
+ state = &svsession->scsi_dev_state[i];
+ /* clear the REMOVED status so that we won't send hotremove events anymore */
+ prev_status = state->status;
+ state->status = VHOST_SCSI_DEV_EMPTY;
+ if (state->dev == NULL) {
+ continue;
+ }
+
+ spdk_scsi_dev_free_io_channels(state->dev);
+
+ state->dev = NULL;
+
+ if (prev_status == VHOST_SCSI_DEV_REMOVING) {
+ /* try to detach it globally */
+ vhost_dev_foreach_session(vsession->vdev,
+ vhost_scsi_session_process_removed,
+ vhost_scsi_dev_process_removed_cpl_cb,
+ (void *)(uintptr_t)i);
+ }
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+
+ free_task_pool(svsession);
+
+ spdk_poller_unregister(&svsession->stop_poller);
+ vhost_session_stop_done(vsession, 0);
+
+ spdk_vhost_unlock();
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+
+ /* Stop receiving new I/O requests */
+ spdk_poller_unregister(&svsession->requestq_poller);
+
+ /* Stop receiving controlq requests, also stop processing the
+ * asynchronous hotremove events. All the remaining events
+ * will be finalized by the stop_poller below.
+ */
+ spdk_poller_unregister(&svsession->mgmt_poller);
+
+ /* Wait for all pending I/Os to complete, then process all the
+ * remaining hotremove events one last time.
+ */
+ svsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
+ svsession, 1000);
+
+ return 0;
+}
+
+static int
+vhost_scsi_stop(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_scsi_stop_cb,
+ 3, "stop session");
+}
+
+static void
+vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_scsi_dev *sdev;
+ struct spdk_scsi_lun *lun;
+ uint32_t dev_idx;
+ uint32_t lun_idx;
+
+ assert(vdev != NULL);
+ spdk_json_write_named_array_begin(w, "scsi");
+ for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) {
+ sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx);
+ if (!sdev) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_uint32(w, "scsi_dev_num", dev_idx);
+
+ spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev));
+
+ spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev));
+
+ spdk_json_write_named_array_begin(w, "luns");
+
+ for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) {
+ lun = spdk_scsi_dev_get_lun(sdev, lun_idx);
+ if (!lun) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "id", spdk_scsi_lun_get_id(lun));
+
+ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_scsi_dev *scsi_dev;
+ struct spdk_scsi_lun *lun;
+ uint32_t i;
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_create_scsi_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_string(w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ scsi_dev = spdk_vhost_scsi_dev_get_tgt(vdev, i);
+ if (scsi_dev == NULL) {
+ continue;
+ }
+
+ lun = spdk_scsi_dev_get_lun(scsi_dev, 0);
+ assert(lun != NULL);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_scsi_controller_add_target");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_uint32(w, "scsi_target_num", i);
+
+ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA)