summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib/vhost
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/lib/vhost
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/vhost')
-rw-r--r--src/spdk/lib/vhost/Makefile47
-rw-r--r--src/spdk/lib/vhost/rte_vhost/Makefile46
-rw-r--r--src/spdk/lib/vhost/rte_vhost/fd_man.c300
-rw-r--r--src/spdk/lib/vhost/rte_vhost/fd_man.h69
-rw-r--r--src/spdk/lib/vhost/rte_vhost/rte_vhost.h474
-rw-r--r--src/spdk/lib/vhost/rte_vhost/socket.c819
-rw-r--r--src/spdk/lib/vhost/rte_vhost/vhost.c482
-rw-r--r--src/spdk/lib/vhost/rte_vhost/vhost.h321
-rw-r--r--src/spdk/lib/vhost/rte_vhost/vhost_user.c1360
-rw-r--r--src/spdk/lib/vhost/rte_vhost/vhost_user.h182
-rw-r--r--src/spdk/lib/vhost/vhost.c1503
-rw-r--r--src/spdk/lib/vhost/vhost_blk.c901
-rw-r--r--src/spdk/lib/vhost/vhost_internal.h277
-rw-r--r--src/spdk/lib/vhost/vhost_nvme.c1465
-rw-r--r--src/spdk/lib/vhost/vhost_rpc.c814
-rw-r--r--src/spdk/lib/vhost/vhost_scsi.c1271
16 files changed, 10331 insertions, 0 deletions
diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile
new file mode 100644
index 00000000..b46978e2
--- /dev/null
+++ b/src/spdk/lib/vhost/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+CFLAGS += -I.
+CFLAGS += -Irte_vhost
+CFLAGS += $(ENV_CFLAGS)
+
+C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c vhost_nvme.c
+
+LIBNAME = vhost
+
+DIRS-y += rte_vhost
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vhost/rte_vhost/Makefile b/src/spdk/lib/vhost/rte_vhost/Makefile
new file mode 100644
index 00000000..b0ae6335
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+CFLAGS += -include rte_config.h
+
+# These are the DPDK vhost files copied (for now) into SPDK
+C_SRCS += fd_man.c socket.c vhost_user.c vhost.c
+
+LIBNAME = rte_vhost
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vhost/rte_vhost/fd_man.c b/src/spdk/lib/vhost/rte_vhost/fd_man.c
new file mode 100644
index 00000000..2ceacc9a
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/fd_man.c
@@ -0,0 +1,300 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "fd_man.h"
+
+#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL)
+
+static int
+get_last_valid_idx(struct fdset *pfdset, int last_valid_idx)
+{
+ int i;
+
+ for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--)
+ ;
+
+ return i;
+}
+
+static void
+fdset_move(struct fdset *pfdset, int dst, int src)
+{
+ pfdset->fd[dst] = pfdset->fd[src];
+ pfdset->rwfds[dst] = pfdset->rwfds[src];
+}
+
+static void
+fdset_shrink_nolock(struct fdset *pfdset)
+{
+ int i;
+ int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1);
+
+ for (i = 0; i < last_valid_idx; i++) {
+ if (pfdset->fd[i].fd != -1)
+ continue;
+
+ fdset_move(pfdset, i, last_valid_idx);
+ last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1);
+ }
+ pfdset->num = last_valid_idx + 1;
+}
+
+/*
+ * Find deleted fd entries and remove them
+ */
+static void
+fdset_shrink(struct fdset *pfdset)
+{
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ fdset_shrink_nolock(pfdset);
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+}
+
+/**
+ * Returns the index in the fdset for a given fd.
+ * @return
+ * index for the fd, or -1 if fd isn't in the fdset.
+ */
+static int
+fdset_find_fd(struct fdset *pfdset, int fd)
+{
+ int i;
+
+ for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++)
+ ;
+
+ return i == pfdset->num ? -1 : i;
+}
+
+static void
+fdset_add_fd(struct fdset *pfdset, int idx, int fd,
+ fd_cb rcb, fd_cb wcb, void *dat)
+{
+ struct fdentry *pfdentry = &pfdset->fd[idx];
+ struct pollfd *pfd = &pfdset->rwfds[idx];
+
+ pfdentry->fd = fd;
+ pfdentry->rcb = rcb;
+ pfdentry->wcb = wcb;
+ pfdentry->dat = dat;
+
+ pfd->fd = fd;
+ pfd->events = rcb ? POLLIN : 0;
+ pfd->events |= wcb ? POLLOUT : 0;
+ pfd->revents = 0;
+}
+
+void
+fdset_init(struct fdset *pfdset)
+{
+ int i;
+
+ if (pfdset == NULL)
+ return;
+
+ for (i = 0; i < MAX_FDS; i++) {
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].dat = NULL;
+ }
+ pfdset->num = 0;
+}
+
+/**
+ * Register the fd in the fdset with read/write handler and context.
+ */
+int
+fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
+{
+ int i;
+
+ if (pfdset == NULL || fd == -1)
+ return -1;
+
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+ if (i == -1) {
+ fdset_shrink_nolock(pfdset);
+ i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+ if (i == -1) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ return -2;
+ }
+ }
+
+ fdset_add_fd(pfdset, i, fd, rcb, wcb, dat);
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ return 0;
+}
+
+/**
+ * Unregister the fd from the fdset.
+ * Returns context of a given fd or NULL.
+ */
+void *
+fdset_del(struct fdset *pfdset, int fd)
+{
+ int i;
+ void *dat = NULL;
+
+ if (pfdset == NULL || fd == -1)
+ return NULL;
+
+ do {
+ pthread_mutex_lock(&pfdset->fd_mutex);
+
+ i = fdset_find_fd(pfdset, fd);
+ if (i != -1 && pfdset->fd[i].busy == 0) {
+ /* busy indicates r/wcb is executing! */
+ dat = pfdset->fd[i].dat;
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+ pfdset->fd[i].dat = NULL;
+ i = -1;
+ }
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ } while (i != -1);
+
+ return dat;
+}
+
+
+/**
+ * This functions runs in infinite blocking loop until there is no fd in
+ * pfdset. It calls corresponding r/w handler if there is event on the fd.
+ *
+ * Before the callback is called, we set the flag to busy status; If other
+ * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
+ * will wait until the flag is reset to zero(which indicates the callback is
+ * finished), then it could free the context after fdset_del.
+ */
+void *
+fdset_event_dispatch(void *arg)
+{
+ int i;
+ struct pollfd *pfd;
+ struct fdentry *pfdentry;
+ fd_cb rcb, wcb;
+ void *dat;
+ int fd, numfds;
+ int remove1, remove2;
+ int need_shrink;
+ struct fdset *pfdset = arg;
+
+ if (pfdset == NULL)
+ return NULL;
+
+ while (1) {
+
+ /*
+ * When poll is blocked, other threads might unregister
+ * listenfds from and register new listenfds into fdset.
+ * When poll returns, the entries for listenfds in the fdset
+ * might have been updated. It is ok if there is unwanted call
+ * for new listenfds.
+ */
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ numfds = pfdset->num;
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+
+ need_shrink = 0;
+ for (i = 0; i < numfds; i++) {
+ pthread_mutex_lock(&pfdset->fd_mutex);
+
+ pfdentry = &pfdset->fd[i];
+ fd = pfdentry->fd;
+ pfd = &pfdset->rwfds[i];
+
+ if (fd < 0) {
+ need_shrink = 1;
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ continue;
+ }
+
+ if (!pfd->revents) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ continue;
+ }
+
+ remove1 = remove2 = 0;
+
+ rcb = pfdentry->rcb;
+ wcb = pfdentry->wcb;
+ dat = pfdentry->dat;
+ pfdentry->busy = 1;
+
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ if (rcb && pfd->revents & (POLLIN | FDPOLLERR))
+ rcb(fd, dat, &remove1);
+ if (wcb && pfd->revents & (POLLOUT | FDPOLLERR))
+ wcb(fd, dat, &remove2);
+ pfdentry->busy = 0;
+ /*
+ * fdset_del needs to check busy flag.
+ * We don't allow fdset_del to be called in callback
+ * directly.
+ */
+ /*
+ * When we are to clean up the fd from fdset,
+ * because the fd is closed in the cb,
+ * the old fd val could be reused by when creates new
+ * listen fd in another thread, we couldn't call
+ * fd_set_del.
+ */
+ if (remove1 || remove2) {
+ pfdentry->fd = -1;
+ need_shrink = 1;
+ }
+ }
+
+ if (need_shrink)
+ fdset_shrink(pfdset);
+ }
+
+ return NULL;
+}
diff --git a/src/spdk/lib/vhost/rte_vhost/fd_man.h b/src/spdk/lib/vhost/rte_vhost/fd_man.h
new file mode 100644
index 00000000..3a9d269b
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/fd_man.h
@@ -0,0 +1,69 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FD_MAN_H_
+#define _FD_MAN_H_
+#include <stdint.h>
+#include <pthread.h>
+#include <poll.h>
+
+#define MAX_FDS 1024
+
+typedef void (*fd_cb)(int fd, void *dat, int *remove);
+
+struct fdentry {
+ int fd; /* -1 indicates this entry is empty */
+ fd_cb rcb; /* callback when this fd is readable. */
+ fd_cb wcb; /* callback when this fd is writeable. */
+ void *dat; /* fd context */
+ int busy; /* whether this entry is being used in cb. */
+};
+
+struct fdset {
+ struct pollfd rwfds[MAX_FDS];
+ struct fdentry fd[MAX_FDS];
+ pthread_mutex_t fd_mutex;
+ int num; /* current fd number of this fdset */
+};
+
+
+void fdset_init(struct fdset *pfdset);
+
+int fdset_add(struct fdset *pfdset, int fd,
+ fd_cb rcb, fd_cb wcb, void *dat);
+
+void *fdset_del(struct fdset *pfdset, int fd);
+
+void *fdset_event_dispatch(void *arg);
+
+#endif
diff --git a/src/spdk/lib/vhost/rte_vhost/rte_vhost.h b/src/spdk/lib/vhost/rte_vhost/rte_vhost.h
new file mode 100644
index 00000000..29f5b613
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/rte_vhost.h
@@ -0,0 +1,474 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_VHOST_H_
+#define _RTE_VHOST_H_
+
+/**
+ * @file
+ * Interface to vhost-user
+ */
+
+#include <stdint.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <sys/eventfd.h>
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+
+#define RTE_VHOST_USER_CLIENT (1ULL << 0)
+#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
+#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct rte_vhost_mem_region {
+ uint64_t guest_phys_addr;
+ uint64_t guest_user_addr;
+ uint64_t host_user_addr;
+ uint64_t size;
+ void *mmap_addr;
+ uint64_t mmap_size;
+ int fd;
+};
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct rte_vhost_memory {
+ uint32_t nregions;
+ struct rte_vhost_mem_region regions[0];
+};
+
+struct rte_vhost_vring {
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+ uint64_t log_guest_addr;
+
+ int callfd;
+ int kickfd;
+ uint16_t size;
+
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+};
+
+/**
+ * Device and vring operations.
+ */
+struct vhost_device_ops {
+ int (*new_device)(int vid); /**< Add device. */
+ void (*destroy_device)(int vid); /**< Remove device. */
+
+ int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
+
+ /**
+ * Features could be changed after the feature negotiation.
+ * For example, VHOST_F_LOG_ALL will be set/cleared at the
+ * start/end of live migration, respectively. This callback
+ * is used to inform the application on such change.
+ */
+ int (*features_changed)(int vid, uint64_t features);
+ int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf);
+ int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd);
+ int (*vhost_nvme_get_cap)(int vid, uint64_t *cap);
+
+ int (*new_connection)(int vid);
+ void (*destroy_connection)(int vid);
+
+ int (*get_config)(int vid, uint8_t *config, uint32_t config_len);
+ int (*set_config)(int vid, uint8_t *config, uint32_t offset,
+ uint32_t len, uint32_t flags);
+
+ void *reserved[2]; /**< Reserved for future extension */
+};
+
+/**
+ * Convert guest physical address to host virtual address
+ *
+ * @param mem
+ * the guest memory regions
+ * @param gpa
+ * the guest physical address for querying
+ * @return
+ * the host virtual address on success, 0 on failure
+ */
+static inline uint64_t __attribute__((always_inline))
+rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
+{
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+ if (gpa >= reg->guest_phys_addr &&
+ gpa < reg->guest_phys_addr + reg->size) {
+ return gpa - reg->guest_phys_addr +
+ reg->host_user_addr;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Convert guest physical address to host virtual address safely
+ *
+ * This variant of rte_vhost_gpa_to_vva() takes care all the
+ * requested length is mapped and contiguous in process address
+ * space.
+ *
+ * @param mem
+ * the guest memory regions
+ * @param gpa
+ * the guest physical address for querying
+ * @param len
+ * the size of the requested area to map,
+ * updated with actual size mapped
+ * @return
+ * the host virtual address on success, 0 on failure */
+static inline uint64_t
+rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
+ uint64_t gpa, uint64_t *len)
+{
+ struct rte_vhost_mem_region *r;
+ uint32_t i;
+
+ for (i = 0; i < mem->nregions; i++) {
+ r = &mem->regions[i];
+ if (gpa >= r->guest_phys_addr &&
+ gpa < r->guest_phys_addr + r->size) {
+
+ if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
+ *len = r->guest_phys_addr + r->size - gpa;
+
+ return gpa - r->guest_phys_addr +
+ r->host_user_addr;
+ }
+ }
+ *len = 0;
+
+ return 0;
+}
+
+#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
+
+/**
+ * Log the memory write start with given address.
+ *
+ * This function only need be invoked when the live migration starts.
+ * Therefore, we won't need call it at all in the most of time. For
+ * making the performance impact be minimum, it's suggested to do a
+ * check before calling it:
+ *
+ * if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ * rte_vhost_log_write(vid, addr, len);
+ *
+ * @param vid
+ * vhost device ID
+ * @param addr
+ * the starting address for write
+ * @param len
+ * the length to write
+ */
+void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
+
+/**
+ * Log the used ring update start at given offset.
+ *
+ * Same as rte_vhost_log_write, it's suggested to do a check before
+ * calling it:
+ *
+ * if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * the vring index
+ * @param offset
+ * the offset inside the used ring
+ * @param len
+ * the length to write
+ */
+void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+ uint64_t offset, uint64_t len);
+
+int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
+
+/**
+ * Register vhost driver. path could be different for multiple
+ * instance support.
+ */
+int rte_vhost_driver_register(const char *path, uint64_t flags);
+
+/* Unregister vhost driver. This is only meaningful to vhost user. */
+int rte_vhost_driver_unregister(const char *path);
+
+/**
+ * Set the feature bits the vhost-user driver supports.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_features(const char *path, uint64_t features);
+
+/**
+ * Enable vhost-user driver features.
+ *
+ * Note that
+ * - the param @features should be a subset of the feature bits provided
+ * by rte_vhost_driver_set_features().
+ * - it must be invoked before vhost-user negotiation starts.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Features to enable
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_enable_features(const char *path, uint64_t features);
+
+/**
+ * Disable vhost-user driver features.
+ *
+ * The two notes at rte_vhost_driver_enable_features() also apply here.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Features to disable
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_disable_features(const char *path, uint64_t features);
+
+/**
+ * Get the feature bits before feature negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * A pointer to store the queried feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_features(const char *path, uint64_t *features);
+
+/**
+ * Get the feature bits after negotiation
+ *
+ * @param vid
+ * Vhost device ID
+ * @param features
+ * A pointer to store the queried feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
+
+/* Register callbacks. */
+int rte_vhost_driver_callback_register(const char *path,
+ struct vhost_device_ops const * const ops);
+
+/**
+ *
+ * Start the vhost-user driver.
+ *
+ * This function triggers the vhost-user negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_start(const char *path);
+
+/**
+ * Get the MTU value of the device if set in QEMU.
+ *
+ * @param vid
+ * virtio-net device ID
+ * @param mtu
+ * The variable to store the MTU value
+ *
+ * @return
+ * 0: success
+ * -EAGAIN: device not yet started
+ * -ENOTSUP: device does not support MTU feature
+ */
+int rte_vhost_get_mtu(int vid, uint16_t *mtu);
+
+/**
+ * Get the numa node from which the virtio net device's memory
+ * is allocated.
+ *
+ * @param vid
+ * vhost device ID
+ *
+ * @return
+ * The numa node, -1 on failure
+ */
+int rte_vhost_get_numa_node(int vid);
+
+/**
+ * Get the virtio net device's ifname, which is the vhost-user socket
+ * file path.
+ *
+ * @param vid
+ * vhost device ID
+ * @param buf
+ * The buffer to stored the queried ifname
+ * @param len
+ * The length of buf
+ *
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_ifname(int vid, char *buf, size_t len);
+
+/**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index
+ *
+ * @return
+ * num of avail entires left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+struct rte_mbuf;
+struct rte_mempool;
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtual device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue.
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index in mq case
+ * @param pkts
+ * array to contain packets to be enqueued
+ * @param count
+ * packets num to be enqueued
+ * @return
+ * num of packets enqueued
+ */
+uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * This function gets guest buffers from the virtio device TX virtqueue,
+ * construct host mbufs, copies guest buffer content to host mbufs and
+ * store them in pkts to be processed.
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index in mq case
+ * @param mbuf_pool
+ * mbuf_pool where host mbuf is allocated.
+ * @param pkts
+ * array to contain packets to be dequeued
+ * @param count
+ * packets num to be dequeued
+ * @return
+ * num of packets dequeued
+ */
+uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * Get guest mem table: a list of memory regions.
+ *
+ * An rte_vhost_vhost_memory object will be allocated internaly, to hold the
+ * guest memory regions. Application should free it at destroy_device()
+ * callback.
+ *
+ * @param vid
+ * vhost device ID
+ * @param mem
+ * To store the returned mem regions
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+
+/**
+ * Get guest vring info, including the vring address, vring size, etc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param vring
+ * the structure to hold the requested vring info
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+ struct rte_vhost_vring *vring);
+
+/**
+ * Set id of the last descriptors in avail and used guest vrings.
+ *
+ * In case user application operates directly on buffers, it should use this
+ * function on device destruction to retrieve the same values later on in device
+ * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *)
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param last_avail_idx
+ * id of the last descriptor in avail ring to be set
+ * @param last_used_idx
+ * id of the last descriptor in used ring to be set
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx,
+ uint16_t last_avail_idx, uint16_t last_used_idx);
+
+#endif /* _RTE_VHOST_H_ */
diff --git a/src/spdk/lib/vhost/rte_vhost/socket.c b/src/spdk/lib/vhost/rte_vhost/socket.c
new file mode 100644
index 00000000..1bc1e64b
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/socket.c
@@ -0,0 +1,819 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+
+TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+ struct vhost_user_connection_list conn_list;
+ pthread_mutex_t conn_mutex;
+ char *path;
+ int socket_fd;
+ struct sockaddr_un un;
+ bool is_server;
+ bool reconnect;
+ bool dequeue_zero_copy;
+
+ /*
+ * The "supported_features" indicates the feature bits the
+ * vhost driver supports. The "features" indicates the feature
+ * bits after the rte_vhost_driver_features_disable/enable().
+ * It is also the final feature bits used for vhost-user
+ * features negotiation.
+ */
+ uint64_t supported_features;
+ uint64_t features;
+
+ struct vhost_device_ops const *notify_ops;
+};
+
+struct vhost_user_connection {
+ struct vhost_user_socket *vsocket;
+ int connfd;
+ int vid;
+
+ TAILQ_ENTRY(vhost_user_connection) next;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+ struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+ struct fdset fdset;
+ int vsocket_cnt;
+ pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int create_unix_socket(struct vhost_user_socket *vsocket);
+static int vhost_user_start_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+ .fdset = {
+ .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+ .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .num = 0
+ },
+ .vsocket_cnt = 0,
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret <= 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+ return -1;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+
+ if (fds && fd_num > 0) {
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ if (cmsg == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n");
+ errno = EINVAL;
+ return -1;
+ }
+ cmsg->cmsg_len = CMSG_LEN(fdsize);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fdsize);
+ } else {
+ msgh.msg_control = NULL;
+ msgh.msg_controllen = 0;
+ }
+
+ do {
+ ret = sendmsg(sockfd, &msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
+ return ret;
+ }
+
+ return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+ int vid;
+ size_t size;
+ struct vhost_user_connection *conn;
+ int ret;
+
+ conn = malloc(sizeof(*conn));
+ if (conn == NULL) {
+ close(fd);
+ return;
+ }
+
+ vid = vhost_new_device(vsocket->features);
+ if (vid == -1) {
+ goto err;
+ }
+
+ size = strnlen(vsocket->path, PATH_MAX);
+ vhost_set_ifname(vid, vsocket->path, size);
+
+ if (vsocket->dequeue_zero_copy)
+ vhost_enable_dequeue_zero_copy(vid);
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+ if (vsocket->notify_ops->new_connection) {
+ ret = vsocket->notify_ops->new_connection(vid);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add vhost user connection with fd %d\n",
+ fd);
+ goto err;
+ }
+ }
+
+ conn->connfd = fd;
+ conn->vsocket = vsocket;
+ conn->vid = vid;
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+ NULL, conn);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add fd %d into vhost server fdset\n",
+ fd);
+
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->vid);
+
+ goto err;
+ }
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+ return;
+
+err:
+ free(conn);
+ close(fd);
+}
+
+/* call back when there is new vhost-user connection from client */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+ struct vhost_user_socket *vsocket = dat;
+
+ fd = accept(fd, NULL, NULL);
+ if (fd < 0)
+ return;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+ vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+ struct vhost_user_connection *conn = dat;
+ struct vhost_user_socket *vsocket = conn->vsocket;
+ int ret;
+
+ ret = vhost_user_msg_handler(conn->vid, connfd);
+ if (ret < 0) {
+ close(connfd);
+ *remove = 1;
+ vhost_destroy_device(conn->vid);
+
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->vid);
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_REMOVE(&vsocket->conn_list, conn, next);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ free(conn);
+
+ if (vsocket->reconnect) {
+ create_unix_socket(vsocket);
+ vhost_user_start_client(vsocket);
+ }
+ }
+}
+
+static int
+create_unix_socket(struct vhost_user_socket *vsocket)
+{
+ int fd;
+ struct sockaddr_un *un = &vsocket->un;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -1;
+ RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+ vsocket->is_server ? "server" : "client", fd);
+
+ if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost-user: can't set nonblocking mode for socket, fd: "
+ "%d (%s)\n", fd, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ memset(un, 0, sizeof(*un));
+ un->sun_family = AF_UNIX;
+ strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
+ un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+ vsocket->socket_fd = fd;
+ return 0;
+}
+
+static int
+vhost_user_start_server(struct vhost_user_socket *vsocket)
+{
+ int ret;
+ int fd = vsocket->socket_fd;
+ const char *path = vsocket->path;
+
+ ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to bind to %s: %s; remove it and try again\n",
+ path, strerror(errno));
+ goto err;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+ ret = listen(fd, MAX_VIRTIO_BACKLOG);
+ if (ret < 0)
+ goto err;
+
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+ NULL, vsocket);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add listen fd %d to vhost server fdset\n",
+ fd);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ close(fd);
+ return -1;
+}
+
+struct vhost_user_reconnect {
+ struct sockaddr_un un;
+ int fd;
+ struct vhost_user_socket *vsocket;
+
+ TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+ struct vhost_user_reconnect_tailq_list head;
+ pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+ int ret, flags;
+
+ ret = connect(fd, un, sz);
+ if (ret < 0 && errno != EISCONN)
+ return -1;
+
+ flags = fcntl(fd, F_GETFL, 0);
+ if (flags < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't get flags for connfd %d\n", fd);
+ return -2;
+ }
+ if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't disable nonblocking on fd %d\n", fd);
+ return -2;
+ }
+ return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+ int ret;
+ struct vhost_user_reconnect *reconn, *next;
+
+ while (1) {
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ /*
+ * An equal implementation of TAILQ_FOREACH_SAFE,
+ * which does not exist on all platforms.
+ */
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ ret = vhost_user_connect_nonblock(reconn->fd,
+ (struct sockaddr *)&reconn->un,
+ sizeof(reconn->un));
+ if (ret == -2) {
+ close(reconn->fd);
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "reconnection for fd %d failed\n",
+ reconn->fd);
+ goto remove_fd;
+ }
+ if (ret == -1)
+ continue;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "%s: connected\n", reconn->vsocket->path);
+ vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ free(reconn);
+ }
+
+ pthread_mutex_unlock(&reconn_list.mutex);
+ sleep(1);
+ }
+
+ return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+ int ret;
+
+ pthread_mutex_init(&reconn_list.mutex, NULL);
+ TAILQ_INIT(&reconn_list.head);
+
+ ret = pthread_create(&reconn_tid, NULL,
+ vhost_user_client_reconnect, NULL);
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+ return ret;
+}
+
+static int
+vhost_user_start_client(struct vhost_user_socket *vsocket)
+{
+ int ret;
+ int fd = vsocket->socket_fd;
+ const char *path = vsocket->path;
+ struct vhost_user_reconnect *reconn;
+
+ ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
+ sizeof(vsocket->un));
+ if (ret == 0) {
+ vhost_user_add_connection(fd, vsocket);
+ return 0;
+ }
+
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "failed to connect to %s: %s\n",
+ path, strerror(errno));
+
+ if (ret == -2 || !vsocket->reconnect) {
+ close(fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
+ reconn = malloc(sizeof(*reconn));
+ if (reconn == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for reconnect\n");
+ close(fd);
+ return -1;
+ }
+ reconn->un = vsocket->un;
+ reconn->fd = fd;
+ reconn->vsocket = vsocket;
+ pthread_mutex_lock(&reconn_list.mutex);
+ TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+ pthread_mutex_unlock(&reconn_list.mutex);
+
+ return 0;
+}
+
+static struct vhost_user_socket *
+find_vhost_user_socket(const char *path)
+{
+ int i;
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path))
+ return vsocket;
+ }
+
+ return NULL;
+}
+
+int
+rte_vhost_driver_disable_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->features &= ~features;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_enable_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket) {
+ if ((vsocket->supported_features & features) != features) {
+ /*
+ * trying to enable features the driver doesn't
+ * support.
+ */
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return -1;
+ }
+ vsocket->features |= features;
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket) {
+ vsocket->supported_features = features;
+ vsocket->features = features;
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_features(const char *path, uint64_t *features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ *features = vsocket->features;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ if (!vsocket) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "socket file %s is not registered yet.\n", path);
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+ int ret = -1;
+ struct vhost_user_socket *vsocket;
+
+ if (!path)
+ return -1;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: the number of vhost sockets reaches maximum\n");
+ goto out;
+ }
+
+ vsocket = malloc(sizeof(struct vhost_user_socket));
+ if (!vsocket)
+ goto out;
+ memset(vsocket, 0, sizeof(struct vhost_user_socket));
+ vsocket->path = strdup(path);
+ if (!vsocket->path) {
+ free(vsocket);
+ goto out;
+ }
+ TAILQ_INIT(&vsocket->conn_list);
+ pthread_mutex_init(&vsocket->conn_mutex, NULL);
+ vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+
+ /*
+ * Set the supported features correctly for the builtin vhost-user
+ * net driver.
+ *
+ * Applications know nothing about features the builtin virtio net
+ * driver (virtio_net.c) supports, thus it's not possible for them
+ * to invoke rte_vhost_driver_set_features(). To workaround it, here
+ * we set it unconditionally. If the application want to implement
+ * another vhost-user driver (say SCSI), it should call the
+ * rte_vhost_driver_set_features(), which will overwrite following
+ * two values.
+ */
+ vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
+ vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES;
+
+ if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+ vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+ if (vsocket->reconnect && reconn_tid == 0) {
+ if (vhost_user_reconnect_init() < 0) {
+ free(vsocket->path);
+ free(vsocket);
+ goto out;
+ }
+ }
+ } else {
+ vsocket->is_server = true;
+ }
+ ret = create_unix_socket(vsocket);
+ if (ret < 0) {
+ free(vsocket->path);
+ free(vsocket);
+ goto out;
+ }
+
+ vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+ int found = false;
+ struct vhost_user_reconnect *reconn, *next;
+
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ if (reconn->vsocket == vsocket) {
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ close(reconn->fd);
+ free(reconn);
+ found = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&reconn_list.mutex);
+ return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+ int i;
+ int count;
+ struct vhost_user_connection *conn;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path)) {
+ if (vsocket->is_server) {
+ fdset_del(&vhost_user.fdset, vsocket->socket_fd);
+ close(vsocket->socket_fd);
+ unlink(path);
+ } else if (vsocket->reconnect) {
+ vhost_user_remove_reconnect(vsocket);
+ }
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_FOREACH(conn, &vsocket->conn_list, next) {
+ close(conn->connfd);
+ }
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ do {
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ conn = TAILQ_FIRST(&vsocket->conn_list);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+ } while (conn != NULL);
+
+ free(vsocket->path);
+ free(vsocket);
+
+ count = --vhost_user.vsocket_cnt;
+ vhost_user.vsockets[i] = vhost_user.vsockets[count];
+ vhost_user.vsockets[count] = NULL;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return 0;
+ }
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(const char *path,
+ struct vhost_device_ops const * const ops)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->notify_ops = ops;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+struct vhost_device_ops const *
+vhost_driver_callback_get(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? vsocket->notify_ops : NULL;
+}
+
+int
+rte_vhost_driver_start(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+ static pthread_t fdset_tid;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ if (!vsocket)
+ return -1;
+
+ if (fdset_tid == 0) {
+ int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch,
+ &vhost_user.fdset);
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to create fdset handling thread");
+ }
+
+ if (vsocket->is_server)
+ return vhost_user_start_server(vsocket);
+ else
+ return vhost_user_start_client(vsocket);
+}
diff --git a/src/spdk/lib/vhost/rte_vhost/vhost.c b/src/spdk/lib/vhost/rte_vhost/vhost.c
new file mode 100644
index 00000000..9d4ae71b
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/vhost.c
@@ -0,0 +1,482 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+
+#include "vhost.h"
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+struct virtio_net *
+get_device(int vid)
+{
+ struct virtio_net *dev = vhost_devices[vid];
+
+ if (unlikely(!dev)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) device not found.\n", vid);
+ }
+
+ return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+ if ((vq->callfd >= 0) && (destroy != 0))
+ close(vq->callfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+ uint32_t i;
+
+ vhost_backend_cleanup(dev);
+
+ for (i = 0; i < dev->nr_vring; i++)
+ cleanup_vq(dev->virtqueue[i], destroy);
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct vhost_virtqueue *vq;
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+
+ rte_free(vq->shadow_used_ring);
+
+ rte_free(vq);
+ }
+
+ rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq)
+{
+ memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ /* Backends are set to -1 indicating an inactive device. */
+ vq->backend = -1;
+
+ /*
+ * always set the vq to enabled; this is to keep compatibility
+ * with the old QEMU, whereas there is no SET_VRING_ENABLE message.
+ */
+ vq->enabled = 1;
+
+ TAILQ_INIT(&vq->zmbuf_list);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq)
+{
+ int callfd;
+
+ callfd = vq->callfd;
+ init_vring_queue(vq);
+ vq->callfd = callfd;
+}
+
+int
+alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+ struct vhost_virtqueue *vq;
+
+ vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0);
+ if (vq == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for vring:%u.\n", vring_idx);
+ return -1;
+ }
+
+ dev->virtqueue[vring_idx] = vq;
+ init_vring_queue(vq);
+
+ dev->nr_vring += 1;
+
+ return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, nr_vring: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ dev->negotiated_features = 0;
+ dev->protocol_features = 0;
+ dev->flags = 0;
+
+ for (i = 0; i < dev->nr_vring; i++)
+ reset_vring_queue(dev->virtqueue[i]);
+}
+
+/*
+ * Invoked when there is a new vhost-user connection established (when
+ * there is a new virtio device being attached).
+ */
+int
+vhost_new_device(uint64_t features)
+{
+ struct virtio_net *dev;
+ int i;
+
+ dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+ if (dev == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for new dev.\n");
+ return -1;
+ }
+
+ for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+ if (vhost_devices[i] == NULL)
+ break;
+ }
+ if (i == MAX_VHOST_DEVICE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to find a free slot for new device.\n");
+ rte_free(dev);
+ return -1;
+ }
+
+ vhost_devices[i] = dev;
+ dev->vid = i;
+ dev->features = features;
+
+ return i;
+}
+
+/*
+ * Invoked when there is the vhost-user connection is broken (when
+ * the virtio device is being detached).
+ */
+void
+vhost_destroy_device(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(vid);
+ }
+
+ cleanup_device(dev, 1);
+ free_device(dev);
+
+ vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+ struct virtio_net *dev;
+ unsigned int len;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ len = if_len > sizeof(dev->ifname) ?
+ sizeof(dev->ifname) : if_len;
+
+ strncpy(dev->ifname, if_name, len);
+ dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+void
+vhost_enable_dequeue_zero_copy(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ dev->dequeue_zero_copy = 1;
+}
+
+int
+rte_vhost_get_mtu(int vid, uint16_t *mtu)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!(dev->flags & VIRTIO_DEV_READY))
+ return -EAGAIN;
+
+ if (!(dev->negotiated_features & VIRTIO_NET_F_MTU))
+ return -ENOTSUP;
+
+ *mtu = dev->mtu;
+
+ return 0;
+}
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+ struct virtio_net *dev = get_device(vid);
+ int numa_node;
+ int ret;
+
+ if (dev == NULL)
+ return -1;
+
+ ret = get_mempolicy(&numa_node, NULL, 0, dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to query numa node: %d\n", vid, ret);
+ return -1;
+ }
+
+ return numa_node;
+#else
+ RTE_SET_USED(vid);
+ return -1;
+#endif
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ len = RTE_MIN(len, sizeof(dev->ifname));
+
+ strncpy(buf, dev->ifname, len);
+ buf[len - 1] = '\0';
+
+ return 0;
+}
+
+int
+rte_vhost_get_negotiated_features(int vid, uint64_t *features)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ *features = dev->negotiated_features;
+ return 0;
+}
+
+int
+rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+ struct virtio_net *dev;
+ struct rte_vhost_memory *m;
+ size_t size;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region);
+ m = malloc(sizeof(struct rte_vhost_memory) + size);
+ if (!m)
+ return -1;
+
+ m->nregions = dev->mem->nregions;
+ memcpy(m->regions, dev->mem->regions, size);
+ *mem = m;
+
+ return 0;
+}
+
+int
+rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+ struct rte_vhost_vring *vring)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ vring->desc = vq->desc;
+ vring->avail = vq->avail;
+ vring->used = vq->used;
+ vring->log_guest_addr = vq->log_guest_addr;
+
+ vring->callfd = vq->callfd;
+ vring->kickfd = vq->kickfd;
+ vring->size = vq->size;
+
+ vring->last_avail_idx = vq->last_avail_idx;
+ vring->last_used_idx = vq->last_used_idx;
+
+ return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ vq = dev->virtqueue[queue_id];
+ if (!vq->enabled)
+ return 0;
+
+ return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ if (enable) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "guest notification isn't supported.\n");
+ return -1;
+ }
+
+ dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+ return 0;
+}
+
+void
+rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ vhost_log_write(dev, addr, len);
+}
+
+void
+rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+ uint64_t offset, uint64_t len)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return;
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return;
+
+ vhost_log_used_vring(dev, vq, offset, len);
+}
+
+int
+rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx,
+ uint16_t last_avail_idx, uint16_t last_used_idx) {
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ vq->last_avail_idx = last_avail_idx;
+ vq->last_used_idx = last_used_idx;
+
+ return 0;
+}
diff --git a/src/spdk/lib/vhost/rte_vhost/vhost.h b/src/spdk/lib/vhost/rte_vhost/vhost.h
new file mode 100644
index 00000000..b0a0201d
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/vhost.h
@@ -0,0 +1,321 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+
+#include <rte_log.h>
+#include <rte_ether.h>
+
+#include "rte_vhost.h"
+#include "vhost_user.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+/* Used to indicate that the device is ready to operate */
+#define VIRTIO_DEV_READY 2
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+ uint64_t buf_addr;
+ uint32_t buf_len;
+ uint32_t desc_idx;
+};
+
+/*
+ * A structure to hold some fields needed in zero copy code path,
+ * mainly for associating an mbuf with the right desc_idx.
+ */
+struct zcopy_mbuf {
+ struct rte_mbuf *mbuf;
+ uint32_t desc_idx;
+ uint16_t in_use;
+
+ TAILQ_ENTRY(zcopy_mbuf) next;
+};
+TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+ uint32_t size;
+
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+#define VIRTIO_INVALID_EVENTFD (-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
+
+ /* Backend value to determine if device should started/stopped */
+ int backend;
+ /* Used to notify the guest (trigger interrupt) */
+ int callfd;
+ /* Currently unused as polling mode is enabled */
+ int kickfd;
+ int enabled;
+
+ /* Physical address of used ring, for logging */
+ uint64_t log_guest_addr;
+
+ uint16_t nr_zmbuf;
+ uint16_t zmbuf_size;
+ uint16_t last_zmbuf_idx;
+ struct zcopy_mbuf *zmbufs;
+ struct zcopy_mbuf_list zmbuf_list;
+
+ struct vring_used_elem *shadow_used_ring;
+ uint16_t shadow_used_idx;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macros defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+#ifndef VIRTIO_NET_F_MQ
+ #define VIRTIO_NET_F_MQ 22
+#endif
+
+#define VHOST_MAX_VRING 0x100
+#define VHOST_MAX_QUEUE_PAIRS 0x80
+
+#ifndef VIRTIO_NET_F_MTU
+ #define VIRTIO_NET_F_MTU 3
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+/* Features supported by this builtin vhost-user net driver. */
+#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_CTRL_RX) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+ (1ULL << VIRTIO_NET_F_MQ) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VIRTIO_NET_F_MTU))
+
+
+struct guest_page {
+ uint64_t guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+};
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+ /* Frontend (QEMU) memory and memory region information */
+ struct rte_vhost_memory *mem;
+ uint64_t features;
+ uint64_t negotiated_features;
+ uint64_t protocol_features;
+ int vid;
+ uint32_t is_nvme;
+ uint32_t flags;
+ uint16_t vhost_hlen;
+ /* to tell if we need broadcast rarp packet */
+ rte_atomic16_t broadcast_rarp;
+ uint32_t nr_vring;
+ int dequeue_zero_copy;
+ struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+ char ifname[IF_NAME_SZ];
+ uint64_t log_size;
+ uint64_t log_base;
+ uint64_t log_addr;
+ struct ether_addr mac;
+ uint16_t mtu;
+
+ struct vhost_device_ops const *notify_ops;
+
+ uint32_t nr_guest_pages;
+ uint32_t max_guest_pages;
+ struct guest_page *guest_pages;
+ int has_new_mem_table;
+ struct VhostUserMemory mem_table;
+ int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS];
+} __rte_cache_aligned;
+
+
+#define VHOST_LOG_PAGE 4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+ log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+ uint64_t page;
+
+ if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base || !len))
+ return;
+
+ if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+ return;
+
+ /* To make sure guest memory updates are committed before logging */
+ rte_smp_wmb();
+
+ page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len) {
+ vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+ page += 1;
+ }
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t offset, uint64_t len)
+{
+ vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define VHOST_LOG_LEVEL RTE_LOG_DEBUG
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+ char *pkt_addr = (char *)(addr); \
+ unsigned int index; \
+ char packet[VHOST_MAX_PRINT_BUFF]; \
+ \
+ if ((header)) \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+ else \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+ for (index = 0; index < (size); index++) { \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+ "%02hhx ", pkt_addr[index]); \
+ } \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+ \
+ VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define VHOST_LOG_LEVEL RTE_LOG_INFO
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE 1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* Convert guest physical address to host physical address */
+static inline phys_addr_t __attribute__((always_inline))
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ if (gpa >= page->guest_phys_addr &&
+ gpa + size < page->guest_phys_addr + page->size) {
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ }
+ }
+
+ return 0;
+}
+
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(uint64_t features);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+void vhost_enable_dequeue_zero_copy(int vid);
+
+struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+
+/*
+ * Backend-specific cleanup.
+ *
+ * TODO: fix it; we have one backend now
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/src/spdk/lib/vhost/rte_vhost/vhost_user.c b/src/spdk/lib/vhost/rte_vhost/vhost_user.c
new file mode 100644
index 00000000..b708a8a7
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/vhost_user.c
@@ -0,0 +1,1360 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+#define VIRTIO_MIN_MTU 68
+#define VIRTIO_MAX_MTU 65535
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = "VHOST_USER_NONE",
+ [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+ [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+ [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+ [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+ [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+ [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+ [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+ [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+ [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+ [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+ [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+ [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+ [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+ [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+ [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+ [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
+ [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
+ [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU",
+ [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+ [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+ [VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN",
+ [VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL",
+ [VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP",
+ [VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP",
+ [VHOST_USER_NVME_IO_CMD] = "VHOST_USER_NVME_IO_CMD"
+};
+
+static uint64_t
+get_blk_size(int fd)
+{
+ struct stat stat;
+ int ret;
+
+ ret = fstat(fd, &stat);
+ return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct rte_vhost_mem_region *reg;
+
+ if (!dev || !dev->mem)
+ return;
+
+ for (i = 0; i < dev->mem->nregions; i++) {
+ reg = &dev->mem->regions[i];
+ if (reg->host_user_addr) {
+ munmap(reg->mmap_addr, reg->mmap_size);
+ close(reg->fd);
+ }
+ }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ if (dev->mem) {
+ if (dev->has_new_mem_table) {
+ for (i = 0; i < dev->mem->nregions; i++) {
+ close(dev->mem_table_fds[i]);
+ }
+ dev->has_new_mem_table = 0;
+ }
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ free(dev->guest_pages);
+ dev->guest_pages = NULL;
+
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ dev->log_addr = 0;
+ }
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_user_set_owner(void)
+{
+ return 0;
+}
+
+static int
+vhost_user_reset_owner(struct virtio_net *dev)
+{
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ cleanup_device(dev, 0);
+ reset_device(dev);
+ return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static uint64_t
+vhost_user_get_features(struct virtio_net *dev)
+{
+ return dev->features;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_user_set_features(struct virtio_net *dev, uint64_t features)
+{
+ uint64_t vhost_features = 0;
+
+ vhost_features = vhost_user_get_features(dev);
+ if (features & ~vhost_features) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) received invalid negotiated features.\n",
+ dev->vid);
+ return -1;
+ }
+
+ if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) {
+ if (dev->notify_ops->features_changed) {
+ dev->notify_ops->features_changed(dev->vid, features);
+ } else {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+ }
+
+ dev->negotiated_features = features;
+ if (dev->negotiated_features &
+ ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ } else {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+ }
+ VHOST_LOG_DEBUG(VHOST_CONFIG,
+ "(%d) mergeable RX buffers %s, virtio 1 %s\n",
+ dev->vid,
+ (dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+ (dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+ return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_user_set_vring_num(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+ vq->size = msg->payload.state.num;
+
+ if (dev->dequeue_zero_copy) {
+ vq->nr_zmbuf = 0;
+ vq->last_zmbuf_idx = 0;
+ vq->zmbuf_size = vq->size;
+ vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
+ sizeof(struct zcopy_mbuf), 0);
+ if (vq->zmbufs == NULL) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "failed to allocate mem for zero copy; "
+ "zero copy is force disabled\n");
+ dev->dequeue_zero_copy = 0;
+ }
+ }
+
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for shadow used ring.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+ int oldnode, newnode;
+ struct virtio_net *old_dev;
+ struct vhost_virtqueue *old_vq, *vq;
+ int ret;
+
+ old_dev = dev;
+ vq = old_vq = dev->virtqueue[index];
+
+ ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+ MPOL_F_NODE | MPOL_F_ADDR);
+
+ /* check if we need to reallocate vq */
+ ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get vq numa information.\n");
+ return dev;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate vq from %d to %d node\n", oldnode, newnode);
+ vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
+ if (!vq)
+ return dev;
+
+ memcpy(vq, old_vq, sizeof(*vq));
+ rte_free(old_vq);
+ }
+
+ /* check if we need to reallocate dev */
+ ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get dev numa information.\n");
+ goto out;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate dev from %d to %d node\n",
+ oldnode, newnode);
+ dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+ if (!dev) {
+ dev = old_dev;
+ goto out;
+ }
+
+ memcpy(dev, old_dev, sizeof(*dev));
+ rte_free(old_dev);
+ }
+
+out:
+ dev->virtqueue[index] = vq;
+ vhost_devices[dev->vid] = dev;
+
+ return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+ return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
+{
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+
+ /* Find the region where the address lives. */
+ for (i = 0; i < dev->mem->nregions; i++) {
+ reg = &dev->mem->regions[i];
+
+ if (qva >= reg->guest_user_addr &&
+ qva < reg->guest_user_addr + reg->size) {
+
+ if (unlikely(*len > reg->guest_user_addr + reg->size - qva))
+ *len = reg->guest_user_addr + reg->size - qva;
+
+ return qva - reg->guest_user_addr +
+ reg->host_user_addr;
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_setup_mem_table(struct virtio_net *dev);
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
+{
+ struct vhost_virtqueue *vq;
+ uint64_t len;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ if (dev->has_new_mem_table) {
+ vhost_setup_mem_table(dev);
+ dev->has_new_mem_table = 0;
+ }
+
+ if (dev->mem == NULL)
+ return -1;
+
+ /* addr->index refers to the queue index. The txq 1, rxq is 0. */
+ vq = dev->virtqueue[msg->payload.addr.index];
+
+ /* The addresses are converted from QEMU virtual to Vhost virtual. */
+ len = sizeof(struct vring_desc) * vq->size;
+ vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+ msg->payload.addr.desc_user_addr, &len);
+ if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to map desc ring.\n",
+ dev->vid);
+ return -1;
+ }
+
+ dev = numa_realloc(dev, msg->payload.addr.index);
+ vq = dev->virtqueue[msg->payload.addr.index];
+
+ len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
+ vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+ msg->payload.addr.avail_user_addr, &len);
+ if (vq->avail == 0 ||
+ len != sizeof(struct vring_avail)
+ + sizeof(uint16_t) * vq->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find avail ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ len = sizeof(struct vring_used) +
+ sizeof(struct vring_used_elem) * vq->size;
+ vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+ msg->payload.addr.used_user_addr, &len);
+ if (vq->used == 0 || len != sizeof(struct vring_used) +
+ sizeof(struct vring_used_elem) * vq->size) {
+
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find used ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ if (vq->last_used_idx != vq->used->idx) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+ "some packets maybe resent for Tx and dropped for Rx\n",
+ vq->last_used_idx, vq->used->idx);
+ vq->last_used_idx = vq->used->idx;
+ vq->last_avail_idx = vq->used->idx;
+ }
+
+ vq->log_guest_addr = msg->payload.addr.log_guest_addr;
+
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+ dev->vid, vq->desc);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+ dev->vid, vq->avail);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+ dev->vid, vq->used);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+ dev->vid, vq->log_guest_addr);
+
+ return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_user_set_vring_base(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num;
+ dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num;
+
+ return 0;
+}
+
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+ uint64_t host_phys_addr, uint64_t size)
+{
+ struct guest_page *page, *last_page;
+
+ if (dev->nr_guest_pages == dev->max_guest_pages) {
+ dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2);
+ dev->guest_pages = realloc(dev->guest_pages,
+ dev->max_guest_pages * sizeof(*page));
+ }
+
+ if (dev->nr_guest_pages > 0) {
+ last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+ /* merge if the two pages are continuous */
+ if (host_phys_addr == last_page->host_phys_addr +
+ last_page->size) {
+ last_page->size += size;
+ return;
+ }
+ }
+
+ page = &dev->guest_pages[dev->nr_guest_pages++];
+ page->guest_phys_addr = guest_phys_addr;
+ page->host_phys_addr = host_phys_addr;
+ page->size = size;
+}
+
+static void
+add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
+ uint64_t page_size)
+{
+ uint64_t reg_size = reg->size;
+ uint64_t host_user_addr = reg->host_user_addr;
+ uint64_t guest_phys_addr = reg->guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+
+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+ size = page_size - (guest_phys_addr & (page_size - 1));
+ size = RTE_MIN(size, reg_size);
+
+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+
+ while (reg_size > 0) {
+ size = RTE_MIN(reg_size, page_size);
+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
+ host_user_addr);
+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+ }
+}
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest physical page region %u\n"
+ "\t guest_phys_addr: %" PRIx64 "\n"
+ "\t host_phys_addr : %" PRIx64 "\n"
+ "\t size : %" PRIx64 "\n",
+ i,
+ page->guest_phys_addr,
+ page->host_phys_addr,
+ page->size);
+ }
+}
+#else
+#define dump_guest_pages(dev)
+#endif
+
+static int
+vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ uint32_t i;
+
+ if (dev->has_new_mem_table) {
+ /*
+ * The previous mem table was not consumed, so close the
+ * file descriptors from that mem table before copying
+ * the new one.
+ */
+ for (i = 0; i < dev->mem_table.nregions; i++) {
+ close(dev->mem_table_fds[i]);
+ }
+ }
+
+ memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table));
+ memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds));
+ dev->has_new_mem_table = 1;
+ /* vhost-user-nvme will not send
+ * set vring addr message, enable
+ * memory address table now.
+ */
+ if (dev->has_new_mem_table && dev->is_nvme) {
+ vhost_setup_mem_table(dev);
+ dev->has_new_mem_table = 0;
+ }
+
+ return 0;
+}
+
+ static int
+vhost_setup_mem_table(struct virtio_net *dev)
+{
+ struct VhostUserMemory memory = dev->mem_table;
+ struct rte_vhost_mem_region *reg;
+ struct vhost_virtqueue *vq;
+ void *mmap_addr;
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint64_t alignment;
+ uint32_t i;
+ int fd;
+
+ if (dev->mem) {
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+ /* Those addresses won't be valid anymore in host address space
+ * after setting new mem table. Initiator need to resend these
+ * addresses.
+ */
+ vq->desc = NULL;
+ vq->avail = NULL;
+ vq->used = NULL;
+ }
+
+ dev->nr_guest_pages = 0;
+ if (!dev->guest_pages) {
+ dev->max_guest_pages = 8;
+ dev->guest_pages = malloc(dev->max_guest_pages *
+ sizeof(struct guest_page));
+ }
+
+ dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+ sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
+ if (dev->mem == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to allocate memory for dev->mem\n",
+ dev->vid);
+ return -1;
+ }
+ dev->mem->nregions = memory.nregions;
+
+ for (i = 0; i < memory.nregions; i++) {
+ fd = dev->mem_table_fds[i];
+ reg = &dev->mem->regions[i];
+
+ reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
+ reg->guest_user_addr = memory.regions[i].userspace_addr;
+ reg->size = memory.regions[i].memory_size;
+ reg->fd = fd;
+
+ mmap_offset = memory.regions[i].mmap_offset;
+ mmap_size = reg->size + mmap_offset;
+
+ /* mmap() without flag of MAP_ANONYMOUS, should be called
+ * with length argument aligned with hugepagesz at older
+ * longterm version Linux, like 2.6.32 and 3.2.72, or
+ * mmap() will fail with EINVAL.
+ *
+ * to avoid failure, make sure in caller to keep length
+ * aligned.
+ */
+ alignment = get_blk_size(fd);
+ if (alignment == (uint64_t)-1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "couldn't get hugepage size through fstat\n");
+ goto err_mmap;
+ }
+ mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+ mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap region %u failed.\n", i);
+ goto err_mmap;
+ }
+
+ if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "MADV_DONTDUMP advice setting failed.\n");
+ }
+
+ reg->mmap_addr = mmap_addr;
+ reg->mmap_size = mmap_size;
+ reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+ mmap_offset;
+
+ if (dev->dequeue_zero_copy)
+ add_guest_pages(dev, reg, alignment);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest memory region %u, size: 0x%" PRIx64 "\n"
+ "\t guest physical addr: 0x%" PRIx64 "\n"
+ "\t guest virtual addr: 0x%" PRIx64 "\n"
+ "\t host virtual addr: 0x%" PRIx64 "\n"
+ "\t mmap addr : 0x%" PRIx64 "\n"
+ "\t mmap size : 0x%" PRIx64 "\n"
+ "\t mmap align: 0x%" PRIx64 "\n"
+ "\t mmap off : 0x%" PRIx64 "\n",
+ i, reg->size,
+ reg->guest_phys_addr,
+ reg->guest_user_addr,
+ reg->host_user_addr,
+ (uint64_t)(uintptr_t)mmap_addr,
+ mmap_size,
+ alignment,
+ mmap_offset);
+ }
+
+ dump_guest_pages(dev);
+
+ return 0;
+
+err_mmap:
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+ return vq && vq->desc &&
+ vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+ vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+ vq->kickfd != VIRTIO_INVALID_EVENTFD &&
+ vq->callfd != VIRTIO_INVALID_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+ struct vhost_virtqueue *vq;
+ uint32_t i;
+
+ if (dev->nr_vring == 0)
+ return 0;
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+
+ if (vq_is_ready(vq)) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio is now ready for processing.\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void
+vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+ struct vhost_virtqueue *vq;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring call idx:%d file:%d\n", file.index, file.fd);
+
+ vq = dev->virtqueue[file.index];
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = file.fd;
+}
+
+static void
+vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+ struct vhost_virtqueue *vq;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring kick idx:%d file:%d\n", file.index, file.fd);
+
+ vq = dev->virtqueue[file.index];
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+ vq->kickfd = file.fd;
+}
+
+static void
+free_zmbufs(struct vhost_virtqueue *vq)
+{
+ struct zcopy_mbuf *zmbuf, *next;
+
+ for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+ zmbuf != NULL; zmbuf = next) {
+ next = TAILQ_NEXT(zmbuf, next);
+
+ rte_pktmbuf_free(zmbuf->mbuf);
+ TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ }
+
+ rte_free(vq->zmbufs);
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+vhost_user_get_vring_base(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+ /* We have to stop the queue (virtio) if it is running. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ dev->flags &= ~VIRTIO_DEV_READY;
+
+ /* Here we are safe to get the last used index */
+ msg->payload.state.num = vq->last_used_idx;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num);
+ /*
+ * Based on current qemu vhost-user implementation, this message is
+ * sent and only sent in vhost_vring_stop.
+ * TODO: cleanup the vring, it isn't usable since here.
+ */
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ if (dev->dequeue_zero_copy)
+ free_zmbufs(vq);
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+
+ return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+vhost_user_set_vring_enable(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ int enable = (int)msg->payload.state.num;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set queue enable: %d to qp idx: %d\n",
+ enable, msg->payload.state.index);
+
+ if (dev->notify_ops->vring_state_changed)
+ dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable);
+
+ dev->virtqueue[msg->payload.state.index]->enabled = enable;
+
+ return 0;
+}
+
+static void
+vhost_user_set_protocol_features(struct virtio_net *dev,
+ uint64_t protocol_features)
+{
+ if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+ return;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ dev->protocol_features = protocol_features;
+}
+
+static int
+vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ int fd = msg->fds[0];
+ uint64_t size, off;
+ void *addr;
+
+ if (fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+ return -1;
+ }
+
+ if (msg->size != sizeof(VhostUserLog)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid log base msg size: %"PRId32" != %d\n",
+ msg->size, (int)sizeof(VhostUserLog));
+ return -1;
+ }
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ size = msg->payload.log.mmap_size;
+ off = msg->payload.log.mmap_offset;
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "log mmap size: %"PRId64", offset: %"PRId64"\n",
+ size, off);
+
+ /*
+ * mmap from 0 to workaround a hugepage mmap bug: mmap will
+ * fail when offset is not page size aligned.
+ */
+ addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+ return -1;
+ }
+
+ /*
+ * Free previously mapped log memory on occasionally
+ * multiple VHOST_USER_SET_LOG_BASE.
+ */
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ }
+ dev->log_addr = (uint64_t)(uintptr_t)addr;
+ dev->log_base = dev->log_addr + off;
+ dev->log_size = size;
+
+ return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ memcpy(dev->mac.addr_bytes, mac, 6);
+
+ /*
+ * Set the flag to inject a RARP broadcast packet at
+ * rte_vhost_dequeue_burst().
+ *
+ * rte_smp_wmb() is for making sure the mac is copied
+ * before the flag is set.
+ */
+ rte_smp_wmb();
+ rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+ return 0;
+}
+
+static int
+vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ if (msg->payload.u64 < VIRTIO_MIN_MTU ||
+ msg->payload.u64 > VIRTIO_MAX_MTU) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
+ msg->payload.u64);
+
+ return -1;
+ }
+
+ dev->mtu = msg->payload.u64;
+
+ return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+ msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+ if (ret <= 0)
+ return ret;
+
+ if (msg && msg->size) {
+ if (msg->size > sizeof(msg->payload)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid msg size: %d\n", msg->size);
+ return -1;
+ }
+ ret = read(sockfd, &msg->payload, msg->size);
+ if (ret <= 0)
+ return ret;
+ if (ret != (int)msg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "read control message failed\n");
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ if (!msg)
+ return 0;
+
+ msg->flags &= ~VHOST_USER_VERSION_MASK;
+ msg->flags &= ~VHOST_USER_NEED_REPLY;
+ msg->flags |= VHOST_USER_VERSION;
+ msg->flags |= VHOST_USER_REPLY_MASK;
+
+ ret = send_fd_message(sockfd, (char *)msg,
+ VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+ return ret;
+}
+
+/*
+ * Allocate a queue pair if it hasn't been allocated yet
+ */
+static int
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
+{
+ uint16_t vring_idx;
+
+ switch (msg->request) {
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_VRING_ERR:
+ vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ break;
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ENABLE:
+ vring_idx = msg->payload.state.index;
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ vring_idx = msg->payload.addr.index;
+ break;
+ default:
+ return 0;
+ }
+
+ if (vring_idx >= VHOST_MAX_VRING) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid vring index: %u\n", vring_idx);
+ return -1;
+ }
+
+ if (dev->virtqueue[vring_idx])
+ return 0;
+
+ return alloc_vring_queue(dev, vring_idx);
+}
+
+static int
+vhost_user_nvme_io_request_passthrough(struct virtio_net *dev,
+ uint16_t qid, uint16_t tail_head,
+ bool is_submission_queue)
+{
+ return -1;
+}
+
+static int
+vhost_user_nvme_admin_passthrough(struct virtio_net *dev,
+ void *cmd, void *cqe, void *buf)
+{
+ if (dev->notify_ops->vhost_nvme_admin_passthrough) {
+ return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf);
+ }
+
+ return -1;
+}
+
+static int
+vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd)
+{
+ if (dev->notify_ops->vhost_nvme_set_cq_call) {
+ return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd);
+ }
+
+ return -1;
+}
+
+static int
+vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap)
+{
+ if (dev->notify_ops->vhost_nvme_get_cap) {
+ return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap);
+ }
+
+ return -1;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+ struct virtio_net *dev;
+ struct VhostUserMsg msg;
+ struct vhost_vring_file file;
+ int ret;
+ uint64_t cap;
+ uint64_t enable;
+ uint8_t cqe[16];
+ uint8_t cmd[64];
+ uint8_t buf[4096];
+ uint16_t qid, tail_head;
+ bool is_submission_queue;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ if (!dev->notify_ops) {
+ dev->notify_ops = vhost_driver_callback_get(dev->ifname);
+ if (!dev->notify_ops) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to get callback ops for driver %s\n",
+ dev->ifname);
+ return -1;
+ }
+ }
+
+ ret = read_vhost_message(fd, &msg);
+ if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read message failed\n");
+ else if (ret == 0)
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vhost peer closed\n");
+ else
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read incorrect message\n");
+
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n",
+ dev->ifname, vhost_message_str[msg.request]);
+
+ ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to alloc queue\n");
+ return -1;
+ }
+
+ switch (msg.request) {
+ case VHOST_USER_GET_CONFIG:
+ if (dev->notify_ops->get_config(dev->vid,
+ msg.payload.config.region,
+ msg.payload.config.size) != 0) {
+ msg.size = sizeof(uint64_t);
+ }
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_CONFIG:
+ if ((dev->notify_ops->set_config(dev->vid,
+ msg.payload.config.region,
+ msg.payload.config.offset,
+ msg.payload.config.size,
+ msg.payload.config.flags)) != 0) {
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+ break;
+ case VHOST_USER_NVME_ADMIN:
+ if (!dev->is_nvme) {
+ dev->is_nvme = 1;
+ }
+ memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd));
+ ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf);
+ memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe));
+ msg.size = sizeof(cqe);
+ /* NVMe Identify Command */
+ if (cmd[0] == 0x06) {
+ memcpy(msg.payload.nvme.buf, &buf, 4096);
+ msg.size += 4096;
+ }
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_NVME_SET_CQ_CALL:
+ file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ file.fd = msg.fds[0];
+ ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd);
+ break;
+ case VHOST_USER_NVME_GET_CAP:
+ ret = vhost_user_nvme_get_cap(dev, &cap);
+ if (!ret)
+ msg.payload.u64 = cap;
+ else
+ msg.payload.u64 = 0;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_NVME_START_STOP:
+ enable = msg.payload.u64;
+ /* device must be started before set cq call */
+ if (enable) {
+ if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+ if (dev->notify_ops->new_device(dev->vid) == 0)
+ dev->flags |= VIRTIO_DEV_RUNNING;
+ }
+ } else {
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+ }
+ break;
+ case VHOST_USER_NVME_IO_CMD:
+ qid = msg.payload.nvme_io.qid;
+ tail_head = msg.payload.nvme_io.tail_head;
+ is_submission_queue = (msg.payload.nvme_io.queue_type == VHOST_USER_NVME_SUBMISSION_QUEUE) ? true : false;
+ vhost_user_nvme_io_request_passthrough(dev, qid, tail_head, is_submission_queue);
+ break;
+ case VHOST_USER_GET_FEATURES:
+ msg.payload.u64 = vhost_user_get_features(dev);
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_FEATURES:
+ vhost_user_set_features(dev, msg.payload.u64);
+ break;
+
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ vhost_user_set_protocol_features(dev, msg.payload.u64);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ vhost_user_set_owner();
+ break;
+ case VHOST_USER_RESET_OWNER:
+ vhost_user_reset_owner(dev);
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ ret = vhost_user_set_mem_table(dev, &msg);
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ vhost_user_set_log_base(dev, &msg);
+
+ /* it needs a reply */
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_LOG_FD:
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ vhost_user_set_vring_num(dev, &msg);
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ vhost_user_set_vring_addr(dev, &msg);
+ break;
+ case VHOST_USER_SET_VRING_BASE:
+ vhost_user_set_vring_base(dev, &msg);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ vhost_user_get_vring_base(dev, &msg);
+ msg.size = sizeof(msg.payload.state);
+ send_vhost_message(fd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ vhost_user_set_vring_kick(dev, &msg);
+ break;
+ case VHOST_USER_SET_VRING_CALL:
+ vhost_user_set_vring_call(dev, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+ break;
+
+ case VHOST_USER_GET_QUEUE_NUM:
+ msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ENABLE:
+ vhost_user_set_vring_enable(dev, &msg);
+ break;
+ case VHOST_USER_SEND_RARP:
+ vhost_user_send_rarp(dev, &msg);
+ break;
+
+ case VHOST_USER_NET_SET_MTU:
+ ret = vhost_user_net_set_mtu(dev, &msg);
+ break;
+
+ default:
+ ret = -1;
+ break;
+
+ }
+
+ if (msg.flags & VHOST_USER_NEED_REPLY) {
+ msg.payload.u64 = !!ret;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ }
+
+ if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
+ dev->flags |= VIRTIO_DEV_READY;
+
+ if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+ if (dev->dequeue_zero_copy) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "dequeue zero copy is enabled\n");
+ }
+
+ if (dev->notify_ops->new_device(dev->vid) == 0)
+ dev->flags |= VIRTIO_DEV_RUNNING;
+ }
+ }
+
+ return 0;
+}
diff --git a/src/spdk/lib/vhost/rte_vhost/vhost_user.h b/src/spdk/lib/vhost/rte_vhost/vhost_user.h
new file mode 100644
index 00000000..cb5ff0a6
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost/vhost_user.h
@@ -0,0 +1,182 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_vhost.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+#define VHOST_USER_PROTOCOL_F_MQ 0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
+#define VHOST_USER_PROTOCOL_F_RARP 2
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
+#define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#define VHOST_USER_PROTOCOL_F_CONFIG 9
+
+#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+ (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
+ VHOST_USER_NET_SET_MTU = 20,
+ VHOST_USER_GET_CONFIG = 24,
+ VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_NVME_ADMIN = 80,
+ VHOST_USER_NVME_SET_CQ_CALL = 81,
+ VHOST_USER_NVME_GET_CAP = 82,
+ VHOST_USER_NVME_START_STOP = 83,
+ VHOST_USER_NVME_IO_CMD = 84,
+ VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+ VHOST_USER_SLAVE_NONE = 0,
+ VHOST_USER_SLAVE_IOTLB_MSG = 1,
+ VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
+ VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserConfig {
+ uint32_t offset;
+ uint32_t size;
+ uint32_t flags;
+ uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+enum VhostUserNvmeQueueTypes {
+ VHOST_USER_NVME_SUBMISSION_QUEUE = 1,
+ VHOST_USER_NVME_COMPLETION_QUEUE = 2,
+};
+
+typedef struct VhostUserNvmeIO {
+ enum VhostUserNvmeQueueTypes queue_type;
+ uint32_t qid;
+ uint32_t tail_head;
+} VhostUserNvmeIO;
+
+typedef struct VhostUserMsg {
+ VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK 0x3
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+#define VHOST_USER_NEED_REPLY (0x1 << 3)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+ union {
+#define VHOST_USER_VRING_IDX_MASK 0xff
+#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ VhostUserLog log;
+ VhostUserConfig config;
+ struct nvme {
+ union {
+ uint8_t req[64];
+ uint8_t cqe[16];
+ } cmd;
+ uint8_t buf[4096];
+ } nvme;
+ struct VhostUserNvmeIO nvme_io;
+ } payload;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION 0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c
new file mode 100644
index 00000000..0cacf613
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost.c
@@ -0,0 +1,1503 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/barrier.h"
+
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+static uint32_t *g_num_ctrlrs;
+
+/* Path to folder where character device will be created. Can be set by user. */
+static char dev_dirname[PATH_MAX] = "";
+
+struct spdk_vhost_dev_event_ctx {
+ /** Pointer to the controller obtained before enqueuing the event */
+ struct spdk_vhost_dev *vdev;
+
+ /** ID of the vdev to send event to. */
+ unsigned vdev_id;
+
+ /** User callback function to be executed on given lcore. */
+ spdk_vhost_event_fn cb_fn;
+
+ /** Semaphore used to signal that event is done. */
+ sem_t sem;
+
+ /** Response to be written by enqueued event. */
+ int response;
+};
+
+static int new_connection(int vid);
+static int start_device(int vid);
+static void stop_device(int vid);
+static void destroy_connection(int vid);
+static int get_config(int vid, uint8_t *config, uint32_t len);
+static int set_config(int vid, uint8_t *config, uint32_t offset,
+ uint32_t size, uint32_t flags);
+
+const struct vhost_device_ops g_spdk_vhost_ops = {
+ .new_device = start_device,
+ .destroy_device = stop_device,
+ .get_config = get_config,
+ .set_config = set_config,
+ .new_connection = new_connection,
+ .destroy_connection = destroy_connection,
+ .vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough,
+ .vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call,
+ .vhost_nvme_get_cap = spdk_vhost_nvme_get_cap,
+};
+
+static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER(
+ g_spdk_vhost_devices);
+static pthread_mutex_t g_spdk_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len)
+{
+ void *vva;
+ uint64_t newlen;
+
+ newlen = len;
+ vva = (void *)rte_vhost_va_from_guest_pa(vdev->mem, addr, &newlen);
+ if (newlen != len) {
+ return NULL;
+ }
+
+ return vva;
+
+}
+
+static void
+spdk_vhost_log_req_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_id)
+{
+ struct vring_desc *desc, *desc_table;
+ uint32_t desc_table_size;
+ int rc;
+
+ if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ rc = spdk_vhost_vq_get_desc(vdev, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Can't log used ring descriptors!\n");
+ return;
+ }
+
+ do {
+ if (spdk_vhost_vring_desc_is_wr(desc)) {
+ /* To be honest, only pages realy touched should be logged, but
+ * doing so would require tracking those changes in each backed.
+ * Also backend most likely will touch all/most of those pages so
+ * for lets assume we touched all pages passed to as writeable buffers. */
+ rte_vhost_log_write(vdev->vid, desc->addr, desc->len);
+ }
+ spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ } while (desc);
+}
+
+static void
+spdk_vhost_log_used_vring_elem(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t idx)
+{
+ uint64_t offset, len;
+ uint16_t vq_idx;
+
+ if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ offset = offsetof(struct vring_used, ring[idx]);
+ len = sizeof(virtqueue->vring.used->ring[idx]);
+ vq_idx = virtqueue - vdev->virtqueue;
+
+ rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len);
+}
+
+static void
+spdk_vhost_log_used_vring_idx(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue)
+{
+ uint64_t offset, len;
+ uint16_t vq_idx;
+
+ if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ offset = offsetof(struct vring_used, idx);
+ len = sizeof(virtqueue->vring.used->idx);
+ vq_idx = virtqueue - vdev->virtqueue;
+
+ rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len);
+}
+
+/*
+ * Get available requests from avail ring.
+ */
+uint16_t
+spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
+ uint16_t reqs_len)
+{
+ struct rte_vhost_vring *vring = &virtqueue->vring;
+ struct vring_avail *avail = vring->avail;
+ uint16_t size_mask = vring->size - 1;
+ uint16_t last_idx = vring->last_avail_idx, avail_idx = avail->idx;
+ uint16_t count, i;
+
+ count = avail_idx - last_idx;
+ if (spdk_likely(count == 0)) {
+ return 0;
+ }
+
+ if (spdk_unlikely(count > vring->size)) {
+ /* TODO: the queue is unrecoverably broken and should be marked so.
+ * For now we will fail silently and report there are no new avail entries.
+ */
+ return 0;
+ }
+
+ count = spdk_min(count, reqs_len);
+ vring->last_avail_idx += count;
+ for (i = 0; i < count; i++) {
+ reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
+ last_idx, avail_idx, count);
+
+ return count;
+}
+
+static bool
+spdk_vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
+{
+ return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
+}
+
+int
+spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+ uint32_t *desc_table_size)
+{
+ if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
+ return -1;
+ }
+
+ *desc = &virtqueue->vring.desc[req_idx];
+
+ if (spdk_vhost_vring_desc_is_indirect(*desc)) {
+ assert(spdk_vhost_dev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC));
+ *desc_table_size = (*desc)->len / sizeof(**desc);
+ *desc_table = spdk_vhost_gpa_to_vva(vdev, (*desc)->addr,
+ sizeof(**desc) * *desc_table_size);
+ *desc = *desc_table;
+ if (*desc == NULL) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+ *desc_table = virtqueue->vring.desc;
+ *desc_table_size = virtqueue->vring.size;
+
+ return 0;
+}
+
+int
+spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue)
+{
+ if (virtqueue->used_req_cnt == 0) {
+ return 0;
+ }
+
+ virtqueue->req_cnt += virtqueue->used_req_cnt;
+ virtqueue->used_req_cnt = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
+ virtqueue - vdev->virtqueue, virtqueue->vring.last_used_idx);
+
+ eventfd_write(virtqueue->vring.callfd, (eventfd_t)1);
+ return 1;
+}
+
+
+static void
+check_dev_io_stats(struct spdk_vhost_dev *vdev, uint64_t now)
+{
+ struct spdk_vhost_virtqueue *virtqueue;
+ uint32_t irq_delay_base = vdev->coalescing_delay_time_base;
+ uint32_t io_threshold = vdev->coalescing_io_rate_threshold;
+ int32_t irq_delay;
+ uint32_t req_cnt;
+ uint16_t q_idx;
+
+ if (now < vdev->next_stats_check_time) {
+ return;
+ }
+
+ vdev->next_stats_check_time = now + vdev->stats_check_interval;
+ for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) {
+ virtqueue = &vdev->virtqueue[q_idx];
+
+ req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
+ if (req_cnt <= io_threshold) {
+ continue;
+ }
+
+ irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
+ virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
+
+ virtqueue->req_cnt = 0;
+ virtqueue->next_event_time = now;
+ }
+}
+
+void
+spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_virtqueue *virtqueue;
+ uint64_t now;
+ uint16_t q_idx;
+
+ if (vdev->coalescing_delay_time_base == 0) {
+ for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) {
+ virtqueue = &vdev->virtqueue[q_idx];
+
+ if (virtqueue->vring.desc == NULL ||
+ (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+ continue;
+ }
+
+ spdk_vhost_vq_used_signal(vdev, virtqueue);
+ }
+ } else {
+ now = spdk_get_ticks();
+ check_dev_io_stats(vdev, now);
+
+ for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) {
+ virtqueue = &vdev->virtqueue[q_idx];
+
+ /* No need for event right now */
+ if (now < virtqueue->next_event_time ||
+ (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+ continue;
+ }
+
+ if (!spdk_vhost_vq_used_signal(vdev, virtqueue)) {
+ continue;
+ }
+
+ /* Syscall is quite long so update time */
+ now = spdk_get_ticks();
+ virtqueue->next_event_time = now + virtqueue->irq_delay_time;
+ }
+ }
+}
+
+int
+spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+ uint32_t iops_threshold)
+{
+ uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
+ uint32_t io_rate = iops_threshold * SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS / 1000U;
+
+ if (delay_time_base >= UINT32_MAX) {
+ SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
+ return -EINVAL;
+ } else if (io_rate == 0) {
+ SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
+ 1000U / SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS);
+ return -EINVAL;
+ }
+
+ vdev->coalescing_delay_time_base = delay_time_base;
+ vdev->coalescing_io_rate_threshold = io_rate;
+
+ vdev->coalescing_delay_us = delay_base_us;
+ vdev->coalescing_iops_threshold = iops_threshold;
+ return 0;
+}
+
+void
+spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
+ uint32_t *iops_threshold)
+{
+ if (delay_base_us) {
+ *delay_base_us = vdev->coalescing_delay_us;
+ }
+
+ if (iops_threshold) {
+ *iops_threshold = vdev->coalescing_iops_threshold;
+ }
+}
+
+/*
+ * Enqueue id and len to used ring.
+ */
+void
+spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t id, uint32_t len)
+{
+ struct rte_vhost_vring *vring = &virtqueue->vring;
+ struct vring_used *used = vring->used;
+ uint16_t last_idx = vring->last_used_idx & (vring->size - 1);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
+ virtqueue - vdev->virtqueue, vring->last_used_idx, id, len);
+
+ spdk_vhost_log_req_desc(vdev, virtqueue, id);
+
+ vring->last_used_idx++;
+ used->ring[last_idx].id = id;
+ used->ring[last_idx].len = len;
+
+ /* Ensure the used ring is updated before we log it or increment used->idx. */
+ spdk_smp_wmb();
+
+ spdk_vhost_log_used_vring_elem(vdev, virtqueue, last_idx);
+ * (volatile uint16_t *) &used->idx = vring->last_used_idx;
+ spdk_vhost_log_used_vring_idx(vdev, virtqueue);
+
+ /* Ensure all our used ring changes are visible to the guest at the time
+ * of interrupt.
+ * TODO: this is currently an sfence on x86. For other architectures we
+ * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
+ */
+ spdk_wmb();
+
+ virtqueue->used_req_cnt++;
+}
+
+int
+spdk_vhost_vring_desc_get_next(struct vring_desc **desc,
+ struct vring_desc *desc_table, uint32_t desc_table_size)
+{
+ struct vring_desc *old_desc = *desc;
+ uint16_t next_idx;
+
+ if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
+ *desc = NULL;
+ return 0;
+ }
+
+ next_idx = old_desc->next;
+ if (spdk_unlikely(next_idx >= desc_table_size)) {
+ *desc = NULL;
+ return -1;
+ }
+
+ *desc = &desc_table[next_idx];
+ return 0;
+}
+
+bool
+spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
+{
+ return !!(cur_desc->flags & VRING_DESC_F_WRITE);
+}
+
+#define _2MB_OFFSET(ptr) ((ptr) & (0x200000 - 1))
+
+int
+spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_desc *desc)
+{
+ uint32_t remaining = desc->len;
+ uint32_t to_boundary;
+ uint32_t len;
+ uintptr_t payload = desc->addr;
+ uintptr_t vva;
+
+ while (remaining) {
+ if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
+ SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
+ return -1;
+ }
+ vva = (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload);
+ if (vva == 0) {
+ SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
+ return -1;
+ }
+ to_boundary = 0x200000 - _2MB_OFFSET(payload);
+ if (spdk_likely(remaining <= to_boundary)) {
+ len = remaining;
+ } else {
+ /*
+ * Descriptor crosses a 2MB hugepage boundary. vhost memory regions are allocated
+ * from hugepage memory, so this means this descriptor may be described by
+ * discontiguous vhost memory regions. Do not blindly split on the 2MB boundary,
+ * only split it if the two sides of the boundary do not map to the same vhost
+ * memory region. This helps ensure we do not exceed the max number of IOVs
+ * defined by SPDK_VHOST_IOVS_MAX.
+ */
+ len = to_boundary;
+ while (len < remaining) {
+ if (vva + len != (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload + len)) {
+ break;
+ }
+ len += spdk_min(remaining - len, 0x200000);
+ }
+ }
+ iov[*iov_index].iov_base = (void *)vva;
+ iov[*iov_index].iov_len = len;
+ remaining -= len;
+ payload += len;
+ (*iov_index)++;
+ }
+
+ return 0;
+}
+
+static struct spdk_vhost_dev *
+spdk_vhost_dev_find_by_id(unsigned id)
+{
+ struct spdk_vhost_dev *vdev;
+
+ TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+ if (vdev->id == id) {
+ return vdev;
+ }
+ }
+
+ return NULL;
+}
+
+static struct spdk_vhost_dev *
+spdk_vhost_dev_find_by_vid(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+
+ TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+ if (vdev->vid == vid) {
+ return vdev;
+ }
+ }
+
+ return NULL;
+}
+
+#define SHIFT_2MB 21
+#define SIZE_2MB (1ULL << SHIFT_2MB)
+#define FLOOR_2MB(x) (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB
+#define CEIL_2MB(x) ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB
+
+static void
+spdk_vhost_dev_mem_register(struct spdk_vhost_dev *vdev)
+{
+ struct rte_vhost_mem_region *region;
+ uint32_t i;
+
+ for (i = 0; i < vdev->mem->nregions; i++) {
+ uint64_t start, end, len;
+ region = &vdev->mem->regions[i];
+ start = FLOOR_2MB(region->mmap_addr);
+ end = CEIL_2MB(region->mmap_addr + region->mmap_size);
+ len = end - start;
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
+ start, len);
+
+ if (spdk_mem_register((void *)start, len) != 0) {
+ SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
+ i);
+ continue;
+ }
+ }
+}
+
+static void
+spdk_vhost_dev_mem_unregister(struct spdk_vhost_dev *vdev)
+{
+ struct rte_vhost_mem_region *region;
+ uint32_t i;
+
+ for (i = 0; i < vdev->mem->nregions; i++) {
+ uint64_t start, end, len;
+ region = &vdev->mem->regions[i];
+ start = FLOOR_2MB(region->mmap_addr);
+ end = CEIL_2MB(region->mmap_addr + region->mmap_size);
+ len = end - start;
+
+ if (spdk_vtophys((void *) start) == SPDK_VTOPHYS_ERROR) {
+ continue; /* region has not been registered */
+ }
+
+ if (spdk_mem_unregister((void *)start, len) != 0) {
+ assert(false);
+ }
+ }
+
+}
+
+static void
+spdk_vhost_free_reactor(uint32_t lcore)
+{
+ g_num_ctrlrs[lcore]--;
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_find(const char *ctrlr_name)
+{
+ struct spdk_vhost_dev *vdev;
+ size_t dev_dirname_len = strlen(dev_dirname);
+
+ if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
+ ctrlr_name += dev_dirname_len;
+ }
+
+ TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+ if (strcmp(vdev->name, ctrlr_name) == 0) {
+ return vdev;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+{
+ int rc;
+
+ if (cpumask == NULL) {
+ return -1;
+ }
+
+ if (mask == NULL) {
+ spdk_cpuset_copy(cpumask, spdk_app_get_core_mask());
+ return 0;
+ }
+
+ rc = spdk_app_parse_core_mask(mask, cpumask);
+ if (rc < 0) {
+ SPDK_ERRLOG("invalid cpumask %s\n", mask);
+ return -1;
+ }
+
+ if (spdk_cpuset_count(cpumask) == 0) {
+ SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n",
+ spdk_cpuset_fmt(spdk_app_get_core_mask()));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void *
+_start_rte_driver(void *arg)
+{
+ char *path = arg;
+
+ if (rte_vhost_driver_start(path) != 0) {
+ return NULL;
+ }
+
+ return path;
+}
+
+int
+spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+ const struct spdk_vhost_dev_backend *backend)
+{
+ static unsigned ctrlr_num;
+ char path[PATH_MAX];
+ struct stat file_stat;
+ struct spdk_cpuset *cpumask;
+ int rc;
+
+ assert(vdev);
+
+ /* We expect devices inside g_spdk_vhost_devices to be sorted in ascending
+ * order in regard of vdev->id. For now we always set vdev->id = ctrlr_num++
+ * and append each vdev to the very end of g_spdk_vhost_devices list.
+ * This is required for foreach vhost events to work.
+ */
+ if (ctrlr_num == UINT_MAX) {
+ assert(false);
+ return -EINVAL;
+ }
+
+ if (name == NULL) {
+ SPDK_ERRLOG("Can't register controller with no name\n");
+ return -EINVAL;
+ }
+
+ cpumask = spdk_cpuset_alloc();
+ if (!cpumask) {
+ SPDK_ERRLOG("spdk_cpuset_alloc failed\n");
+ return -ENOMEM;
+ }
+
+ if (spdk_vhost_parse_core_mask(mask_str, cpumask) != 0) {
+ SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n",
+ mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask()));
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (spdk_vhost_dev_find(name)) {
+ SPDK_ERRLOG("vhost controller %s already exists.\n", name);
+ rc = -EEXIST;
+ goto out;
+ }
+
+ if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
+ SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
+ name);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Register vhost driver to handle vhost messages. */
+ if (stat(path, &file_stat) != -1) {
+ if (!S_ISSOCK(file_stat.st_mode)) {
+ SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+ "The file already exists and is not a socket.\n",
+ path);
+ rc = -EIO;
+ goto out;
+ } else if (unlink(path) != 0) {
+ SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+ "The socket already exists and failed to unlink.\n",
+ path);
+ rc = -EIO;
+ goto out;
+ }
+ }
+
+ if (rte_vhost_driver_register(path, 0) != 0) {
+ SPDK_ERRLOG("Could not register controller %s with vhost library\n", name);
+ SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
+ rc = -EIO;
+ goto out;
+ }
+ if (rte_vhost_driver_set_features(path, backend->virtio_features) ||
+ rte_vhost_driver_disable_features(path, backend->disabled_features)) {
+ SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name);
+
+ rte_vhost_driver_unregister(path);
+ rc = -EIO;
+ goto out;
+ }
+
+ if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
+ rte_vhost_driver_unregister(path);
+ SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* The following might start a POSIX thread that polls for incoming
+ * socket connections and calls backend->start/stop_device. These backend
+ * callbacks are also protected by the global SPDK vhost mutex, so we're
+ * safe with not initializing the vdev just yet.
+ */
+ if (spdk_call_unaffinitized(_start_rte_driver, path) == NULL) {
+ SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
+ name, errno, spdk_strerror(errno));
+ rte_vhost_driver_unregister(path);
+ rc = -EIO;
+ goto out;
+ }
+
+ vdev->name = strdup(name);
+ vdev->path = strdup(path);
+ vdev->id = ctrlr_num++;
+ vdev->vid = -1;
+ vdev->lcore = -1;
+ vdev->cpumask = cpumask;
+ vdev->registered = true;
+ vdev->backend = backend;
+
+ spdk_vhost_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
+ SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
+ vdev->next_stats_check_time = 0;
+ vdev->stats_check_interval = SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS * spdk_get_ticks_hz() /
+ 1000UL;
+
+ TAILQ_INSERT_TAIL(&g_spdk_vhost_devices, vdev, tailq);
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
+ return 0;
+
+out:
+ spdk_cpuset_free(cpumask);
+ return rc;
+}
+
+int
+spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->vid != -1) {
+ SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
+ return -EBUSY;
+ }
+
+ if (vdev->registered && rte_vhost_driver_unregister(vdev->path) != 0) {
+ SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
+ "Check if domain socket %s still exists\n",
+ vdev->name, vdev->path);
+ return -EIO;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
+
+ free(vdev->name);
+ free(vdev->path);
+ spdk_cpuset_free(vdev->cpumask);
+ TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq);
+ return 0;
+}
+
+static struct spdk_vhost_dev *
+spdk_vhost_dev_next(unsigned i)
+{
+ struct spdk_vhost_dev *vdev;
+
+ TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+ if (vdev->id > i) {
+ return vdev;
+ }
+ }
+
+ return NULL;
+}
+
+const char *
+spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
+{
+ assert(vdev != NULL);
+ return vdev->name;
+}
+
+const struct spdk_cpuset *
+spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
+{
+ assert(vdev != NULL);
+ return vdev->cpumask;
+}
+
+static uint32_t
+spdk_vhost_allocate_reactor(struct spdk_cpuset *cpumask)
+{
+ uint32_t i, selected_core;
+ uint32_t min_ctrlrs;
+
+ min_ctrlrs = INT_MAX;
+ selected_core = spdk_env_get_first_core();
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ if (!spdk_cpuset_get_cpu(cpumask, i)) {
+ continue;
+ }
+
+ if (g_num_ctrlrs[i] < min_ctrlrs) {
+ selected_core = i;
+ min_ctrlrs = g_num_ctrlrs[i];
+ }
+ }
+
+ g_num_ctrlrs[selected_core]++;
+ return selected_core;
+}
+
+void
+spdk_vhost_dev_backend_event_done(void *event_ctx, int response)
+{
+ struct spdk_vhost_dev_event_ctx *ctx = event_ctx;
+
+ ctx->response = response;
+ sem_post(&ctx->sem);
+}
+
+static void
+spdk_vhost_event_cb(void *arg1, void *arg2)
+{
+ struct spdk_vhost_dev_event_ctx *ctx = arg1;
+
+ ctx->cb_fn(ctx->vdev, ctx);
+}
+
+static void
+spdk_vhost_event_async_fn(void *arg1, void *arg2)
+{
+ struct spdk_vhost_dev_event_ctx *ctx = arg1;
+ struct spdk_vhost_dev *vdev;
+ struct spdk_event *ev;
+
+ if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
+ ev = spdk_event_allocate(spdk_env_get_current_core(), spdk_vhost_event_async_fn, arg1, arg2);
+ spdk_event_call(ev);
+ return;
+ }
+
+ vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id);
+ if (vdev != ctx->vdev) {
+ /* vdev has been changed after enqueuing this event */
+ vdev = NULL;
+ }
+
+ if (vdev != NULL && vdev->lcore >= 0 &&
+ (uint32_t)vdev->lcore != spdk_env_get_current_core()) {
+ /* if vdev has been relocated to other core, it is no longer thread-safe
+ * to access its contents here. Even though we're running under global vhost
+ * mutex, the controller itself (and its pollers) are not. We need to chase
+ * the vdev thread as many times as necessary.
+ */
+ ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_async_fn, arg1, arg2);
+ spdk_event_call(ev);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return;
+ }
+
+ ctx->cb_fn(vdev, arg2);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+
+ free(ctx);
+}
+
+static void spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
+ spdk_vhost_event_fn fn, void *arg);
+
+static void
+spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2)
+{
+ struct spdk_vhost_dev_event_ctx *ctx = arg1;
+ struct spdk_vhost_dev *vdev;
+ struct spdk_event *ev;
+
+ if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
+ ev = spdk_event_allocate(spdk_env_get_current_core(),
+ spdk_vhost_event_async_foreach_fn, arg1, arg2);
+ spdk_event_call(ev);
+ return;
+ }
+
+ vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id);
+ if (vdev != ctx->vdev) {
+ /* ctx->vdev is probably a dangling pointer at this point.
+ * It must have been removed in the meantime, so we just skip
+ * it in our foreach chain. */
+ goto out_unlock_continue;
+ }
+
+ /* the assert is just for static analyzers, vdev cannot be NULL here */
+ assert(vdev != NULL);
+ if (vdev->lcore >= 0 &&
+ (uint32_t)vdev->lcore != spdk_env_get_current_core()) {
+ /* if vdev has been relocated to other core, it is no longer thread-safe
+ * to access its contents here. Even though we're running under global vhost
+ * mutex, the controller itself (and its pollers) are not. We need to chase
+ * the vdev thread as many times as necessary.
+ */
+ ev = spdk_event_allocate(vdev->lcore,
+ spdk_vhost_event_async_foreach_fn, arg1, arg2);
+ spdk_event_call(ev);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return;
+ }
+
+ ctx->cb_fn(vdev, arg2);
+
+out_unlock_continue:
+ vdev = spdk_vhost_dev_next(ctx->vdev_id);
+ spdk_vhost_external_event_foreach_continue(vdev, ctx->cb_fn, arg2);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+
+ free(ctx);
+}
+
+static int
+_spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn,
+ unsigned timeout_sec, const char *errmsg)
+{
+ struct spdk_vhost_dev_event_ctx ev_ctx = {0};
+ struct spdk_event *ev;
+ struct timespec timeout;
+ int rc;
+
+ rc = sem_init(&ev_ctx.sem, 0, 0);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n");
+ return -errno;
+ }
+
+ ev_ctx.vdev = vdev;
+ ev_ctx.cb_fn = cb_fn;
+ ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_cb, &ev_ctx, NULL);
+ assert(ev);
+ spdk_event_call(ev);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+
+ clock_gettime(CLOCK_REALTIME, &timeout);
+ timeout.tv_sec += timeout_sec;
+
+ rc = sem_timedwait(&ev_ctx.sem, &timeout);
+ if (rc != 0) {
+ SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
+ sem_wait(&ev_ctx.sem);
+ }
+
+ sem_destroy(&ev_ctx.sem);
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ return ev_ctx.response;
+}
+
+static int
+spdk_vhost_event_async_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, void *arg,
+ bool foreach)
+{
+ struct spdk_vhost_dev_event_ctx *ev_ctx;
+ struct spdk_event *ev;
+ spdk_event_fn fn;
+
+ ev_ctx = calloc(1, sizeof(*ev_ctx));
+ if (ev_ctx == NULL) {
+ SPDK_ERRLOG("Failed to alloc vhost event.\n");
+ assert(false);
+ return -ENOMEM;
+ }
+
+ ev_ctx->vdev = vdev;
+ ev_ctx->vdev_id = vdev->id;
+ ev_ctx->cb_fn = cb_fn;
+
+ fn = foreach ? spdk_vhost_event_async_foreach_fn : spdk_vhost_event_async_fn;
+ ev = spdk_event_allocate(ev_ctx->vdev->lcore, fn, ev_ctx, arg);
+ assert(ev);
+ spdk_event_call(ev);
+
+ return 0;
+}
+
+static void
+stop_device(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ struct rte_vhost_vring *q;
+ int rc;
+ uint16_t i;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ vdev = spdk_vhost_dev_find_by_vid(vid);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Couldn't find device with vid %d to stop.\n", vid);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return;
+ }
+
+ if (vdev->lcore == -1) {
+ SPDK_ERRLOG("Controller %s is not loaded.\n", vdev->name);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return;
+ }
+
+ rc = _spdk_vhost_event_send(vdev, vdev->backend->stop_device, 3, "stop device");
+ if (rc != 0) {
+ SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vid);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return;
+ }
+
+ for (i = 0; i < vdev->max_queues; i++) {
+ q = &vdev->virtqueue[i].vring;
+ if (q->desc == NULL) {
+ continue;
+ }
+ rte_vhost_set_vhost_vring_last_idx(vdev->vid, i, q->last_avail_idx, q->last_used_idx);
+ }
+
+ spdk_vhost_dev_mem_unregister(vdev);
+ free(vdev->mem);
+ spdk_vhost_free_reactor(vdev->lcore);
+ vdev->lcore = -1;
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+}
+
+static int
+start_device(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+ uint16_t i;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+
+ vdev = spdk_vhost_dev_find_by_vid(vid);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid);
+ goto out;
+ }
+
+ if (vdev->lcore != -1) {
+ SPDK_ERRLOG("Controller %s already loaded.\n", vdev->name);
+ goto out;
+ }
+
+ vdev->max_queues = 0;
+ memset(vdev->virtqueue, 0, sizeof(vdev->virtqueue));
+ for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
+ if (rte_vhost_get_vhost_vring(vid, i, &vdev->virtqueue[i].vring)) {
+ continue;
+ }
+
+ if (vdev->virtqueue[i].vring.desc == NULL ||
+ vdev->virtqueue[i].vring.size == 0) {
+ continue;
+ }
+
+ /* Disable notifications. */
+ if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i);
+ goto out;
+ }
+
+ vdev->max_queues = i + 1;
+ }
+
+ if (rte_vhost_get_negotiated_features(vid, &vdev->negotiated_features) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
+ goto out;
+ }
+
+ if (rte_vhost_get_mem_table(vid, &vdev->mem) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
+ goto out;
+ }
+
+ /*
+ * Not sure right now but this look like some kind of QEMU bug and guest IO
+ * might be frozed without kicking all queues after live-migration. This look like
+ * the previous vhost instance failed to effectively deliver all interrupts before
+ * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
+ * should be ignored by guest virtio driver.
+ *
+ * Tested on QEMU 2.10.91 and 2.11.50.
+ */
+ for (i = 0; i < vdev->max_queues; i++) {
+ if (vdev->virtqueue[i].vring.callfd != -1) {
+ eventfd_write(vdev->virtqueue[i].vring.callfd, (eventfd_t)1);
+ }
+ }
+
+ vdev->lcore = spdk_vhost_allocate_reactor(vdev->cpumask);
+ spdk_vhost_dev_mem_register(vdev);
+ rc = _spdk_vhost_event_send(vdev, vdev->backend->start_device, 3, "start device");
+ if (rc != 0) {
+ spdk_vhost_dev_mem_unregister(vdev);
+ free(vdev->mem);
+ spdk_vhost_free_reactor(vdev->lcore);
+ vdev->lcore = -1;
+ }
+
+out:
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return rc;
+}
+
+static int
+get_config(int vid, uint8_t *config, uint32_t len)
+{
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ vdev = spdk_vhost_dev_find_by_vid(vid);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid);
+ goto out;
+ }
+
+ if (vdev->backend->vhost_get_config) {
+ rc = vdev->backend->vhost_get_config(vdev, config, len);
+ }
+
+out:
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return rc;
+}
+
+static int
+set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
+{
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ vdev = spdk_vhost_dev_find_by_vid(vid);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid);
+ goto out;
+ }
+
+ if (vdev->backend->vhost_set_config) {
+ rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
+ }
+
+out:
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return rc;
+}
+
+int
+spdk_vhost_set_socket_path(const char *basename)
+{
+ int ret;
+
+ if (basename && strlen(basename) > 0) {
+ ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
+ if (ret <= 0) {
+ return -EINVAL;
+ }
+ if ((size_t)ret >= sizeof(dev_dirname) - 2) {
+ SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
+ return -EINVAL;
+ }
+
+ if (dev_dirname[ret - 1] != '/') {
+ dev_dirname[ret] = '/';
+ dev_dirname[ret + 1] = '\0';
+ }
+ }
+
+ return 0;
+}
+
+static void *
+session_shutdown(void *arg)
+{
+ struct spdk_vhost_dev *vdev = NULL;
+
+ TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+ rte_vhost_driver_unregister(vdev->path);
+ vdev->registered = false;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
+ spdk_event_call((struct spdk_event *)arg);
+ return NULL;
+}
+
+void
+spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ assert(vdev->backend->dump_info_json != NULL);
+ vdev->backend->dump_info_json(vdev, w);
+}
+
+int
+spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ return vdev->backend->remove_device(vdev);
+}
+
+static int
+new_connection(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ char ifname[PATH_MAX];
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
+ SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(ifname);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return -1;
+ }
+
+ /* since pollers are not running it safe not to use spdk_event here */
+ if (vdev->vid != -1) {
+ SPDK_ERRLOG("Device with vid %d is already connected.\n", vid);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return -1;
+ }
+
+ vdev->vid = vid;
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return 0;
+}
+
+static void
+destroy_connection(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ vdev = spdk_vhost_dev_find_by_vid(vid);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Couldn't find device with vid %d to destroy connection for.\n", vid);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ return;
+ }
+
+ /* since pollers are not running it safe not to use spdk_event here */
+ vdev->vid = -1;
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+}
+
+void
+spdk_vhost_call_external_event(const char *ctrlr_name, spdk_vhost_event_fn fn, void *arg)
+{
+ struct spdk_vhost_dev *vdev;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ vdev = spdk_vhost_dev_find(ctrlr_name);
+
+ if (vdev == NULL) {
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+ fn(NULL, arg);
+ return;
+ }
+
+ if (vdev->lcore == -1) {
+ fn(vdev, arg);
+ } else {
+ spdk_vhost_event_async_send(vdev, fn, arg, false);
+ }
+
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+}
+
+static void
+spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
+ spdk_vhost_event_fn fn, void *arg)
+{
+ if (vdev == NULL) {
+ fn(NULL, arg);
+ return;
+ }
+
+ while (vdev->lcore == -1) {
+ fn(vdev, arg);
+ vdev = spdk_vhost_dev_next(vdev->id);
+ if (vdev == NULL) {
+ fn(NULL, arg);
+ return;
+ }
+ }
+
+ spdk_vhost_event_async_send(vdev, fn, arg, true);
+}
+
+void
+spdk_vhost_call_external_event_foreach(spdk_vhost_event_fn fn, void *arg)
+{
+ struct spdk_vhost_dev *vdev;
+
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+ vdev = TAILQ_FIRST(&g_spdk_vhost_devices);
+ spdk_vhost_external_event_foreach_continue(vdev, fn, arg);
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+}
+
+void
+spdk_vhost_lock(void)
+{
+ pthread_mutex_lock(&g_spdk_vhost_mutex);
+}
+
+void
+spdk_vhost_unlock(void)
+{
+ pthread_mutex_unlock(&g_spdk_vhost_mutex);
+}
+
+int
+spdk_vhost_init(void)
+{
+ uint32_t last_core;
+ size_t len;
+ int ret;
+
+ if (dev_dirname[0] == '\0') {
+ if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
+ SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ len = strlen(dev_dirname);
+ if (dev_dirname[len - 1] != '/') {
+ dev_dirname[len] = '/';
+ dev_dirname[len + 1] = '\0';
+ }
+ }
+
+ last_core = spdk_env_get_last_core();
+ g_num_ctrlrs = calloc(last_core + 1, sizeof(uint32_t));
+ if (!g_num_ctrlrs) {
+ SPDK_ERRLOG("Could not allocate array size=%u for g_num_ctrlrs\n",
+ last_core + 1);
+ return -1;
+ }
+
+ ret = spdk_vhost_scsi_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost controllers\n");
+ return -1;
+ }
+
+ ret = spdk_vhost_blk_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost block controllers\n");
+ return -1;
+ }
+
+ ret = spdk_vhost_nvme_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+_spdk_vhost_fini_remove_vdev_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ spdk_vhost_fini_cb fini_cb = arg;
+
+ if (vdev != NULL) {
+ spdk_vhost_dev_remove(vdev);
+ return 0;
+ }
+
+ /* All devices are removed now. */
+ free(g_num_ctrlrs);
+ fini_cb();
+ return 0;
+}
+
+static void
+_spdk_vhost_fini(void *arg1, void *arg2)
+{
+ spdk_vhost_fini_cb fini_cb = arg1;
+
+ spdk_vhost_call_external_event_foreach(_spdk_vhost_fini_remove_vdev_cb, fini_cb);
+}
+
+void
+spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
+{
+ pthread_t tid;
+ int rc;
+ struct spdk_event *fini_ev;
+
+ fini_ev = spdk_event_allocate(spdk_env_get_current_core(), _spdk_vhost_fini, fini_cb, NULL);
+
+ /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
+ * ops for stopping a device or removing a connection, we need to call it from
+ * a separate thread to avoid deadlock.
+ */
+ rc = pthread_create(&tid, NULL, &session_shutdown, fini_ev);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
+ abort();
+ }
+ pthread_detach(tid);
+}
+
+struct spdk_vhost_write_config_json_ctx {
+ struct spdk_json_write_ctx *w;
+ struct spdk_event *done_ev;
+};
+
+static int
+spdk_vhost_config_json_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct spdk_vhost_write_config_json_ctx *ctx = arg;
+ uint32_t delay_base_us;
+ uint32_t iops_threshold;
+
+ if (vdev == NULL) {
+ spdk_json_write_array_end(ctx->w);
+ spdk_event_call(ctx->done_ev);
+ free(ctx);
+ return 0;
+ }
+
+ vdev->backend->write_config_json(vdev, ctx->w);
+
+ spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+ if (delay_base_us) {
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_string(ctx->w, "method", "set_vhost_controller_coalescing");
+
+ spdk_json_write_named_object_begin(ctx->w, "params");
+ spdk_json_write_named_string(ctx->w, "ctrlr", vdev->name);
+ spdk_json_write_named_uint32(ctx->w, "delay_base_us", delay_base_us);
+ spdk_json_write_named_uint32(ctx->w, "iops_threshold", iops_threshold);
+ spdk_json_write_object_end(ctx->w);
+
+ spdk_json_write_object_end(ctx->w);
+ }
+
+ return 0;
+}
+
+void
+spdk_vhost_config_json(struct spdk_json_write_ctx *w, struct spdk_event *done_ev)
+{
+ struct spdk_vhost_write_config_json_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_event_call(done_ev);
+ return;
+ }
+
+ ctx->w = w;
+ ctx->done_ev = done_ev;
+
+ spdk_json_write_array_begin(w);
+
+ spdk_vhost_call_external_event_foreach(spdk_vhost_config_json_cb, ctx);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
+SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)
diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c
new file mode 100644
index 00000000..6a9a1896
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_blk.c
@@ -0,0 +1,901 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/virtio_blk.h>
+
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/vhost.h"
+
+#include "vhost_internal.h"
+
+struct spdk_vhost_blk_task {
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_vhost_blk_dev *bvdev;
+ struct spdk_vhost_virtqueue *vq;
+
+ volatile uint8_t *status;
+
+ uint16_t req_idx;
+
+ /* for io wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+ /* If set, the task is currently used for I/O processing. */
+ bool used;
+
+ /** Number of bytes that were written. */
+ uint32_t used_len;
+ uint16_t iovcnt;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+};
+
+struct spdk_vhost_blk_dev {
+ struct spdk_vhost_dev vdev;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *bdev_desc;
+ struct spdk_io_channel *bdev_io_channel;
+ struct spdk_poller *requestq_poller;
+ struct spdk_vhost_dev_destroy_ctx destroy_ctx;
+ bool readonly;
+};
+
+/* forward declaration */
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
+ struct spdk_vhost_virtqueue *vq);
+
+static void
+blk_task_finish(struct spdk_vhost_blk_task *task)
+{
+ assert(task->bvdev->vdev.task_cnt > 0);
+ task->bvdev->vdev.task_cnt--;
+ task->used = false;
+}
+
+static void
+invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
+{
+ if (task->status) {
+ *task->status = status;
+ }
+
+ spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
+ task->used_len);
+ blk_task_finish(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ * total size of suplied buffers
+ *
+ * FIXME: Make this function return to rd_cnt and wr_cnt
+ */
+static int
+blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+ struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+ struct vring_desc *desc, *desc_table;
+ uint16_t out_cnt = 0, cnt = 0;
+ uint32_t desc_table_size, len = 0;
+ int rc;
+
+ rc = spdk_vhost_vq_get_desc(vdev, vq, req_idx, &desc, &desc_table, &desc_table_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+ return -1;
+ }
+
+ while (1) {
+ /*
+ * Maximum cnt reached?
+ * Should not happen if request is well formatted, otherwise this is a BUG.
+ */
+ if (spdk_unlikely(cnt == *iovs_cnt)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n",
+ req_idx);
+ return -1;
+ }
+
+ if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &cnt, desc))) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+ req_idx, cnt);
+ return -1;
+ }
+
+ len += desc->len;
+
+ out_cnt += spdk_vhost_vring_desc_is_wr(desc);
+
+ rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
+ vdev->name, req_idx);
+ return -1;
+ } else if (desc == NULL) {
+ break;
+ }
+ }
+
+ /*
+ * There must be least two descriptors.
+ * First contain request so it must be readable.
+ * Last descriptor contain buffer for response so it must be writable.
+ */
+ if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+ return -1;
+ }
+
+ *length = len;
+ *iovs_cnt = cnt;
+ return 0;
+}
+
+static void
+blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
+{
+ *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+ spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
+ task->used_len);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
+ task->req_idx, success ? "OK" : "FAIL");
+ blk_task_finish(task);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_blk_task *task = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+ blk_request_finish(success, task);
+}
+
+static void
+blk_request_resubmit(void *arg)
+{
+ struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
+ int rc = 0;
+
+ rc = process_blk_request(task, task->bvdev, task->vq);
+ if (rc == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
+ }
+}
+
+static inline void
+blk_request_queue_io(struct spdk_vhost_blk_task *task)
+{
+ int rc;
+ struct spdk_vhost_blk_dev *bvdev = task->bvdev;
+ struct spdk_bdev *bdev = bvdev->bdev;
+
+ task->bdev_io_wait.bdev = bdev;
+ task->bdev_io_wait.cb_fn = blk_request_resubmit;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ }
+}
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
+ struct spdk_vhost_virtqueue *vq)
+{
+ const struct virtio_blk_outhdr *req;
+ struct iovec *iov;
+ uint32_t type;
+ uint32_t payload_len;
+ int rc;
+
+ if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
+ /* Only READ and WRITE are supported for now. */
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ iov = &task->iovs[0];
+ if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+ "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
+ iov->iov_len, sizeof(*req), task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ req = iov->iov_base;
+
+ iov = &task->iovs[task->iovcnt - 1];
+ if (spdk_unlikely(iov->iov_len != 1)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+ "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
+ iov->iov_len, 1, task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ task->status = iov->iov_base;
+ payload_len -= sizeof(*req) + sizeof(*task->status);
+ task->iovcnt -= 2;
+
+ type = req->type;
+#ifdef VIRTIO_BLK_T_BARRIER
+ /* Don't care about barier for now (as QEMU's virtio-blk do). */
+ type &= ~VIRTIO_BLK_T_BARRIER;
+#endif
+
+ switch (type) {
+ case VIRTIO_BLK_T_IN:
+ case VIRTIO_BLK_T_OUT:
+ if (spdk_unlikely((payload_len & (512 - 1)) != 0)) {
+ SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
+ type ? "WRITE" : "READ", task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ if (type == VIRTIO_BLK_T_IN) {
+ task->used_len = payload_len + sizeof(*task->status);
+ rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel,
+ &task->iovs[1], task->iovcnt, req->sector * 512,
+ payload_len, blk_request_complete_cb, task);
+ } else if (!bvdev->readonly) {
+ task->used_len = sizeof(*task->status);
+ rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel,
+ &task->iovs[1], task->iovcnt, req->sector * 512,
+ payload_len, blk_request_complete_cb, task);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
+ rc = -1;
+ }
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_GET_ID:
+ if (!task->iovcnt || !payload_len) {
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+ task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
+ spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
+ task->used_len, ' ');
+ blk_request_finish(true, task);
+ break;
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_blk_task *task;
+ int rc;
+ uint16_t reqs[32];
+ uint16_t reqs_cnt, i;
+
+ reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ if (!reqs_cnt) {
+ return;
+ }
+
+ for (i = 0; i < reqs_cnt; i++) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+ reqs[i]);
+
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ bvdev->vdev.name, reqs[i], vq->vring.size);
+ spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
+ continue;
+ }
+
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ bvdev->vdev.name, reqs[i]);
+ spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
+ continue;
+ }
+
+ bvdev->vdev.task_cnt++;
+
+ task->used = true;
+ task->iovcnt = SPDK_COUNTOF(task->iovs);
+ task->status = NULL;
+ task->used_len = 0;
+
+ rc = process_blk_request(task, bvdev, vq);
+ if (rc == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
+ reqs[i]);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
+ }
+ }
+}
+
+static int
+vdev_worker(void *arg)
+{
+ struct spdk_vhost_blk_dev *bvdev = arg;
+ uint16_t q_idx;
+
+ for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
+ process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
+ }
+
+ spdk_vhost_dev_used_signal(&bvdev->vdev);
+
+ return -1;
+}
+
+static void
+no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
+{
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+ uint32_t length;
+ uint16_t iovcnt, req_idx;
+
+ if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
+ return;
+ }
+
+ iovcnt = SPDK_COUNTOF(iovs);
+ if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) {
+ *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+ }
+
+ spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, req_idx, 0);
+}
+
+static int
+no_bdev_vdev_worker(void *arg)
+{
+ struct spdk_vhost_blk_dev *bvdev = arg;
+ uint16_t q_idx;
+
+ for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
+ no_bdev_process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
+ }
+
+ spdk_vhost_dev_used_signal(&bvdev->vdev);
+
+ if (bvdev->vdev.task_cnt == 0 && bvdev->bdev_io_channel) {
+ spdk_put_io_channel(bvdev->bdev_io_channel);
+ bvdev->bdev_io_channel = NULL;
+ }
+
+ return -1;
+}
+
+static struct spdk_vhost_blk_dev *
+to_blk_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev == NULL) {
+ return NULL;
+ }
+
+ if (vdev->backend != &vhost_blk_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
+}
+
+struct spdk_bdev *
+spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+
+ assert(bvdev != NULL);
+ return bvdev->bdev;
+}
+
+static int
+_bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct spdk_vhost_blk_dev *bvdev = arg;
+
+ SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
+ bvdev->vdev.name);
+ if (bvdev->requestq_poller) {
+ spdk_poller_unregister(&bvdev->requestq_poller);
+ bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0);
+ }
+
+ spdk_bdev_close(bvdev->bdev_desc);
+ bvdev->bdev_desc = NULL;
+ bvdev->bdev = NULL;
+ return 0;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev = remove_ctx;
+
+ spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev);
+}
+
+static void
+free_task_pool(struct spdk_vhost_blk_dev *bvdev)
+{
+ struct spdk_vhost_virtqueue *vq;
+ uint16_t i;
+
+ for (i = 0; i < bvdev->vdev.max_queues; i++) {
+ vq = &bvdev->vdev.virtqueue[i];
+ if (vq->tasks == NULL) {
+ continue;
+ }
+
+ spdk_dma_free(vq->tasks);
+ vq->tasks = NULL;
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
+{
+ struct spdk_vhost_virtqueue *vq;
+ struct spdk_vhost_blk_task *task;
+ uint32_t task_cnt;
+ uint16_t i;
+ uint32_t j;
+
+ for (i = 0; i < bvdev->vdev.max_queues; i++) {
+ vq = &bvdev->vdev.virtqueue[i];
+ if (vq->vring.desc == NULL) {
+ continue;
+ }
+
+ task_cnt = vq->vring.size;
+ if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+ /* sanity check */
+ SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+ bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+ free_task_pool(bvdev);
+ return -1;
+ }
+ vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
+ SPDK_CACHE_LINE_SIZE, NULL);
+ if (vq->tasks == NULL) {
+ SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+ bvdev->vdev.name, task_cnt, i);
+ free_task_pool(bvdev);
+ return -1;
+ }
+
+ for (j = 0; j < task_cnt; j++) {
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
+ task->bvdev = bvdev;
+ task->req_idx = j;
+ task->vq = vq;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * A new device is added to a data core. First the device is added to the main linked list
+ * and then allocated to a specific data core.
+ *
+ */
+static int
+spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+ int i, rc = 0;
+
+ bvdev = to_blk_dev(vdev);
+ if (bvdev == NULL) {
+ SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
+ rc = -1;
+ goto out;
+ }
+
+ /* validate all I/O queues are in a contiguous index range */
+ for (i = 0; i < vdev->max_queues; i++) {
+ if (vdev->virtqueue[i].vring.desc == NULL) {
+ SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ rc = alloc_task_pool(bvdev);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
+ goto out;
+ }
+
+ if (bvdev->bdev) {
+ bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+ if (!bvdev->bdev_io_channel) {
+ free_task_pool(bvdev);
+ SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
+ bvdev, 0);
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
+ vdev->name, vdev->lcore);
+out:
+ spdk_vhost_dev_backend_event_done(event_ctx, rc);
+ return rc;
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+ struct spdk_vhost_blk_dev *bvdev = arg;
+ int i;
+
+ if (bvdev->vdev.task_cnt > 0) {
+ return -1;
+ }
+
+ for (i = 0; i < bvdev->vdev.max_queues; i++) {
+ bvdev->vdev.virtqueue[i].next_event_time = 0;
+ spdk_vhost_vq_used_signal(&bvdev->vdev, &bvdev->vdev.virtqueue[i]);
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name);
+
+ if (bvdev->bdev_io_channel) {
+ spdk_put_io_channel(bvdev->bdev_io_channel);
+ bvdev->bdev_io_channel = NULL;
+ }
+
+ free_task_pool(bvdev);
+ spdk_poller_unregister(&bvdev->destroy_ctx.poller);
+ spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0);
+
+ return -1;
+}
+
+static int
+spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ if (bvdev == NULL) {
+ SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
+ goto err;
+ }
+
+ bvdev->destroy_ctx.event_ctx = event_ctx;
+ spdk_poller_unregister(&bvdev->requestq_poller);
+ bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb,
+ bvdev, 1000);
+ return 0;
+
+err:
+ spdk_vhost_dev_backend_event_done(event_ctx, -1);
+ return -1;
+}
+
+static void
+spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ if (bvdev == NULL) {
+ return;
+ }
+
+ assert(bvdev != NULL);
+ spdk_json_write_name(w, "block");
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_name(w, "readonly");
+ spdk_json_write_bool(w, bvdev->readonly);
+
+ spdk_json_write_name(w, "bdev");
+ if (bdev) {
+ spdk_json_write_string(w, spdk_bdev_get_name(bdev));
+ } else {
+ spdk_json_write_null(w);
+ }
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ if (bvdev == NULL) {
+ return;
+ }
+
+ if (!bvdev->bdev) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
+ spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
+ spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev);
+
+static int
+spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
+ uint32_t len)
+{
+ struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config;
+ struct spdk_vhost_blk_dev *bvdev;
+ struct spdk_bdev *bdev;
+ uint32_t blk_size;
+ uint64_t blkcnt;
+
+ bvdev = to_blk_dev(vdev);
+ if (bvdev == NULL) {
+ SPDK_ERRLOG("Trying to get virito_blk configuration failed\n");
+ return -1;
+ }
+
+ if (len < sizeof(*blkcfg)) {
+ return -1;
+ }
+
+ bdev = bvdev->bdev;
+ if (bdev == NULL) {
+ /* We can't just return -1 here as this GET_CONFIG message might
+ * be caused by a QEMU VM reboot. Returning -1 will indicate an
+ * error to QEMU, who might then decide to terminate itself.
+ * We don't want that. A simple reboot shouldn't break the system.
+ *
+ * Presenting a block device with block size 0 and block count 0
+ * doesn't cause any problems on QEMU side and the virtio-pci
+ * device is even still available inside the VM, but there will
+ * be no block device created for it - the kernel drivers will
+ * silently reject it.
+ */
+ blk_size = 0;
+ blkcnt = 0;
+ } else {
+ blk_size = spdk_bdev_get_block_size(bdev);
+ blkcnt = spdk_bdev_get_num_blocks(bdev);
+ }
+
+ memset(blkcfg, 0, sizeof(*blkcfg));
+ blkcfg->blk_size = blk_size;
+ /* minimum I/O size in blocks */
+ blkcfg->min_io_size = 1;
+ /* expressed in 512 Bytes sectors */
+ blkcfg->capacity = (blkcnt * blk_size) / 512;
+ blkcfg->size_max = 131072;
+ /* -2 for REQ and RESP and -1 for region boundary splitting */
+ blkcfg->seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+ /* QEMU can overwrite this value when started */
+ blkcfg->num_queues = SPDK_VHOST_MAX_VQUEUES;
+
+ return 0;
+}
+
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
+ .virtio_features = SPDK_VHOST_FEATURES |
+ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
+ (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
+ (1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
+ (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) |
+ (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
+ (1ULL << VIRTIO_BLK_F_MQ),
+ .disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
+ (1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
+ (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI),
+ .start_device = spdk_vhost_blk_start,
+ .stop_device = spdk_vhost_blk_stop,
+ .vhost_get_config = spdk_vhost_blk_get_config,
+ .dump_info_json = spdk_vhost_blk_dump_info_json,
+ .write_config_json = spdk_vhost_blk_write_config_json,
+ .remove_device = spdk_vhost_blk_destroy,
+};
+
+int
+spdk_vhost_blk_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ unsigned ctrlr_num;
+ char *bdev_name;
+ char *cpumask;
+ char *name;
+ bool readonly;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
+
+ bdev_name = spdk_conf_section_get_val(sp, "Dev");
+ if (bdev_name == NULL) {
+ continue;
+ }
+
+ if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
+{
+ struct spdk_vhost_blk_dev *bvdev = NULL;
+ struct spdk_bdev *bdev;
+ int ret = 0;
+
+ spdk_vhost_lock();
+ bdev = spdk_bdev_get_by_name(dev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("Controller %s: bdev '%s' not found\n",
+ name, dev_name);
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
+ if (bvdev == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
+ if (ret != 0) {
+ SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n",
+ name, dev_name, ret);
+ goto out;
+ }
+
+ bvdev->bdev = bdev;
+ bvdev->readonly = readonly;
+ ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
+ if (ret != 0) {
+ spdk_bdev_close(bvdev->bdev_desc);
+ goto out;
+ }
+
+ if (readonly && rte_vhost_driver_enable_features(bvdev->vdev.path, (1ULL << VIRTIO_BLK_F_RO))) {
+ SPDK_ERRLOG("Controller %s: failed to set as a readonly\n", name);
+ spdk_bdev_close(bvdev->bdev_desc);
+
+ if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
+ SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
+ }
+
+ ret = -1;
+ goto out;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
+out:
+ if (ret != 0 && bvdev) {
+ spdk_dma_free(bvdev);
+ }
+ spdk_vhost_unlock();
+ return ret;
+}
+
+static int
+spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+ int rc;
+
+ if (!bvdev) {
+ return -EINVAL;
+ }
+
+ rc = spdk_vhost_dev_unregister(&bvdev->vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ if (bvdev->bdev_desc) {
+ spdk_bdev_close(bvdev->bdev_desc);
+ bvdev->bdev_desc = NULL;
+ }
+ bvdev->bdev = NULL;
+
+ spdk_dma_free(bvdev);
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h
new file mode 100644
index 00000000..9c0ad211
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_internal.h
@@ -0,0 +1,277 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VHOST_INTERNAL_H
+#define SPDK_VHOST_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include <rte_vhost.h>
+
+#include "spdk_internal/log.h"
+#include "spdk/event.h"
+#include "spdk/rpc.h"
+
+#define SPDK_CACHE_LINE_SIZE RTE_CACHE_LINE_SIZE
+
+#ifndef VHOST_USER_F_PROTOCOL_FEATURES
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#endif
+
+#ifndef VIRTIO_F_VERSION_1
+#define VIRTIO_F_VERSION_1 32
+#endif
+
+#ifndef VIRTIO_BLK_F_MQ
+#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */
+#endif
+
+#ifndef VIRTIO_BLK_F_CONFIG_WCE
+#define VIRTIO_BLK_F_CONFIG_WCE 11
+#endif
+
+#define SPDK_VHOST_MAX_VQUEUES 256
+#define SPDK_VHOST_MAX_VQ_SIZE 1024
+
+#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8
+
+#define SPDK_VHOST_IOVS_MAX 129
+
+/*
+ * Rate at which stats are checked for interrupt coalescing.
+ */
+#define SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS 10
+/*
+ * Default threshold at which interrupts start to be coalesced.
+ */
+#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000
+
+/*
+ * Currently coalescing is not used by default.
+ * Setting this to value > 0 here or by RPC will enable coalescing.
+ */
+#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0
+
+
+#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC))
+
+#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY))
+
+struct spdk_vhost_virtqueue {
+ struct rte_vhost_vring vring;
+ void *tasks;
+
+ /* Request count from last stats check */
+ uint32_t req_cnt;
+
+ /* Request count from last event */
+ uint16_t used_req_cnt;
+
+ /* How long interrupt is delayed */
+ uint32_t irq_delay_time;
+
+ /* Next time when we need to send event */
+ uint64_t next_event_time;
+
+} __attribute((aligned(SPDK_CACHE_LINE_SIZE)));
+
+struct spdk_vhost_dev_backend {
+ uint64_t virtio_features;
+ uint64_t disabled_features;
+
+ /**
+ * Callbacks for starting and pausing the device.
+ * The first param is struct spdk_vhost_dev *.
+ * The second one is event context that has to be
+ * passed to spdk_vhost_dev_backend_event_done().
+ */
+ spdk_vhost_event_fn start_device;
+ spdk_vhost_event_fn stop_device;
+
+ int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len);
+ int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config,
+ uint32_t offset, uint32_t size, uint32_t flags);
+
+ void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+ void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+ int (*remove_device)(struct spdk_vhost_dev *vdev);
+};
+
+struct spdk_vhost_dev {
+ struct rte_vhost_memory *mem;
+ char *name;
+ char *path;
+
+ /* Unique device ID. */
+ unsigned id;
+
+ /* rte_vhost device ID. */
+ int vid;
+ int task_cnt;
+ int32_t lcore;
+ struct spdk_cpuset *cpumask;
+ bool registered;
+
+ const struct spdk_vhost_dev_backend *backend;
+
+ /* Saved orginal values used to setup coalescing to avoid integer
+ * rounding issues during save/load config.
+ */
+ uint32_t coalescing_delay_us;
+ uint32_t coalescing_iops_threshold;
+
+ uint32_t coalescing_delay_time_base;
+
+ /* Threshold when event coalescing for virtqueue will be turned on. */
+ uint32_t coalescing_io_rate_threshold;
+
+ /* Next time when stats for event coalescing will be checked. */
+ uint64_t next_stats_check_time;
+
+ /* Interval used for event coalescing checking. */
+ uint64_t stats_check_interval;
+
+ uint16_t max_queues;
+
+ uint64_t negotiated_features;
+
+ struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES];
+
+ TAILQ_ENTRY(spdk_vhost_dev) tailq;
+};
+
+struct spdk_vhost_dev_destroy_ctx {
+ struct spdk_poller *poller;
+ void *event_ctx;
+};
+
+struct spdk_vhost_dev *spdk_vhost_dev_find(const char *ctrlr_name);
+
+void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len);
+
+uint16_t spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs,
+ uint16_t reqs_len);
+
+/**
+ * Get a virtio descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c spdk_vhost_vring_desc_get_next.
+ * \param vdev vhost device
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * default virtqueue descriptor table or per-chain indirect
+ * table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+ uint32_t *desc_table_size);
+
+/**
+ * Send IRQ/call client (if pending) for \c vq.
+ * \param vdev vhost device
+ * \param vq virtqueue
+ * \return
+ * 0 - if no interrupt was signalled
+ * 1 - if interrupt was signalled
+ */
+int spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq);
+
+
+/**
+ * Send IRQs for all queues that need to be signaled.
+ * \param vdev vhost device
+ * \param vq virtqueue
+ */
+void spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev);
+
+void spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq,
+ uint16_t id, uint32_t len);
+
+/**
+ * Get subsequent descriptor from given table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int spdk_vhost_vring_desc_get_next(struct vring_desc **desc,
+ struct vring_desc *desc_table, uint32_t desc_table_size);
+bool spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc);
+
+int spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_desc *desc);
+
+static inline bool __attribute__((always_inline))
+spdk_vhost_dev_has_feature(struct spdk_vhost_dev *vdev, unsigned feature_id)
+{
+ return vdev->negotiated_features & (1ULL << feature_id);
+}
+
+int spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+ const struct spdk_vhost_dev_backend *backend);
+int spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev);
+
+int spdk_vhost_scsi_controller_construct(void);
+int spdk_vhost_blk_controller_construct(void);
+void spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+void spdk_vhost_dev_backend_event_done(void *event_ctx, int response);
+void spdk_vhost_lock(void);
+void spdk_vhost_unlock(void);
+int spdk_remove_vhost_controller(struct spdk_vhost_dev *vdev);
+int spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf);
+int spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd);
+int spdk_vhost_nvme_get_cap(int vid, uint64_t *cap);
+int spdk_vhost_nvme_controller_construct(void);
+int spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues);
+int spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev);
+int spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev,
+ const char *bdev_name);
+
+#endif /* SPDK_VHOST_INTERNAL_H */
diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c
new file mode 100644
index 00000000..35015d93
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_nvme.c
@@ -0,0 +1,1465 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "spdk/bdev.h"
+#include "spdk/version.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/likely.h"
+
+#include "vhost_internal.h"
+
+#define MAX_IO_QUEUES 31
+#define MAX_IOVS 64
+#define MAX_NAMESPACE 8
+#define MAX_QUEUE_ENTRIES_SUPPORTED 256
+#define MAX_BATCH_IO 8
+
+struct spdk_vhost_nvme_sq {
+ uint16_t sqid;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ struct spdk_nvme_cmd *sq_cmd;
+ uint16_t sq_head;
+ uint16_t sq_tail;
+};
+
+struct spdk_vhost_nvme_cq {
+ uint8_t phase;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ volatile struct spdk_nvme_cpl *cq_cqe;
+ uint16_t cq_head;
+ uint16_t guest_signaled_cq_head;
+ uint32_t need_signaled_cnt;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks;
+ bool irq_enabled;
+ int virq;
+};
+
+struct spdk_vhost_nvme_ns {
+ struct spdk_bdev *bdev;
+ uint32_t block_size;
+ uint64_t capacity;
+ uint32_t nsid;
+ uint32_t active_ns;
+ struct spdk_bdev_desc *bdev_desc;
+ struct spdk_io_channel *bdev_io_channel;
+ struct spdk_nvme_ns_data nsdata;
+};
+
+struct spdk_vhost_nvme_task {
+ struct spdk_nvme_cmd cmd;
+ struct spdk_vhost_nvme_dev *nvme;
+ uint16_t sqid;
+ uint16_t cqid;
+
+ /** array of iovecs to transfer. */
+ struct iovec iovs[MAX_IOVS];
+
+ /** Number of iovecs in iovs array. */
+ int iovcnt;
+
+ /** Current iovec position. */
+ int iovpos;
+
+ /** Offset in current iovec. */
+ uint32_t iov_offset;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_ns *ns;
+
+ /* parent pointer. */
+ struct spdk_vhost_nvme_task *parent;
+ uint8_t dnr;
+ uint8_t sct;
+ uint8_t sc;
+ uint32_t num_children;
+ STAILQ_ENTRY(spdk_vhost_nvme_task) stailq;
+};
+
+struct spdk_vhost_nvme_dev {
+ struct spdk_vhost_dev vdev;
+
+ uint32_t num_io_queues;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ struct spdk_nvme_ctrlr_data cdata;
+
+ uint32_t num_sqs;
+ uint32_t num_cqs;
+
+ uint32_t num_ns;
+ struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE];
+
+ volatile uint32_t *dbbuf_dbs;
+ volatile uint32_t *dbbuf_eis;
+ struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1];
+ struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1];
+
+ TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks;
+ struct spdk_poller *requestq_poller;
+ struct spdk_vhost_dev_destroy_ctx destroy_ctx;
+};
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend;
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+static int
+spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task);
+
+static struct spdk_vhost_nvme_dev *
+to_nvme_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->backend != &spdk_vhost_nvme_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev);
+}
+
+static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs);
+
+static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return qid * 2 * db_stride;
+}
+
+static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return (qid * 2 + 1) * db_stride;
+}
+
+static void
+nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq)
+{
+ cq->cq_head++;
+ if (cq->cq_head >= cq->size) {
+ cq->cq_head = 0;
+ cq->phase = !cq->phase;
+ }
+}
+
+static bool
+nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq)
+{
+ return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head);
+}
+
+static void
+nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq)
+{
+ sq->sq_head = (sq->sq_head + 1) % sq->size;
+}
+
+static struct spdk_vhost_nvme_sq *
+spdk_vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->sq_queue[qid];
+}
+
+static struct spdk_vhost_nvme_cq *
+spdk_vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->cq_queue[qid];
+}
+
+static int
+spdk_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd,
+ struct spdk_vhost_nvme_task *task, uint32_t len)
+{
+ uint64_t prp1, prp2;
+ void *vva;
+ uint32_t i;
+ uint32_t residue_len, nents, mps = 4096;
+ uint64_t *prp_list;
+
+ prp1 = cmd->dptr.prp.prp1;
+ prp2 = cmd->dptr.prp.prp2;
+
+ /* PRP1 may started with unaligned page address */
+ residue_len = mps - (prp1 % mps);
+ residue_len = spdk_min(len, residue_len);
+
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp1, residue_len);
+ if (spdk_unlikely(vva == NULL)) {
+ SPDK_ERRLOG("GPA to VVA failed\n");
+ return -1;
+ }
+ task->iovs[0].iov_base = vva;
+ task->iovs[0].iov_len = residue_len;
+ len -= residue_len;
+
+ if (len) {
+ if (spdk_unlikely(prp2 == 0)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PRP2=0 in command\n");
+ return -1;
+ }
+
+ if (len <= mps) {
+ /* 2 PRP used */
+ task->iovcnt = 2;
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, len);
+ if (spdk_unlikely(vva == NULL)) {
+ return -1;
+ }
+ task->iovs[1].iov_base = vva;
+ task->iovs[1].iov_len = len;
+ } else {
+ /* PRP list used */
+ nents = (len + mps - 1) / mps;
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, nents * sizeof(*prp_list));
+ if (spdk_unlikely(vva == NULL)) {
+ return -1;
+ }
+ prp_list = vva;
+ i = 0;
+ while (len != 0) {
+ residue_len = spdk_min(len, mps);
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp_list[i], residue_len);
+ if (spdk_unlikely(vva == NULL)) {
+ return -1;
+ }
+ task->iovs[i + 1].iov_base = vva;
+ task->iovs[i + 1].iov_len = residue_len;
+ len -= residue_len;
+ i++;
+ }
+ task->iovcnt = i + 1;
+ }
+ } else {
+ /* 1 PRP used */
+ task->iovcnt = 1;
+ }
+
+ return 0;
+}
+
+static void
+spdk_nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_cq *cq;
+ uint32_t qid, cq_head;
+
+ assert(nvme != NULL);
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq || !cq->valid) {
+ continue;
+ }
+
+ cq_head = nvme->dbbuf_dbs[cq_offset(qid, 1)];
+ if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) {
+ eventfd_write(cq->virq, (eventfd_t)1);
+ cq->need_signaled_cnt = 0;
+ }
+ }
+}
+
+static void
+spdk_vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ struct spdk_nvme_cpl cqe = {0};
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ uint16_t cqid = task->cqid;
+ uint16_t sqid = task->sqid;
+
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid);
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, sqid);
+ if (spdk_unlikely(!cq || !sq)) {
+ return;
+ }
+
+ cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(cqid, 1)];
+ if (spdk_unlikely(nvme_cq_is_full(cq))) {
+ STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq);
+ return;
+ }
+
+ cqe.sqid = sqid;
+ cqe.sqhd = sq->sq_head;
+ cqe.cid = cmd->cid;
+ cqe.status.dnr = task->dnr;
+ cqe.status.sct = task->sct;
+ cqe.status.sc = task->sc;
+ cqe.status.p = !cq->phase;
+ cq->cq_cqe[cq->cq_head] = cqe;
+ spdk_smp_wmb();
+ cq->cq_cqe[cq->cq_head].status.p = cq->phase;
+
+ nvme_inc_cq_head(cq);
+ cq->need_signaled_cnt++;
+
+ /* MMIO Controll */
+ nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *task = cb_arg;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ int sc, sct;
+
+ assert(bdev_io != NULL);
+
+ spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc);
+ spdk_bdev_free_io(bdev_io);
+
+ task->dnr = !success;
+ task->sct = sct;
+ task->sc = sc;
+
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10);
+ }
+
+ spdk_vhost_nvme_task_complete(task);
+}
+
+static void
+blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *child = cb_arg;
+ struct spdk_vhost_nvme_task *task = child->parent;
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ int sct, sc;
+
+ assert(bdev_io != NULL);
+
+ task->num_children--;
+ if (!success) {
+ task->dnr = 1;
+ spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc);
+ task->sct = sct;
+ task->sc = sc;
+ }
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!task->num_children) {
+ spdk_vhost_nvme_task_complete(task);
+ }
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+}
+
+static struct spdk_vhost_nvme_ns *
+spdk_vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid)
+{
+ if (spdk_unlikely(!nsid || nsid > dev->num_ns)) {
+ return NULL;
+ }
+
+ return &dev->ns[nsid - 1];
+}
+
+static void
+vhost_nvme_resubmit_task(void *arg)
+{
+ struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg;
+ int rc;
+
+ rc = spdk_nvme_process_sq(task->nvme, task->sq, task);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc);
+ }
+}
+
+static int
+vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task)
+{
+ int rc;
+
+ task->bdev_io_wait.bdev = task->ns->bdev;
+ task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ spdk_vhost_nvme_task_complete(task);
+ }
+
+ return rc;
+}
+
+static int
+spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_task *child;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = -1;
+ uint32_t len, nlba, block_size;
+ uint64_t slba;
+ struct spdk_nvme_dsm_range *range;
+ uint16_t i, num_ranges = 0;
+
+ task->nvme = nvme;
+ task->dnr = 0;
+ task->sct = 0;
+ task->sc = 0;
+
+ ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid);
+ if (spdk_unlikely(!ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ block_size = ns->block_size;
+ task->num_children = 0;
+ task->cqid = sq->cqid;
+ task->sqid = sq->sqid;
+
+ task->ns = ns;
+
+ if (spdk_unlikely(!ns->active_ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ /* valid only for Read/Write commands */
+ nlba = (cmd->cdw12 & 0xffff) + 1;
+ slba = cmd->cdw11;
+ slba = (slba << 32) | cmd->cdw10;
+
+ if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE ||
+ cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ if (cmd->psdt != SPDK_NVME_PSDT_PRP) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n",
+ cmd->psdt >> 1, cmd->psdt & 1u);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ num_ranges = (cmd->cdw10 & 0xff) + 1;
+ len = num_ranges * sizeof(struct spdk_nvme_dsm_range);
+ } else {
+ len = nlba * block_size;
+ }
+
+ ret = spdk_nvme_map_prps(nvme, cmd, task, len);
+ if (spdk_unlikely(ret != 0)) {
+ SPDK_ERRLOG("nvme command map prps failed\n");
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+ }
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_WRITE:
+ ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel,
+ 0, ns->capacity,
+ blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base;
+ for (i = 0; i < num_ranges; i++) {
+ if (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ child = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ SPDK_ERRLOG("No free task now\n");
+ ret = -1;
+ break;
+ }
+ task->num_children++;
+ child->parent = task;
+ ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel,
+ range[i].starting_lba * block_size,
+ range[i].length * block_size,
+ blk_unmap_complete_cb, child);
+ if (ret) {
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+ break;
+ }
+ }
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (spdk_unlikely(ret)) {
+ if (ret == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n");
+ task->sq = sq;
+ ret = vhost_nvme_queue_task(task);
+ } else {
+ /* post error status to cqe */
+ SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ spdk_vhost_nvme_task_complete(task);
+ }
+ }
+
+ return ret;
+}
+
+static int
+nvme_worker(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_task *task;
+ uint32_t qid, dbbuf_sq;
+ int ret;
+ int count = -1;
+
+ if (spdk_unlikely(!nvme->num_sqs)) {
+ return -1;
+ }
+
+ /* worker thread can't start before the admin doorbell
+ * buffer config command
+ */
+ if (spdk_unlikely(!nvme->dbbuf_dbs)) {
+ return -1;
+ }
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq->valid) {
+ continue;
+ }
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, sq->cqid);
+ if (spdk_unlikely(!cq)) {
+ return -1;
+ }
+ cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(sq->cqid, 1)];
+ if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) &&
+ !nvme_cq_is_full(cq))) {
+ task = STAILQ_FIRST(&cq->cq_full_waited_tasks);
+ STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq);
+ spdk_vhost_nvme_task_complete(task);
+ }
+
+ dbbuf_sq = nvme->dbbuf_dbs[sq_offset(qid, 1)];
+ sq->sq_tail = (uint16_t)dbbuf_sq;
+ count = 0;
+
+ while (sq->sq_head != sq->sq_tail) {
+ if (spdk_unlikely(!sq->sq_cmd)) {
+ break;
+ }
+ if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ return -1;
+ }
+
+ task->cmd = sq->sq_cmd[sq->sq_head];
+ nvme_inc_sq_head(sq);
+
+ /* processing IO */
+ ret = spdk_nvme_process_sq(nvme, sq, task);
+ if (spdk_unlikely(ret)) {
+ SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head,
+ sq->sq_tail);
+ }
+
+ /* MMIO Control */
+ nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
+
+ /* Maximum batch I/Os to pick up at once */
+ if (count++ == MAX_BATCH_IO) {
+ break;
+ }
+ }
+ }
+
+ /* Completion Queue */
+ spdk_nvme_cq_signal_fd(nvme);
+
+ return count;
+}
+
+static int
+vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint64_t dbs_dma_addr, eis_dma_addr;
+
+ dbs_dma_addr = cmd->dptr.prp.prp1;
+ eis_dma_addr = cmd->dptr.prp.prp2;
+
+ if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) {
+ return -1;
+ }
+ /* Guest Physical Address to Host Virtual Address */
+ nvme->dbbuf_dbs = spdk_vhost_gpa_to_vva(&nvme->vdev, dbs_dma_addr, 4096);
+ nvme->dbbuf_eis = spdk_vhost_gpa_to_vva(&nvme->vdev, eis_dma_addr, 4096);
+ if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) {
+ return -1;
+ }
+ /* zeroed the doorbell buffer memory */
+ memset((void *)nvme->dbbuf_dbs, 0, 4096);
+ memset((void *)nvme->dbbuf_eis, 0, 4096);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid, qsize, cqid;
+ uint64_t dma_addr;
+ uint64_t requested_len;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ cqid = (cmd->cdw11 >> 16) & 0xffff;
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid);
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid);
+ if (!sq || !cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n",
+ qid, cqid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+
+ sq->sqid = qid;
+ sq->cqid = cqid;
+ sq->size = qsize + 1;
+ sq->sq_head = sq->sq_tail = 0;
+ requested_len = sizeof(struct spdk_nvme_cmd) * sq->size;
+ sq->sq_cmd = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len);
+ if (!sq->sq_cmd) {
+ return -1;
+ }
+ nvme->num_sqs++;
+ sq->valid = true;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_sq *sq;
+
+ qid = cmd->cdw10 & 0xffff;
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq) {
+ return -1;
+ }
+
+ /* We didn't see scenarios when deleting submission
+ * queue while I/O is running against the submisson
+ * queue for now, otherwise, we must ensure the poller
+ * will not run with this submission queue.
+ */
+ nvme->num_sqs--;
+ sq->valid = false;
+
+ memset(sq, 0, sizeof(*sq));
+ sq->sq_cmd = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qsize, qid;
+ uint64_t dma_addr;
+ struct spdk_vhost_nvme_cq *cq;
+ uint64_t requested_len;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+ cq->cqid = qid;
+ cq->size = qsize + 1;
+ cq->phase = 1;
+ cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1;
+ /* Setup virq through vhost messages */
+ cq->virq = -1;
+ cq->cq_head = 0;
+ cq->guest_signaled_cq_head = 0;
+ cq->need_signaled_cnt = 0;
+ requested_len = sizeof(struct spdk_nvme_cpl) * cq->size;
+ cq->cq_cqe = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len);
+ if (!cq->cq_cqe) {
+ return -1;
+ }
+ nvme->num_cqs++;
+ cq->valid = true;
+ STAILQ_INIT(&cq->cq_full_waited_tasks);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_cq *cq;
+
+ qid = cmd->cdw10 & 0xffff;
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ nvme->num_cqs--;
+ cq->valid = false;
+
+ memset(cq, 0, sizeof(*cq));
+ cq->cq_cqe = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static struct spdk_vhost_nvme_dev *
+spdk_vhost_nvme_get_by_name(int vid)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) {
+ if (nvme->vdev.vid == vid) {
+ return nvme;
+ }
+ }
+
+ return NULL;
+}
+
+int
+spdk_vhost_nvme_get_cap(int vid, uint64_t *cap)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = spdk_vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ *cap = nvme->cap.raw;
+ return 0;
+}
+
+int
+spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
+{
+ struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd;
+ struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = 0;
+ struct spdk_vhost_nvme_dev *nvme;
+ uint32_t cq_head, sq_tail;
+
+ nvme = spdk_vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc);
+ switch (req->opc) {
+ case SPDK_NVME_OPC_IDENTIFY:
+ if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) {
+ memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data));
+
+ } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) {
+ ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, req->nsid);
+ if (!ns) {
+ cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE;
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ break;
+ }
+ memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data));
+ }
+ /* successfully */
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_CQ:
+ ret = vhost_nvme_create_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_CQ:
+ ret = vhost_nvme_delete_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_SQ:
+ ret = vhost_nvme_create_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_SQ:
+ ret = vhost_nvme_delete_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_GET_FEATURES:
+ case SPDK_NVME_OPC_SET_FEATURES:
+ if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) {
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16);
+ } else {
+ cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ cpl->status.sct = SPDK_NVME_SCT_GENERIC;
+ }
+ break;
+ case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG:
+ ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_ABORT:
+ sq_tail = nvme->dbbuf_dbs[sq_offset(1, 1)] & 0xffffu;
+ cq_head = nvme->dbbuf_dbs[cq_offset(1, 1)] & 0xffffu;
+ SPDK_NOTICELOG("ABORT: CID %u, SQ_TAIL %u, CQ_HEAD %u\n",
+ (req->cdw10 >> 16) & 0xffffu, sq_tail, cq_head);
+ /* TODO: ABORT failed fow now */
+ cpl->cdw0 = 1;
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ }
+
+ if (ret) {
+ SPDK_ERRLOG("Admin Passthrough Faild with %u\n", req->opc);
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+ struct spdk_vhost_nvme_cq *cq;
+
+ nvme = spdk_vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ if (cq->irq_enabled) {
+ cq->virq = fd;
+ } else {
+ SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_task *task;
+
+ while (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ spdk_dma_free(task);
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ uint32_t entries, i;
+ struct spdk_vhost_nvme_task *task;
+
+ entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED;
+
+ for (i = 0; i < entries; i++) {
+ task = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_task),
+ SPDK_CACHE_LINE_SIZE, NULL);
+ if (task == NULL) {
+ SPDK_ERRLOG("Controller %s alloc task pool failed\n",
+ nvme->vdev.name);
+ free_task_pool(nvme);
+ return -1;
+ }
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+ }
+
+ return 0;
+}
+
+/* new device means enable the
+ * virtual NVMe controller
+ */
+static int
+spdk_vhost_nvme_start_device(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return -1;
+ }
+
+ if (alloc_task_pool(nvme)) {
+ return -1;
+ }
+
+ SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vdev->vid,
+ vdev->path, vdev->lcore);
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc);
+ if (!ns_dev->bdev_io_channel) {
+ return -1;
+ }
+ }
+
+ /* Start the NVMe Poller */
+ nvme->requestq_poller = spdk_poller_register(nvme_worker, nvme, 0);
+
+ spdk_vhost_dev_backend_event_done(event_ctx, 0);
+ return 0;
+}
+
+static void
+spdk_vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns)
+{
+ ns->active_ns = 0;
+ spdk_bdev_close(ns->bdev_desc);
+ ns->bdev_desc = NULL;
+ ns->bdev = NULL;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_nvme_ns *ns = remove_ctx;
+
+ SPDK_NOTICELOG("Removing NS %u, Block Device %s\n",
+ ns->nsid, spdk_bdev_get_name(ns->bdev));
+
+ spdk_vhost_nvme_deactive_ns(ns);
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = arg;
+ struct spdk_vhost_nvme_dev *dev, *tmp;
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n");
+
+ TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) {
+ if (dev == nvme) {
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (ns_dev->bdev_io_channel) {
+ spdk_put_io_channel(ns_dev->bdev_io_channel);
+ ns_dev->bdev_io_channel = NULL;
+ }
+ }
+ nvme->num_sqs = 0;
+ nvme->num_cqs = 0;
+ nvme->dbbuf_dbs = NULL;
+ nvme->dbbuf_eis = NULL;
+ }
+ }
+
+ spdk_poller_unregister(&nvme->destroy_ctx.poller);
+ spdk_vhost_dev_backend_event_done(nvme->destroy_ctx.event_ctx, 0);
+
+ return -1;
+}
+
+/* Disable NVMe controller
+ */
+static int
+spdk_vhost_nvme_stop_device(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+
+ if (nvme == NULL) {
+ return -1;
+ }
+
+ free_task_pool(nvme);
+ SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vdev->vid, vdev->path);
+
+ nvme->destroy_ctx.event_ctx = event_ctx;
+ spdk_poller_unregister(&nvme->requestq_poller);
+ nvme->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, nvme, 1000);
+
+ return 0;
+}
+
+static void
+spdk_vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_named_array_begin(w, "namespaces");
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid);
+ spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+spdk_vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "construct_vhost_nvme_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues);
+ spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(nvme->vdev.cpumask));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "add_vhost_nvme_ns");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = {
+ .start_device = spdk_vhost_nvme_start_device,
+ .stop_device = spdk_vhost_nvme_stop_device,
+ .dump_info_json = spdk_vhost_nvme_dump_info_json,
+ .write_config_json = spdk_vhost_nvme_write_config_json,
+ .remove_device = spdk_vhost_nvme_dev_remove,
+};
+
+static int
+spdk_vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ struct spdk_nvme_ns_data *nsdata;
+ uint64_t num_blocks;
+ uint32_t i;
+
+ /* Identify Namespace */
+ cdata->nn = dev->num_ns;
+ for (i = 0; i < dev->num_ns; i++) {
+ nsdata = &dev->ns[i].nsdata;
+ if (dev->ns[i].active_ns) {
+ num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev);
+ nsdata->nsze = num_blocks;
+ /* ncap must be non-zero for active Namespace */
+ nsdata->ncap = num_blocks;
+ nsdata->nuse = num_blocks;
+ nsdata->nlbaf = 0;
+ nsdata->flbas.format = 0;
+ nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev));
+ nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev);
+ dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev);
+ dev->ns[i].capacity = num_blocks * dev->ns[i].block_size;
+ } else {
+ memset(nsdata, 0, sizeof(*nsdata));
+ }
+ }
+ return 0;
+}
+
+static int
+spdk_vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ char sn[20];
+
+ /* Controller Capabilities */
+ dev->cap.bits.cqr = 1;
+ dev->cap.bits.to = 1;
+ dev->cap.bits.dstrd = 0;
+ dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+ dev->cap.bits.mpsmin = 0;
+ dev->cap.bits.mpsmax = 0;
+ /* MQES is 0 based value */
+ dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1;
+
+ /* Controller Configuration */
+ dev->cc.bits.en = 0;
+
+ /* Controller Status */
+ dev->csts.bits.rdy = 0;
+
+ /* Identify Controller */
+ spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+ cdata->vid = 0x8086;
+ cdata->ssvid = 0x8086;
+ spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' ');
+ snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name);
+ spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' ');
+ cdata->ieee[0] = 0xe4;
+ cdata->ieee[1] = 0xd2;
+ cdata->ieee[2] = 0x5c;
+ cdata->ver.bits.mjr = 1;
+ cdata->ver.bits.mnr = 0;
+ cdata->mdts = 5; /* 128 KiB */
+ cdata->rab = 6;
+ cdata->sqes.min = 6;
+ cdata->sqes.max = 6;
+ cdata->cqes.min = 4;
+ cdata->cqes.max = 4;
+ cdata->oncs.dsm = 1;
+ /* Emulated NVMe controller */
+ cdata->oacs.doorbell_buffer_config = 1;
+
+ spdk_vhost_nvme_ns_identify_update(dev);
+
+ return 0;
+}
+
+int
+spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues)
+{
+ struct spdk_vhost_nvme_dev *dev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_dev),
+ SPDK_CACHE_LINE_SIZE, NULL);
+ int rc;
+
+ if (dev == NULL) {
+ return -ENOMEM;
+ }
+
+ if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) {
+ spdk_dma_free(dev);
+ return -EINVAL;
+ }
+
+ spdk_vhost_lock();
+ rc = spdk_vhost_dev_register(&dev->vdev, name, cpumask,
+ &spdk_vhost_nvme_device_backend);
+
+ if (rc) {
+ spdk_dma_free(dev);
+ spdk_vhost_unlock();
+ return rc;
+ }
+
+ dev->num_io_queues = num_io_queues;
+ STAILQ_INIT(&dev->free_tasks);
+ TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq);
+
+ spdk_vhost_nvme_ctrlr_identify_update(dev);
+
+ SPDK_NOTICELOG("Controller %s: Constructed\n", name);
+ spdk_vhost_unlock();
+ return rc;
+}
+
+int
+spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_dev *dev, *tmp;
+ struct spdk_vhost_nvme_ns *ns;
+ int rc;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) {
+ if (dev == nvme) {
+ TAILQ_REMOVE(&g_nvme_ctrlrs, dev, tailq);
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns = &nvme->ns[i];
+ if (ns->active_ns) {
+ spdk_vhost_nvme_deactive_ns(ns);
+ }
+ }
+ }
+ }
+
+ rc = spdk_vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ spdk_dma_free(nvme);
+ return 0;
+}
+
+int
+spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns;
+ struct spdk_bdev *bdev;
+ int rc = -1;
+
+ if (nvme == NULL) {
+ return -ENODEV;
+ }
+
+ if (nvme->num_ns == MAX_NAMESPACE) {
+ SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns);
+ return -ENOSPC;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("could not find bdev %s\n", bdev_name);
+ return -ENODEV;
+ }
+
+ ns = &nvme->ns[nvme->num_ns];
+ rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not open bdev '%s', error=%d\n",
+ bdev_name, rc);
+ return rc;
+ }
+
+ nvme->ns[nvme->num_ns].bdev = bdev;
+ nvme->ns[nvme->num_ns].active_ns = 1;
+ nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1;
+ nvme->num_ns++;
+
+ spdk_vhost_nvme_ns_identify_update(nvme);
+
+ return rc;
+}
+
+int
+spdk_vhost_nvme_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ const char *name;
+ const char *bdev_name;
+ const char *cpumask;
+ int rc, i = 0;
+ struct spdk_vhost_dev *vdev;
+ uint32_t ctrlr_num, io_queues;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ rc = spdk_conf_section_get_intval(sp, "NumberOfQueues");
+ if (rc > 0) {
+ io_queues = rc;
+ } else {
+ io_queues = 1;
+ }
+
+ rc = spdk_vhost_nvme_dev_construct(name, cpumask, io_queues);
+ if (rc < 0) {
+ SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num);
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ if (!vdev) {
+ return -1;
+ }
+
+ for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) {
+ bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0);
+ if (!bdev_name) {
+ SPDK_ERRLOG("namespace configuration missing bdev name\n");
+ break;
+ }
+ rc = spdk_vhost_nvme_dev_add_ns(vdev, bdev_name);
+ if (rc < 0) {
+ SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n",
+ ctrlr_num, bdev_name);
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME)
diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c
new file mode 100644
index 00000000..0e546c36
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_rpc.c
@@ -0,0 +1,814 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk/scsi.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+#include "spdk/bdev.h"
+
+struct rpc_vhost_scsi_ctrlr {
+ char *ctrlr;
+ char *cpumask;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->cpumask);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string },
+ {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+spdk_rpc_construct_vhost_scsi_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_scsi_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_ctrlr),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_scsi_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_scsi_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("construct_vhost_scsi_controller", spdk_rpc_construct_vhost_scsi_controller,
+ SPDK_RPC_RUNTIME)
+
+struct rpc_add_vhost_scsi_ctrlr_lun {
+ char *ctrlr;
+ uint32_t scsi_target_num;
+ char *bdev_name;
+
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_add_vhost_scsi_ctrlr_lun(struct rpc_add_vhost_scsi_ctrlr_lun *req)
+{
+ free(req->ctrlr);
+ free(req->bdev_name);
+ free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_add_lun[] = {
+ {"ctrlr", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, ctrlr), spdk_json_decode_string },
+ {"scsi_target_num", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, scsi_target_num), spdk_json_decode_uint32},
+ {"bdev_name", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, bdev_name), spdk_json_decode_string },
+};
+
+static int
+spdk_rpc_add_vhost_scsi_lun_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_add_vhost_scsi_ctrlr_lun *rpc = arg;
+ struct spdk_jsonrpc_request *request = rpc->request;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (vdev == NULL) {
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_add_tgt(vdev, rpc->scsi_target_num, rpc->bdev_name);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_add_vhost_scsi_ctrlr_lun(rpc);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return -1;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return 0;
+
+invalid:
+ free_rpc_add_vhost_scsi_ctrlr_lun(rpc);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+ return rc;
+}
+
+static void
+spdk_rpc_add_vhost_scsi_lun(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_add_vhost_scsi_ctrlr_lun *req;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto invalid;
+ }
+
+ req->request = request;
+ if (spdk_json_decode_object(params, rpc_vhost_add_lun,
+ SPDK_COUNTOF(rpc_vhost_add_lun),
+ req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ if (req->ctrlr == NULL) {
+ SPDK_ERRLOG("No controller name\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_add_vhost_scsi_lun_cb, req);
+
+ return;
+
+invalid:
+ if (req) {
+ free_rpc_add_vhost_scsi_ctrlr_lun(req);
+ }
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("add_vhost_scsi_lun", spdk_rpc_add_vhost_scsi_lun, SPDK_RPC_RUNTIME)
+
+struct rpc_remove_vhost_scsi_ctrlr_target {
+ char *ctrlr;
+ uint32_t scsi_target_num;
+
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req)
+{
+ free(req->ctrlr);
+ free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = {
+ {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string },
+ {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32},
+};
+
+static int
+spdk_rpc_remove_vhost_scsi_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_remove_vhost_scsi_ctrlr_target *rpc = arg;
+ struct spdk_jsonrpc_request *request = rpc->request;
+ struct spdk_json_write_ctx *w;
+
+ free_rpc_remove_vhost_scsi_ctrlr_target(rpc);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return -1;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return 0;
+}
+
+static int
+spdk_rpc_remove_vhost_scsi_target_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_remove_vhost_scsi_ctrlr_target *rpc = arg;
+ struct spdk_jsonrpc_request *request = rpc->request;
+ int rc;
+
+ if (vdev == NULL) {
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_remove_tgt(vdev, rpc->scsi_target_num,
+ spdk_rpc_remove_vhost_scsi_target_finish_cb, rpc);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ return 0;
+
+invalid:
+ free_rpc_remove_vhost_scsi_ctrlr_target(rpc);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc));
+ return rc;
+}
+
+static void
+spdk_rpc_remove_vhost_scsi_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_remove_vhost_scsi_ctrlr_target *req;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto invalid;
+ }
+
+ req->request = request;
+ if (spdk_json_decode_object(params, rpc_vhost_remove_target,
+ SPDK_COUNTOF(rpc_vhost_remove_target),
+ req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_remove_vhost_scsi_target_cb, req);
+
+ return;
+
+invalid:
+ if (req) {
+ free_rpc_remove_vhost_scsi_ctrlr_target(req);
+ }
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+
+SPDK_RPC_REGISTER("remove_vhost_scsi_target", spdk_rpc_remove_vhost_scsi_target, SPDK_RPC_RUNTIME)
+
+struct rpc_vhost_blk_ctrlr {
+ char *ctrlr;
+ char *dev_name;
+ char *cpumask;
+ bool readonly;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string },
+ {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string },
+ {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true},
+ {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true},
+};
+
+static void
+free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->dev_name);
+ free(req->cpumask);
+}
+
+static void
+spdk_rpc_construct_vhost_blk_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_blk_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name, req.readonly);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_blk_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_blk_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("construct_vhost_blk_controller", spdk_rpc_construct_vhost_blk_controller,
+ SPDK_RPC_RUNTIME)
+
+struct rpc_remove_vhost_ctrlr {
+ char *ctrlr;
+
+ struct spdk_jsonrpc_request *request;
+};
+
+static const struct spdk_json_object_decoder rpc_remove_vhost_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_remove_vhost_ctrlr, ctrlr), spdk_json_decode_string },
+};
+
+static void
+free_rpc_remove_vhost_ctrlr(struct rpc_remove_vhost_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req);
+}
+
+static int
+spdk_rpc_remove_vhost_controller_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_remove_vhost_ctrlr *ctx = arg;
+ struct spdk_jsonrpc_request *request = ctx->request;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (vdev == NULL) {
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_dev_remove(vdev);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_remove_vhost_ctrlr(ctx);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return 0;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return 0;
+
+invalid:
+ free_rpc_remove_vhost_ctrlr(ctx);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+ return -1;
+}
+
+static void
+spdk_rpc_remove_vhost_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_remove_vhost_ctrlr *req;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto invalid;
+ }
+
+ req->request = request;
+ if (spdk_json_decode_object(params, rpc_remove_vhost_ctrlr,
+ SPDK_COUNTOF(rpc_remove_vhost_ctrlr), req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_remove_vhost_controller_cb, req);
+ return;
+
+invalid:
+ if (req) {
+ free_rpc_remove_vhost_ctrlr(req);
+ }
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("remove_vhost_controller", spdk_rpc_remove_vhost_controller, SPDK_RPC_RUNTIME)
+
+struct rpc_get_vhost_ctrlrs {
+ char *name;
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+_spdk_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev)
+{
+ uint32_t delay_base_us, iops_threshold;
+
+ spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev));
+ spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", spdk_cpuset_fmt(vdev->cpumask));
+ spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+ spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+ spdk_json_write_named_string(w, "socket", vdev->path);
+
+ spdk_json_write_named_object_begin(w, "backend_specific");
+ spdk_vhost_dump_info_json(vdev, w);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static int
+spdk_rpc_get_vhost_controllers_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_get_vhost_ctrlrs *ctx = arg;
+
+ assert(ctx->name == NULL);
+
+ if (vdev == NULL) {
+ spdk_json_write_array_end(ctx->w);
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+ free(ctx);
+ return 0;
+ }
+
+ _spdk_rpc_get_vhost_controller(ctx->w, vdev);
+ return 0;
+}
+
+static int
+spdk_rpc_get_vhost_controller_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_get_vhost_ctrlrs *ctx = arg;
+
+ assert(ctx->name != NULL);
+
+ if (vdev == NULL) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(ENODEV));
+ goto free_name_ctx;
+ }
+
+ ctx->w = spdk_jsonrpc_begin_result(ctx->request);
+ if (ctx->w == NULL) {
+ goto free_name_ctx;
+ }
+
+ spdk_json_write_array_begin(ctx->w);
+ _spdk_rpc_get_vhost_controller(ctx->w, vdev);
+ spdk_json_write_array_end(ctx->w);
+
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+
+free_name_ctx:
+ free(ctx->name);
+ free(ctx);
+ return 0;
+}
+
+static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = {
+ {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true},
+};
+
+static void
+spdk_rpc_get_vhost_controllers(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_get_vhost_ctrlrs *ctx;
+ struct spdk_json_write_ctx *w;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(ENOMEM));
+ return;
+ }
+
+ if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders,
+ SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ free(ctx);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ if (ctx->name) {
+ ctx->request = request;
+ spdk_vhost_call_external_event(ctx->name, spdk_rpc_get_vhost_controller_cb, ctx);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ free(ctx);
+ return;
+ }
+
+ spdk_json_write_array_begin(w);
+
+ ctx->w = w;
+ ctx->request = request;
+ spdk_vhost_call_external_event_foreach(spdk_rpc_get_vhost_controllers_cb, ctx);
+}
+SPDK_RPC_REGISTER("get_vhost_controllers", spdk_rpc_get_vhost_controllers, SPDK_RPC_RUNTIME)
+
+
+struct rpc_vhost_ctrlr_coalescing {
+ char *ctrlr;
+ uint32_t delay_base_us;
+ uint32_t iops_threshold;
+ struct spdk_jsonrpc_request *request;
+};
+
+static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string },
+ {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32},
+ {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32},
+};
+
+static void
+free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req)
+{
+ if (!req) {
+ return;
+ }
+
+ free(req->ctrlr);
+ free(req);
+}
+
+static int
+spdk_rpc_set_vhost_controller_coalescing_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_vhost_ctrlr_coalescing *req = arg;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (vdev == NULL) {
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_set_coalescing(vdev, req->delay_base_us, req->iops_threshold);
+ if (rc) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(req->request);
+ if (w != NULL) {
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(req->request, w);
+ }
+
+ free_rpc_set_vhost_controllers_event_coalescing(req);
+ return 0;
+
+invalid:
+ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+ free_rpc_set_vhost_controllers_event_coalescing(req);
+ return 0;
+}
+
+static void
+spdk_rpc_set_vhost_controller_coalescing(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_ctrlr_coalescing *req;
+ int rc;
+
+ req = calloc(1, sizeof(struct rpc_vhost_ctrlr_coalescing));
+ if (!req) {
+ rc = -ENOMEM;
+ goto invalid;
+ }
+
+ if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing,
+ SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ req->request = request;
+ spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_set_vhost_controller_coalescing_cb, req);
+ return;
+
+invalid:
+ free_rpc_set_vhost_controllers_event_coalescing(req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("set_vhost_controller_coalescing", spdk_rpc_set_vhost_controller_coalescing,
+ SPDK_RPC_RUNTIME)
+
+struct rpc_vhost_nvme_ctrlr {
+ char *ctrlr;
+ uint32_t io_queues;
+ char *cpumask;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string },
+ {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32},
+ {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->cpumask);
+}
+
+static void
+spdk_rpc_construct_vhost_nvme_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_nvme_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr),
+ &req)) {
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues);
+ if (rc < 0) {
+ free_rpc_vhost_nvme_ctrlr(&req);
+ goto invalid;
+ }
+
+ free_rpc_vhost_nvme_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("construct_vhost_nvme_controller", spdk_rpc_construct_vhost_nvme_controller,
+ SPDK_RPC_RUNTIME)
+
+struct rpc_add_vhost_nvme_ctrlr_ns {
+ char *ctrlr;
+ char *bdev_name;
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_add_vhost_nvme_ctrlr_ns(struct rpc_add_vhost_nvme_ctrlr_ns *req)
+{
+ free(req->ctrlr);
+ free(req->bdev_name);
+ free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = {
+ {"ctrlr", offsetof(struct rpc_add_vhost_nvme_ctrlr_ns, ctrlr), spdk_json_decode_string },
+ {"bdev_name", offsetof(struct rpc_add_vhost_nvme_ctrlr_ns, bdev_name), spdk_json_decode_string },
+};
+
+static int
+spdk_rpc_add_vhost_nvme_ns_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct rpc_add_vhost_nvme_ctrlr_ns *rpc = arg;
+ struct spdk_jsonrpc_request *request = rpc->request;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (vdev == NULL) {
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_nvme_dev_add_ns(vdev, rpc->bdev_name);
+ if (rc < 0) {
+ goto invalid;
+ }
+ free_rpc_add_vhost_nvme_ctrlr_ns(rpc);
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w == NULL) {
+ return -1;
+ }
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return 0;
+
+invalid:
+ free_rpc_add_vhost_nvme_ctrlr_ns(rpc);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+ return rc;
+}
+
+static void
+spdk_rpc_add_vhost_nvme_ns(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_add_vhost_nvme_ctrlr_ns *req;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto invalid;
+ }
+
+ req->request = request;
+ if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns,
+ SPDK_COUNTOF(rpc_vhost_nvme_add_ns),
+ req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_add_vhost_nvme_ns_cb, req);
+ return;
+
+invalid:
+ if (req) {
+ free_rpc_add_vhost_nvme_ctrlr_ns(req);
+ }
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("add_vhost_nvme_ns", spdk_rpc_add_vhost_nvme_ns, SPDK_RPC_RUNTIME)
+
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC)
diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c
new file mode 100644
index 00000000..aefa4c45
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_scsi.c
@@ -0,0 +1,1271 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <linux/virtio_scsi.h>
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/scsi.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/conf.h"
+#include "spdk/event.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+/* Features supported by SPDK VHOST lib. */
+#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \
+ (1ULL << VIRTIO_SCSI_F_INOUT) | \
+ (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \
+ (1ULL << VIRTIO_SCSI_F_CHANGE ) | \
+ (1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+/* Features that are specified in VIRTIO SCSI but currently not supported:
+ * - Live migration not supported yet
+ * - T10 PI
+ */
+#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
+ (1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+#define MGMT_POLL_PERIOD_US (1000 * 5)
+
+#define VIRTIO_SCSI_CONTROLQ 0
+#define VIRTIO_SCSI_EVENTQ 1
+#define VIRTIO_SCSI_REQUESTQ 2
+
+struct spdk_scsi_dev_vhost_state {
+ bool removed;
+ spdk_vhost_event_fn remove_cb;
+ void *remove_ctx;
+};
+
+struct spdk_vhost_scsi_dev {
+ struct spdk_vhost_dev vdev;
+ struct spdk_scsi_dev *scsi_dev[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+ struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+
+ struct spdk_poller *requestq_poller;
+ struct spdk_poller *mgmt_poller;
+ struct spdk_vhost_dev_destroy_ctx destroy_ctx;
+} __rte_cache_aligned;
+
+struct spdk_vhost_scsi_task {
+ struct spdk_scsi_task scsi;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+
+ union {
+ struct virtio_scsi_cmd_resp *resp;
+ struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
+ };
+
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_dev *scsi_dev;
+
+ /** Number of bytes that were written. */
+ uint32_t used_len;
+
+ int req_idx;
+
+ /* If set, the task is currently used for I/O processing. */
+ bool used;
+
+ struct spdk_vhost_virtqueue *vq;
+};
+
+static int spdk_vhost_scsi_start(struct spdk_vhost_dev *, void *);
+static int spdk_vhost_scsi_stop(struct spdk_vhost_dev *, void *);
+static void spdk_vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev,
+ struct spdk_json_write_ctx *w);
+static void spdk_vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev,
+ struct spdk_json_write_ctx *w);
+static int spdk_vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev);
+
+const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = {
+ .virtio_features = SPDK_VHOST_SCSI_FEATURES,
+ .disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES,
+ .start_device = spdk_vhost_scsi_start,
+ .stop_device = spdk_vhost_scsi_stop,
+ .dump_info_json = spdk_vhost_scsi_dump_info_json,
+ .write_config_json = spdk_vhost_scsi_write_config_json,
+ .remove_device = spdk_vhost_scsi_dev_remove,
+};
+
+static void
+spdk_vhost_scsi_task_put(struct spdk_vhost_scsi_task *task)
+{
+ spdk_scsi_task_put(&task->scsi);
+}
+
+static void
+spdk_vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ assert(task->svdev->vdev.task_cnt > 0);
+ task->svdev->vdev.task_cnt--;
+ task->used = false;
+}
+
+static void
+process_removed_devs(struct spdk_vhost_scsi_dev *svdev)
+{
+ struct spdk_scsi_dev *dev;
+ struct spdk_scsi_dev_vhost_state *state;
+ int i;
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+ dev = svdev->scsi_dev[i];
+ state = &svdev->scsi_dev_state[i];
+
+ if (dev && state->removed && !spdk_scsi_dev_has_pending_tasks(dev)) {
+ spdk_scsi_dev_free_io_channels(dev);
+ svdev->scsi_dev[i] = NULL;
+ spdk_scsi_dev_destruct(dev);
+ if (state->remove_cb) {
+ state->remove_cb(&svdev->vdev, state->remove_ctx);
+ state->remove_cb = NULL;
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: hot-detached device 'Dev %u'.\n",
+ svdev->vdev.name, i);
+ }
+ }
+}
+
+static void
+eventq_enqueue(struct spdk_vhost_scsi_dev *svdev, unsigned scsi_dev_num, uint32_t event,
+ uint32_t reason)
+{
+ struct spdk_vhost_virtqueue *vq;
+ struct vring_desc *desc, *desc_table;
+ struct virtio_scsi_event *desc_ev;
+ uint32_t desc_table_size, req_size = 0;
+ uint16_t req;
+ int rc;
+
+ assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ vq = &svdev->vdev.virtqueue[VIRTIO_SCSI_EVENTQ];
+
+ if (spdk_vhost_vq_avail_ring_get(vq, &req, 1) != 1) {
+ SPDK_ERRLOG("Controller %s: Failed to send virtio event (no avail ring entries?).\n",
+ svdev->vdev.name);
+ return;
+ }
+
+ rc = spdk_vhost_vq_get_desc(&svdev->vdev, vq, req, &desc, &desc_table, &desc_table_size);
+ if (rc != 0 || desc->len < sizeof(*desc_ev)) {
+ SPDK_ERRLOG("Controller %s: Invalid eventq descriptor at index %"PRIu16".\n",
+ svdev->vdev.name, req);
+ goto out;
+ }
+
+ desc_ev = spdk_vhost_gpa_to_vva(&svdev->vdev, desc->addr, sizeof(*desc_ev));
+ if (desc_ev == NULL) {
+ SPDK_ERRLOG("Controller %s: Eventq descriptor at index %"PRIu16" points to unmapped guest memory address %p.\n",
+ svdev->vdev.name, req, (void *)(uintptr_t)desc->addr);
+ goto out;
+ }
+
+ desc_ev->event = event;
+ desc_ev->lun[0] = 1;
+ desc_ev->lun[1] = scsi_dev_num;
+ /* virtio LUN id 0 can refer either to the entire device
+ * or actual LUN 0 (the only supported by vhost for now)
+ */
+ desc_ev->lun[2] = 0 >> 8;
+ desc_ev->lun[3] = 0 & 0xFF;
+ /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3)
+ * current implementation relies on linux kernel sources
+ */
+ memset(&desc_ev->lun[4], 0, 4);
+ desc_ev->reason = reason;
+ req_size = sizeof(*desc_ev);
+
+out:
+ spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, req, req_size);
+}
+
+static void
+submit_completion(struct spdk_vhost_scsi_task *task)
+{
+ spdk_vhost_vq_used_ring_enqueue(&task->svdev->vdev, task->vq, task->req_idx,
+ task->used_len);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx);
+
+ spdk_vhost_scsi_task_put(task);
+}
+
+static void
+spdk_vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ submit_completion(task);
+}
+
+static void
+spdk_vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ /* The SCSI task has completed. Do final processing and then post
+ notification to the virtqueue's "used" ring.
+ */
+ task->resp->status = task->scsi.status;
+
+ if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+ memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len);
+ task->resp->sense_len = task->scsi.sense_data_len;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx,
+ task->scsi.status);
+ }
+ assert(task->scsi.transfer_len == task->scsi.length);
+ task->resp->resid = task->scsi.length - task->scsi.data_transferred;
+
+ submit_completion(task);
+}
+
+static void
+task_submit(struct spdk_vhost_scsi_task *task)
+{
+ task->resp->response = VIRTIO_SCSI_S_OK;
+ spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func)
+{
+ task->tmf_resp->response = VIRTIO_SCSI_S_OK;
+ spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi, func);
+}
+
+static void
+invalid_request(struct spdk_vhost_scsi_task *task)
+{
+ spdk_vhost_vq_used_ring_enqueue(&task->svdev->vdev, task->vq, task->req_idx,
+ task->used_len);
+ spdk_vhost_scsi_task_put(task);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n",
+ task->resp ? task->resp->response : -1);
+}
+
+static int
+spdk_vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun)
+{
+ struct spdk_scsi_dev *dev;
+ uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF;
+
+ SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8);
+
+ /* First byte must be 1 and second is target */
+ if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ return -1;
+ }
+
+ dev = task->svdev->scsi_dev[lun[1]];
+ task->scsi_dev = dev;
+ if (dev == NULL || task->svdev->scsi_dev_state[lun[1]].removed) {
+ /* If dev has been hotdetached, return 0 to allow sending
+ * additional hotremove event via sense codes.
+ */
+ return task->svdev->scsi_dev_state[lun[1]].removed ? 0 : -1;
+ }
+
+ task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0);
+ task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_id);
+ return 0;
+}
+
+static void
+process_ctrl_request(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_dev *vdev = &task->svdev->vdev;
+ struct vring_desc *desc, *desc_table;
+ struct virtio_scsi_ctrl_tmf_req *ctrl_req;
+ struct virtio_scsi_ctrl_an_resp *an_resp;
+ uint32_t desc_table_size, used_len = 0;
+ int rc;
+
+ spdk_scsi_task_construct(&task->scsi, spdk_vhost_scsi_task_mgmt_cpl, spdk_vhost_scsi_task_free_cb);
+ rc = spdk_vhost_vq_get_desc(vdev, task->vq, task->req_idx, &desc, &desc_table, &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("%s: Invalid controlq descriptor at index %d.\n",
+ vdev->name, task->req_idx);
+ goto out;
+ }
+
+ ctrl_req = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*ctrl_req));
+ if (ctrl_req == NULL) {
+ SPDK_ERRLOG("%s: Invalid task management request at index %d.\n",
+ vdev->name, task->req_idx);
+ goto out;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE,
+ "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n",
+ task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->vring.last_used_idx,
+ task->vq->vring.kickfd, task->vq->vring.size);
+ SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req,
+ desc->len);
+
+ spdk_vhost_scsi_task_init_target(task, ctrl_req->lun);
+
+ spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ if (spdk_unlikely(desc == NULL)) {
+ SPDK_ERRLOG("%s: No response descriptor for controlq request %d.\n",
+ vdev->name, task->req_idx);
+ goto out;
+ }
+
+ /* Process the TMF request */
+ switch (ctrl_req->type) {
+ case VIRTIO_SCSI_T_TMF:
+ task->tmf_resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->tmf_resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) {
+ SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n",
+ vdev->name, task->req_idx);
+ goto out;
+ }
+
+ /* Check if we are processing a valid request */
+ if (task->scsi_dev == NULL) {
+ task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+ break;
+ }
+
+ switch (ctrl_req->subtype) {
+ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+ /* Handle LUN reset */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN reset\n");
+
+ mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+ return;
+ default:
+ task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED;
+ /* Unsupported command */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "Unsupported TMF command %x\n", ctrl_req->subtype);
+ break;
+ }
+ break;
+ case VIRTIO_SCSI_T_AN_QUERY:
+ case VIRTIO_SCSI_T_AN_SUBSCRIBE: {
+ an_resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*an_resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) {
+ SPDK_WARNLOG("%s: Asynchronous response descriptor points to invalid guest memory region\n",
+ vdev->name);
+ goto out;
+ }
+
+ an_resp->response = VIRTIO_SCSI_S_ABORTED;
+ break;
+ }
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "Unsupported control command %x\n", ctrl_req->type);
+ break;
+ }
+
+ used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp);
+out:
+ spdk_vhost_vq_used_ring_enqueue(vdev, task->vq, task->req_idx, used_len);
+ spdk_vhost_scsi_task_put(task);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ * -1 if request is invalid and must be aborted,
+ * 0 if all data are set.
+ */
+static int
+task_data_setup(struct spdk_vhost_scsi_task *task,
+ struct virtio_scsi_cmd_req **req)
+{
+ struct spdk_vhost_dev *vdev = &task->svdev->vdev;
+ struct vring_desc *desc, *desc_table;
+ struct iovec *iovs = task->iovs;
+ uint16_t iovcnt = 0;
+ uint32_t desc_table_len, len = 0;
+ int rc;
+
+ spdk_scsi_task_construct(&task->scsi, spdk_vhost_scsi_task_cpl, spdk_vhost_scsi_task_free_cb);
+
+ rc = spdk_vhost_vq_get_desc(vdev, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len);
+ /* First descriptor must be readable */
+ if (spdk_unlikely(rc != 0 || spdk_vhost_vring_desc_is_wr(desc) ||
+ desc->len < sizeof(struct virtio_scsi_cmd_req))) {
+ SPDK_WARNLOG("%s: invalid first (request) descriptor at index %"PRIu16".\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ *req = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(**req));
+ if (spdk_unlikely(*req == NULL)) {
+ SPDK_WARNLOG("%s: Request descriptor at index %d points to invalid guest memory region\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ /* Each request must have at least 2 descriptors (e.g. request and response) */
+ spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (desc == NULL) {
+ SPDK_WARNLOG("%s: Descriptor chain at index %d contains neither payload nor response buffer.\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+ task->scsi.dxfer_dir = spdk_vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV :
+ SPDK_SCSI_DIR_TO_DEV;
+ task->scsi.iovs = iovs;
+
+ if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+ /*
+ * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN]
+ */
+ task->resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+ SPDK_WARNLOG("%s: Response descriptor at index %d points to invalid guest memory region\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+ rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ if (desc == NULL) {
+ /*
+ * TEST UNIT READY command and some others might not contain any payload and this is not an error.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA,
+ "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx);
+ SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE);
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+ task->scsi.iovcnt = 1;
+ task->scsi.iovs[0].iov_len = 0;
+ task->scsi.length = 0;
+ task->scsi.transfer_len = 0;
+ return 0;
+ }
+
+ /* All remaining descriptors are data. */
+ while (desc) {
+ if (spdk_unlikely(!spdk_vhost_vring_desc_is_wr(desc))) {
+ SPDK_WARNLOG("FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n", iovcnt);
+ goto invalid_task;
+ }
+
+ if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &iovcnt, desc))) {
+ goto invalid_task;
+ }
+ len += desc->len;
+
+ rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+ }
+
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV");
+ /*
+ * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp]
+ * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir.
+ */
+
+ /* Process descriptors up to response. */
+ while (!spdk_vhost_vring_desc_is_wr(desc)) {
+ if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &iovcnt, desc))) {
+ goto invalid_task;
+ }
+ len += desc->len;
+
+ spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(desc == NULL)) {
+ SPDK_WARNLOG("TO_DEV cmd: no response descriptor.\n");
+ goto invalid_task;
+ }
+ }
+
+ task->resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+ SPDK_WARNLOG("%s: Response descriptor at index %d points to invalid guest memory region\n",
+ vdev->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+ }
+
+ task->scsi.iovcnt = iovcnt;
+ task->scsi.length = len;
+ task->scsi.transfer_len = len;
+ return 0;
+
+invalid_task:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n",
+ vdev->name, task->req_idx);
+ return -1;
+}
+
+static int
+process_request(struct spdk_vhost_scsi_task *task)
+{
+ struct virtio_scsi_cmd_req *req;
+ int result;
+
+ result = task_data_setup(task, &req);
+ if (result) {
+ return result;
+ }
+
+ result = spdk_vhost_scsi_task_init_target(task, req->lun);
+ if (spdk_unlikely(result != 0)) {
+ task->resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+ return -1;
+ }
+
+ task->scsi.cdb = req->cdb;
+ SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE);
+
+ if (spdk_unlikely(task->scsi.lun == NULL)) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ task->resp->response = VIRTIO_SCSI_S_OK;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+process_controlq(struct spdk_vhost_scsi_dev *svdev, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_scsi_task *task;
+ uint16_t reqs[32];
+ uint16_t reqs_cnt, i;
+
+ reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ for (i = 0; i < reqs_cnt; i++) {
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: invalid entry in avail ring. Buffer '%"PRIu16"' exceeds virtqueue size (%"PRIu16")\n",
+ svdev->vdev.name, reqs[i], vq->vring.size);
+ spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0);
+ continue;
+ }
+
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[reqs[i]];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: invalid entry in avail ring. Buffer '%"PRIu16"' is still in use!\n",
+ svdev->vdev.name, reqs[i]);
+ spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0);
+ continue;
+ }
+
+ svdev->vdev.task_cnt++;
+ memset(&task->scsi, 0, sizeof(task->scsi));
+ task->tmf_resp = NULL;
+ task->used = true;
+ process_ctrl_request(task);
+ }
+}
+
+static void
+process_requestq(struct spdk_vhost_scsi_dev *svdev, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_scsi_task *task;
+ uint16_t reqs[32];
+ uint16_t reqs_cnt, i;
+ int result;
+
+ reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ assert(reqs_cnt <= 32);
+
+ for (i = 0; i < reqs_cnt; i++) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n",
+ reqs[i]);
+
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ svdev->vdev.name, reqs[i], vq->vring.size);
+ spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0);
+ continue;
+ }
+
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[reqs[i]];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ svdev->vdev.name, reqs[i]);
+ spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0);
+ continue;
+ }
+
+ svdev->vdev.task_cnt++;
+ memset(&task->scsi, 0, sizeof(task->scsi));
+ task->resp = NULL;
+ task->used = true;
+ task->used_len = 0;
+ result = process_request(task);
+ if (likely(result == 0)) {
+ task_submit(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task,
+ task->req_idx);
+ } else if (result > 0) {
+ spdk_vhost_scsi_task_cpl(&task->scsi);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task,
+ task->req_idx);
+ } else {
+ invalid_request(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task,
+ task->req_idx);
+ }
+ }
+}
+
+static int
+vdev_mgmt_worker(void *arg)
+{
+ struct spdk_vhost_scsi_dev *svdev = arg;
+
+ process_removed_devs(svdev);
+ spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_EVENTQ]);
+
+ process_controlq(svdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_CONTROLQ]);
+ spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_CONTROLQ]);
+
+ return -1;
+}
+
+static int
+vdev_worker(void *arg)
+{
+ struct spdk_vhost_scsi_dev *svdev = arg;
+ uint32_t q_idx;
+
+ for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < svdev->vdev.max_queues; q_idx++) {
+ process_requestq(svdev, &svdev->vdev.virtqueue[q_idx]);
+ }
+
+ spdk_vhost_dev_used_signal(&svdev->vdev);
+
+ return -1;
+}
+
+static struct spdk_vhost_scsi_dev *
+to_scsi_dev(struct spdk_vhost_dev *ctrlr)
+{
+ if (ctrlr == NULL) {
+ return NULL;
+ }
+
+ if (ctrlr->backend != &spdk_vhost_scsi_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev);
+}
+
+int
+spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask)
+{
+ struct spdk_vhost_scsi_dev *svdev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_scsi_dev),
+ SPDK_CACHE_LINE_SIZE, NULL);
+ int rc;
+
+ if (svdev == NULL) {
+ return -ENOMEM;
+ }
+
+ spdk_vhost_lock();
+ rc = spdk_vhost_dev_register(&svdev->vdev, name, cpumask,
+ &spdk_vhost_scsi_device_backend);
+
+ if (rc) {
+ spdk_dma_free(svdev);
+ }
+
+ spdk_vhost_unlock();
+ return rc;
+}
+
+static int
+spdk_vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev);
+ int rc, i;
+
+ if (svdev == NULL) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+ if (svdev->scsi_dev[i]) {
+ if (vdev->registered) {
+ SPDK_ERRLOG("Trying to remove non-empty controller: %s.\n", vdev->name);
+ return -EBUSY;
+ }
+
+ rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i);
+ return rc;
+ }
+ }
+ }
+
+ rc = spdk_vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ spdk_dma_free(svdev);
+ return 0;
+}
+
+struct spdk_scsi_dev *
+spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+
+ assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ svdev = to_scsi_dev(vdev);
+
+ return svdev ? svdev->scsi_dev[num] : NULL;
+}
+
+static void
+spdk_vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg)
+{
+ struct spdk_vhost_scsi_dev *svdev = arg;
+ const struct spdk_scsi_dev *scsi_dev;
+ unsigned scsi_dev_num;
+
+ assert(lun != NULL);
+ assert(svdev != NULL);
+ if (svdev->vdev.lcore != -1 &&
+ !spdk_vhost_dev_has_feature(&svdev->vdev, VIRTIO_SCSI_F_HOTPLUG)) {
+ SPDK_WARNLOG("%s: hotremove is not enabled for this controller.\n", svdev->vdev.name);
+ return;
+ }
+
+ scsi_dev = spdk_scsi_lun_get_dev(lun);
+ for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) {
+ if (svdev->scsi_dev[scsi_dev_num] == scsi_dev) {
+ break;
+ }
+ }
+
+ if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ /* The entire device has been already removed. */
+ return;
+ }
+
+ /* remove entire device */
+ spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL);
+}
+
+int
+spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num,
+ const char *bdev_name)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ char target_name[SPDK_SCSI_DEV_MAX_NAME];
+ int lun_id_list[1];
+ const char *bdev_names_list[1];
+
+ svdev = to_scsi_dev(vdev);
+ if (svdev == NULL) {
+ return -EINVAL;
+ }
+
+ if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("Controller %d target number too big (max %d)\n", scsi_tgt_num,
+ SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ return -EINVAL;
+ }
+
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("No lun name specified\n");
+ return -EINVAL;
+ }
+
+ if (svdev->scsi_dev[scsi_tgt_num] != NULL) {
+ SPDK_ERRLOG("Controller %s target %u already occupied\n", vdev->name, scsi_tgt_num);
+ return -EEXIST;
+ }
+
+ /*
+ * At this stage only one LUN per target
+ */
+ snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num);
+ lun_id_list[0] = 0;
+ bdev_names_list[0] = (char *)bdev_name;
+
+ svdev->scsi_dev_state[scsi_tgt_num].removed = false;
+ svdev->scsi_dev[scsi_tgt_num] = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list,
+ 1,
+ SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, spdk_vhost_scsi_lun_hotremove, svdev);
+
+ if (svdev->scsi_dev[scsi_tgt_num] == NULL) {
+ SPDK_ERRLOG("Couldn't create spdk SCSI target '%s' using bdev '%s' in controller: %s\n",
+ target_name, bdev_name, vdev->name);
+ return -EINVAL;
+ }
+ spdk_scsi_dev_add_port(svdev->scsi_dev[scsi_tgt_num], 0, "vhost");
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: defined target '%s' using bdev '%s'\n",
+ vdev->name, target_name, bdev_name);
+
+ if (vdev->lcore == -1) {
+ /* All done. */
+ return 0;
+ }
+
+ spdk_scsi_dev_allocate_io_channels(svdev->scsi_dev[scsi_tgt_num]);
+
+ if (spdk_vhost_dev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
+ eventq_enqueue(svdev, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET,
+ VIRTIO_SCSI_EVT_RESET_RESCAN);
+ } else {
+ SPDK_NOTICELOG("Device %s does not support hotplug. "
+ "Please restart the driver or perform a rescan.\n",
+ vdev->name);
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num,
+ spdk_vhost_event_fn cb_fn, void *cb_arg)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_dev *scsi_dev;
+ struct spdk_scsi_dev_vhost_state *scsi_dev_state;
+ int rc = 0;
+
+ if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: invalid target number %d\n", vdev->name, scsi_tgt_num);
+ return -EINVAL;
+ }
+
+ svdev = to_scsi_dev(vdev);
+ if (svdev == NULL) {
+ return -ENODEV;
+ }
+
+ scsi_dev = svdev->scsi_dev[scsi_tgt_num];
+ if (scsi_dev == NULL) {
+ SPDK_ERRLOG("Controller %s target %u is not occupied\n", vdev->name, scsi_tgt_num);
+ return -ENODEV;
+ }
+
+ if (svdev->vdev.lcore == -1) {
+ /* controller is not in use, remove dev and exit */
+ svdev->scsi_dev[scsi_tgt_num] = NULL;
+ spdk_scsi_dev_destruct(scsi_dev);
+ if (cb_fn) {
+ rc = cb_fn(vdev, cb_arg);
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n",
+ vdev->name, scsi_tgt_num);
+ return rc;
+ }
+
+ if (!spdk_vhost_dev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
+ SPDK_WARNLOG("%s: 'Target %u' is in use and hot-detach is not enabled for this controller.\n",
+ svdev->vdev.name, scsi_tgt_num);
+ return -ENOTSUP;
+ }
+
+ scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num];
+ if (scsi_dev_state->removed) {
+ SPDK_WARNLOG("%s: 'Target %u' has been already marked to hotremove.\n", svdev->vdev.name,
+ scsi_tgt_num);
+ return -EBUSY;
+ }
+
+ scsi_dev_state->remove_cb = cb_fn;
+ scsi_dev_state->remove_ctx = cb_arg;
+ scsi_dev_state->removed = true;
+ eventq_enqueue(svdev, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED);
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: queued 'Target %u' for hot-detach.\n", vdev->name, scsi_tgt_num);
+ return 0;
+}
+
+int
+spdk_vhost_scsi_controller_construct(void)
+{
+ struct spdk_conf_section *sp = spdk_conf_first_section(NULL);
+ struct spdk_vhost_dev *vdev;
+ int i, dev_num;
+ unsigned ctrlr_num = 0;
+ char *bdev_name, *tgt_num_str;
+ char *cpumask;
+ char *name;
+ char *tgt = NULL;
+
+ while (sp != NULL) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) {
+ sp = spdk_conf_next_section(sp);
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+
+ if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) {
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ assert(vdev);
+
+ for (i = 0; ; i++) {
+
+ tgt = spdk_conf_section_get_nval(sp, "Target", i);
+ if (tgt == NULL) {
+ break;
+ }
+
+ tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0);
+ if (tgt_num_str == NULL) {
+ SPDK_ERRLOG("%s: Invalid or missing target number\n", name);
+ return -1;
+ }
+
+ dev_num = (int)strtol(tgt_num_str, NULL, 10);
+ bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1);
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("%s: Invalid or missing bdev name for target %d\n", name, dev_num);
+ return -1;
+ } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) {
+ SPDK_ERRLOG("%s: Only one LUN per vhost SCSI device supported\n", name);
+ return -1;
+ }
+
+ if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) {
+ return -1;
+ }
+ }
+
+ sp = spdk_conf_next_section(sp);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_scsi_dev *svdev)
+{
+ struct spdk_vhost_virtqueue *vq;
+ uint16_t i;
+
+ for (i = 0; i < svdev->vdev.max_queues; i++) {
+ vq = &svdev->vdev.virtqueue[i];
+ if (vq->tasks == NULL) {
+ continue;
+ }
+
+ spdk_dma_free(vq->tasks);
+ vq->tasks = NULL;
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_scsi_dev *svdev)
+{
+ struct spdk_vhost_virtqueue *vq;
+ struct spdk_vhost_scsi_task *task;
+ uint32_t task_cnt;
+ uint16_t i;
+ uint32_t j;
+
+ for (i = 0; i < svdev->vdev.max_queues; i++) {
+ vq = &svdev->vdev.virtqueue[i];
+ if (vq->vring.desc == NULL) {
+ continue;
+ }
+
+ task_cnt = vq->vring.size;
+ if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+ /* sanity check */
+ SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+ svdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+ free_task_pool(svdev);
+ return -1;
+ }
+ vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt,
+ SPDK_CACHE_LINE_SIZE, NULL);
+ if (vq->tasks == NULL) {
+ SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+ svdev->vdev.name, task_cnt, i);
+ free_task_pool(svdev);
+ return -1;
+ }
+
+ for (j = 0; j < task_cnt; j++) {
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j];
+ task->svdev = svdev;
+ task->vq = vq;
+ task->req_idx = j;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * A new device is added to a data core. First the device is added to the main linked list
+ * and then allocated to a specific data core.
+ */
+static int
+spdk_vhost_scsi_start(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ uint32_t i;
+ int rc;
+
+ svdev = to_scsi_dev(vdev);
+ if (svdev == NULL) {
+ SPDK_ERRLOG("Trying to start non-scsi controller as a scsi one.\n");
+ rc = -1;
+ goto out;
+ }
+
+ /* validate all I/O queues are in a contiguous index range */
+ for (i = VIRTIO_SCSI_REQUESTQ; i < vdev->max_queues; i++) {
+ if (vdev->virtqueue[i].vring.desc == NULL) {
+ SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ rc = alloc_task_pool(svdev);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc task pool.\n", vdev->name);
+ goto out;
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ if (svdev->scsi_dev[i] == NULL) {
+ continue;
+ }
+ spdk_scsi_dev_allocate_io_channels(svdev->scsi_dev[i]);
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
+ vdev->name, vdev->lcore);
+
+ svdev->requestq_poller = spdk_poller_register(vdev_worker, svdev, 0);
+ if (vdev->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc &&
+ vdev->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) {
+ svdev->mgmt_poller = spdk_poller_register(vdev_mgmt_worker, svdev,
+ MGMT_POLL_PERIOD_US);
+ }
+out:
+ spdk_vhost_dev_backend_event_done(event_ctx, rc);
+ return rc;
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+ struct spdk_vhost_scsi_dev *svdev = arg;
+ uint32_t i;
+
+ if (svdev->vdev.task_cnt > 0) {
+ return -1;
+ }
+
+
+ for (i = 0; i < svdev->vdev.max_queues; i++) {
+ spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[i]);
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ if (svdev->scsi_dev[i] == NULL) {
+ continue;
+ }
+ spdk_scsi_dev_free_io_channels(svdev->scsi_dev[i]);
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", svdev->vdev.name);
+
+ free_task_pool(svdev);
+
+ spdk_poller_unregister(&svdev->destroy_ctx.poller);
+ spdk_vhost_dev_backend_event_done(svdev->destroy_ctx.event_ctx, 0);
+
+ return -1;
+}
+
+static int
+spdk_vhost_scsi_stop(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+
+ svdev = to_scsi_dev(vdev);
+ if (svdev == NULL) {
+ SPDK_ERRLOG("Trying to stop non-scsi controller as a scsi one.\n");
+ goto err;
+ }
+
+ svdev->destroy_ctx.event_ctx = event_ctx;
+ spdk_poller_unregister(&svdev->requestq_poller);
+ spdk_poller_unregister(&svdev->mgmt_poller);
+ svdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, svdev,
+ 1000);
+
+ return 0;
+
+err:
+ spdk_vhost_dev_backend_event_done(event_ctx, -1);
+ return -1;
+}
+
+static void
+spdk_vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_scsi_dev *sdev;
+ struct spdk_scsi_lun *lun;
+ uint32_t dev_idx;
+ uint32_t lun_idx;
+
+ assert(vdev != NULL);
+ spdk_json_write_name(w, "scsi");
+ spdk_json_write_array_begin(w);
+ for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) {
+ sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx);
+ if (!sdev) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_name(w, "scsi_dev_num");
+ spdk_json_write_uint32(w, dev_idx);
+
+ spdk_json_write_name(w, "id");
+ spdk_json_write_int32(w, spdk_scsi_dev_get_id(sdev));
+
+ spdk_json_write_name(w, "target_name");
+ spdk_json_write_string(w, spdk_scsi_dev_get_name(sdev));
+
+ spdk_json_write_name(w, "luns");
+ spdk_json_write_array_begin(w);
+
+ for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) {
+ lun = spdk_scsi_dev_get_lun(sdev, lun_idx);
+ if (!lun) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_name(w, "id");
+ spdk_json_write_int32(w, spdk_scsi_lun_get_id(lun));
+
+ spdk_json_write_name(w, "bdev_name");
+ spdk_json_write_string(w, spdk_scsi_lun_get_bdev_name(lun));
+
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+spdk_vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_lun *lun;
+ uint32_t i;
+
+ svdev = to_scsi_dev(vdev);
+ if (!svdev) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "construct_vhost_scsi_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < SPDK_COUNTOF(svdev->scsi_dev); i++) {
+ if (svdev->scsi_dev[i] == NULL || svdev->scsi_dev_state[i].removed) {
+ continue;
+ }
+
+ lun = spdk_scsi_dev_get_lun(svdev->scsi_dev[i], 0);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "add_vhost_scsi_lun");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_uint32(w, "scsi_target_num", i);
+
+ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA)