diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/lib/vhost | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/vhost')
-rw-r--r-- | src/spdk/lib/vhost/Makefile | 47 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/Makefile | 46 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/fd_man.c | 300 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/fd_man.h | 69 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/rte_vhost.h | 474 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/socket.c | 819 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/vhost.c | 482 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/vhost.h | 321 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/vhost_user.c | 1360 | ||||
-rw-r--r-- | src/spdk/lib/vhost/rte_vhost/vhost_user.h | 182 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost.c | 1503 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_blk.c | 901 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_internal.h | 277 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_nvme.c | 1465 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_rpc.c | 814 | ||||
-rw-r--r-- | src/spdk/lib/vhost/vhost_scsi.c | 1271 |
16 files changed, 10331 insertions, 0 deletions
diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile new file mode 100644 index 00000000..b46978e2 --- /dev/null +++ b/src/spdk/lib/vhost/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I. +CFLAGS += -Irte_vhost +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c vhost_nvme.c + +LIBNAME = vhost + +DIRS-y += rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vhost/rte_vhost/Makefile b/src/spdk/lib/vhost/rte_vhost/Makefile new file mode 100644 index 00000000..b0ae6335 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I. +CFLAGS += $(ENV_CFLAGS) +CFLAGS += -include rte_config.h + +# These are the DPDK vhost files copied (for now) into SPDK +C_SRCS += fd_man.c socket.c vhost_user.c vhost.c + +LIBNAME = rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vhost/rte_vhost/fd_man.c b/src/spdk/lib/vhost/rte_vhost/fd_man.c new file mode 100644 index 00000000..2ceacc9a --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/fd_man.c @@ -0,0 +1,300 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> +#include <string.h> + +#include <rte_common.h> +#include <rte_log.h> + +#include "fd_man.h" + +#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) + +static int +get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) +{ + int i; + + for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) + ; + + return i; +} + +static void +fdset_move(struct fdset *pfdset, int dst, int src) +{ + pfdset->fd[dst] = pfdset->fd[src]; + pfdset->rwfds[dst] = pfdset->rwfds[src]; +} + +static void +fdset_shrink_nolock(struct fdset *pfdset) +{ + int i; + int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); + + for (i = 0; i < last_valid_idx; i++) { + if (pfdset->fd[i].fd != -1) + continue; + + fdset_move(pfdset, i, last_valid_idx); + last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); + } + pfdset->num = last_valid_idx + 1; +} + +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * Returns the index in the fdset for a given fd. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) + ; + + return i == pfdset->num ? -1 : i; +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + struct pollfd *pfd = &pfdset->rwfds[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + pfd->fd = fd; + pfd->events = rcb ? POLLIN : 0; + pfd->events |= wcb ? POLLOUT : 0; + pfd->revents = 0; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + } + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void * +fdset_event_dispatch(void *arg) +{ + int i; + struct pollfd *pfd; + struct fdentry *pfdentry; + fd_cb rcb, wcb; + void *dat; + int fd, numfds; + int remove1, remove2; + int need_shrink; + struct fdset *pfdset = arg; + + if (pfdset == NULL) + return NULL; + + while (1) { + + /* + * When poll is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When poll returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + pthread_mutex_lock(&pfdset->fd_mutex); + numfds = pfdset->num; + pthread_mutex_unlock(&pfdset->fd_mutex); + + poll(pfdset->rwfds, numfds, 1000 /* millisecs */); + + need_shrink = 0; + for (i = 0; i < numfds; i++) { + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + pfd = &pfdset->rwfds[i]; + + if (fd < 0) { + need_shrink = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + if (!pfd->revents) { + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + remove1 = remove2 = 0; + + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) + rcb(fd, dat, &remove1); + if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fd_set_del. + */ + if (remove1 || remove2) { + pfdentry->fd = -1; + need_shrink = 1; + } + } + + if (need_shrink) + fdset_shrink(pfdset); + } + + return NULL; +} diff --git a/src/spdk/lib/vhost/rte_vhost/fd_man.h b/src/spdk/lib/vhost/rte_vhost/fd_man.h new file mode 100644 index 00000000..3a9d269b --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/fd_man.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include <stdint.h> +#include <pthread.h> +#include <poll.h> + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable. */ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct pollfd rwfds[MAX_FDS]; + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); + +void *fdset_event_dispatch(void *arg); + +#endif diff --git a/src/spdk/lib/vhost/rte_vhost/rte_vhost.h b/src/spdk/lib/vhost/rte_vhost/rte_vhost.h new file mode 100644 index 00000000..29f5b613 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/rte_vhost.h @@ -0,0 +1,474 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_VHOST_H_ +#define _RTE_VHOST_H_ + +/** + * @file + * Interface to vhost-user + */ + +#include <stdint.h> +#include <linux/vhost.h> +#include <linux/virtio_ring.h> +#include <sys/eventfd.h> + +#include <rte_config.h> +#include <rte_memory.h> +#include <rte_mempool.h> + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct rte_vhost_mem_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + +/** + * Memory structure includes region and mapping information. + */ +struct rte_vhost_memory { + uint32_t nregions; + struct rte_vhost_mem_region regions[0]; +}; + +struct rte_vhost_vring { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + + int callfd; + int kickfd; + uint16_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +}; + +/** + * Device and vring operations. + */ +struct vhost_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + /** + * Features could be changed after the feature negotiation. + * For example, VHOST_F_LOG_ALL will be set/cleared at the + * start/end of live migration, respectively. This callback + * is used to inform the application on such change. + */ + int (*features_changed)(int vid, uint64_t features); + int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf); + int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd); + int (*vhost_nvme_get_cap)(int vid, uint64_t *cap); + + int (*new_connection)(int vid); + void (*destroy_connection)(int vid); + + int (*get_config)(int vid, uint8_t *config, uint32_t config_len); + int (*set_config)(int vid, uint8_t *config, uint32_t offset, + uint32_t len, uint32_t flags); + + void *reserved[2]; /**< Reserved for future extension */ +}; + +/** + * Convert guest physical address to host virtual address + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @return + * the host virtual address on success, 0 on failure + */ +static inline uint64_t __attribute__((always_inline)) +rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/** + * Convert guest physical address to host virtual address safely + * + * This variant of rte_vhost_gpa_to_vva() takes care all the + * requested length is mapped and contiguous in process address + * space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @param len + * the size of the requested area to map, + * updated with actual size mapped + * @return + * the host virtual address on success, 0 on failure */ +static inline uint64_t +rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, + uint64_t gpa, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (gpa >= r->guest_phys_addr && + gpa < r->guest_phys_addr + r->size) { + + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) + *len = r->guest_phys_addr + r->size - gpa; + + return gpa - r->guest_phys_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + +#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) + +/** + * Log the memory write start with given address. + * + * This function only need be invoked when the live migration starts. + * Therefore, we won't need call it at all in the most of time. For + * making the performance impact be minimum, it's suggested to do a + * check before calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_write(vid, addr, len); + * + * @param vid + * vhost device ID + * @param addr + * the starting address for write + * @param len + * the length to write + */ +void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len); + +/** + * Log the used ring update start at given offset. + * + * Same as rte_vhost_log_write, it's suggested to do a check before + * calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_used_vring(vid, vring_idx, offset, len); + * + * @param vid + * vhost device ID + * @param vring_idx + * the vring index + * @param offset + * the offset inside the used ring + * @param len + * the length to write + */ +void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/** + * Set the feature bits the vhost-user driver supports. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_set_features(const char *path, uint64_t features); + +/** + * Enable vhost-user driver features. + * + * Note that + * - the param @features should be a subset of the feature bits provided + * by rte_vhost_driver_set_features(). + * - it must be invoked before vhost-user negotiation starts. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to enable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_enable_features(const char *path, uint64_t features); + +/** + * Disable vhost-user driver features. + * + * The two notes at rte_vhost_driver_enable_features() also apply here. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to disable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_disable_features(const char *path, uint64_t features); + +/** + * Get the feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_get_features(const char *path, uint64_t *features); + +/** + * Get the feature bits after negotiation + * + * @param vid + * Vhost device ID + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_negotiated_features(int vid, uint64_t *features); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops); + +/** + * + * Start the vhost-user driver. + * + * This function triggers the vhost-user negotiation. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_start(const char *path); + +/** + * Get the MTU value of the device if set in QEMU. + * + * @param vid + * virtio-net device ID + * @param mtu + * The variable to store the MTU value + * + * @return + * 0: success + * -EAGAIN: device not yet started + * -ENOTSUP: device does not support MTU feature + */ +int rte_vhost_get_mtu(int vid, uint16_t *mtu); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * vhost device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * vhost device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +struct rte_mbuf; +struct rte_mempool; +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +/** + * Get guest mem table: a list of memory regions. + * + * An rte_vhost_vhost_memory object will be allocated internaly, to hold the + * guest memory regions. Application should free it at destroy_device() + * callback. + * + * @param vid + * vhost device ID + * @param mem + * To store the returned mem regions + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); + +/** + * Get guest vring info, including the vring address, vring size, etc. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested vring info + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring); + +/** + * Set id of the last descriptors in avail and used guest vrings. + * + * In case user application operates directly on buffers, it should use this + * function on device destruction to retrieve the same values later on in device + * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *) + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param last_avail_idx + * id of the last descriptor in avail ring to be set + * @param last_used_idx + * id of the last descriptor in used ring to be set + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx, + uint16_t last_avail_idx, uint16_t last_used_idx); + +#endif /* _RTE_VHOST_H_ */ diff --git a/src/spdk/lib/vhost/rte_vhost/socket.c b/src/spdk/lib/vhost/rte_vhost/socket.c new file mode 100644 index 00000000..1bc1e64b --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/socket.c @@ -0,0 +1,819 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdbool.h> +#include <limits.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/queue.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> + +#include <rte_log.h> + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; + char *path; + int socket_fd; + struct sockaddr_un un; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + struct vhost_device_ops const *notify_ops; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int connfd; + int vid; + + TAILQ_ENTRY(vhost_user_connection) next; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + if (cmsg == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); + errno = EINVAL; + return -1; + } + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(vsocket->features); + if (vid == -1) { + goto err; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + if (vsocket->notify_ops->new_connection) { + ret = vsocket->notify_ops->new_connection(vid); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add vhost user connection with fd %d\n", + fd); + goto err; + } + } + + conn->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + goto err; + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + return; + +err: + free(conn); + close(fd); +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + close(connfd); + *remove = 1; + vhost_destroy_device(conn->vid); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(conn); + + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } + } +} + +static int +create_unix_socket(struct vhost_user_socket *vsocket) +{ + int fd; + struct sockaddr_un *un = &vsocket->un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + vsocket->is_server ? "server" : "client", fd); + + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + vsocket->socket_fd = fd; + return 0; +} + +static int +vhost_user_start_server(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_start_client(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = vsocket->un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + *features = vsocket->features; + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + return -1; + } else { + return 0; + } +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + if (!vsocket->path) { + free(vsocket); + goto out; + } + TAILQ_INIT(&vsocket->conn_list); + pthread_mutex_init(&vsocket->conn_mutex, NULL); + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + } + } else { + vsocket->is_server = true; + } + ret = create_unix_socket(vsocket); + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, vsocket->socket_fd); + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_FOREACH(conn, &vsocket->conn_list, next) { + close(conn->connfd); + } + pthread_mutex_unlock(&vsocket->conn_mutex); + + do { + pthread_mutex_lock(&vsocket->conn_mutex); + conn = TAILQ_FIRST(&vsocket->conn_list); + pthread_mutex_unlock(&vsocket->conn_mutex); + } while (conn != NULL); + + free(vsocket->path); + free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + +int +rte_vhost_driver_start(const char *path) +{ + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); +} diff --git a/src/spdk/lib/vhost/rte_vhost/vhost.c b/src/spdk/lib/vhost/rte_vhost/vhost.c new file mode 100644 index 00000000..9d4ae71b --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost.c @@ -0,0 +1,482 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#ifdef RTE_LIBRTE_VHOST_NUMA +#include <numaif.h> +#endif + +#include <rte_ethdev.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_memory.h> +#include <rte_malloc.h> +#include <rte_vhost.h> + +#include "vhost.h" + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +static void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + struct vhost_virtqueue *vq; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + rte_free(vq->shadow_used_ring); + + rte_free(vq); + } + + rte_free(dev); +} + +static void +init_vring_queue(struct vhost_virtqueue *vq) +{ + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + /* + * always set the vq to enabled; this is to keep compatibility + * with the old QEMU, whereas there is no SET_VRING_ENABLE message. + */ + vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +reset_vring_queue(struct vhost_virtqueue *vq) +{ + int callfd; + + callfd = vq->callfd; + init_vring_queue(vq); + vq->callfd = callfd; +} + +int +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for vring:%u.\n", vring_idx); + return -1; + } + + dev->virtqueue[vring_idx] = vq; + init_vring_queue(vq); + + dev->nr_vring += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, nr_vring: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->negotiated_features = 0; + dev->protocol_features = 0; + dev->flags = 0; + + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev->virtqueue[i]); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(uint64_t features) +{ + struct virtio_net *dev; + int i; + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + rte_free(dev); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + dev->features = features; + + return i; +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(vid); + } + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->negotiated_features & VIRTIO_NET_F_MTU)) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->negotiated_features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(sizeof(struct rte_vhost_memory) + size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + vring->last_avail_idx = vq->last_avail_idx; + vring->last_used_idx = vq->last_used_idx; + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + if (enable) { + RTE_LOG(ERR, VHOST_CONFIG, + "guest notification isn't supported.\n"); + return -1; + } + + dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; + return 0; +} + +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_log_write(dev, addr, len); +} + +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); +} + +int +rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx, + uint16_t last_avail_idx, uint16_t last_used_idx) { + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vq->last_avail_idx = last_avail_idx; + vq->last_used_idx = last_used_idx; + + return 0; +} diff --git a/src/spdk/lib/vhost/rte_vhost/vhost.h b/src/spdk/lib/vhost/rte_vhost/vhost.h new file mode 100644 index 00000000..b0a0201d --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost.h @@ -0,0 +1,321 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include <stdint.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <unistd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <sys/socket.h> +#include <linux/if.h> + +#include <rte_log.h> +#include <rte_ether.h> + +#include "rte_vhost.h" +#include "vhost_user.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + struct vring_used_elem *shadow_used_ring; + uint16_t shadow_used_idx; +} __rte_cache_aligned; + +/* Old kernels have no such macros defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif + +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_NET_F_MTU)) + + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct rte_vhost_memory *mem; + uint64_t features; + uint64_t negotiated_features; + uint64_t protocol_features; + int vid; + uint32_t is_nvme; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t nr_vring; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; + int has_new_mem_table; + struct VhostUserMemory mem_table; + int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS]; +} __rte_cache_aligned; + + +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define VHOST_LOG_LEVEL RTE_LOG_DEBUG +#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define VHOST_LOG_LEVEL RTE_LOG_INFO +#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical address to host physical address */ +static inline phys_addr_t __attribute__((always_inline)) +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +struct virtio_net *get_device(int vid); + +int vhost_new_device(uint64_t features); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); + +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); + +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/src/spdk/lib/vhost/rte_vhost/vhost_user.c b/src/spdk/lib/vhost/rte_vhost/vhost_user.c new file mode 100644 index 00000000..b708a8a7 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost_user.c @@ -0,0 +1,1360 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <unistd.h> +#include <sys/mman.h> +#include <asm/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <assert.h> +#ifdef RTE_LIBRTE_VHOST_NUMA +#include <numaif.h> +#endif + +#include <rte_common.h> +#include <rte_malloc.h> +#include <rte_log.h> + +#include "vhost.h" +#include "vhost_user.h" + +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", + [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", + [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", + [VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN", + [VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL", + [VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP", + [VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP", + [VHOST_USER_NVME_IO_CMD] = "VHOST_USER_NVME_IO_CMD" +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct rte_vhost_mem_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + uint32_t i; + + if (dev->mem) { + if (dev->has_new_mem_table) { + for (i = 0; i < dev->mem->nregions; i++) { + close(dev->mem_table_fds[i]); + } + dev->has_new_mem_table = 0; + } + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + free(dev->guest_pages); + dev->guest_pages = NULL; + + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(struct virtio_net *dev) +{ + return dev->features; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + uint64_t vhost_features = 0; + + vhost_features = vhost_user_get_features(dev); + if (features & ~vhost_features) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) received invalid negotiated features.\n", + dev->vid); + return -1; + } + + if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) { + if (dev->notify_ops->features_changed) { + dev->notify_ops->features_changed(dev->vid, features); + } else { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + + dev->negotiated_features = features; + if (dev->negotiated_features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + VHOST_LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + vq->size = msg->payload.state.num; + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + } + + vq->shadow_used_ring = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_ring) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + int ret; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq)); + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + vhost_devices[dev->vid] = dev; + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* + * Converts QEMU virtual address to Vhost virtual address. This function is + * used to convert the ring addresses to our address space. + */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + + if (qva >= reg->guest_user_addr && + qva < reg->guest_user_addr + reg->size) { + + if (unlikely(*len > reg->guest_user_addr + reg->size - qva)) + *len = reg->guest_user_addr + reg->size - qva; + + return qva - reg->guest_user_addr + + reg->host_user_addr; + } + } + + return 0; +} + +static int vhost_setup_mem_table(struct virtio_net *dev); + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq; + uint64_t len; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + if (dev->has_new_mem_table) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[msg->payload.addr.index]; + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + len = sizeof(struct vring_desc) * vq->size; + vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.desc_user_addr, &len); + if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to map desc ring.\n", + dev->vid); + return -1; + } + + dev = numa_realloc(dev, msg->payload.addr.index); + vq = dev->virtqueue[msg->payload.addr.index]; + + len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; + vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.avail_user_addr, &len); + if (vq->avail == 0 || + len != sizeof(struct vring_avail) + + sizeof(uint16_t) * vq->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find avail ring address.\n", + dev->vid); + return -1; + } + + len = sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size; + vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.used_user_addr, &len); + if (vq->used == 0 || len != sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size) { + + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find used ring address.\n", + dev->vid); + return -1; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = msg->payload.addr.log_guest_addr; + + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; + dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; + + return 0; +} + +static void +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2); + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; +} + +static void +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + size = RTE_MIN(reg_size, page_size); + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) + host_user_addr); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + } +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static int +vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + uint32_t i; + + if (dev->has_new_mem_table) { + /* + * The previous mem table was not consumed, so close the + * file descriptors from that mem table before copying + * the new one. + */ + for (i = 0; i < dev->mem_table.nregions; i++) { + close(dev->mem_table_fds[i]); + } + } + + memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table)); + memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds)); + dev->has_new_mem_table = 1; + /* vhost-user-nvme will not send + * set vring addr message, enable + * memory address table now. + */ + if (dev->has_new_mem_table && dev->is_nvme) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } + + return 0; +} + + static int +vhost_setup_mem_table(struct virtio_net *dev) +{ + struct VhostUserMemory memory = dev->mem_table; + struct rte_vhost_mem_region *reg; + struct vhost_virtqueue *vq; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int fd; + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + /* Those addresses won't be valid anymore in host address space + * after setting new mem table. Initiator need to resend these + * addresses. + */ + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + } + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = dev->mem_table_fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "MADV_DONTDUMP advice setting failed.\n"); + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + if (dev->dequeue_zero_copy) + add_guest_pages(dev, reg, alignment); + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static int +vq_is_ready(struct vhost_virtqueue *vq) +{ + return vq && vq->desc && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->kickfd != VIRTIO_INVALID_EVENTFD && + vq->callfd != VIRTIO_INVALID_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *vq; + uint32_t i; + + if (dev->nr_vring == 0) + return 0; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + if (vq_is_ready(vq)) { + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; + } + } + + return 0; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; +} + +static void +vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + /* We have to stop the queue (virtio) if it is running. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->flags &= ~VIRTIO_DEV_READY; + + /* Here we are safe to get the last used index */ + msg->payload.state.num = vq->last_used_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + rte_free(vq->shadow_used_ring); + vq->shadow_used_ring = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + VhostUserMsg *msg) +{ + int enable = (int)msg->payload.state.num; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, msg->payload.state.index); + + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); + + dev->virtqueue[msg->payload.state.index]->enabled = enable; + + return 0; +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + + return 0; +} + +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + +static int +vhost_user_nvme_io_request_passthrough(struct virtio_net *dev, + uint16_t qid, uint16_t tail_head, + bool is_submission_queue) +{ + return -1; +} + +static int +vhost_user_nvme_admin_passthrough(struct virtio_net *dev, + void *cmd, void *cqe, void *buf) +{ + if (dev->notify_ops->vhost_nvme_admin_passthrough) { + return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf); + } + + return -1; +} + +static int +vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd) +{ + if (dev->notify_ops->vhost_nvme_set_cq_call) { + return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd); + } + + return -1; +} + +static int +vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap) +{ + if (dev->notify_ops->vhost_nvme_get_cap) { + return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap); + } + + return -1; +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + struct vhost_vring_file file; + int ret; + uint64_t cap; + uint64_t enable; + uint8_t cqe[16]; + uint8_t cmd[64]; + uint8_t buf[4096]; + uint16_t qid, tail_head; + bool is_submission_queue; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + if (!dev->notify_ops) { + dev->notify_ops = vhost_driver_callback_get(dev->ifname); + if (!dev->notify_ops) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get callback ops for driver %s\n", + dev->ifname); + return -1; + } + } + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n", + dev->ifname, vhost_message_str[msg.request]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + + switch (msg.request) { + case VHOST_USER_GET_CONFIG: + if (dev->notify_ops->get_config(dev->vid, + msg.payload.config.region, + msg.payload.config.size) != 0) { + msg.size = sizeof(uint64_t); + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_CONFIG: + if ((dev->notify_ops->set_config(dev->vid, + msg.payload.config.region, + msg.payload.config.offset, + msg.payload.config.size, + msg.payload.config.flags)) != 0) { + ret = 1; + } else { + ret = 0; + } + break; + case VHOST_USER_NVME_ADMIN: + if (!dev->is_nvme) { + dev->is_nvme = 1; + } + memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd)); + ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf); + memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe)); + msg.size = sizeof(cqe); + /* NVMe Identify Command */ + if (cmd[0] == 0x06) { + memcpy(msg.payload.nvme.buf, &buf, 4096); + msg.size += 4096; + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_SET_CQ_CALL: + file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = msg.fds[0]; + ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd); + break; + case VHOST_USER_NVME_GET_CAP: + ret = vhost_user_nvme_get_cap(dev, &cap); + if (!ret) + msg.payload.u64 = cap; + else + msg.payload.u64 = 0; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_START_STOP: + enable = msg.payload.u64; + /* device must be started before set cq call */ + if (enable) { + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } else { + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + break; + case VHOST_USER_NVME_IO_CMD: + qid = msg.payload.nvme_io.qid; + tail_head = msg.payload.nvme_io.tail_head; + is_submission_queue = (msg.payload.nvme_io.queue_type == VHOST_USER_NVME_SUBMISSION_QUEUE) ? true : false; + vhost_user_nvme_io_request_passthrough(dev, qid, tail_head, is_submission_queue); + break; + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(dev); + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + vhost_user_set_features(dev, msg.payload.u64); + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + ret = vhost_user_set_mem_table(dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg); + break; + + case VHOST_USER_GET_VRING_BASE: + vhost_user_get_vring_base(dev, &msg); + msg.size = sizeof(msg.payload.state); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + + default: + ret = -1; + break; + + } + + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + + return 0; +} diff --git a/src/spdk/lib/vhost/rte_vhost/vhost_user.h b/src/spdk/lib/vhost/rte_vhost/vhost_user.h new file mode 100644 index 00000000..cb5ff0a6 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost_user.h @@ -0,0 +1,182 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include <stdint.h> +#include <linux/vhost.h> + +#include "rte_vhost.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +/* + * Maximum size of virtio device config space + */ +#define VHOST_USER_MAX_CONFIG_SIZE 256 + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 +#define VHOST_USER_PROTOCOL_F_CONFIG 9 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_NVME_ADMIN = 80, + VHOST_USER_NVME_SET_CQ_CALL = 81, + VHOST_USER_NVME_GET_CAP = 82, + VHOST_USER_NVME_START_STOP = 83, + VHOST_USER_NVME_IO_CMD = 84, + VHOST_USER_MAX +} VhostUserRequest; + +typedef enum VhostUserSlaveRequest { + VHOST_USER_SLAVE_NONE = 0, + VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, + VHOST_USER_SLAVE_MAX +} VhostUserSlaveRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserConfig { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +} VhostUserConfig; + +enum VhostUserNvmeQueueTypes { + VHOST_USER_NVME_SUBMISSION_QUEUE = 1, + VHOST_USER_NVME_COMPLETION_QUEUE = 2, +}; + +typedef struct VhostUserNvmeIO { + enum VhostUserNvmeQueueTypes queue_type; + uint32_t qid; + uint32_t tail_head; +} VhostUserNvmeIO; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + VhostUserConfig config; + struct nvme { + union { + uint8_t req[64]; + uint8_t cqe[16]; + } cmd; + uint8_t buf[4096]; + } nvme; + struct VhostUserNvmeIO nvme_io; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c new file mode 100644 index 00000000..0cacf613 --- /dev/null +++ b/src/spdk/lib/vhost/vhost.c @@ -0,0 +1,1503 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/barrier.h" + +#include "spdk/vhost.h" +#include "vhost_internal.h" + +static uint32_t *g_num_ctrlrs; + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_dirname[PATH_MAX] = ""; + +struct spdk_vhost_dev_event_ctx { + /** Pointer to the controller obtained before enqueuing the event */ + struct spdk_vhost_dev *vdev; + + /** ID of the vdev to send event to. */ + unsigned vdev_id; + + /** User callback function to be executed on given lcore. */ + spdk_vhost_event_fn cb_fn; + + /** Semaphore used to signal that event is done. */ + sem_t sem; + + /** Response to be written by enqueued event. */ + int response; +}; + +static int new_connection(int vid); +static int start_device(int vid); +static void stop_device(int vid); +static void destroy_connection(int vid); +static int get_config(int vid, uint8_t *config, uint32_t len); +static int set_config(int vid, uint8_t *config, uint32_t offset, + uint32_t size, uint32_t flags); + +const struct vhost_device_ops g_spdk_vhost_ops = { + .new_device = start_device, + .destroy_device = stop_device, + .get_config = get_config, + .set_config = set_config, + .new_connection = new_connection, + .destroy_connection = destroy_connection, + .vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough, + .vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call, + .vhost_nvme_get_cap = spdk_vhost_nvme_get_cap, +}; + +static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER( + g_spdk_vhost_devices); +static pthread_mutex_t g_spdk_vhost_mutex = PTHREAD_MUTEX_INITIALIZER; + +void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len) +{ + void *vva; + uint64_t newlen; + + newlen = len; + vva = (void *)rte_vhost_va_from_guest_pa(vdev->mem, addr, &newlen); + if (newlen != len) { + return NULL; + } + + return vva; + +} + +static void +spdk_vhost_log_req_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_id) +{ + struct vring_desc *desc, *desc_table; + uint32_t desc_table_size; + int rc; + + if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) { + return; + } + + rc = spdk_vhost_vq_get_desc(vdev, virtqueue, req_id, &desc, &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Can't log used ring descriptors!\n"); + return; + } + + do { + if (spdk_vhost_vring_desc_is_wr(desc)) { + /* To be honest, only pages realy touched should be logged, but + * doing so would require tracking those changes in each backed. + * Also backend most likely will touch all/most of those pages so + * for lets assume we touched all pages passed to as writeable buffers. */ + rte_vhost_log_write(vdev->vid, desc->addr, desc->len); + } + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + } while (desc); +} + +static void +spdk_vhost_log_used_vring_elem(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t idx) +{ + uint64_t offset, len; + uint16_t vq_idx; + + if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) { + return; + } + + offset = offsetof(struct vring_used, ring[idx]); + len = sizeof(virtqueue->vring.used->ring[idx]); + vq_idx = virtqueue - vdev->virtqueue; + + rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len); +} + +static void +spdk_vhost_log_used_vring_idx(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue) +{ + uint64_t offset, len; + uint16_t vq_idx; + + if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) { + return; + } + + offset = offsetof(struct vring_used, idx); + len = sizeof(virtqueue->vring.used->idx); + vq_idx = virtqueue - vdev->virtqueue; + + rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len); +} + +/* + * Get available requests from avail ring. + */ +uint16_t +spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs, + uint16_t reqs_len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_avail *avail = vring->avail; + uint16_t size_mask = vring->size - 1; + uint16_t last_idx = vring->last_avail_idx, avail_idx = avail->idx; + uint16_t count, i; + + count = avail_idx - last_idx; + if (spdk_likely(count == 0)) { + return 0; + } + + if (spdk_unlikely(count > vring->size)) { + /* TODO: the queue is unrecoverably broken and should be marked so. + * For now we will fail silently and report there are no new avail entries. + */ + return 0; + } + + count = spdk_min(count, reqs_len); + vring->last_avail_idx += count; + for (i = 0; i < count; i++) { + reqs[i] = vring->avail->ring[(last_idx + i) & size_mask]; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", + last_idx, avail_idx, count); + + return count; +} + +static bool +spdk_vhost_vring_desc_is_indirect(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_INDIRECT); +} + +int +spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size) +{ + if (spdk_unlikely(req_idx >= virtqueue->vring.size)) { + return -1; + } + + *desc = &virtqueue->vring.desc[req_idx]; + + if (spdk_vhost_vring_desc_is_indirect(*desc)) { + assert(spdk_vhost_dev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)); + *desc_table_size = (*desc)->len / sizeof(**desc); + *desc_table = spdk_vhost_gpa_to_vva(vdev, (*desc)->addr, + sizeof(**desc) * *desc_table_size); + *desc = *desc_table; + if (*desc == NULL) { + return -1; + } + + return 0; + } + + *desc_table = virtqueue->vring.desc; + *desc_table_size = virtqueue->vring.size; + + return 0; +} + +int +spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue) +{ + if (virtqueue->used_req_cnt == 0) { + return 0; + } + + virtqueue->req_cnt += virtqueue->used_req_cnt; + virtqueue->used_req_cnt = 0; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n", + virtqueue - vdev->virtqueue, virtqueue->vring.last_used_idx); + + eventfd_write(virtqueue->vring.callfd, (eventfd_t)1); + return 1; +} + + +static void +check_dev_io_stats(struct spdk_vhost_dev *vdev, uint64_t now) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint32_t irq_delay_base = vdev->coalescing_delay_time_base; + uint32_t io_threshold = vdev->coalescing_io_rate_threshold; + int32_t irq_delay; + uint32_t req_cnt; + uint16_t q_idx; + + if (now < vdev->next_stats_check_time) { + return; + } + + vdev->next_stats_check_time = now + vdev->stats_check_interval; + for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) { + virtqueue = &vdev->virtqueue[q_idx]; + + req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt; + if (req_cnt <= io_threshold) { + continue; + } + + irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold; + virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay); + + virtqueue->req_cnt = 0; + virtqueue->next_event_time = now; + } +} + +void +spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint64_t now; + uint16_t q_idx; + + if (vdev->coalescing_delay_time_base == 0) { + for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) { + virtqueue = &vdev->virtqueue[q_idx]; + + if (virtqueue->vring.desc == NULL || + (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + continue; + } + + spdk_vhost_vq_used_signal(vdev, virtqueue); + } + } else { + now = spdk_get_ticks(); + check_dev_io_stats(vdev, now); + + for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) { + virtqueue = &vdev->virtqueue[q_idx]; + + /* No need for event right now */ + if (now < virtqueue->next_event_time || + (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + continue; + } + + if (!spdk_vhost_vq_used_signal(vdev, virtqueue)) { + continue; + } + + /* Syscall is quite long so update time */ + now = spdk_get_ticks(); + virtqueue->next_event_time = now + virtqueue->irq_delay_time; + } + } +} + +int +spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, + uint32_t iops_threshold) +{ + uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL; + uint32_t io_rate = iops_threshold * SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS / 1000U; + + if (delay_time_base >= UINT32_MAX) { + SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us); + return -EINVAL; + } else if (io_rate == 0) { + SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate, + 1000U / SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS); + return -EINVAL; + } + + vdev->coalescing_delay_time_base = delay_time_base; + vdev->coalescing_io_rate_threshold = io_rate; + + vdev->coalescing_delay_us = delay_base_us; + vdev->coalescing_iops_threshold = iops_threshold; + return 0; +} + +void +spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us, + uint32_t *iops_threshold) +{ + if (delay_base_us) { + *delay_base_us = vdev->coalescing_delay_us; + } + + if (iops_threshold) { + *iops_threshold = vdev->coalescing_iops_threshold; + } +} + +/* + * Enqueue id and len to used ring. + */ +void +spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t id, uint32_t len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_used *used = vring->used; + uint16_t last_idx = vring->last_used_idx & (vring->size - 1); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n", + virtqueue - vdev->virtqueue, vring->last_used_idx, id, len); + + spdk_vhost_log_req_desc(vdev, virtqueue, id); + + vring->last_used_idx++; + used->ring[last_idx].id = id; + used->ring[last_idx].len = len; + + /* Ensure the used ring is updated before we log it or increment used->idx. */ + spdk_smp_wmb(); + + spdk_vhost_log_used_vring_elem(vdev, virtqueue, last_idx); + * (volatile uint16_t *) &used->idx = vring->last_used_idx; + spdk_vhost_log_used_vring_idx(vdev, virtqueue); + + /* Ensure all our used ring changes are visible to the guest at the time + * of interrupt. + * TODO: this is currently an sfence on x86. For other architectures we + * will most likely need an smp_mb(), but smp_mb() is an overkill for x86. + */ + spdk_wmb(); + + virtqueue->used_req_cnt++; +} + +int +spdk_vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size) +{ + struct vring_desc *old_desc = *desc; + uint16_t next_idx; + + if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { + *desc = NULL; + return 0; + } + + next_idx = old_desc->next; + if (spdk_unlikely(next_idx >= desc_table_size)) { + *desc = NULL; + return -1; + } + + *desc = &desc_table[next_idx]; + return 0; +} + +bool +spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +#define _2MB_OFFSET(ptr) ((ptr) & (0x200000 - 1)) + +int +spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc) +{ + uint32_t remaining = desc->len; + uint32_t to_boundary; + uint32_t len; + uintptr_t payload = desc->addr; + uintptr_t vva; + + while (remaining) { + if (*iov_index >= SPDK_VHOST_IOVS_MAX) { + SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX); + return -1; + } + vva = (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload); + if (vva == 0) { + SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload); + return -1; + } + to_boundary = 0x200000 - _2MB_OFFSET(payload); + if (spdk_likely(remaining <= to_boundary)) { + len = remaining; + } else { + /* + * Descriptor crosses a 2MB hugepage boundary. vhost memory regions are allocated + * from hugepage memory, so this means this descriptor may be described by + * discontiguous vhost memory regions. Do not blindly split on the 2MB boundary, + * only split it if the two sides of the boundary do not map to the same vhost + * memory region. This helps ensure we do not exceed the max number of IOVs + * defined by SPDK_VHOST_IOVS_MAX. + */ + len = to_boundary; + while (len < remaining) { + if (vva + len != (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload + len)) { + break; + } + len += spdk_min(remaining - len, 0x200000); + } + } + iov[*iov_index].iov_base = (void *)vva; + iov[*iov_index].iov_len = len; + remaining -= len; + payload += len; + (*iov_index)++; + } + + return 0; +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_find_by_id(unsigned id) +{ + struct spdk_vhost_dev *vdev; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (vdev->id == id) { + return vdev; + } + } + + return NULL; +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_find_by_vid(int vid) +{ + struct spdk_vhost_dev *vdev; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (vdev->vid == vid) { + return vdev; + } + } + + return NULL; +} + +#define SHIFT_2MB 21 +#define SIZE_2MB (1ULL << SHIFT_2MB) +#define FLOOR_2MB(x) (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB +#define CEIL_2MB(x) ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB + +static void +spdk_vhost_dev_mem_register(struct spdk_vhost_dev *vdev) +{ + struct rte_vhost_mem_region *region; + uint32_t i; + + for (i = 0; i < vdev->mem->nregions; i++) { + uint64_t start, end, len; + region = &vdev->mem->regions[i]; + start = FLOOR_2MB(region->mmap_addr); + end = CEIL_2MB(region->mmap_addr + region->mmap_size); + len = end - start; + SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", + start, len); + + if (spdk_mem_register((void *)start, len) != 0) { + SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n", + i); + continue; + } + } +} + +static void +spdk_vhost_dev_mem_unregister(struct spdk_vhost_dev *vdev) +{ + struct rte_vhost_mem_region *region; + uint32_t i; + + for (i = 0; i < vdev->mem->nregions; i++) { + uint64_t start, end, len; + region = &vdev->mem->regions[i]; + start = FLOOR_2MB(region->mmap_addr); + end = CEIL_2MB(region->mmap_addr + region->mmap_size); + len = end - start; + + if (spdk_vtophys((void *) start) == SPDK_VTOPHYS_ERROR) { + continue; /* region has not been registered */ + } + + if (spdk_mem_unregister((void *)start, len) != 0) { + assert(false); + } + } + +} + +static void +spdk_vhost_free_reactor(uint32_t lcore) +{ + g_num_ctrlrs[lcore]--; +} + +struct spdk_vhost_dev * +spdk_vhost_dev_find(const char *ctrlr_name) +{ + struct spdk_vhost_dev *vdev; + size_t dev_dirname_len = strlen(dev_dirname); + + if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) { + ctrlr_name += dev_dirname_len; + } + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (strcmp(vdev->name, ctrlr_name) == 0) { + return vdev; + } + } + + return NULL; +} + +static int +spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) +{ + int rc; + + if (cpumask == NULL) { + return -1; + } + + if (mask == NULL) { + spdk_cpuset_copy(cpumask, spdk_app_get_core_mask()); + return 0; + } + + rc = spdk_app_parse_core_mask(mask, cpumask); + if (rc < 0) { + SPDK_ERRLOG("invalid cpumask %s\n", mask); + return -1; + } + + if (spdk_cpuset_count(cpumask) == 0) { + SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n", + spdk_cpuset_fmt(spdk_app_get_core_mask())); + return -1; + } + + return 0; +} + +static void * +_start_rte_driver(void *arg) +{ + char *path = arg; + + if (rte_vhost_driver_start(path) != 0) { + return NULL; + } + + return path; +} + +int +spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend) +{ + static unsigned ctrlr_num; + char path[PATH_MAX]; + struct stat file_stat; + struct spdk_cpuset *cpumask; + int rc; + + assert(vdev); + + /* We expect devices inside g_spdk_vhost_devices to be sorted in ascending + * order in regard of vdev->id. For now we always set vdev->id = ctrlr_num++ + * and append each vdev to the very end of g_spdk_vhost_devices list. + * This is required for foreach vhost events to work. + */ + if (ctrlr_num == UINT_MAX) { + assert(false); + return -EINVAL; + } + + if (name == NULL) { + SPDK_ERRLOG("Can't register controller with no name\n"); + return -EINVAL; + } + + cpumask = spdk_cpuset_alloc(); + if (!cpumask) { + SPDK_ERRLOG("spdk_cpuset_alloc failed\n"); + return -ENOMEM; + } + + if (spdk_vhost_parse_core_mask(mask_str, cpumask) != 0) { + SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n", + mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask())); + rc = -EINVAL; + goto out; + } + + if (spdk_vhost_dev_find(name)) { + SPDK_ERRLOG("vhost controller %s already exists.\n", name); + rc = -EEXIST; + goto out; + } + + if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) { + SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname, + name); + rc = -EINVAL; + goto out; + } + + /* Register vhost driver to handle vhost messages. */ + if (stat(path, &file_stat) != -1) { + if (!S_ISSOCK(file_stat.st_mode)) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The file already exists and is not a socket.\n", + path); + rc = -EIO; + goto out; + } else if (unlink(path) != 0) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The socket already exists and failed to unlink.\n", + path); + rc = -EIO; + goto out; + } + } + + if (rte_vhost_driver_register(path, 0) != 0) { + SPDK_ERRLOG("Could not register controller %s with vhost library\n", name); + SPDK_ERRLOG("Check if domain socket %s already exists\n", path); + rc = -EIO; + goto out; + } + if (rte_vhost_driver_set_features(path, backend->virtio_features) || + rte_vhost_driver_disable_features(path, backend->disabled_features)) { + SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name); + + rte_vhost_driver_unregister(path); + rc = -EIO; + goto out; + } + + if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) { + rte_vhost_driver_unregister(path); + SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name); + rc = -EIO; + goto out; + } + + /* The following might start a POSIX thread that polls for incoming + * socket connections and calls backend->start/stop_device. These backend + * callbacks are also protected by the global SPDK vhost mutex, so we're + * safe with not initializing the vdev just yet. + */ + if (spdk_call_unaffinitized(_start_rte_driver, path) == NULL) { + SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n", + name, errno, spdk_strerror(errno)); + rte_vhost_driver_unregister(path); + rc = -EIO; + goto out; + } + + vdev->name = strdup(name); + vdev->path = strdup(path); + vdev->id = ctrlr_num++; + vdev->vid = -1; + vdev->lcore = -1; + vdev->cpumask = cpumask; + vdev->registered = true; + vdev->backend = backend; + + spdk_vhost_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US, + SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD); + vdev->next_stats_check_time = 0; + vdev->stats_check_interval = SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS * spdk_get_ticks_hz() / + 1000UL; + + TAILQ_INSERT_TAIL(&g_spdk_vhost_devices, vdev, tailq); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name); + return 0; + +out: + spdk_cpuset_free(cpumask); + return rc; +} + +int +spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev) +{ + if (vdev->vid != -1) { + SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name); + return -EBUSY; + } + + if (vdev->registered && rte_vhost_driver_unregister(vdev->path) != 0) { + SPDK_ERRLOG("Could not unregister controller %s with vhost library\n" + "Check if domain socket %s still exists\n", + vdev->name, vdev->path); + return -EIO; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name); + + free(vdev->name); + free(vdev->path); + spdk_cpuset_free(vdev->cpumask); + TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq); + return 0; +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_next(unsigned i) +{ + struct spdk_vhost_dev *vdev; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (vdev->id > i) { + return vdev; + } + } + + return NULL; +} + +const char * +spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return vdev->name; +} + +const struct spdk_cpuset * +spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return vdev->cpumask; +} + +static uint32_t +spdk_vhost_allocate_reactor(struct spdk_cpuset *cpumask) +{ + uint32_t i, selected_core; + uint32_t min_ctrlrs; + + min_ctrlrs = INT_MAX; + selected_core = spdk_env_get_first_core(); + + SPDK_ENV_FOREACH_CORE(i) { + if (!spdk_cpuset_get_cpu(cpumask, i)) { + continue; + } + + if (g_num_ctrlrs[i] < min_ctrlrs) { + selected_core = i; + min_ctrlrs = g_num_ctrlrs[i]; + } + } + + g_num_ctrlrs[selected_core]++; + return selected_core; +} + +void +spdk_vhost_dev_backend_event_done(void *event_ctx, int response) +{ + struct spdk_vhost_dev_event_ctx *ctx = event_ctx; + + ctx->response = response; + sem_post(&ctx->sem); +} + +static void +spdk_vhost_event_cb(void *arg1, void *arg2) +{ + struct spdk_vhost_dev_event_ctx *ctx = arg1; + + ctx->cb_fn(ctx->vdev, ctx); +} + +static void +spdk_vhost_event_async_fn(void *arg1, void *arg2) +{ + struct spdk_vhost_dev_event_ctx *ctx = arg1; + struct spdk_vhost_dev *vdev; + struct spdk_event *ev; + + if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) { + ev = spdk_event_allocate(spdk_env_get_current_core(), spdk_vhost_event_async_fn, arg1, arg2); + spdk_event_call(ev); + return; + } + + vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id); + if (vdev != ctx->vdev) { + /* vdev has been changed after enqueuing this event */ + vdev = NULL; + } + + if (vdev != NULL && vdev->lcore >= 0 && + (uint32_t)vdev->lcore != spdk_env_get_current_core()) { + /* if vdev has been relocated to other core, it is no longer thread-safe + * to access its contents here. Even though we're running under global vhost + * mutex, the controller itself (and its pollers) are not. We need to chase + * the vdev thread as many times as necessary. + */ + ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_async_fn, arg1, arg2); + spdk_event_call(ev); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + ctx->cb_fn(vdev, arg2); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + + free(ctx); +} + +static void spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev, + spdk_vhost_event_fn fn, void *arg); + +static void +spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2) +{ + struct spdk_vhost_dev_event_ctx *ctx = arg1; + struct spdk_vhost_dev *vdev; + struct spdk_event *ev; + + if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) { + ev = spdk_event_allocate(spdk_env_get_current_core(), + spdk_vhost_event_async_foreach_fn, arg1, arg2); + spdk_event_call(ev); + return; + } + + vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id); + if (vdev != ctx->vdev) { + /* ctx->vdev is probably a dangling pointer at this point. + * It must have been removed in the meantime, so we just skip + * it in our foreach chain. */ + goto out_unlock_continue; + } + + /* the assert is just for static analyzers, vdev cannot be NULL here */ + assert(vdev != NULL); + if (vdev->lcore >= 0 && + (uint32_t)vdev->lcore != spdk_env_get_current_core()) { + /* if vdev has been relocated to other core, it is no longer thread-safe + * to access its contents here. Even though we're running under global vhost + * mutex, the controller itself (and its pollers) are not. We need to chase + * the vdev thread as many times as necessary. + */ + ev = spdk_event_allocate(vdev->lcore, + spdk_vhost_event_async_foreach_fn, arg1, arg2); + spdk_event_call(ev); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + ctx->cb_fn(vdev, arg2); + +out_unlock_continue: + vdev = spdk_vhost_dev_next(ctx->vdev_id); + spdk_vhost_external_event_foreach_continue(vdev, ctx->cb_fn, arg2); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + + free(ctx); +} + +static int +_spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, + unsigned timeout_sec, const char *errmsg) +{ + struct spdk_vhost_dev_event_ctx ev_ctx = {0}; + struct spdk_event *ev; + struct timespec timeout; + int rc; + + rc = sem_init(&ev_ctx.sem, 0, 0); + if (rc != 0) { + SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n"); + return -errno; + } + + ev_ctx.vdev = vdev; + ev_ctx.cb_fn = cb_fn; + ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_cb, &ev_ctx, NULL); + assert(ev); + spdk_event_call(ev); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_sec += timeout_sec; + + rc = sem_timedwait(&ev_ctx.sem, &timeout); + if (rc != 0) { + SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg); + sem_wait(&ev_ctx.sem); + } + + sem_destroy(&ev_ctx.sem); + pthread_mutex_lock(&g_spdk_vhost_mutex); + return ev_ctx.response; +} + +static int +spdk_vhost_event_async_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, void *arg, + bool foreach) +{ + struct spdk_vhost_dev_event_ctx *ev_ctx; + struct spdk_event *ev; + spdk_event_fn fn; + + ev_ctx = calloc(1, sizeof(*ev_ctx)); + if (ev_ctx == NULL) { + SPDK_ERRLOG("Failed to alloc vhost event.\n"); + assert(false); + return -ENOMEM; + } + + ev_ctx->vdev = vdev; + ev_ctx->vdev_id = vdev->id; + ev_ctx->cb_fn = cb_fn; + + fn = foreach ? spdk_vhost_event_async_foreach_fn : spdk_vhost_event_async_fn; + ev = spdk_event_allocate(ev_ctx->vdev->lcore, fn, ev_ctx, arg); + assert(ev); + spdk_event_call(ev); + + return 0; +} + +static void +stop_device(int vid) +{ + struct spdk_vhost_dev *vdev; + struct rte_vhost_vring *q; + int rc; + uint16_t i; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to stop.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + if (vdev->lcore == -1) { + SPDK_ERRLOG("Controller %s is not loaded.\n", vdev->name); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + rc = _spdk_vhost_event_send(vdev, vdev->backend->stop_device, 3, "stop device"); + if (rc != 0) { + SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + for (i = 0; i < vdev->max_queues; i++) { + q = &vdev->virtqueue[i].vring; + if (q->desc == NULL) { + continue; + } + rte_vhost_set_vhost_vring_last_idx(vdev->vid, i, q->last_avail_idx, q->last_used_idx); + } + + spdk_vhost_dev_mem_unregister(vdev); + free(vdev->mem); + spdk_vhost_free_reactor(vdev->lcore); + vdev->lcore = -1; + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +static int +start_device(int vid) +{ + struct spdk_vhost_dev *vdev; + int rc = -1; + uint16_t i; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid); + goto out; + } + + if (vdev->lcore != -1) { + SPDK_ERRLOG("Controller %s already loaded.\n", vdev->name); + goto out; + } + + vdev->max_queues = 0; + memset(vdev->virtqueue, 0, sizeof(vdev->virtqueue)); + for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { + if (rte_vhost_get_vhost_vring(vid, i, &vdev->virtqueue[i].vring)) { + continue; + } + + if (vdev->virtqueue[i].vring.desc == NULL || + vdev->virtqueue[i].vring.size == 0) { + continue; + } + + /* Disable notifications. */ + if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i); + goto out; + } + + vdev->max_queues = i + 1; + } + + if (rte_vhost_get_negotiated_features(vid, &vdev->negotiated_features) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); + goto out; + } + + if (rte_vhost_get_mem_table(vid, &vdev->mem) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); + goto out; + } + + /* + * Not sure right now but this look like some kind of QEMU bug and guest IO + * might be frozed without kicking all queues after live-migration. This look like + * the previous vhost instance failed to effectively deliver all interrupts before + * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts + * should be ignored by guest virtio driver. + * + * Tested on QEMU 2.10.91 and 2.11.50. + */ + for (i = 0; i < vdev->max_queues; i++) { + if (vdev->virtqueue[i].vring.callfd != -1) { + eventfd_write(vdev->virtqueue[i].vring.callfd, (eventfd_t)1); + } + } + + vdev->lcore = spdk_vhost_allocate_reactor(vdev->cpumask); + spdk_vhost_dev_mem_register(vdev); + rc = _spdk_vhost_event_send(vdev, vdev->backend->start_device, 3, "start device"); + if (rc != 0) { + spdk_vhost_dev_mem_unregister(vdev); + free(vdev->mem); + spdk_vhost_free_reactor(vdev->lcore); + vdev->lcore = -1; + } + +out: + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return rc; +} + +static int +get_config(int vid, uint8_t *config, uint32_t len) +{ + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid); + goto out; + } + + if (vdev->backend->vhost_get_config) { + rc = vdev->backend->vhost_get_config(vdev, config, len); + } + +out: + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return rc; +} + +static int +set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags) +{ + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid); + goto out; + } + + if (vdev->backend->vhost_set_config) { + rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags); + } + +out: + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return rc; +} + +int +spdk_vhost_set_socket_path(const char *basename) +{ + int ret; + + if (basename && strlen(basename) > 0) { + ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename); + if (ret <= 0) { + return -EINVAL; + } + if ((size_t)ret >= sizeof(dev_dirname) - 2) { + SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret); + return -EINVAL; + } + + if (dev_dirname[ret - 1] != '/') { + dev_dirname[ret] = '/'; + dev_dirname[ret + 1] = '\0'; + } + } + + return 0; +} + +static void * +session_shutdown(void *arg) +{ + struct spdk_vhost_dev *vdev = NULL; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + rte_vhost_driver_unregister(vdev->path); + vdev->registered = false; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n"); + spdk_event_call((struct spdk_event *)arg); + return NULL; +} + +void +spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + assert(vdev->backend->dump_info_json != NULL); + vdev->backend->dump_info_json(vdev, w); +} + +int +spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev) +{ + return vdev->backend->remove_device(vdev); +} + +static int +new_connection(int vid) +{ + struct spdk_vhost_dev *vdev; + char ifname[PATH_MAX]; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) { + SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return -1; + } + + vdev = spdk_vhost_dev_find(ifname); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return -1; + } + + /* since pollers are not running it safe not to use spdk_event here */ + if (vdev->vid != -1) { + SPDK_ERRLOG("Device with vid %d is already connected.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return -1; + } + + vdev->vid = vid; + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return 0; +} + +static void +destroy_connection(int vid) +{ + struct spdk_vhost_dev *vdev; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to destroy connection for.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + /* since pollers are not running it safe not to use spdk_event here */ + vdev->vid = -1; + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +void +spdk_vhost_call_external_event(const char *ctrlr_name, spdk_vhost_event_fn fn, void *arg) +{ + struct spdk_vhost_dev *vdev; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find(ctrlr_name); + + if (vdev == NULL) { + pthread_mutex_unlock(&g_spdk_vhost_mutex); + fn(NULL, arg); + return; + } + + if (vdev->lcore == -1) { + fn(vdev, arg); + } else { + spdk_vhost_event_async_send(vdev, fn, arg, false); + } + + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +static void +spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev, + spdk_vhost_event_fn fn, void *arg) +{ + if (vdev == NULL) { + fn(NULL, arg); + return; + } + + while (vdev->lcore == -1) { + fn(vdev, arg); + vdev = spdk_vhost_dev_next(vdev->id); + if (vdev == NULL) { + fn(NULL, arg); + return; + } + } + + spdk_vhost_event_async_send(vdev, fn, arg, true); +} + +void +spdk_vhost_call_external_event_foreach(spdk_vhost_event_fn fn, void *arg) +{ + struct spdk_vhost_dev *vdev; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = TAILQ_FIRST(&g_spdk_vhost_devices); + spdk_vhost_external_event_foreach_continue(vdev, fn, arg); + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +void +spdk_vhost_lock(void) +{ + pthread_mutex_lock(&g_spdk_vhost_mutex); +} + +void +spdk_vhost_unlock(void) +{ + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +int +spdk_vhost_init(void) +{ + uint32_t last_core; + size_t len; + int ret; + + if (dev_dirname[0] == '\0') { + if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) { + SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno)); + return -1; + } + + len = strlen(dev_dirname); + if (dev_dirname[len - 1] != '/') { + dev_dirname[len] = '/'; + dev_dirname[len + 1] = '\0'; + } + } + + last_core = spdk_env_get_last_core(); + g_num_ctrlrs = calloc(last_core + 1, sizeof(uint32_t)); + if (!g_num_ctrlrs) { + SPDK_ERRLOG("Could not allocate array size=%u for g_num_ctrlrs\n", + last_core + 1); + return -1; + } + + ret = spdk_vhost_scsi_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost controllers\n"); + return -1; + } + + ret = spdk_vhost_blk_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost block controllers\n"); + return -1; + } + + ret = spdk_vhost_nvme_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n"); + return -1; + } + + return 0; +} + +static int +_spdk_vhost_fini_remove_vdev_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + spdk_vhost_fini_cb fini_cb = arg; + + if (vdev != NULL) { + spdk_vhost_dev_remove(vdev); + return 0; + } + + /* All devices are removed now. */ + free(g_num_ctrlrs); + fini_cb(); + return 0; +} + +static void +_spdk_vhost_fini(void *arg1, void *arg2) +{ + spdk_vhost_fini_cb fini_cb = arg1; + + spdk_vhost_call_external_event_foreach(_spdk_vhost_fini_remove_vdev_cb, fini_cb); +} + +void +spdk_vhost_fini(spdk_vhost_fini_cb fini_cb) +{ + pthread_t tid; + int rc; + struct spdk_event *fini_ev; + + fini_ev = spdk_event_allocate(spdk_env_get_current_core(), _spdk_vhost_fini, fini_cb, NULL); + + /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK + * ops for stopping a device or removing a connection, we need to call it from + * a separate thread to avoid deadlock. + */ + rc = pthread_create(&tid, NULL, &session_shutdown, fini_ev); + if (rc < 0) { + SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc)); + abort(); + } + pthread_detach(tid); +} + +struct spdk_vhost_write_config_json_ctx { + struct spdk_json_write_ctx *w; + struct spdk_event *done_ev; +}; + +static int +spdk_vhost_config_json_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct spdk_vhost_write_config_json_ctx *ctx = arg; + uint32_t delay_base_us; + uint32_t iops_threshold; + + if (vdev == NULL) { + spdk_json_write_array_end(ctx->w); + spdk_event_call(ctx->done_ev); + free(ctx); + return 0; + } + + vdev->backend->write_config_json(vdev, ctx->w); + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + if (delay_base_us) { + spdk_json_write_object_begin(ctx->w); + spdk_json_write_named_string(ctx->w, "method", "set_vhost_controller_coalescing"); + + spdk_json_write_named_object_begin(ctx->w, "params"); + spdk_json_write_named_string(ctx->w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(ctx->w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(ctx->w, "iops_threshold", iops_threshold); + spdk_json_write_object_end(ctx->w); + + spdk_json_write_object_end(ctx->w); + } + + return 0; +} + +void +spdk_vhost_config_json(struct spdk_json_write_ctx *w, struct spdk_event *done_ev) +{ + struct spdk_vhost_write_config_json_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_event_call(done_ev); + return; + } + + ctx->w = w; + ctx->done_ev = done_ev; + + spdk_json_write_array_begin(w); + + spdk_vhost_call_external_event_foreach(spdk_vhost_config_json_cb, ctx); +} + +SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST) +SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING) diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c new file mode 100644 index 00000000..6a9a1896 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_blk.c @@ -0,0 +1,901 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/virtio_blk.h> + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/vhost.h" + +#include "vhost_internal.h" + +struct spdk_vhost_blk_task { + struct spdk_bdev_io *bdev_io; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_vhost_virtqueue *vq; + + volatile uint8_t *status; + + uint16_t req_idx; + + /* for io wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + /** Number of bytes that were written. */ + uint32_t used_len; + uint16_t iovcnt; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; +}; + +struct spdk_vhost_blk_dev { + struct spdk_vhost_dev vdev; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_poller *requestq_poller; + struct spdk_vhost_dev_destroy_ctx destroy_ctx; + bool readonly; +}; + +/* forward declaration */ +static const struct spdk_vhost_dev_backend vhost_blk_device_backend; + +static int +process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev, + struct spdk_vhost_virtqueue *vq); + +static void +blk_task_finish(struct spdk_vhost_blk_task *task) +{ + assert(task->bvdev->vdev.task_cnt > 0); + task->bvdev->vdev.task_cnt--; + task->used = false; +} + +static void +invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) +{ + if (task->status) { + *task->status = status; + } + + spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx, + task->used_len); + blk_task_finish(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * total size of suplied buffers + * + * FIXME: Make this function return to rd_cnt and wr_cnt + */ +static int +blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx, + struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) +{ + struct vring_desc *desc, *desc_table; + uint16_t out_cnt = 0, cnt = 0; + uint32_t desc_table_size, len = 0; + int rc; + + rc = spdk_vhost_vq_get_desc(vdev, vq, req_idx, &desc, &desc_table, &desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); + return -1; + } + + while (1) { + /* + * Maximum cnt reached? + * Should not happen if request is well formatted, otherwise this is a BUG. + */ + if (spdk_unlikely(cnt == *iovs_cnt)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n", + req_idx); + return -1; + } + + if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &cnt, desc))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", + req_idx, cnt); + return -1; + } + + len += desc->len; + + out_cnt += spdk_vhost_vring_desc_is_wr(desc); + + rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n", + vdev->name, req_idx); + return -1; + } else if (desc == NULL) { + break; + } + } + + /* + * There must be least two descriptors. + * First contain request so it must be readable. + * Last descriptor contain buffer for response so it must be writable. + */ + if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { + return -1; + } + + *length = len; + *iovs_cnt = cnt; + return 0; +} + +static void +blk_request_finish(bool success, struct spdk_vhost_blk_task *task) +{ + *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx, + task->used_len); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task, + task->req_idx, success ? "OK" : "FAIL"); + blk_task_finish(task); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_blk_task *task = cb_arg; + + spdk_bdev_free_io(bdev_io); + blk_request_finish(success, task); +} + +static void +blk_request_resubmit(void *arg) +{ + struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; + int rc = 0; + + rc = process_blk_request(task, task->bvdev, task->vq); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task); + } +} + +static inline void +blk_request_queue_io(struct spdk_vhost_blk_task *task) +{ + int rc; + struct spdk_vhost_blk_dev *bvdev = task->bvdev; + struct spdk_bdev *bdev = bvdev->bdev; + + task->bdev_io_wait.bdev = bdev; + task->bdev_io_wait.cb_fn = blk_request_resubmit; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + } +} + +static int +process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev, + struct spdk_vhost_virtqueue *vq) +{ + const struct virtio_blk_outhdr *req; + struct iovec *iov; + uint32_t type; + uint32_t payload_len; + int rc; + + if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); + /* Only READ and WRITE are supported for now. */ + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + iov = &task->iovs[0]; + if (spdk_unlikely(iov->iov_len != sizeof(*req))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", + iov->iov_len, sizeof(*req), task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + req = iov->iov_base; + + iov = &task->iovs[task->iovcnt - 1]; + if (spdk_unlikely(iov->iov_len != 1)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", + iov->iov_len, 1, task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + task->status = iov->iov_base; + payload_len -= sizeof(*req) + sizeof(*task->status); + task->iovcnt -= 2; + + type = req->type; +#ifdef VIRTIO_BLK_T_BARRIER + /* Don't care about barier for now (as QEMU's virtio-blk do). */ + type &= ~VIRTIO_BLK_T_BARRIER; +#endif + + switch (type) { + case VIRTIO_BLK_T_IN: + case VIRTIO_BLK_T_OUT: + if (spdk_unlikely((payload_len & (512 - 1)) != 0)) { + SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", + type ? "WRITE" : "READ", task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + if (type == VIRTIO_BLK_T_IN) { + task->used_len = payload_len + sizeof(*task->status); + rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else if (!bvdev->readonly) { + task->used_len = sizeof(*task->status); + rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n"); + rc = -1; + } + + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovcnt || !payload_len) { + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); + spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), + task->used_len, ' '); + blk_request_finish(true, task); + break; + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + return 0; +} + +static void +process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_blk_task *task; + int rc; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + if (!reqs_cnt) { + return; + } + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + bvdev->vdev.name, reqs[i], vq->vring.size); + spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0); + continue; + } + + task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + bvdev->vdev.name, reqs[i]); + spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0); + continue; + } + + bvdev->vdev.task_cnt++; + + task->used = true; + task->iovcnt = SPDK_COUNTOF(task->iovs); + task->status = NULL; + task->used_len = 0; + + rc = process_blk_request(task, bvdev, vq); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task, + reqs[i]); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]); + } + } +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + uint16_t q_idx; + + for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) { + process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]); + } + + spdk_vhost_dev_used_signal(&bvdev->vdev); + + return -1; +} + +static void +no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq) +{ + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + uint32_t length; + uint16_t iovcnt, req_idx; + + if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { + return; + } + + iovcnt = SPDK_COUNTOF(iovs); + if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) { + *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); + } + + spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, req_idx, 0); +} + +static int +no_bdev_vdev_worker(void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + uint16_t q_idx; + + for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) { + no_bdev_process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]); + } + + spdk_vhost_dev_used_signal(&bvdev->vdev); + + if (bvdev->vdev.task_cnt == 0 && bvdev->bdev_io_channel) { + spdk_put_io_channel(bvdev->bdev_io_channel); + bvdev->bdev_io_channel = NULL; + } + + return -1; +} + +static struct spdk_vhost_blk_dev * +to_blk_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return NULL; + } + + if (vdev->backend != &vhost_blk_device_backend) { + SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); +} + +struct spdk_bdev * +spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + + assert(bvdev != NULL); + return bvdev->bdev; +} + +static int +_bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + + SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n", + bvdev->vdev.name); + if (bvdev->requestq_poller) { + spdk_poller_unregister(&bvdev->requestq_poller); + bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0); + } + + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + bvdev->bdev = NULL; + return 0; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = remove_ctx; + + spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev); +} + +static void +free_task_pool(struct spdk_vhost_blk_dev *bvdev) +{ + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < bvdev->vdev.max_queues; i++) { + vq = &bvdev->vdev.virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_dma_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_blk_dev *bvdev) +{ + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_blk_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < bvdev->vdev.max_queues; i++) { + vq = &bvdev->vdev.virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(bvdev); + return -1; + } + vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL); + if (vq->tasks == NULL) { + SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + bvdev->vdev.name, task_cnt, i); + free_task_pool(bvdev); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; + task->bvdev = bvdev; + task->req_idx = j; + task->vq = vq; + } + } + + return 0; +} + +/* + * A new device is added to a data core. First the device is added to the main linked list + * and then allocated to a specific data core. + * + */ +static int +spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_blk_dev *bvdev; + int i, rc = 0; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n"); + rc = -1; + goto out; + } + + /* validate all I/O queues are in a contiguous index range */ + for (i = 0; i < vdev->max_queues; i++) { + if (vdev->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(bvdev); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name); + goto out; + } + + if (bvdev->bdev) { + bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); + if (!bvdev->bdev_io_channel) { + free_task_pool(bvdev); + SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name); + rc = -1; + goto out; + } + } + + bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, + bvdev, 0); + SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n", + vdev->name, vdev->lcore); +out: + spdk_vhost_dev_backend_event_done(event_ctx, rc); + return rc; +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + int i; + + if (bvdev->vdev.task_cnt > 0) { + return -1; + } + + for (i = 0; i < bvdev->vdev.max_queues; i++) { + bvdev->vdev.virtqueue[i].next_event_time = 0; + spdk_vhost_vq_used_signal(&bvdev->vdev, &bvdev->vdev.virtqueue[i]); + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name); + + if (bvdev->bdev_io_channel) { + spdk_put_io_channel(bvdev->bdev_io_channel); + bvdev->bdev_io_channel = NULL; + } + + free_task_pool(bvdev); + spdk_poller_unregister(&bvdev->destroy_ctx.poller); + spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0); + + return -1; +} + +static int +spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n"); + goto err; + } + + bvdev->destroy_ctx.event_ctx = event_ctx; + spdk_poller_unregister(&bvdev->requestq_poller); + bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, + bvdev, 1000); + return 0; + +err: + spdk_vhost_dev_backend_event_done(event_ctx, -1); + return -1; +} + +static void +spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev); + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + return; + } + + assert(bvdev != NULL); + spdk_json_write_name(w, "block"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "readonly"); + spdk_json_write_bool(w, bvdev->readonly); + + spdk_json_write_name(w, "bdev"); + if (bdev) { + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + } else { + spdk_json_write_null(w); + } + + spdk_json_write_object_end(w); +} + +static void +spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + return; + } + + if (!bvdev->bdev) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); + spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask)); + spdk_json_write_named_bool(w, "readonly", bvdev->readonly); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev); + +static int +spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_bdev *bdev; + uint32_t blk_size; + uint64_t blkcnt; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + SPDK_ERRLOG("Trying to get virito_blk configuration failed\n"); + return -1; + } + + if (len < sizeof(*blkcfg)) { + return -1; + } + + bdev = bvdev->bdev; + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = spdk_bdev_get_block_size(bdev); + blkcnt = spdk_bdev_get_num_blocks(bdev); + } + + memset(blkcfg, 0, sizeof(*blkcfg)); + blkcfg->blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg->min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg->capacity = (blkcnt * blk_size) / 512; + blkcfg->size_max = 131072; + /* -2 for REQ and RESP and -1 for region boundary splitting */ + blkcfg->seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; + /* QEMU can overwrite this value when started */ + blkcfg->num_queues = SPDK_VHOST_MAX_VQUEUES; + + return 0; +} + +static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { + .virtio_features = SPDK_VHOST_FEATURES | + (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | + (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) | + (1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) | + (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | + (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | + (1ULL << VIRTIO_BLK_F_MQ), + .disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) | + (1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | + (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI), + .start_device = spdk_vhost_blk_start, + .stop_device = spdk_vhost_blk_stop, + .vhost_get_config = spdk_vhost_blk_get_config, + .dump_info_json = spdk_vhost_blk_dump_info_json, + .write_config_json = spdk_vhost_blk_write_config_json, + .remove_device = spdk_vhost_blk_destroy, +}; + +int +spdk_vhost_blk_controller_construct(void) +{ + struct spdk_conf_section *sp; + unsigned ctrlr_num; + char *bdev_name; + char *cpumask; + char *name; + bool readonly; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false); + + bdev_name = spdk_conf_section_get_val(sp, "Dev"); + if (bdev_name == NULL) { + continue; + } + + if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) { + return -1; + } + } + + return 0; +} + +int +spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly) +{ + struct spdk_vhost_blk_dev *bvdev = NULL; + struct spdk_bdev *bdev; + int ret = 0; + + spdk_vhost_lock(); + bdev = spdk_bdev_get_by_name(dev_name); + if (bdev == NULL) { + SPDK_ERRLOG("Controller %s: bdev '%s' not found\n", + name, dev_name); + ret = -ENODEV; + goto out; + } + + bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL); + if (bvdev == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc); + if (ret != 0) { + SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n", + name, dev_name, ret); + goto out; + } + + bvdev->bdev = bdev; + bvdev->readonly = readonly; + ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend); + if (ret != 0) { + spdk_bdev_close(bvdev->bdev_desc); + goto out; + } + + if (readonly && rte_vhost_driver_enable_features(bvdev->vdev.path, (1ULL << VIRTIO_BLK_F_RO))) { + SPDK_ERRLOG("Controller %s: failed to set as a readonly\n", name); + spdk_bdev_close(bvdev->bdev_desc); + + if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) { + SPDK_ERRLOG("Controller %s: failed to remove controller\n", name); + } + + ret = -1; + goto out; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name); +out: + if (ret != 0 && bvdev) { + spdk_dma_free(bvdev); + } + spdk_vhost_unlock(); + return ret; +} + +static int +spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + int rc; + + if (!bvdev) { + return -EINVAL; + } + + rc = spdk_vhost_dev_unregister(&bvdev->vdev); + if (rc != 0) { + return rc; + } + + if (bvdev->bdev_desc) { + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + } + bvdev->bdev = NULL; + + spdk_dma_free(bvdev); + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK) +SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA) diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h new file mode 100644 index 00000000..9c0ad211 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_internal.h @@ -0,0 +1,277 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VHOST_INTERNAL_H +#define SPDK_VHOST_INTERNAL_H + +#include "spdk/stdinc.h" + +#include <rte_vhost.h> + +#include "spdk_internal/log.h" +#include "spdk/event.h" +#include "spdk/rpc.h" + +#define SPDK_CACHE_LINE_SIZE RTE_CACHE_LINE_SIZE + +#ifndef VHOST_USER_F_PROTOCOL_FEATURES +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#endif + +#ifndef VIRTIO_F_VERSION_1 +#define VIRTIO_F_VERSION_1 32 +#endif + +#ifndef VIRTIO_BLK_F_MQ +#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ +#endif + +#ifndef VIRTIO_BLK_F_CONFIG_WCE +#define VIRTIO_BLK_F_CONFIG_WCE 11 +#endif + +#define SPDK_VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8 + +#define SPDK_VHOST_IOVS_MAX 129 + +/* + * Rate at which stats are checked for interrupt coalescing. + */ +#define SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS 10 +/* + * Default threshold at which interrupts start to be coalesced. + */ +#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000 + +/* + * Currently coalescing is not used by default. + * Setting this to value > 0 here or by RPC will enable coalescing. + */ +#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0 + + +#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC)) + +#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY)) + +struct spdk_vhost_virtqueue { + struct rte_vhost_vring vring; + void *tasks; + + /* Request count from last stats check */ + uint32_t req_cnt; + + /* Request count from last event */ + uint16_t used_req_cnt; + + /* How long interrupt is delayed */ + uint32_t irq_delay_time; + + /* Next time when we need to send event */ + uint64_t next_event_time; + +} __attribute((aligned(SPDK_CACHE_LINE_SIZE))); + +struct spdk_vhost_dev_backend { + uint64_t virtio_features; + uint64_t disabled_features; + + /** + * Callbacks for starting and pausing the device. + * The first param is struct spdk_vhost_dev *. + * The second one is event context that has to be + * passed to spdk_vhost_dev_backend_event_done(). + */ + spdk_vhost_event_fn start_device; + spdk_vhost_event_fn stop_device; + + int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len); + int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t offset, uint32_t size, uint32_t flags); + + void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + int (*remove_device)(struct spdk_vhost_dev *vdev); +}; + +struct spdk_vhost_dev { + struct rte_vhost_memory *mem; + char *name; + char *path; + + /* Unique device ID. */ + unsigned id; + + /* rte_vhost device ID. */ + int vid; + int task_cnt; + int32_t lcore; + struct spdk_cpuset *cpumask; + bool registered; + + const struct spdk_vhost_dev_backend *backend; + + /* Saved orginal values used to setup coalescing to avoid integer + * rounding issues during save/load config. + */ + uint32_t coalescing_delay_us; + uint32_t coalescing_iops_threshold; + + uint32_t coalescing_delay_time_base; + + /* Threshold when event coalescing for virtqueue will be turned on. */ + uint32_t coalescing_io_rate_threshold; + + /* Next time when stats for event coalescing will be checked. */ + uint64_t next_stats_check_time; + + /* Interval used for event coalescing checking. */ + uint64_t stats_check_interval; + + uint16_t max_queues; + + uint64_t negotiated_features; + + struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES]; + + TAILQ_ENTRY(spdk_vhost_dev) tailq; +}; + +struct spdk_vhost_dev_destroy_ctx { + struct spdk_poller *poller; + void *event_ctx; +}; + +struct spdk_vhost_dev *spdk_vhost_dev_find(const char *ctrlr_name); + +void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len); + +uint16_t spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs, + uint16_t reqs_len); + +/** + * Get a virtio descriptor at given index in given virtqueue. + * The descriptor will provide access to the entire descriptor + * chain. The subsequent descriptors are accesible via + * \c spdk_vhost_vring_desc_get_next. + * \param vdev vhost device + * \param vq virtqueue + * \param req_idx descriptor index + * \param desc pointer to be set to the descriptor + * \param desc_table descriptor table to be used with + * \c spdk_vhost_vring_desc_get_next. This might be either + * default virtqueue descriptor table or per-chain indirect + * table. + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid. + * If -1 is returned, the content of params is undefined. + */ +int spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size); + +/** + * Send IRQ/call client (if pending) for \c vq. + * \param vdev vhost device + * \param vq virtqueue + * \return + * 0 - if no interrupt was signalled + * 1 - if interrupt was signalled + */ +int spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq); + + +/** + * Send IRQs for all queues that need to be signaled. + * \param vdev vhost device + * \param vq virtqueue + */ +void spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev); + +void spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, + uint16_t id, uint32_t len); + +/** + * Get subsequent descriptor from given table. + * \param desc current descriptor, will be set to the + * next descriptor (NULL in case this is the last + * descriptor in the chain or the next desc is invalid) + * \param desc_table descriptor table + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid + * The *desc* param will be set regardless of the + * return value. + */ +int spdk_vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size); +bool spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc); + +int spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc); + +static inline bool __attribute__((always_inline)) +spdk_vhost_dev_has_feature(struct spdk_vhost_dev *vdev, unsigned feature_id) +{ + return vdev->negotiated_features & (1ULL << feature_id); +} + +int spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend); +int spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev); + +int spdk_vhost_scsi_controller_construct(void); +int spdk_vhost_blk_controller_construct(void); +void spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); +void spdk_vhost_dev_backend_event_done(void *event_ctx, int response); +void spdk_vhost_lock(void); +void spdk_vhost_unlock(void); +int spdk_remove_vhost_controller(struct spdk_vhost_dev *vdev); +int spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf); +int spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd); +int spdk_vhost_nvme_get_cap(int vid, uint64_t *cap); +int spdk_vhost_nvme_controller_construct(void); +int spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues); +int spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev); +int spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, + const char *bdev_name); + +#endif /* SPDK_VHOST_INTERNAL_H */ diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c new file mode 100644 index 00000000..35015d93 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_nvme.c @@ -0,0 +1,1465 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "spdk/bdev.h" +#include "spdk/version.h" +#include "spdk/nvme_spec.h" +#include "spdk/likely.h" + +#include "vhost_internal.h" + +#define MAX_IO_QUEUES 31 +#define MAX_IOVS 64 +#define MAX_NAMESPACE 8 +#define MAX_QUEUE_ENTRIES_SUPPORTED 256 +#define MAX_BATCH_IO 8 + +struct spdk_vhost_nvme_sq { + uint16_t sqid; + uint16_t size; + uint16_t cqid; + bool valid; + struct spdk_nvme_cmd *sq_cmd; + uint16_t sq_head; + uint16_t sq_tail; +}; + +struct spdk_vhost_nvme_cq { + uint8_t phase; + uint16_t size; + uint16_t cqid; + bool valid; + volatile struct spdk_nvme_cpl *cq_cqe; + uint16_t cq_head; + uint16_t guest_signaled_cq_head; + uint32_t need_signaled_cnt; + STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks; + bool irq_enabled; + int virq; +}; + +struct spdk_vhost_nvme_ns { + struct spdk_bdev *bdev; + uint32_t block_size; + uint64_t capacity; + uint32_t nsid; + uint32_t active_ns; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_nvme_ns_data nsdata; +}; + +struct spdk_vhost_nvme_task { + struct spdk_nvme_cmd cmd; + struct spdk_vhost_nvme_dev *nvme; + uint16_t sqid; + uint16_t cqid; + + /** array of iovecs to transfer. */ + struct iovec iovs[MAX_IOVS]; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_ns *ns; + + /* parent pointer. */ + struct spdk_vhost_nvme_task *parent; + uint8_t dnr; + uint8_t sct; + uint8_t sc; + uint32_t num_children; + STAILQ_ENTRY(spdk_vhost_nvme_task) stailq; +}; + +struct spdk_vhost_nvme_dev { + struct spdk_vhost_dev vdev; + + uint32_t num_io_queues; + union spdk_nvme_cap_register cap; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + struct spdk_nvme_ctrlr_data cdata; + + uint32_t num_sqs; + uint32_t num_cqs; + + uint32_t num_ns; + struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE]; + + volatile uint32_t *dbbuf_dbs; + volatile uint32_t *dbbuf_eis; + struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1]; + struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1]; + + TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq; + STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks; + struct spdk_poller *requestq_poller; + struct spdk_vhost_dev_destroy_ctx destroy_ctx; +}; + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend; + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static int +spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task); + +static struct spdk_vhost_nvme_dev * +to_nvme_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev->backend != &spdk_vhost_nvme_device_backend) { + SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev); +} + +static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs); + +static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride) +{ + return qid * 2 * db_stride; +} + +static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride) +{ + return (qid * 2 + 1) * db_stride; +} + +static void +nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq) +{ + cq->cq_head++; + if (cq->cq_head >= cq->size) { + cq->cq_head = 0; + cq->phase = !cq->phase; + } +} + +static bool +nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq) +{ + return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head); +} + +static void +nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq) +{ + sq->sq_head = (sq->sq_head + 1) % sq->size; +} + +static struct spdk_vhost_nvme_sq * +spdk_vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->sq_queue[qid]; +} + +static struct spdk_vhost_nvme_cq * +spdk_vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->cq_queue[qid]; +} + +static int +spdk_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd, + struct spdk_vhost_nvme_task *task, uint32_t len) +{ + uint64_t prp1, prp2; + void *vva; + uint32_t i; + uint32_t residue_len, nents, mps = 4096; + uint64_t *prp_list; + + prp1 = cmd->dptr.prp.prp1; + prp2 = cmd->dptr.prp.prp2; + + /* PRP1 may started with unaligned page address */ + residue_len = mps - (prp1 % mps); + residue_len = spdk_min(len, residue_len); + + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp1, residue_len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("GPA to VVA failed\n"); + return -1; + } + task->iovs[0].iov_base = vva; + task->iovs[0].iov_len = residue_len; + len -= residue_len; + + if (len) { + if (spdk_unlikely(prp2 == 0)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PRP2=0 in command\n"); + return -1; + } + + if (len <= mps) { + /* 2 PRP used */ + task->iovcnt = 2; + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, len); + if (spdk_unlikely(vva == NULL)) { + return -1; + } + task->iovs[1].iov_base = vva; + task->iovs[1].iov_len = len; + } else { + /* PRP list used */ + nents = (len + mps - 1) / mps; + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, nents * sizeof(*prp_list)); + if (spdk_unlikely(vva == NULL)) { + return -1; + } + prp_list = vva; + i = 0; + while (len != 0) { + residue_len = spdk_min(len, mps); + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp_list[i], residue_len); + if (spdk_unlikely(vva == NULL)) { + return -1; + } + task->iovs[i + 1].iov_base = vva; + task->iovs[i + 1].iov_len = residue_len; + len -= residue_len; + i++; + } + task->iovcnt = i + 1; + } + } else { + /* 1 PRP used */ + task->iovcnt = 1; + } + + return 0; +} + +static void +spdk_nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_cq *cq; + uint32_t qid, cq_head; + + assert(nvme != NULL); + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq || !cq->valid) { + continue; + } + + cq_head = nvme->dbbuf_dbs[cq_offset(qid, 1)]; + if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) { + eventfd_write(cq->virq, (eventfd_t)1); + cq->need_signaled_cnt = 0; + } + } +} + +static void +spdk_vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_dev *nvme = task->nvme; + struct spdk_nvme_cpl cqe = {0}; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + struct spdk_nvme_cmd *cmd = &task->cmd; + uint16_t cqid = task->cqid; + uint16_t sqid = task->sqid; + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid); + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, sqid); + if (spdk_unlikely(!cq || !sq)) { + return; + } + + cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(cqid, 1)]; + if (spdk_unlikely(nvme_cq_is_full(cq))) { + STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq); + return; + } + + cqe.sqid = sqid; + cqe.sqhd = sq->sq_head; + cqe.cid = cmd->cid; + cqe.status.dnr = task->dnr; + cqe.status.sct = task->sct; + cqe.status.sc = task->sc; + cqe.status.p = !cq->phase; + cq->cq_cqe[cq->cq_head] = cqe; + spdk_smp_wmb(); + cq->cq_cqe[cq->cq_head].status.p = cq->phase; + + nvme_inc_cq_head(cq); + cq->need_signaled_cnt++; + + /* MMIO Controll */ + nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1); + + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *task = cb_arg; + struct spdk_nvme_cmd *cmd = &task->cmd; + int sc, sct; + + assert(bdev_io != NULL); + + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + spdk_bdev_free_io(bdev_io); + + task->dnr = !success; + task->sct = sct; + task->sc = sc; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10); + } + + spdk_vhost_nvme_task_complete(task); +} + +static void +blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *child = cb_arg; + struct spdk_vhost_nvme_task *task = child->parent; + struct spdk_vhost_nvme_dev *nvme = task->nvme; + int sct, sc; + + assert(bdev_io != NULL); + + task->num_children--; + if (!success) { + task->dnr = 1; + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + task->sct = sct; + task->sc = sc; + } + + spdk_bdev_free_io(bdev_io); + + if (!task->num_children) { + spdk_vhost_nvme_task_complete(task); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); +} + +static struct spdk_vhost_nvme_ns * +spdk_vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid) +{ + if (spdk_unlikely(!nsid || nsid > dev->num_ns)) { + return NULL; + } + + return &dev->ns[nsid - 1]; +} + +static void +vhost_nvme_resubmit_task(void *arg) +{ + struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg; + int rc; + + rc = spdk_nvme_process_sq(task->nvme, task->sq, task); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc); + } +} + +static int +vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task) +{ + int rc; + + task->bdev_io_wait.bdev = task->ns->bdev; + task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_vhost_nvme_task_complete(task); + } + + return rc; +} + +static int +spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_task *child; + struct spdk_nvme_cmd *cmd = &task->cmd; + struct spdk_vhost_nvme_ns *ns; + int ret = -1; + uint32_t len, nlba, block_size; + uint64_t slba; + struct spdk_nvme_dsm_range *range; + uint16_t i, num_ranges = 0; + + task->nvme = nvme; + task->dnr = 0; + task->sct = 0; + task->sc = 0; + + ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid); + if (spdk_unlikely(!ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + spdk_vhost_nvme_task_complete(task); + return -1; + } + + block_size = ns->block_size; + task->num_children = 0; + task->cqid = sq->cqid; + task->sqid = sq->sqid; + + task->ns = ns; + + if (spdk_unlikely(!ns->active_ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + spdk_vhost_nvme_task_complete(task); + return -1; + } + + /* valid only for Read/Write commands */ + nlba = (cmd->cdw12 & 0xffff) + 1; + slba = cmd->cdw11; + slba = (slba << 32) | cmd->cdw10; + + if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE || + cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + if (cmd->psdt != SPDK_NVME_PSDT_PRP) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n", + cmd->psdt >> 1, cmd->psdt & 1u); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + spdk_vhost_nvme_task_complete(task); + return -1; + } + + if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + num_ranges = (cmd->cdw10 & 0xff) + 1; + len = num_ranges * sizeof(struct spdk_nvme_dsm_range); + } else { + len = nlba * block_size; + } + + ret = spdk_nvme_map_prps(nvme, cmd, task, len); + if (spdk_unlikely(ret != 0)) { + SPDK_ERRLOG("nvme command map prps failed\n"); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + spdk_vhost_nvme_task_complete(task); + return -1; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_WRITE: + ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_FLUSH: + ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel, + 0, ns->capacity, + blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base; + for (i = 0; i < num_ranges; i++) { + if (!STAILQ_EMPTY(&nvme->free_tasks)) { + child = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + SPDK_ERRLOG("No free task now\n"); + ret = -1; + break; + } + task->num_children++; + child->parent = task; + ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel, + range[i].starting_lba * block_size, + range[i].length * block_size, + blk_unmap_complete_cb, child); + if (ret) { + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); + break; + } + } + break; + default: + ret = -1; + break; + } + + if (spdk_unlikely(ret)) { + if (ret == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n"); + task->sq = sq; + ret = vhost_nvme_queue_task(task); + } else { + /* post error status to cqe */ + SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_vhost_nvme_task_complete(task); + } + } + + return ret; +} + +static int +nvme_worker(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_task *task; + uint32_t qid, dbbuf_sq; + int ret; + int count = -1; + + if (spdk_unlikely(!nvme->num_sqs)) { + return -1; + } + + /* worker thread can't start before the admin doorbell + * buffer config command + */ + if (spdk_unlikely(!nvme->dbbuf_dbs)) { + return -1; + } + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq->valid) { + continue; + } + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, sq->cqid); + if (spdk_unlikely(!cq)) { + return -1; + } + cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(sq->cqid, 1)]; + if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) && + !nvme_cq_is_full(cq))) { + task = STAILQ_FIRST(&cq->cq_full_waited_tasks); + STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq); + spdk_vhost_nvme_task_complete(task); + } + + dbbuf_sq = nvme->dbbuf_dbs[sq_offset(qid, 1)]; + sq->sq_tail = (uint16_t)dbbuf_sq; + count = 0; + + while (sq->sq_head != sq->sq_tail) { + if (spdk_unlikely(!sq->sq_cmd)) { + break; + } + if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + return -1; + } + + task->cmd = sq->sq_cmd[sq->sq_head]; + nvme_inc_sq_head(sq); + + /* processing IO */ + ret = spdk_nvme_process_sq(nvme, sq, task); + if (spdk_unlikely(ret)) { + SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head, + sq->sq_tail); + } + + /* MMIO Control */ + nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1); + + /* Maximum batch I/Os to pick up at once */ + if (count++ == MAX_BATCH_IO) { + break; + } + } + } + + /* Completion Queue */ + spdk_nvme_cq_signal_fd(nvme); + + return count; +} + +static int +vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint64_t dbs_dma_addr, eis_dma_addr; + + dbs_dma_addr = cmd->dptr.prp.prp1; + eis_dma_addr = cmd->dptr.prp.prp2; + + if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) { + return -1; + } + /* Guest Physical Address to Host Virtual Address */ + nvme->dbbuf_dbs = spdk_vhost_gpa_to_vva(&nvme->vdev, dbs_dma_addr, 4096); + nvme->dbbuf_eis = spdk_vhost_gpa_to_vva(&nvme->vdev, eis_dma_addr, 4096); + if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) { + return -1; + } + /* zeroed the doorbell buffer memory */ + memset((void *)nvme->dbbuf_dbs, 0, 4096); + memset((void *)nvme->dbbuf_eis, 0, 4096); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid, qsize, cqid; + uint64_t dma_addr; + uint64_t requested_len; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + cqid = (cmd->cdw11 >> 16) & 0xffff; + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid); + if (!sq || !cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n", + qid, cqid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + + sq->sqid = qid; + sq->cqid = cqid; + sq->size = qsize + 1; + sq->sq_head = sq->sq_tail = 0; + requested_len = sizeof(struct spdk_nvme_cmd) * sq->size; + sq->sq_cmd = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len); + if (!sq->sq_cmd) { + return -1; + } + nvme->num_sqs++; + sq->valid = true; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_sq *sq; + + qid = cmd->cdw10 & 0xffff; + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq) { + return -1; + } + + /* We didn't see scenarios when deleting submission + * queue while I/O is running against the submisson + * queue for now, otherwise, we must ensure the poller + * will not run with this submission queue. + */ + nvme->num_sqs--; + sq->valid = false; + + memset(sq, 0, sizeof(*sq)); + sq->sq_cmd = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + + return 0; +} + +static int +vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qsize, qid; + uint64_t dma_addr; + struct spdk_vhost_nvme_cq *cq; + uint64_t requested_len; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + cq->cqid = qid; + cq->size = qsize + 1; + cq->phase = 1; + cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1; + /* Setup virq through vhost messages */ + cq->virq = -1; + cq->cq_head = 0; + cq->guest_signaled_cq_head = 0; + cq->need_signaled_cnt = 0; + requested_len = sizeof(struct spdk_nvme_cpl) * cq->size; + cq->cq_cqe = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len); + if (!cq->cq_cqe) { + return -1; + } + nvme->num_cqs++; + cq->valid = true; + STAILQ_INIT(&cq->cq_full_waited_tasks); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_cq *cq; + + qid = cmd->cdw10 & 0xffff; + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + nvme->num_cqs--; + cq->valid = false; + + memset(cq, 0, sizeof(*cq)); + cq->cq_cqe = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static struct spdk_vhost_nvme_dev * +spdk_vhost_nvme_get_by_name(int vid) +{ + struct spdk_vhost_nvme_dev *nvme; + + TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) { + if (nvme->vdev.vid == vid) { + return nvme; + } + } + + return NULL; +} + +int +spdk_vhost_nvme_get_cap(int vid, uint64_t *cap) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + *cap = nvme->cap.raw; + return 0; +} + +int +spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf) +{ + struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd; + struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe; + struct spdk_vhost_nvme_ns *ns; + int ret = 0; + struct spdk_vhost_nvme_dev *nvme; + uint32_t cq_head, sq_tail; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc); + switch (req->opc) { + case SPDK_NVME_OPC_IDENTIFY: + if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) { + memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data)); + + } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) { + ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, req->nsid); + if (!ns) { + cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE; + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + break; + } + memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data)); + } + /* successfully */ + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + case SPDK_NVME_OPC_CREATE_IO_CQ: + ret = vhost_nvme_create_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_CQ: + ret = vhost_nvme_delete_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_CREATE_IO_SQ: + ret = vhost_nvme_create_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_SQ: + ret = vhost_nvme_delete_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_GET_FEATURES: + case SPDK_NVME_OPC_SET_FEATURES: + if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) { + cpl->status.sc = 0; + cpl->status.sct = 0; + cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16); + } else { + cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD; + cpl->status.sct = SPDK_NVME_SCT_GENERIC; + } + break; + case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: + ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl); + break; + case SPDK_NVME_OPC_ABORT: + sq_tail = nvme->dbbuf_dbs[sq_offset(1, 1)] & 0xffffu; + cq_head = nvme->dbbuf_dbs[cq_offset(1, 1)] & 0xffffu; + SPDK_NOTICELOG("ABORT: CID %u, SQ_TAIL %u, CQ_HEAD %u\n", + (req->cdw10 >> 16) & 0xffffu, sq_tail, cq_head); + /* TODO: ABORT failed fow now */ + cpl->cdw0 = 1; + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + } + + if (ret) { + SPDK_ERRLOG("Admin Passthrough Faild with %u\n", req->opc); + } + + return 0; +} + +int +spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_nvme_cq *cq; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + if (cq->irq_enabled) { + cq->virq = fd; + } else { + SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_task *task; + + while (!STAILQ_EMPTY(&nvme->free_tasks)) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + spdk_dma_free(task); + } +} + +static int +alloc_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + uint32_t entries, i; + struct spdk_vhost_nvme_task *task; + + entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED; + + for (i = 0; i < entries; i++) { + task = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_task), + SPDK_CACHE_LINE_SIZE, NULL); + if (task == NULL) { + SPDK_ERRLOG("Controller %s alloc task pool failed\n", + nvme->vdev.name); + free_task_pool(nvme); + return -1; + } + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); + } + + return 0; +} + +/* new device means enable the + * virtual NVMe controller + */ +static int +spdk_vhost_nvme_start_device(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return -1; + } + + if (alloc_task_pool(nvme)) { + return -1; + } + + SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vdev->vid, + vdev->path, vdev->lcore); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc); + if (!ns_dev->bdev_io_channel) { + return -1; + } + } + + /* Start the NVMe Poller */ + nvme->requestq_poller = spdk_poller_register(nvme_worker, nvme, 0); + + spdk_vhost_dev_backend_event_done(event_ctx, 0); + return 0; +} + +static void +spdk_vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns) +{ + ns->active_ns = 0; + spdk_bdev_close(ns->bdev_desc); + ns->bdev_desc = NULL; + ns->bdev = NULL; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_nvme_ns *ns = remove_ctx; + + SPDK_NOTICELOG("Removing NS %u, Block Device %s\n", + ns->nsid, spdk_bdev_get_name(ns->bdev)); + + spdk_vhost_nvme_deactive_ns(ns); +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = arg; + struct spdk_vhost_nvme_dev *dev, *tmp; + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n"); + + TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) { + if (dev == nvme) { + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (ns_dev->bdev_io_channel) { + spdk_put_io_channel(ns_dev->bdev_io_channel); + ns_dev->bdev_io_channel = NULL; + } + } + nvme->num_sqs = 0; + nvme->num_cqs = 0; + nvme->dbbuf_dbs = NULL; + nvme->dbbuf_eis = NULL; + } + } + + spdk_poller_unregister(&nvme->destroy_ctx.poller); + spdk_vhost_dev_backend_event_done(nvme->destroy_ctx.event_ctx, 0); + + return -1; +} + +/* Disable NVMe controller + */ +static int +spdk_vhost_nvme_stop_device(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + + if (nvme == NULL) { + return -1; + } + + free_task_pool(nvme); + SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vdev->vid, vdev->path); + + nvme->destroy_ctx.event_ctx = event_ctx; + spdk_poller_unregister(&nvme->requestq_poller); + nvme->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, nvme, 1000); + + return 0; +} + +static void +spdk_vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_named_array_begin(w, "namespaces"); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid); + spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +spdk_vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_vhost_nvme_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues); + spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(nvme->vdev.cpumask)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "add_vhost_nvme_ns"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = { + .start_device = spdk_vhost_nvme_start_device, + .stop_device = spdk_vhost_nvme_stop_device, + .dump_info_json = spdk_vhost_nvme_dump_info_json, + .write_config_json = spdk_vhost_nvme_write_config_json, + .remove_device = spdk_vhost_nvme_dev_remove, +}; + +static int +spdk_vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + struct spdk_nvme_ns_data *nsdata; + uint64_t num_blocks; + uint32_t i; + + /* Identify Namespace */ + cdata->nn = dev->num_ns; + for (i = 0; i < dev->num_ns; i++) { + nsdata = &dev->ns[i].nsdata; + if (dev->ns[i].active_ns) { + num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev); + nsdata->nsze = num_blocks; + /* ncap must be non-zero for active Namespace */ + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev); + dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev); + dev->ns[i].capacity = num_blocks * dev->ns[i].block_size; + } else { + memset(nsdata, 0, sizeof(*nsdata)); + } + } + return 0; +} + +static int +spdk_vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + char sn[20]; + + /* Controller Capabilities */ + dev->cap.bits.cqr = 1; + dev->cap.bits.to = 1; + dev->cap.bits.dstrd = 0; + dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + dev->cap.bits.mpsmin = 0; + dev->cap.bits.mpsmax = 0; + /* MQES is 0 based value */ + dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1; + + /* Controller Configuration */ + dev->cc.bits.en = 0; + + /* Controller Status */ + dev->csts.bits.rdy = 0; + + /* Identify Controller */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + cdata->vid = 0x8086; + cdata->ssvid = 0x8086; + spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' '); + snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name); + spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' '); + cdata->ieee[0] = 0xe4; + cdata->ieee[1] = 0xd2; + cdata->ieee[2] = 0x5c; + cdata->ver.bits.mjr = 1; + cdata->ver.bits.mnr = 0; + cdata->mdts = 5; /* 128 KiB */ + cdata->rab = 6; + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->oncs.dsm = 1; + /* Emulated NVMe controller */ + cdata->oacs.doorbell_buffer_config = 1; + + spdk_vhost_nvme_ns_identify_update(dev); + + return 0; +} + +int +spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues) +{ + struct spdk_vhost_nvme_dev *dev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_dev), + SPDK_CACHE_LINE_SIZE, NULL); + int rc; + + if (dev == NULL) { + return -ENOMEM; + } + + if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) { + spdk_dma_free(dev); + return -EINVAL; + } + + spdk_vhost_lock(); + rc = spdk_vhost_dev_register(&dev->vdev, name, cpumask, + &spdk_vhost_nvme_device_backend); + + if (rc) { + spdk_dma_free(dev); + spdk_vhost_unlock(); + return rc; + } + + dev->num_io_queues = num_io_queues; + STAILQ_INIT(&dev->free_tasks); + TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq); + + spdk_vhost_nvme_ctrlr_identify_update(dev); + + SPDK_NOTICELOG("Controller %s: Constructed\n", name); + spdk_vhost_unlock(); + return rc; +} + +int +spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_dev *dev, *tmp; + struct spdk_vhost_nvme_ns *ns; + int rc; + uint32_t i; + + if (nvme == NULL) { + return -EINVAL; + } + + TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) { + if (dev == nvme) { + TAILQ_REMOVE(&g_nvme_ctrlrs, dev, tailq); + for (i = 0; i < nvme->num_ns; i++) { + ns = &nvme->ns[i]; + if (ns->active_ns) { + spdk_vhost_nvme_deactive_ns(ns); + } + } + } + } + + rc = spdk_vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + spdk_dma_free(nvme); + return 0; +} + +int +spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + struct spdk_bdev *bdev; + int rc = -1; + + if (nvme == NULL) { + return -ENODEV; + } + + if (nvme->num_ns == MAX_NAMESPACE) { + SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns); + return -ENOSPC; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("could not find bdev %s\n", bdev_name); + return -ENODEV; + } + + ns = &nvme->ns[nvme->num_ns]; + rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("Could not open bdev '%s', error=%d\n", + bdev_name, rc); + return rc; + } + + nvme->ns[nvme->num_ns].bdev = bdev; + nvme->ns[nvme->num_ns].active_ns = 1; + nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1; + nvme->num_ns++; + + spdk_vhost_nvme_ns_identify_update(nvme); + + return rc; +} + +int +spdk_vhost_nvme_controller_construct(void) +{ + struct spdk_conf_section *sp; + const char *name; + const char *bdev_name; + const char *cpumask; + int rc, i = 0; + struct spdk_vhost_dev *vdev; + uint32_t ctrlr_num, io_queues; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + rc = spdk_conf_section_get_intval(sp, "NumberOfQueues"); + if (rc > 0) { + io_queues = rc; + } else { + io_queues = 1; + } + + rc = spdk_vhost_nvme_dev_construct(name, cpumask, io_queues); + if (rc < 0) { + SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num); + return -1; + } + + vdev = spdk_vhost_dev_find(name); + if (!vdev) { + return -1; + } + + for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0); + if (!bdev_name) { + SPDK_ERRLOG("namespace configuration missing bdev name\n"); + break; + } + rc = spdk_vhost_nvme_dev_add_ns(vdev, bdev_name); + if (rc < 0) { + SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n", + ctrlr_num, bdev_name); + break; + } + } + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME) diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c new file mode 100644 index 00000000..0e546c36 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_rpc.c @@ -0,0 +1,814 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk/scsi.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" +#include "spdk/bdev.h" + +struct rpc_vhost_scsi_ctrlr { + char *ctrlr; + char *cpumask; +}; + +static void +free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static const struct spdk_json_object_decoder rpc_construct_vhost_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_construct_vhost_scsi_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_scsi_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_scsi_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_scsi_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("construct_vhost_scsi_controller", spdk_rpc_construct_vhost_scsi_controller, + SPDK_RPC_RUNTIME) + +struct rpc_add_vhost_scsi_ctrlr_lun { + char *ctrlr; + uint32_t scsi_target_num; + char *bdev_name; + + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_add_vhost_scsi_ctrlr_lun(struct rpc_add_vhost_scsi_ctrlr_lun *req) +{ + free(req->ctrlr); + free(req->bdev_name); + free(req); +} + +static const struct spdk_json_object_decoder rpc_vhost_add_lun[] = { + {"ctrlr", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, scsi_target_num), spdk_json_decode_uint32}, + {"bdev_name", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, bdev_name), spdk_json_decode_string }, +}; + +static int +spdk_rpc_add_vhost_scsi_lun_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_add_vhost_scsi_ctrlr_lun *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_add_tgt(vdev, rpc->scsi_target_num, rpc->bdev_name); + if (rc < 0) { + goto invalid; + } + + free_rpc_add_vhost_scsi_ctrlr_lun(rpc); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return -1; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; + +invalid: + free_rpc_add_vhost_scsi_ctrlr_lun(rpc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + return rc; +} + +static void +spdk_rpc_add_vhost_scsi_lun(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_add_vhost_scsi_ctrlr_lun *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_vhost_add_lun, + SPDK_COUNTOF(rpc_vhost_add_lun), + req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + if (req->ctrlr == NULL) { + SPDK_ERRLOG("No controller name\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_add_vhost_scsi_lun_cb, req); + + return; + +invalid: + if (req) { + free_rpc_add_vhost_scsi_ctrlr_lun(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("add_vhost_scsi_lun", spdk_rpc_add_vhost_scsi_lun, SPDK_RPC_RUNTIME) + +struct rpc_remove_vhost_scsi_ctrlr_target { + char *ctrlr; + uint32_t scsi_target_num; + + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req) +{ + free(req->ctrlr); + free(req); +} + +static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = { + {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32}, +}; + +static int +spdk_rpc_remove_vhost_scsi_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_remove_vhost_scsi_ctrlr_target *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + struct spdk_json_write_ctx *w; + + free_rpc_remove_vhost_scsi_ctrlr_target(rpc); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return -1; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; +} + +static int +spdk_rpc_remove_vhost_scsi_target_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_remove_vhost_scsi_ctrlr_target *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, rpc->scsi_target_num, + spdk_rpc_remove_vhost_scsi_target_finish_cb, rpc); + if (rc < 0) { + goto invalid; + } + + return 0; + +invalid: + free_rpc_remove_vhost_scsi_ctrlr_target(rpc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + return rc; +} + +static void +spdk_rpc_remove_vhost_scsi_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_vhost_scsi_ctrlr_target *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_vhost_remove_target, + SPDK_COUNTOF(rpc_vhost_remove_target), + req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_remove_vhost_scsi_target_cb, req); + + return; + +invalid: + if (req) { + free_rpc_remove_vhost_scsi_ctrlr_target(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} + +SPDK_RPC_REGISTER("remove_vhost_scsi_target", spdk_rpc_remove_vhost_scsi_target, SPDK_RPC_RUNTIME) + +struct rpc_vhost_blk_ctrlr { + char *ctrlr; + char *dev_name; + char *cpumask; + bool readonly; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string }, + {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true}, + {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true}, +}; + +static void +free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req) +{ + free(req->ctrlr); + free(req->dev_name); + free(req->cpumask); +} + +static void +spdk_rpc_construct_vhost_blk_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_blk_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name, req.readonly); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_blk_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_blk_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("construct_vhost_blk_controller", spdk_rpc_construct_vhost_blk_controller, + SPDK_RPC_RUNTIME) + +struct rpc_remove_vhost_ctrlr { + char *ctrlr; + + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder rpc_remove_vhost_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_remove_vhost_ctrlr, ctrlr), spdk_json_decode_string }, +}; + +static void +free_rpc_remove_vhost_ctrlr(struct rpc_remove_vhost_ctrlr *req) +{ + free(req->ctrlr); + free(req); +} + +static int +spdk_rpc_remove_vhost_controller_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_remove_vhost_ctrlr *ctx = arg; + struct spdk_jsonrpc_request *request = ctx->request; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_dev_remove(vdev); + if (rc < 0) { + goto invalid; + } + + free_rpc_remove_vhost_ctrlr(ctx); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return 0; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; + +invalid: + free_rpc_remove_vhost_ctrlr(ctx); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + return -1; +} + +static void +spdk_rpc_remove_vhost_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_vhost_ctrlr *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_remove_vhost_ctrlr, + SPDK_COUNTOF(rpc_remove_vhost_ctrlr), req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_remove_vhost_controller_cb, req); + return; + +invalid: + if (req) { + free_rpc_remove_vhost_ctrlr(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("remove_vhost_controller", spdk_rpc_remove_vhost_controller, SPDK_RPC_RUNTIME) + +struct rpc_get_vhost_ctrlrs { + char *name; + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request; +}; + +static void +_spdk_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev) +{ + uint32_t delay_base_us, iops_threshold; + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev)); + spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", spdk_cpuset_fmt(vdev->cpumask)); + spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); + spdk_json_write_named_string(w, "socket", vdev->path); + + spdk_json_write_named_object_begin(w, "backend_specific"); + spdk_vhost_dump_info_json(vdev, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static int +spdk_rpc_get_vhost_controllers_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_get_vhost_ctrlrs *ctx = arg; + + assert(ctx->name == NULL); + + if (vdev == NULL) { + spdk_json_write_array_end(ctx->w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + free(ctx); + return 0; + } + + _spdk_rpc_get_vhost_controller(ctx->w, vdev); + return 0; +} + +static int +spdk_rpc_get_vhost_controller_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_get_vhost_ctrlrs *ctx = arg; + + assert(ctx->name != NULL); + + if (vdev == NULL) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(ENODEV)); + goto free_name_ctx; + } + + ctx->w = spdk_jsonrpc_begin_result(ctx->request); + if (ctx->w == NULL) { + goto free_name_ctx; + } + + spdk_json_write_array_begin(ctx->w); + _spdk_rpc_get_vhost_controller(ctx->w, vdev); + spdk_json_write_array_end(ctx->w); + + spdk_jsonrpc_end_result(ctx->request, ctx->w); + +free_name_ctx: + free(ctx->name); + free(ctx); + return 0; +} + +static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = { + {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_get_vhost_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_vhost_ctrlrs *ctx; + struct spdk_json_write_ctx *w; + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(ENOMEM)); + return; + } + + if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders, + SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + free(ctx); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + if (ctx->name) { + ctx->request = request; + spdk_vhost_call_external_event(ctx->name, spdk_rpc_get_vhost_controller_cb, ctx); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free(ctx); + return; + } + + spdk_json_write_array_begin(w); + + ctx->w = w; + ctx->request = request; + spdk_vhost_call_external_event_foreach(spdk_rpc_get_vhost_controllers_cb, ctx); +} +SPDK_RPC_REGISTER("get_vhost_controllers", spdk_rpc_get_vhost_controllers, SPDK_RPC_RUNTIME) + + +struct rpc_vhost_ctrlr_coalescing { + char *ctrlr; + uint32_t delay_base_us; + uint32_t iops_threshold; + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = { + {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string }, + {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32}, + {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32}, +}; + +static void +free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req) +{ + if (!req) { + return; + } + + free(req->ctrlr); + free(req); +} + +static int +spdk_rpc_set_vhost_controller_coalescing_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_vhost_ctrlr_coalescing *req = arg; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_set_coalescing(vdev, req->delay_base_us, req->iops_threshold); + if (rc) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(req->request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(req->request, w); + } + + free_rpc_set_vhost_controllers_event_coalescing(req); + return 0; + +invalid: + spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_set_vhost_controllers_event_coalescing(req); + return 0; +} + +static void +spdk_rpc_set_vhost_controller_coalescing(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_ctrlr_coalescing *req; + int rc; + + req = calloc(1, sizeof(struct rpc_vhost_ctrlr_coalescing)); + if (!req) { + rc = -ENOMEM; + goto invalid; + } + + if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing, + SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + req->request = request; + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_set_vhost_controller_coalescing_cb, req); + return; + +invalid: + free_rpc_set_vhost_controllers_event_coalescing(req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("set_vhost_controller_coalescing", spdk_rpc_set_vhost_controller_coalescing, + SPDK_RPC_RUNTIME) + +struct rpc_vhost_nvme_ctrlr { + char *ctrlr; + uint32_t io_queues; + char *cpumask; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string }, + {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32}, + {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static void +spdk_rpc_construct_vhost_nvme_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_nvme_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues); + if (rc < 0) { + free_rpc_vhost_nvme_ctrlr(&req); + goto invalid; + } + + free_rpc_vhost_nvme_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("construct_vhost_nvme_controller", spdk_rpc_construct_vhost_nvme_controller, + SPDK_RPC_RUNTIME) + +struct rpc_add_vhost_nvme_ctrlr_ns { + char *ctrlr; + char *bdev_name; + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_add_vhost_nvme_ctrlr_ns(struct rpc_add_vhost_nvme_ctrlr_ns *req) +{ + free(req->ctrlr); + free(req->bdev_name); + free(req); +} + +static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = { + {"ctrlr", offsetof(struct rpc_add_vhost_nvme_ctrlr_ns, ctrlr), spdk_json_decode_string }, + {"bdev_name", offsetof(struct rpc_add_vhost_nvme_ctrlr_ns, bdev_name), spdk_json_decode_string }, +}; + +static int +spdk_rpc_add_vhost_nvme_ns_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_add_vhost_nvme_ctrlr_ns *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_nvme_dev_add_ns(vdev, rpc->bdev_name); + if (rc < 0) { + goto invalid; + } + free_rpc_add_vhost_nvme_ctrlr_ns(rpc); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return -1; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; + +invalid: + free_rpc_add_vhost_nvme_ctrlr_ns(rpc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + return rc; +} + +static void +spdk_rpc_add_vhost_nvme_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_add_vhost_nvme_ctrlr_ns *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns, + SPDK_COUNTOF(rpc_vhost_nvme_add_ns), + req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_add_vhost_nvme_ns_cb, req); + return; + +invalid: + if (req) { + free_rpc_add_vhost_nvme_ctrlr_ns(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("add_vhost_nvme_ns", spdk_rpc_add_vhost_nvme_ns, SPDK_RPC_RUNTIME) + + +SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC) diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c new file mode 100644 index 00000000..aefa4c45 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_scsi.c @@ -0,0 +1,1271 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <linux/virtio_scsi.h> + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/scsi.h" +#include "spdk/scsi_spec.h" +#include "spdk/conf.h" +#include "spdk/event.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +#include "spdk/vhost.h" +#include "vhost_internal.h" + +/* Features supported by SPDK VHOST lib. */ +#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_INOUT) | \ + (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ + (1ULL << VIRTIO_SCSI_F_CHANGE ) | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +/* Features that are specified in VIRTIO SCSI but currently not supported: + * - Live migration not supported yet + * - T10 PI + */ +#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +#define MGMT_POLL_PERIOD_US (1000 * 5) + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +struct spdk_scsi_dev_vhost_state { + bool removed; + spdk_vhost_event_fn remove_cb; + void *remove_ctx; +}; + +struct spdk_vhost_scsi_dev { + struct spdk_vhost_dev vdev; + struct spdk_scsi_dev *scsi_dev[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + + struct spdk_poller *requestq_poller; + struct spdk_poller *mgmt_poller; + struct spdk_vhost_dev_destroy_ctx destroy_ctx; +} __rte_cache_aligned; + +struct spdk_vhost_scsi_task { + struct spdk_scsi_task scsi; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + + union { + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + }; + + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev *scsi_dev; + + /** Number of bytes that were written. */ + uint32_t used_len; + + int req_idx; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + struct spdk_vhost_virtqueue *vq; +}; + +static int spdk_vhost_scsi_start(struct spdk_vhost_dev *, void *); +static int spdk_vhost_scsi_stop(struct spdk_vhost_dev *, void *); +static void spdk_vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static void spdk_vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static int spdk_vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev); + +const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = { + .virtio_features = SPDK_VHOST_SCSI_FEATURES, + .disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES, + .start_device = spdk_vhost_scsi_start, + .stop_device = spdk_vhost_scsi_stop, + .dump_info_json = spdk_vhost_scsi_dump_info_json, + .write_config_json = spdk_vhost_scsi_write_config_json, + .remove_device = spdk_vhost_scsi_dev_remove, +}; + +static void +spdk_vhost_scsi_task_put(struct spdk_vhost_scsi_task *task) +{ + spdk_scsi_task_put(&task->scsi); +} + +static void +spdk_vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + assert(task->svdev->vdev.task_cnt > 0); + task->svdev->vdev.task_cnt--; + task->used = false; +} + +static void +process_removed_devs(struct spdk_vhost_scsi_dev *svdev) +{ + struct spdk_scsi_dev *dev; + struct spdk_scsi_dev_vhost_state *state; + int i; + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + dev = svdev->scsi_dev[i]; + state = &svdev->scsi_dev_state[i]; + + if (dev && state->removed && !spdk_scsi_dev_has_pending_tasks(dev)) { + spdk_scsi_dev_free_io_channels(dev); + svdev->scsi_dev[i] = NULL; + spdk_scsi_dev_destruct(dev); + if (state->remove_cb) { + state->remove_cb(&svdev->vdev, state->remove_ctx); + state->remove_cb = NULL; + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: hot-detached device 'Dev %u'.\n", + svdev->vdev.name, i); + } + } +} + +static void +eventq_enqueue(struct spdk_vhost_scsi_dev *svdev, unsigned scsi_dev_num, uint32_t event, + uint32_t reason) +{ + struct spdk_vhost_virtqueue *vq; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_event *desc_ev; + uint32_t desc_table_size, req_size = 0; + uint16_t req; + int rc; + + assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + vq = &svdev->vdev.virtqueue[VIRTIO_SCSI_EVENTQ]; + + if (spdk_vhost_vq_avail_ring_get(vq, &req, 1) != 1) { + SPDK_ERRLOG("Controller %s: Failed to send virtio event (no avail ring entries?).\n", + svdev->vdev.name); + return; + } + + rc = spdk_vhost_vq_get_desc(&svdev->vdev, vq, req, &desc, &desc_table, &desc_table_size); + if (rc != 0 || desc->len < sizeof(*desc_ev)) { + SPDK_ERRLOG("Controller %s: Invalid eventq descriptor at index %"PRIu16".\n", + svdev->vdev.name, req); + goto out; + } + + desc_ev = spdk_vhost_gpa_to_vva(&svdev->vdev, desc->addr, sizeof(*desc_ev)); + if (desc_ev == NULL) { + SPDK_ERRLOG("Controller %s: Eventq descriptor at index %"PRIu16" points to unmapped guest memory address %p.\n", + svdev->vdev.name, req, (void *)(uintptr_t)desc->addr); + goto out; + } + + desc_ev->event = event; + desc_ev->lun[0] = 1; + desc_ev->lun[1] = scsi_dev_num; + /* virtio LUN id 0 can refer either to the entire device + * or actual LUN 0 (the only supported by vhost for now) + */ + desc_ev->lun[2] = 0 >> 8; + desc_ev->lun[3] = 0 & 0xFF; + /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3) + * current implementation relies on linux kernel sources + */ + memset(&desc_ev->lun[4], 0, 4); + desc_ev->reason = reason; + req_size = sizeof(*desc_ev); + +out: + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, req, req_size); +} + +static void +submit_completion(struct spdk_vhost_scsi_task *task) +{ + spdk_vhost_vq_used_ring_enqueue(&task->svdev->vdev, task->vq, task->req_idx, + task->used_len); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx); + + spdk_vhost_scsi_task_put(task); +} + +static void +spdk_vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + submit_completion(task); +} + +static void +spdk_vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + /* The SCSI task has completed. Do final processing and then post + notification to the virtqueue's "used" ring. + */ + task->resp->status = task->scsi.status; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len); + task->resp->sense_len = task->scsi.sense_data_len; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx, + task->scsi.status); + } + assert(task->scsi.transfer_len == task->scsi.length); + task->resp->resid = task->scsi.length - task->scsi.data_transferred; + + submit_completion(task); +} + +static void +task_submit(struct spdk_vhost_scsi_task *task) +{ + task->resp->response = VIRTIO_SCSI_S_OK; + spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi); +} + +static void +mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func) +{ + task->tmf_resp->response = VIRTIO_SCSI_S_OK; + spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi, func); +} + +static void +invalid_request(struct spdk_vhost_scsi_task *task) +{ + spdk_vhost_vq_used_ring_enqueue(&task->svdev->vdev, task->vq, task->req_idx, + task->used_len); + spdk_vhost_scsi_task_put(task); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n", + task->resp ? task->resp->response : -1); +} + +static int +spdk_vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun) +{ + struct spdk_scsi_dev *dev; + uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; + + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8); + + /* First byte must be 1 and second is target */ + if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + return -1; + } + + dev = task->svdev->scsi_dev[lun[1]]; + task->scsi_dev = dev; + if (dev == NULL || task->svdev->scsi_dev_state[lun[1]].removed) { + /* If dev has been hotdetached, return 0 to allow sending + * additional hotremove event via sense codes. + */ + return task->svdev->scsi_dev_state[lun[1]].removed ? 0 : -1; + } + + task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0); + task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_id); + return 0; +} + +static void +process_ctrl_request(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_dev *vdev = &task->svdev->vdev; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_ctrl_tmf_req *ctrl_req; + struct virtio_scsi_ctrl_an_resp *an_resp; + uint32_t desc_table_size, used_len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, spdk_vhost_scsi_task_mgmt_cpl, spdk_vhost_scsi_task_free_cb); + rc = spdk_vhost_vq_get_desc(vdev, task->vq, task->req_idx, &desc, &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("%s: Invalid controlq descriptor at index %d.\n", + vdev->name, task->req_idx); + goto out; + } + + ctrl_req = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*ctrl_req)); + if (ctrl_req == NULL) { + SPDK_ERRLOG("%s: Invalid task management request at index %d.\n", + vdev->name, task->req_idx); + goto out; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, + "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n", + task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->vring.last_used_idx, + task->vq->vring.kickfd, task->vq->vring.size); + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, + desc->len); + + spdk_vhost_scsi_task_init_target(task, ctrl_req->lun); + + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (spdk_unlikely(desc == NULL)) { + SPDK_ERRLOG("%s: No response descriptor for controlq request %d.\n", + vdev->name, task->req_idx); + goto out; + } + + /* Process the TMF request */ + switch (ctrl_req->type) { + case VIRTIO_SCSI_T_TMF: + task->tmf_resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->tmf_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) { + SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto out; + } + + /* Check if we are processing a valid request */ + if (task->scsi_dev == NULL) { + task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; + break; + } + + switch (ctrl_req->subtype) { + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + /* Handle LUN reset */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN reset\n"); + + mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + return; + default: + task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED; + /* Unsupported command */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "Unsupported TMF command %x\n", ctrl_req->subtype); + break; + } + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: { + an_resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*an_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) { + SPDK_WARNLOG("%s: Asynchronous response descriptor points to invalid guest memory region\n", + vdev->name); + goto out; + } + + an_resp->response = VIRTIO_SCSI_S_ABORTED; + break; + } + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "Unsupported control command %x\n", ctrl_req->type); + break; + } + + used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp); +out: + spdk_vhost_vq_used_ring_enqueue(vdev, task->vq, task->req_idx, used_len); + spdk_vhost_scsi_task_put(task); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * -1 if request is invalid and must be aborted, + * 0 if all data are set. + */ +static int +task_data_setup(struct spdk_vhost_scsi_task *task, + struct virtio_scsi_cmd_req **req) +{ + struct spdk_vhost_dev *vdev = &task->svdev->vdev; + struct vring_desc *desc, *desc_table; + struct iovec *iovs = task->iovs; + uint16_t iovcnt = 0; + uint32_t desc_table_len, len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, spdk_vhost_scsi_task_cpl, spdk_vhost_scsi_task_free_cb); + + rc = spdk_vhost_vq_get_desc(vdev, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len); + /* First descriptor must be readable */ + if (spdk_unlikely(rc != 0 || spdk_vhost_vring_desc_is_wr(desc) || + desc->len < sizeof(struct virtio_scsi_cmd_req))) { + SPDK_WARNLOG("%s: invalid first (request) descriptor at index %"PRIu16".\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + *req = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(**req)); + if (spdk_unlikely(*req == NULL)) { + SPDK_WARNLOG("%s: Request descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + /* Each request must have at least 2 descriptors (e.g. request and response) */ + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (desc == NULL) { + SPDK_WARNLOG("%s: Descriptor chain at index %d contains neither payload nor response buffer.\n", + vdev->name, task->req_idx); + goto invalid_task; + } + task->scsi.dxfer_dir = spdk_vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV : + SPDK_SCSI_DIR_TO_DEV; + task->scsi.iovs = iovs; + + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + /* + * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN] + */ + task->resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: Response descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto invalid_task; + } + rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + if (desc == NULL) { + /* + * TEST UNIT READY command and some others might not contain any payload and this is not an error. + */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, + "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx); + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE); + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + task->scsi.iovcnt = 1; + task->scsi.iovs[0].iov_len = 0; + task->scsi.length = 0; + task->scsi.transfer_len = 0; + return 0; + } + + /* All remaining descriptors are data. */ + while (desc) { + if (spdk_unlikely(!spdk_vhost_vring_desc_is_wr(desc))) { + SPDK_WARNLOG("FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n", iovcnt); + goto invalid_task; + } + + if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n", + vdev->name, task->req_idx); + goto invalid_task; + } + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len; + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV"); + /* + * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp] + * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir. + */ + + /* Process descriptors up to response. */ + while (!spdk_vhost_vring_desc_is_wr(desc)) { + if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(desc == NULL)) { + SPDK_WARNLOG("TO_DEV cmd: no response descriptor.\n"); + goto invalid_task; + } + } + + task->resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: Response descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + } + + task->scsi.iovcnt = iovcnt; + task->scsi.length = len; + task->scsi.transfer_len = len; + return 0; + +invalid_task: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n", + vdev->name, task->req_idx); + return -1; +} + +static int +process_request(struct spdk_vhost_scsi_task *task) +{ + struct virtio_scsi_cmd_req *req; + int result; + + result = task_data_setup(task, &req); + if (result) { + return result; + } + + result = spdk_vhost_scsi_task_init_target(task, req->lun); + if (spdk_unlikely(result != 0)) { + task->resp->response = VIRTIO_SCSI_S_BAD_TARGET; + return -1; + } + + task->scsi.cdb = req->cdb; + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE); + + if (spdk_unlikely(task->scsi.lun == NULL)) { + spdk_scsi_task_process_null_lun(&task->scsi); + task->resp->response = VIRTIO_SCSI_S_OK; + return 1; + } + + return 0; +} + +static void +process_controlq(struct spdk_vhost_scsi_dev *svdev, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_scsi_task *task; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + for (i = 0; i < reqs_cnt; i++) { + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: invalid entry in avail ring. Buffer '%"PRIu16"' exceeds virtqueue size (%"PRIu16")\n", + svdev->vdev.name, reqs[i], vq->vring.size); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[reqs[i]]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: invalid entry in avail ring. Buffer '%"PRIu16"' is still in use!\n", + svdev->vdev.name, reqs[i]); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + svdev->vdev.task_cnt++; + memset(&task->scsi, 0, sizeof(task->scsi)); + task->tmf_resp = NULL; + task->used = true; + process_ctrl_request(task); + } +} + +static void +process_requestq(struct spdk_vhost_scsi_dev *svdev, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_scsi_task *task; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + int result; + + reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + assert(reqs_cnt <= 32); + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + svdev->vdev.name, reqs[i], vq->vring.size); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[reqs[i]]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + svdev->vdev.name, reqs[i]); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + svdev->vdev.task_cnt++; + memset(&task->scsi, 0, sizeof(task->scsi)); + task->resp = NULL; + task->used = true; + task->used_len = 0; + result = process_request(task); + if (likely(result == 0)) { + task_submit(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task, + task->req_idx); + } else if (result > 0) { + spdk_vhost_scsi_task_cpl(&task->scsi); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task, + task->req_idx); + } else { + invalid_request(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task, + task->req_idx); + } + } +} + +static int +vdev_mgmt_worker(void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + + process_removed_devs(svdev); + spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_EVENTQ]); + + process_controlq(svdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_CONTROLQ]); + spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_CONTROLQ]); + + return -1; +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + uint32_t q_idx; + + for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < svdev->vdev.max_queues; q_idx++) { + process_requestq(svdev, &svdev->vdev.virtqueue[q_idx]); + } + + spdk_vhost_dev_used_signal(&svdev->vdev); + + return -1; +} + +static struct spdk_vhost_scsi_dev * +to_scsi_dev(struct spdk_vhost_dev *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->backend != &spdk_vhost_scsi_device_backend) { + SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name); + return NULL; + } + + return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev); +} + +int +spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask) +{ + struct spdk_vhost_scsi_dev *svdev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_scsi_dev), + SPDK_CACHE_LINE_SIZE, NULL); + int rc; + + if (svdev == NULL) { + return -ENOMEM; + } + + spdk_vhost_lock(); + rc = spdk_vhost_dev_register(&svdev->vdev, name, cpumask, + &spdk_vhost_scsi_device_backend); + + if (rc) { + spdk_dma_free(svdev); + } + + spdk_vhost_unlock(); + return rc; +} + +static int +spdk_vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev); + int rc, i; + + if (svdev == NULL) { + return -EINVAL; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + if (svdev->scsi_dev[i]) { + if (vdev->registered) { + SPDK_ERRLOG("Trying to remove non-empty controller: %s.\n", vdev->name); + return -EBUSY; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i); + return rc; + } + } + } + + rc = spdk_vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + spdk_dma_free(svdev); + return 0; +} + +struct spdk_scsi_dev * +spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num) +{ + struct spdk_vhost_scsi_dev *svdev; + + assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + svdev = to_scsi_dev(vdev); + + return svdev ? svdev->scsi_dev[num] : NULL; +} + +static void +spdk_vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + const struct spdk_scsi_dev *scsi_dev; + unsigned scsi_dev_num; + + assert(lun != NULL); + assert(svdev != NULL); + if (svdev->vdev.lcore != -1 && + !spdk_vhost_dev_has_feature(&svdev->vdev, VIRTIO_SCSI_F_HOTPLUG)) { + SPDK_WARNLOG("%s: hotremove is not enabled for this controller.\n", svdev->vdev.name); + return; + } + + scsi_dev = spdk_scsi_lun_get_dev(lun); + for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) { + if (svdev->scsi_dev[scsi_dev_num] == scsi_dev) { + break; + } + } + + if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + /* The entire device has been already removed. */ + return; + } + + /* remove entire device */ + spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL); +} + +int +spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num, + const char *bdev_name) +{ + struct spdk_vhost_scsi_dev *svdev; + char target_name[SPDK_SCSI_DEV_MAX_NAME]; + int lun_id_list[1]; + const char *bdev_names_list[1]; + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + return -EINVAL; + } + + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("Controller %d target number too big (max %d)\n", scsi_tgt_num, + SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + return -EINVAL; + } + + if (bdev_name == NULL) { + SPDK_ERRLOG("No lun name specified\n"); + return -EINVAL; + } + + if (svdev->scsi_dev[scsi_tgt_num] != NULL) { + SPDK_ERRLOG("Controller %s target %u already occupied\n", vdev->name, scsi_tgt_num); + return -EEXIST; + } + + /* + * At this stage only one LUN per target + */ + snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num); + lun_id_list[0] = 0; + bdev_names_list[0] = (char *)bdev_name; + + svdev->scsi_dev_state[scsi_tgt_num].removed = false; + svdev->scsi_dev[scsi_tgt_num] = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, + 1, + SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, spdk_vhost_scsi_lun_hotremove, svdev); + + if (svdev->scsi_dev[scsi_tgt_num] == NULL) { + SPDK_ERRLOG("Couldn't create spdk SCSI target '%s' using bdev '%s' in controller: %s\n", + target_name, bdev_name, vdev->name); + return -EINVAL; + } + spdk_scsi_dev_add_port(svdev->scsi_dev[scsi_tgt_num], 0, "vhost"); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: defined target '%s' using bdev '%s'\n", + vdev->name, target_name, bdev_name); + + if (vdev->lcore == -1) { + /* All done. */ + return 0; + } + + spdk_scsi_dev_allocate_io_channels(svdev->scsi_dev[scsi_tgt_num]); + + if (spdk_vhost_dev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { + eventq_enqueue(svdev, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, + VIRTIO_SCSI_EVT_RESET_RESCAN); + } else { + SPDK_NOTICELOG("Device %s does not support hotplug. " + "Please restart the driver or perform a rescan.\n", + vdev->name); + } + + return 0; +} + +int +spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num, + spdk_vhost_event_fn cb_fn, void *cb_arg) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev *scsi_dev; + struct spdk_scsi_dev_vhost_state *scsi_dev_state; + int rc = 0; + + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: invalid target number %d\n", vdev->name, scsi_tgt_num); + return -EINVAL; + } + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + return -ENODEV; + } + + scsi_dev = svdev->scsi_dev[scsi_tgt_num]; + if (scsi_dev == NULL) { + SPDK_ERRLOG("Controller %s target %u is not occupied\n", vdev->name, scsi_tgt_num); + return -ENODEV; + } + + if (svdev->vdev.lcore == -1) { + /* controller is not in use, remove dev and exit */ + svdev->scsi_dev[scsi_tgt_num] = NULL; + spdk_scsi_dev_destruct(scsi_dev); + if (cb_fn) { + rc = cb_fn(vdev, cb_arg); + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n", + vdev->name, scsi_tgt_num); + return rc; + } + + if (!spdk_vhost_dev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { + SPDK_WARNLOG("%s: 'Target %u' is in use and hot-detach is not enabled for this controller.\n", + svdev->vdev.name, scsi_tgt_num); + return -ENOTSUP; + } + + scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num]; + if (scsi_dev_state->removed) { + SPDK_WARNLOG("%s: 'Target %u' has been already marked to hotremove.\n", svdev->vdev.name, + scsi_tgt_num); + return -EBUSY; + } + + scsi_dev_state->remove_cb = cb_fn; + scsi_dev_state->remove_ctx = cb_arg; + scsi_dev_state->removed = true; + eventq_enqueue(svdev, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: queued 'Target %u' for hot-detach.\n", vdev->name, scsi_tgt_num); + return 0; +} + +int +spdk_vhost_scsi_controller_construct(void) +{ + struct spdk_conf_section *sp = spdk_conf_first_section(NULL); + struct spdk_vhost_dev *vdev; + int i, dev_num; + unsigned ctrlr_num = 0; + char *bdev_name, *tgt_num_str; + char *cpumask; + char *name; + char *tgt = NULL; + + while (sp != NULL) { + if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) { + sp = spdk_conf_next_section(sp); + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + + if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) { + return -1; + } + + vdev = spdk_vhost_dev_find(name); + assert(vdev); + + for (i = 0; ; i++) { + + tgt = spdk_conf_section_get_nval(sp, "Target", i); + if (tgt == NULL) { + break; + } + + tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0); + if (tgt_num_str == NULL) { + SPDK_ERRLOG("%s: Invalid or missing target number\n", name); + return -1; + } + + dev_num = (int)strtol(tgt_num_str, NULL, 10); + bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("%s: Invalid or missing bdev name for target %d\n", name, dev_num); + return -1; + } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) { + SPDK_ERRLOG("%s: Only one LUN per vhost SCSI device supported\n", name); + return -1; + } + + if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) { + return -1; + } + } + + sp = spdk_conf_next_section(sp); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_scsi_dev *svdev) +{ + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < svdev->vdev.max_queues; i++) { + vq = &svdev->vdev.virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_dma_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_scsi_dev *svdev) +{ + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_scsi_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < svdev->vdev.max_queues; i++) { + vq = &svdev->vdev.virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + svdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(svdev); + return -1; + } + vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL); + if (vq->tasks == NULL) { + SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + svdev->vdev.name, task_cnt, i); + free_task_pool(svdev); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j]; + task->svdev = svdev; + task->vq = vq; + task->req_idx = j; + } + } + + return 0; +} + +/* + * A new device is added to a data core. First the device is added to the main linked list + * and then allocated to a specific data core. + */ +static int +spdk_vhost_scsi_start(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_scsi_dev *svdev; + uint32_t i; + int rc; + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + SPDK_ERRLOG("Trying to start non-scsi controller as a scsi one.\n"); + rc = -1; + goto out; + } + + /* validate all I/O queues are in a contiguous index range */ + for (i = VIRTIO_SCSI_REQUESTQ; i < vdev->max_queues; i++) { + if (vdev->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(svdev); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", vdev->name); + goto out; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + if (svdev->scsi_dev[i] == NULL) { + continue; + } + spdk_scsi_dev_allocate_io_channels(svdev->scsi_dev[i]); + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n", + vdev->name, vdev->lcore); + + svdev->requestq_poller = spdk_poller_register(vdev_worker, svdev, 0); + if (vdev->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc && + vdev->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) { + svdev->mgmt_poller = spdk_poller_register(vdev_mgmt_worker, svdev, + MGMT_POLL_PERIOD_US); + } +out: + spdk_vhost_dev_backend_event_done(event_ctx, rc); + return rc; +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + uint32_t i; + + if (svdev->vdev.task_cnt > 0) { + return -1; + } + + + for (i = 0; i < svdev->vdev.max_queues; i++) { + spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[i]); + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + if (svdev->scsi_dev[i] == NULL) { + continue; + } + spdk_scsi_dev_free_io_channels(svdev->scsi_dev[i]); + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", svdev->vdev.name); + + free_task_pool(svdev); + + spdk_poller_unregister(&svdev->destroy_ctx.poller); + spdk_vhost_dev_backend_event_done(svdev->destroy_ctx.event_ctx, 0); + + return -1; +} + +static int +spdk_vhost_scsi_stop(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_scsi_dev *svdev; + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + SPDK_ERRLOG("Trying to stop non-scsi controller as a scsi one.\n"); + goto err; + } + + svdev->destroy_ctx.event_ctx = event_ctx; + spdk_poller_unregister(&svdev->requestq_poller); + spdk_poller_unregister(&svdev->mgmt_poller); + svdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, svdev, + 1000); + + return 0; + +err: + spdk_vhost_dev_backend_event_done(event_ctx, -1); + return -1; +} + +static void +spdk_vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_scsi_dev *sdev; + struct spdk_scsi_lun *lun; + uint32_t dev_idx; + uint32_t lun_idx; + + assert(vdev != NULL); + spdk_json_write_name(w, "scsi"); + spdk_json_write_array_begin(w); + for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) { + sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx); + if (!sdev) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "scsi_dev_num"); + spdk_json_write_uint32(w, dev_idx); + + spdk_json_write_name(w, "id"); + spdk_json_write_int32(w, spdk_scsi_dev_get_id(sdev)); + + spdk_json_write_name(w, "target_name"); + spdk_json_write_string(w, spdk_scsi_dev_get_name(sdev)); + + spdk_json_write_name(w, "luns"); + spdk_json_write_array_begin(w); + + for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) { + lun = spdk_scsi_dev_get_lun(sdev, lun_idx); + if (!lun) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "id"); + spdk_json_write_int32(w, spdk_scsi_lun_get_id(lun)); + + spdk_json_write_name(w, "bdev_name"); + spdk_json_write_string(w, spdk_scsi_lun_get_bdev_name(lun)); + + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +spdk_vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_lun *lun; + uint32_t i; + + svdev = to_scsi_dev(vdev); + if (!svdev) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_vhost_scsi_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < SPDK_COUNTOF(svdev->scsi_dev); i++) { + if (svdev->scsi_dev[i] == NULL || svdev->scsi_dev_state[i].removed) { + continue; + } + + lun = spdk_scsi_dev_get_lun(svdev->scsi_dev[i], 0); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "add_vhost_scsi_lun"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(w, "scsi_target_num", i); + + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA) |