diff options
Diffstat (limited to 'src/spdk/lib/virtio')
-rw-r--r-- | src/spdk/lib/virtio/Makefile | 46 | ||||
-rw-r--r-- | src/spdk/lib/virtio/spdk_virtio.map | 33 | ||||
-rw-r--r-- | src/spdk/lib/virtio/vhost_user.c | 489 | ||||
-rw-r--r-- | src/spdk/lib/virtio/vhost_user.h | 69 | ||||
-rw-r--r-- | src/spdk/lib/virtio/virtio.c | 717 | ||||
-rw-r--r-- | src/spdk/lib/virtio/virtio_pci.c | 599 | ||||
-rw-r--r-- | src/spdk/lib/virtio/virtio_user.c | 628 |
7 files changed, 2581 insertions, 0 deletions
diff --git a/src/spdk/lib/virtio/Makefile b/src/spdk/lib/virtio/Makefile new file mode 100644 index 000000000..8ea173c3b --- /dev/null +++ b/src/spdk/lib/virtio/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = virtio.c virtio_user.c virtio_pci.c vhost_user.c +LIBNAME = virtio + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_virtio.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/virtio/spdk_virtio.map b/src/spdk/lib/virtio/spdk_virtio.map new file mode 100644 index 000000000..76e02cff8 --- /dev/null +++ b/src/spdk/lib/virtio/spdk_virtio.map @@ -0,0 +1,33 @@ +{ + global: + + # internal functions in spdk_internal/virtio.h + virtio_recv_pkts; + virtqueue_req_start; + virtqueue_req_flush; + virtqueue_req_abort; + virtqueue_req_add_iovs; + virtio_dev_construct; + virtio_dev_reset; + virtio_dev_start; + virtio_dev_stop; + virtio_dev_destruct; + virtio_dev_acquire_queue; + virtio_dev_find_and_acquire_queue; + virtio_dev_queue_get_thread; + virtio_dev_queue_is_acquired; + virtio_dev_release_queue; + virtio_dev_get_status; + virtio_dev_set_status; + virtio_dev_write_dev_config; + virtio_dev_read_dev_config; + virtio_dev_backend_ops; + virtio_dev_has_feature; + virtio_dev_dump_json_info; + virtio_pci_dev_enumerate; + virtio_pci_dev_attach; + virtio_user_dev_init; + virtio_pci_dev_init; + + local: *; +}; diff --git a/src/spdk/lib/virtio/vhost_user.c b/src/spdk/lib/virtio/vhost_user.c new file mode 100644 index 000000000..b3da9d988 --- /dev/null +++ b/src/spdk/lib/virtio/vhost_user.c @@ -0,0 +1,489 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "vhost_user.h" + +#include "spdk/string.h" +#include "spdk_internal/vhost_user.h" + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + +static int +vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num) +{ + int r; + struct msghdr msgh; + struct iovec iov; + size_t fd_size = fd_num * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = (uint8_t *)buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fd_size); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + r = sendmsg(fd, &msgh, 0); + } while (r < 0 && errno == EINTR); + + if (r == -1) { + return -errno; + } + + return 0; +} + +static int +vhost_user_read(int fd, struct vhost_user_msg *msg) +{ + uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; + ssize_t ret; + size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload; + + ret = recv(fd, (void *)msg, sz_hdr, 0); + if ((size_t)ret != sz_hdr) { + SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n", + ret, sz_hdr); + if (ret == -1) { + return -errno; + } else { + return -EBUSY; + } + } + + /* validate msg flags */ + if (msg->flags != (valid_flags)) { + SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n", + msg->flags, valid_flags); + return -EIO; + } + + sz_payload = msg->size; + + if (sz_payload > VHOST_USER_PAYLOAD_SIZE) { + SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n", + sz_payload, VHOST_USER_PAYLOAD_SIZE); + return -EIO; + } + + if (sz_payload) { + ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0); + if ((size_t)ret != sz_payload) { + SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n", + ret, msg->size); + if (ret == -1) { + return -errno; + } else { + return -EBUSY; + } + } + } + + return 0; +} + +struct hugepage_file_info { + uint64_t addr; /**< virtual addr */ + size_t size; /**< the file size */ + char path[PATH_MAX]; /**< path to backing file */ +}; + +/* Two possible options: + * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file + * array. This is simple but cannot be used in secondary process because + * secondary process will close and munmap that file. + * 2. Match HUGEFILE_FMT to find hugepage files directly. + * + * We choose option 2. + */ +static int +get_hugepage_file_info(struct hugepage_file_info huges[], int max) +{ + int idx, rc; + FILE *f; + char buf[BUFSIZ], *tmp, *tail; + char *str_underline, *str_start; + int huge_index; + uint64_t v_start, v_end; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + SPDK_ERRLOG("cannot open /proc/self/maps\n"); + rc = -errno; + assert(rc < 0); /* scan-build hack */ + return rc; + } + + idx = 0; + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) { + SPDK_ERRLOG("Failed to parse address\n"); + rc = -EIO; + goto out; + } + + tmp = strchr(buf, ' ') + 1; /** skip address */ + tmp = strchr(tmp, ' ') + 1; /** skip perm */ + tmp = strchr(tmp, ' ') + 1; /** skip offset */ + tmp = strchr(tmp, ' ') + 1; /** skip dev */ + tmp = strchr(tmp, ' ') + 1; /** skip inode */ + while (*tmp == ' ') { /** skip spaces */ + tmp++; + } + tail = strrchr(tmp, '\n'); /** remove newline if exists */ + if (tail) { + *tail = '\0'; + } + + /* Match HUGEFILE_FMT, aka "%s/%smap_%d", + * which is defined in eal_filesystem.h + */ + str_underline = strrchr(tmp, '_'); + if (!str_underline) { + continue; + } + + str_start = str_underline - strlen("map"); + if (str_start < tmp) { + continue; + } + + if (sscanf(str_start, "map_%d", &huge_index) != 1) { + continue; + } + + if (idx >= max) { + SPDK_ERRLOG("Exceed maximum of %d\n", max); + rc = -ENOSPC; + goto out; + } + + if (idx > 0 && + strncmp(tmp, huges[idx - 1].path, PATH_MAX) == 0 && + v_start == huges[idx - 1].addr + huges[idx - 1].size) { + huges[idx - 1].size += (v_end - v_start); + continue; + } + + huges[idx].addr = v_start; + huges[idx].size = v_end - v_start; + snprintf(huges[idx].path, PATH_MAX, "%s", tmp); + idx++; + } + + rc = idx; +out: + fclose(f); + return rc; +} + +static int +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) +{ + int i, num; + struct hugepage_file_info huges[VHOST_USER_MEMORY_MAX_NREGIONS]; + + num = get_hugepage_file_info(huges, VHOST_USER_MEMORY_MAX_NREGIONS); + if (num < 0) { + SPDK_ERRLOG("Failed to prepare memory for vhost-user\n"); + return num; + } + + for (i = 0; i < num; ++i) { + /* the memory regions are unaligned */ + msg->payload.memory.regions[i].guest_phys_addr = huges[i].addr; /* use vaddr! */ + msg->payload.memory.regions[i].userspace_addr = huges[i].addr; + msg->payload.memory.regions[i].memory_size = huges[i].size; + msg->payload.memory.regions[i].flags_padding = 0; + fds[i] = open(huges[i].path, O_RDWR); + } + + msg->payload.memory.nregions = num; + msg->payload.memory.padding = 0; + + return 0; +} + +static const char *const vhost_msg_strings[VHOST_USER_MAX] = { + [VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER", + [VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES", + [VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES", + [VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM", + [VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", + [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", +}; + +static int +vhost_user_sock(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg) +{ + struct vhost_user_msg msg; + struct vhost_vring_file *file = 0; + int need_reply = 0; + int fds[VHOST_USER_MEMORY_MAX_NREGIONS]; + int fd_num = 0; + int i, len, rc; + int vhostfd = dev->vhostfd; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_USER, "sent message %d = %s\n", req, vhost_msg_strings[req]); + + msg.request = req; + msg.flags = VHOST_USER_VERSION; + msg.size = 0; + + switch (req) { + case VHOST_USER_GET_FEATURES: + case VHOST_USER_GET_PROTOCOL_FEATURES: + case VHOST_USER_GET_QUEUE_NUM: + need_reply = 1; + break; + + case VHOST_USER_SET_FEATURES: + case VHOST_USER_SET_LOG_BASE: + case VHOST_USER_SET_PROTOCOL_FEATURES: + msg.payload.u64 = *((__u64 *)arg); + msg.size = sizeof(msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + break; + + case VHOST_USER_SET_MEM_TABLE: + rc = prepare_vhost_memory_user(&msg, fds); + if (rc < 0) { + return rc; + } + fd_num = msg.payload.memory.nregions; + msg.size = sizeof(msg.payload.memory.nregions); + msg.size += sizeof(msg.payload.memory.padding); + msg.size += fd_num * sizeof(struct vhost_memory_region); + break; + + case VHOST_USER_SET_LOG_FD: + fds[fd_num++] = *((int *)arg); + break; + + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(msg.payload.state); + need_reply = 1; + break; + + case VHOST_USER_SET_VRING_ADDR: + memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr)); + msg.size = sizeof(msg.payload.addr); + break; + + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + file = arg; + msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK; + msg.size = sizeof(msg.payload.u64); + if (file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; + } + break; + + case VHOST_USER_GET_CONFIG: + memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg)); + msg.size = sizeof(msg.payload.cfg); + need_reply = 1; + break; + + case VHOST_USER_SET_CONFIG: + memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg)); + msg.size = sizeof(msg.payload.cfg); + break; + + default: + SPDK_ERRLOG("trying to send unknown msg\n"); + return -EINVAL; + } + + len = VHOST_USER_HDR_SIZE + msg.size; + rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num); + if (rc < 0) { + SPDK_ERRLOG("%s failed: %s\n", + vhost_msg_strings[req], spdk_strerror(-rc)); + return rc; + } + + if (req == VHOST_USER_SET_MEM_TABLE) + for (i = 0; i < fd_num; ++i) { + close(fds[i]); + } + + if (need_reply) { + rc = vhost_user_read(vhostfd, &msg); + if (rc < 0) { + SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + if (req != msg.request) { + SPDK_WARNLOG("Received unexpected msg type\n"); + return -EIO; + } + + switch (req) { + case VHOST_USER_GET_FEATURES: + case VHOST_USER_GET_PROTOCOL_FEATURES: + case VHOST_USER_GET_QUEUE_NUM: + if (msg.size != sizeof(msg.payload.u64)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + *((__u64 *)arg) = msg.payload.u64; + break; + case VHOST_USER_GET_VRING_BASE: + if (msg.size != sizeof(msg.payload.state)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + memcpy(arg, &msg.payload.state, + sizeof(struct vhost_vring_state)); + break; + case VHOST_USER_GET_CONFIG: + if (msg.size != sizeof(msg.payload.cfg)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg)); + break; + default: + SPDK_WARNLOG("Received unexpected msg type\n"); + return -EBADMSG; + } + } + + return 0; +} + +/** + * Set up environment to talk with a vhost user backend. + * + * @return + * - (-1) if fail; + * - (0) if succeed. + */ +static int +vhost_user_setup(struct virtio_user_dev *dev) +{ + int fd; + int flag; + struct sockaddr_un un; + ssize_t rc; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno)); + return -errno; + } + + flag = fcntl(fd, F_GETFD); + if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) { + SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno)); + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path); + if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) { + SPDK_ERRLOG("socket path too long\n"); + close(fd); + if (rc < 0) { + return -errno; + } else { + return -EINVAL; + } + } + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno)); + close(fd); + return -errno; + } + + dev->vhostfd = fd; + return 0; +} + +struct virtio_user_backend_ops ops_user = { + .setup = vhost_user_setup, + .send_request = vhost_user_sock, +}; + +SPDK_LOG_REGISTER_COMPONENT("virtio_user", SPDK_LOG_VIRTIO_USER) diff --git a/src/spdk/lib/virtio/vhost_user.h b/src/spdk/lib/virtio/vhost_user.h new file mode 100644 index 000000000..0caf51ebc --- /dev/null +++ b/src/spdk/lib/virtio/vhost_user.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_H +#define _VHOST_H + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" +#include "spdk_internal/vhost_user.h" + +struct virtio_user_backend_ops; + +struct virtio_user_dev { + int vhostfd; + + int callfds[SPDK_VIRTIO_MAX_VIRTQUEUES]; + int kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES]; + uint32_t queue_size; + + uint8_t status; + char path[PATH_MAX]; + uint64_t protocol_features; + struct vring vrings[SPDK_VIRTIO_MAX_VIRTQUEUES]; + struct virtio_user_backend_ops *ops; + struct spdk_mem_map *mem_map; +}; + +struct virtio_user_backend_ops { + int (*setup)(struct virtio_user_dev *dev); + int (*send_request)(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg); +}; + +extern struct virtio_user_backend_ops ops_user; + +#endif diff --git a/src/spdk/lib/virtio/virtio.c b/src/spdk/lib/virtio/virtio.c new file mode 100644 index 000000000..03866040a --- /dev/null +++ b/src/spdk/lib/virtio/virtio.c @@ -0,0 +1,717 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk/barrier.h" + +#include "spdk_internal/virtio.h" + +/* We use SMP memory barrier variants as all virtio_pci devices + * are purely virtual. All MMIO is executed on a CPU core, so + * there's no need to do full MMIO synchronization. + */ +#define virtio_mb() spdk_smp_mb() +#define virtio_rmb() spdk_smp_rmb() +#define virtio_wmb() spdk_smp_wmb() + +/* Chain all the descriptors in the ring with an END */ +static inline void +vring_desc_init(struct vring_desc *dp, uint16_t n) +{ + uint16_t i; + + for (i = 0; i < n - 1; i++) { + dp[i].next = (uint16_t)(i + 1); + } + dp[i].next = VQ_RING_DESC_CHAIN_END; +} + +static void +virtio_init_vring(struct virtqueue *vq) +{ + int size = vq->vq_nentries; + struct vring *vr = &vq->vq_ring; + uint8_t *ring_mem = vq->vq_ring_virt_mem; + + /* + * Reinitialise since virtio port might have been stopped and restarted + */ + memset(ring_mem, 0, vq->vq_ring_size); + vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN); + vq->vq_used_cons_idx = 0; + vq->vq_desc_head_idx = 0; + vq->vq_avail_idx = 0; + vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1); + vq->vq_free_cnt = vq->vq_nentries; + vq->req_start = VQ_RING_DESC_CHAIN_END; + vq->req_end = VQ_RING_DESC_CHAIN_END; + vq->reqs_finished = 0; + memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries); + + vring_desc_init(vr->desc, size); + + /* Tell the backend not to interrupt us. + * If F_EVENT_IDX is negotiated, we will always set incredibly high + * used event idx, so that we will practically never receive an + * interrupt. See virtqueue_req_flush() + */ + if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + vring_used_event(&vq->vq_ring) = UINT16_MAX; + } else { + vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + } +} + +static int +virtio_init_queue(struct virtio_dev *dev, uint16_t vtpci_queue_idx) +{ + unsigned int vq_size, size; + struct virtqueue *vq; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "setting up queue: %"PRIu16"\n", vtpci_queue_idx); + + /* + * Read the virtqueue size from the Queue Size field + * Always power of 2 and if 0 virtqueue does not exist + */ + vq_size = virtio_dev_backend_ops(dev)->get_queue_size(dev, vtpci_queue_idx); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq_size: %u\n", vq_size); + if (vq_size == 0) { + SPDK_ERRLOG("virtqueue %"PRIu16" does not exist\n", vtpci_queue_idx); + return -EINVAL; + } + + if (!spdk_u32_is_pow2(vq_size)) { + SPDK_ERRLOG("virtqueue %"PRIu16" size (%u) is not powerof 2\n", + vtpci_queue_idx, vq_size); + return -EINVAL; + } + + size = sizeof(*vq) + vq_size * sizeof(struct vq_desc_extra); + + if (posix_memalign((void **)&vq, SPDK_CACHE_LINE_SIZE, size)) { + SPDK_ERRLOG("can not allocate vq\n"); + return -ENOMEM; + } + memset(vq, 0, size); + dev->vqs[vtpci_queue_idx] = vq; + + vq->vdev = dev; + vq->vq_queue_index = vtpci_queue_idx; + vq->vq_nentries = vq_size; + + /* + * Reserve a memzone for vring elements + */ + size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN); + vq->vq_ring_size = SPDK_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vring_size: %u, rounded_vring_size: %u\n", + size, vq->vq_ring_size); + + vq->owner_thread = NULL; + + rc = virtio_dev_backend_ops(dev)->setup_queue(dev, vq); + if (rc < 0) { + SPDK_ERRLOG("setup_queue failed\n"); + free(vq); + dev->vqs[vtpci_queue_idx] = NULL; + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_mem: 0x%" PRIx64 "\n", + vq->vq_ring_mem); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_virt_mem: 0x%" PRIx64 "\n", + (uint64_t)(uintptr_t)vq->vq_ring_virt_mem); + + virtio_init_vring(vq); + return 0; +} + +static void +virtio_free_queues(struct virtio_dev *dev) +{ + uint16_t nr_vq = dev->max_queues; + struct virtqueue *vq; + uint16_t i; + + if (dev->vqs == NULL) { + return; + } + + for (i = 0; i < nr_vq; i++) { + vq = dev->vqs[i]; + if (!vq) { + continue; + } + + virtio_dev_backend_ops(dev)->del_queue(dev, vq); + + free(vq); + dev->vqs[i] = NULL; + } + + free(dev->vqs); + dev->vqs = NULL; +} + +static int +virtio_alloc_queues(struct virtio_dev *dev, uint16_t request_vq_num, uint16_t fixed_vq_num) +{ + uint16_t nr_vq; + uint16_t i; + int ret; + + nr_vq = request_vq_num + fixed_vq_num; + if (nr_vq == 0) { + /* perfectly fine to have a device with no virtqueues. */ + return 0; + } + + assert(dev->vqs == NULL); + dev->vqs = calloc(1, sizeof(struct virtqueue *) * nr_vq); + if (!dev->vqs) { + SPDK_ERRLOG("failed to allocate %"PRIu16" vqs\n", nr_vq); + return -ENOMEM; + } + + for (i = 0; i < nr_vq; i++) { + ret = virtio_init_queue(dev, i); + if (ret < 0) { + virtio_free_queues(dev); + return ret; + } + } + + dev->max_queues = nr_vq; + dev->fixed_queues_num = fixed_vq_num; + return 0; +} + +/** + * Negotiate virtio features. For virtio_user this will also set + * dev->modern flag if VIRTIO_F_VERSION_1 flag is negotiated. + */ +static int +virtio_negotiate_features(struct virtio_dev *dev, uint64_t req_features) +{ + uint64_t host_features = virtio_dev_backend_ops(dev)->get_features(dev); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "guest features = %" PRIx64 "\n", req_features); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "device features = %" PRIx64 "\n", host_features); + + rc = virtio_dev_backend_ops(dev)->set_features(dev, req_features & host_features); + if (rc != 0) { + SPDK_ERRLOG("failed to negotiate device features.\n"); + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "negotiated features = %" PRIx64 "\n", + dev->negotiated_features); + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK)) { + SPDK_ERRLOG("failed to set FEATURES_OK status!\n"); + /* either the device failed, or we offered some features that + * depend on other, not offered features. + */ + return -EINVAL; + } + + return 0; +} + +int +virtio_dev_construct(struct virtio_dev *vdev, const char *name, + const struct virtio_dev_ops *ops, void *ctx) +{ + int rc; + + vdev->name = strdup(name); + if (vdev->name == NULL) { + return -ENOMEM; + } + + rc = pthread_mutex_init(&vdev->mutex, NULL); + if (rc != 0) { + free(vdev->name); + return -rc; + } + + vdev->backend_ops = ops; + vdev->ctx = ctx; + + return 0; +} + +int +virtio_dev_reset(struct virtio_dev *dev, uint64_t req_features) +{ + req_features |= (1ULL << VIRTIO_F_VERSION_1); + + virtio_dev_stop(dev); + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_ACKNOWLEDGE)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_ACKNOWLEDGE status.\n"); + return -EIO; + } + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_DRIVER); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_DRIVER)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER status.\n"); + return -EIO; + } + + return virtio_negotiate_features(dev, req_features); +} + +int +virtio_dev_start(struct virtio_dev *vdev, uint16_t max_queues, uint16_t fixed_queue_num) +{ + int ret; + + ret = virtio_alloc_queues(vdev, max_queues, fixed_queue_num); + if (ret < 0) { + return ret; + } + + virtio_dev_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK); + if (!(virtio_dev_get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER_OK status.\n"); + return -1; + } + + return 0; +} + +void +virtio_dev_destruct(struct virtio_dev *dev) +{ + virtio_dev_backend_ops(dev)->destruct_dev(dev); + pthread_mutex_destroy(&dev->mutex); + free(dev->name); +} + +static void +vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx) +{ + struct vring_desc *dp, *dp_tail; + struct vq_desc_extra *dxp; + uint16_t desc_idx_last = desc_idx; + + dp = &vq->vq_ring.desc[desc_idx]; + dxp = &vq->vq_descx[desc_idx]; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs); + if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) { + while (dp->flags & VRING_DESC_F_NEXT) { + desc_idx_last = dp->next; + dp = &vq->vq_ring.desc[dp->next]; + } + } + dxp->ndescs = 0; + + /* + * We must append the existing free chain, if any, to the end of + * newly freed chain. If the virtqueue was completely used, then + * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above). + */ + if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) { + vq->vq_desc_head_idx = desc_idx; + } else { + dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx]; + dp_tail->next = desc_idx; + } + + vq->vq_desc_tail_idx = desc_idx_last; + dp->next = VQ_RING_DESC_CHAIN_END; +} + +static uint16_t +virtqueue_dequeue_burst_rx(struct virtqueue *vq, void **rx_pkts, + uint32_t *len, uint16_t num) +{ + struct vring_used_elem *uep; + void *cookie; + uint16_t used_idx, desc_idx; + uint16_t i; + + /* Caller does the check */ + for (i = 0; i < num ; i++) { + used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + uep = &vq->vq_ring.used->ring[used_idx]; + desc_idx = (uint16_t) uep->id; + len[i] = uep->len; + cookie = vq->vq_descx[desc_idx].cookie; + + if (spdk_unlikely(cookie == NULL)) { + SPDK_WARNLOG("vring descriptor with no mbuf cookie at %"PRIu16"\n", + vq->vq_used_cons_idx); + break; + } + + __builtin_prefetch(cookie); + + rx_pkts[i] = cookie; + vq->vq_used_cons_idx++; + vq_ring_free_chain(vq, desc_idx); + vq->vq_descx[desc_idx].cookie = NULL; + } + + return i; +} + +static void +finish_req(struct virtqueue *vq) +{ + struct vring_desc *desc; + uint16_t avail_idx; + + desc = &vq->vq_ring.desc[vq->req_end]; + desc->flags &= ~VRING_DESC_F_NEXT; + + /* + * Place the head of the descriptor chain into the next slot and make + * it usable to the host. The chain is made available now rather than + * deferring to virtqueue_req_flush() in the hopes that if the host is + * currently running on another CPU, we can keep it processing the new + * descriptor. + */ + avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1)); + vq->vq_ring.avail->ring[avail_idx] = vq->req_start; + vq->vq_avail_idx++; + vq->req_end = VQ_RING_DESC_CHAIN_END; + virtio_wmb(); + vq->vq_ring.avail->idx = vq->vq_avail_idx; + vq->reqs_finished++; +} + +int +virtqueue_req_start(struct virtqueue *vq, void *cookie, int iovcnt) +{ + struct vq_desc_extra *dxp; + + if (iovcnt > vq->vq_free_cnt) { + return iovcnt > vq->vq_nentries ? -EINVAL : -ENOMEM; + } + + if (vq->req_end != VQ_RING_DESC_CHAIN_END) { + finish_req(vq); + } + + vq->req_start = vq->vq_desc_head_idx; + dxp = &vq->vq_descx[vq->req_start]; + dxp->cookie = cookie; + dxp->ndescs = 0; + + return 0; +} + +void +virtqueue_req_flush(struct virtqueue *vq) +{ + uint16_t reqs_finished; + + if (vq->req_end == VQ_RING_DESC_CHAIN_END) { + /* no non-empty requests have been started */ + return; + } + + finish_req(vq); + virtio_mb(); + + reqs_finished = vq->reqs_finished; + vq->reqs_finished = 0; + + if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + /* Set used event idx to a value the device will never reach. + * This effectively disables interrupts. + */ + vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - vq->vq_nentries - 1; + + if (!vring_need_event(vring_avail_event(&vq->vq_ring), + vq->vq_avail_idx, + vq->vq_avail_idx - reqs_finished)) { + return; + } + } else if (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) { + return; + } + + virtio_dev_backend_ops(vq->vdev)->notify_queue(vq->vdev, vq); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "Notified backend after xmit\n"); +} + +void +virtqueue_req_abort(struct virtqueue *vq) +{ + struct vring_desc *desc; + + if (vq->req_start == VQ_RING_DESC_CHAIN_END) { + /* no requests have been started */ + return; + } + + desc = &vq->vq_ring.desc[vq->req_end]; + desc->flags &= ~VRING_DESC_F_NEXT; + + vq_ring_free_chain(vq, vq->req_start); + vq->req_start = VQ_RING_DESC_CHAIN_END; +} + +void +virtqueue_req_add_iovs(struct virtqueue *vq, struct iovec *iovs, uint16_t iovcnt, + enum spdk_virtio_desc_type desc_type) +{ + struct vring_desc *desc; + struct vq_desc_extra *dxp; + uint16_t i, prev_head, new_head; + + assert(vq->req_start != VQ_RING_DESC_CHAIN_END); + assert(iovcnt <= vq->vq_free_cnt); + + /* TODO use indirect descriptors if iovcnt is high enough + * or the caller specifies SPDK_VIRTIO_DESC_F_INDIRECT + */ + + prev_head = vq->req_end; + new_head = vq->vq_desc_head_idx; + for (i = 0; i < iovcnt; ++i) { + desc = &vq->vq_ring.desc[new_head]; + + if (!vq->vdev->is_hw) { + desc->addr = (uintptr_t)iovs[i].iov_base; + } else { + desc->addr = spdk_vtophys(iovs[i].iov_base, NULL); + } + + desc->len = iovs[i].iov_len; + /* always set NEXT flag. unset it on the last descriptor + * in the request-ending function. + */ + desc->flags = desc_type | VRING_DESC_F_NEXT; + + prev_head = new_head; + new_head = desc->next; + } + + dxp = &vq->vq_descx[vq->req_start]; + dxp->ndescs += iovcnt; + + vq->req_end = prev_head; + vq->vq_desc_head_idx = new_head; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - iovcnt); + if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) { + assert(vq->vq_free_cnt == 0); + vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END; + } +} + +#define DESC_PER_CACHELINE (SPDK_CACHE_LINE_SIZE / sizeof(struct vring_desc)) +uint16_t +virtio_recv_pkts(struct virtqueue *vq, void **io, uint32_t *len, uint16_t nb_pkts) +{ + uint16_t nb_used, num; + + nb_used = vq->vq_ring.used->idx - vq->vq_used_cons_idx; + virtio_rmb(); + + num = (uint16_t)(spdk_likely(nb_used <= nb_pkts) ? nb_used : nb_pkts); + if (spdk_likely(num > DESC_PER_CACHELINE)) { + num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE); + } + + return virtqueue_dequeue_burst_rx(vq, io, len, num); +} + +int +virtio_dev_acquire_queue(struct virtio_dev *vdev, uint16_t index) +{ + struct virtqueue *vq = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("requested vq index %"PRIu16" exceeds max queue count %"PRIu16".\n", + index, vdev->max_queues); + return -1; + } + + pthread_mutex_lock(&vdev->mutex); + vq = vdev->vqs[index]; + if (vq == NULL || vq->owner_thread != NULL) { + pthread_mutex_unlock(&vdev->mutex); + return -1; + } + + vq->owner_thread = spdk_get_thread(); + pthread_mutex_unlock(&vdev->mutex); + return 0; +} + +int32_t +virtio_dev_find_and_acquire_queue(struct virtio_dev *vdev, uint16_t start_index) +{ + struct virtqueue *vq = NULL; + uint16_t i; + + pthread_mutex_lock(&vdev->mutex); + for (i = start_index; i < vdev->max_queues; ++i) { + vq = vdev->vqs[i]; + if (vq != NULL && vq->owner_thread == NULL) { + break; + } + } + + if (vq == NULL || i == vdev->max_queues) { + SPDK_ERRLOG("no more unused virtio queues with idx >= %"PRIu16".\n", start_index); + pthread_mutex_unlock(&vdev->mutex); + return -1; + } + + vq->owner_thread = spdk_get_thread(); + pthread_mutex_unlock(&vdev->mutex); + return i; +} + +struct spdk_thread * +virtio_dev_queue_get_thread(struct virtio_dev *vdev, uint16_t index) +{ + struct spdk_thread *thread = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16"\n", + index, vdev->max_queues); + abort(); /* This is not recoverable */ + } + + pthread_mutex_lock(&vdev->mutex); + thread = vdev->vqs[index]->owner_thread; + pthread_mutex_unlock(&vdev->mutex); + + return thread; +} + +bool +virtio_dev_queue_is_acquired(struct virtio_dev *vdev, uint16_t index) +{ + return virtio_dev_queue_get_thread(vdev, index) != NULL; +} + +void +virtio_dev_release_queue(struct virtio_dev *vdev, uint16_t index) +{ + struct virtqueue *vq = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16".\n", + index, vdev->max_queues); + return; + } + + pthread_mutex_lock(&vdev->mutex); + vq = vdev->vqs[index]; + if (vq == NULL) { + SPDK_ERRLOG("virtqueue at index %"PRIu16" is not initialized.\n", index); + pthread_mutex_unlock(&vdev->mutex); + return; + } + + assert(vq->owner_thread == spdk_get_thread()); + vq->owner_thread = NULL; + pthread_mutex_unlock(&vdev->mutex); +} + +int +virtio_dev_read_dev_config(struct virtio_dev *dev, size_t offset, + void *dst, int length) +{ + return virtio_dev_backend_ops(dev)->read_dev_cfg(dev, offset, dst, length); +} + +int +virtio_dev_write_dev_config(struct virtio_dev *dev, size_t offset, + const void *src, int length) +{ + return virtio_dev_backend_ops(dev)->write_dev_cfg(dev, offset, src, length); +} + +void +virtio_dev_stop(struct virtio_dev *dev) +{ + virtio_dev_backend_ops(dev)->set_status(dev, VIRTIO_CONFIG_S_RESET); + /* flush status write */ + virtio_dev_backend_ops(dev)->get_status(dev); + virtio_free_queues(dev); +} + +void +virtio_dev_set_status(struct virtio_dev *dev, uint8_t status) +{ + if (status != VIRTIO_CONFIG_S_RESET) { + status |= virtio_dev_backend_ops(dev)->get_status(dev); + } + + virtio_dev_backend_ops(dev)->set_status(dev, status); +} + +uint8_t +virtio_dev_get_status(struct virtio_dev *dev) +{ + return virtio_dev_backend_ops(dev)->get_status(dev); +} + +const struct virtio_dev_ops * +virtio_dev_backend_ops(struct virtio_dev *dev) +{ + return dev->backend_ops; +} + +void +virtio_dev_dump_json_info(struct virtio_dev *hw, struct spdk_json_write_ctx *w) +{ + spdk_json_write_named_object_begin(w, "virtio"); + + spdk_json_write_named_uint32(w, "vq_count", hw->max_queues); + + spdk_json_write_named_uint32(w, "vq_size", + virtio_dev_backend_ops(hw)->get_queue_size(hw, 0)); + + virtio_dev_backend_ops(hw)->dump_json_info(hw, w); + + spdk_json_write_object_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_dev", SPDK_LOG_VIRTIO_DEV) diff --git a/src/spdk/lib/virtio/virtio_pci.c b/src/spdk/lib/virtio/virtio_pci.c new file mode 100644 index 000000000..646f77c1a --- /dev/null +++ b/src/spdk/lib/virtio/virtio_pci.c @@ -0,0 +1,599 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/memory.h" +#include "spdk/mmio.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk_internal/virtio.h" + +struct virtio_hw { + uint8_t use_msix; + uint32_t notify_off_multiplier; + uint8_t *isr; + uint16_t *notify_base; + + struct { + /** Mem-mapped resources from given PCI BAR */ + void *vaddr; + + /** Length of the address space */ + uint32_t len; + } pci_bar[6]; + + struct virtio_pci_common_cfg *common_cfg; + struct spdk_pci_device *pci_dev; + + /** Device-specific PCI config space */ + void *dev_cfg; +}; + +struct virtio_pci_probe_ctx { + virtio_pci_create_cb enum_cb; + void *enum_ctx; + uint16_t device_id; +}; + +/* + * Following macros are derived from linux/pci_regs.h, however, + * we can't simply include that header here, as there is no such + * file for non-Linux platform. + */ +#define PCI_CAPABILITY_LIST 0x34 +#define PCI_CAP_ID_VNDR 0x09 +#define PCI_CAP_ID_MSIX 0x11 + +static inline int +check_vq_phys_addr_ok(struct virtqueue *vq) +{ + /* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit, + * and only accepts 32 bit page frame number. + * Check if the allocated physical memory exceeds 16TB. + */ + if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >> + (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) { + SPDK_ERRLOG("vring address shouldn't be above 16TB!\n"); + return 0; + } + + return 1; +} + +static void +free_virtio_hw(struct virtio_hw *hw) +{ + unsigned i; + + for (i = 0; i < 6; ++i) { + if (hw->pci_bar[i].vaddr == NULL) { + continue; + } + + spdk_pci_device_unmap_bar(hw->pci_dev, i, hw->pci_bar[i].vaddr); + } + + free(hw); +} + +static void +pci_dump_json_info(struct virtio_dev *dev, struct spdk_json_write_ctx *w) +{ + struct virtio_hw *hw = dev->ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr((struct spdk_pci_device *)hw->pci_dev); + char addr[32]; + + spdk_json_write_name(w, "type"); + if (dev->modern) { + spdk_json_write_string(w, "pci-modern"); + } else { + spdk_json_write_string(w, "pci-legacy"); + } + + spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr); + spdk_json_write_named_string(w, "pci_address", addr); +} + +static void +pci_write_json_config(struct virtio_dev *dev, struct spdk_json_write_ctx *w) +{ + struct virtio_hw *hw = dev->ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(hw->pci_dev); + char addr[32]; + + spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr); + + spdk_json_write_named_string(w, "trtype", "pci"); + spdk_json_write_named_string(w, "traddr", addr); +} + +static inline void +io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi) +{ + spdk_mmio_write_4(lo, val & ((1ULL << 32) - 1)); + spdk_mmio_write_4(hi, val >> 32); +} + +static int +modern_read_dev_config(struct virtio_dev *dev, size_t offset, + void *dst, int length) +{ + struct virtio_hw *hw = dev->ctx; + int i; + uint8_t *p; + uint8_t old_gen, new_gen; + + do { + old_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation); + + p = dst; + for (i = 0; i < length; i++) { + *p++ = spdk_mmio_read_1((uint8_t *)hw->dev_cfg + offset + i); + } + + new_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation); + } while (old_gen != new_gen); + + return 0; +} + +static int +modern_write_dev_config(struct virtio_dev *dev, size_t offset, + const void *src, int length) +{ + struct virtio_hw *hw = dev->ctx; + int i; + const uint8_t *p = src; + + for (i = 0; i < length; i++) { + spdk_mmio_write_1(((uint8_t *)hw->dev_cfg) + offset + i, *p++); + } + + return 0; +} + +static uint64_t +modern_get_features(struct virtio_dev *dev) +{ + struct virtio_hw *hw = dev->ctx; + uint32_t features_lo, features_hi; + + spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 0); + features_lo = spdk_mmio_read_4(&hw->common_cfg->device_feature); + + spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 1); + features_hi = spdk_mmio_read_4(&hw->common_cfg->device_feature); + + return ((uint64_t)features_hi << 32) | features_lo; +} + +static int +modern_set_features(struct virtio_dev *dev, uint64_t features) +{ + struct virtio_hw *hw = dev->ctx; + + if ((features & (1ULL << VIRTIO_F_VERSION_1)) == 0) { + SPDK_ERRLOG("VIRTIO_F_VERSION_1 feature is not enabled.\n"); + return -EINVAL; + } + + spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 0); + spdk_mmio_write_4(&hw->common_cfg->guest_feature, features & ((1ULL << 32) - 1)); + + spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 1); + spdk_mmio_write_4(&hw->common_cfg->guest_feature, features >> 32); + + dev->negotiated_features = features; + + return 0; +} + +static void +modern_destruct_dev(struct virtio_dev *vdev) +{ + struct virtio_hw *hw = vdev->ctx; + struct spdk_pci_device *pci_dev = hw->pci_dev; + + free_virtio_hw(hw); + spdk_pci_device_detach(pci_dev); +} + +static uint8_t +modern_get_status(struct virtio_dev *dev) +{ + struct virtio_hw *hw = dev->ctx; + + return spdk_mmio_read_1(&hw->common_cfg->device_status); +} + +static void +modern_set_status(struct virtio_dev *dev, uint8_t status) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_1(&hw->common_cfg->device_status, status); +} + +static uint16_t +modern_get_queue_size(struct virtio_dev *dev, uint16_t queue_id) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_2(&hw->common_cfg->queue_select, queue_id); + return spdk_mmio_read_2(&hw->common_cfg->queue_size); +} + +static int +modern_setup_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + struct virtio_hw *hw = dev->ctx; + uint64_t desc_addr, avail_addr, used_addr; + uint16_t notify_off; + void *queue_mem; + uint64_t queue_mem_phys_addr; + + /* To ensure physical address contiguity we make the queue occupy + * only a single hugepage (2MB). As of Virtio 1.0, the queue size + * always falls within this limit. + */ + if (vq->vq_ring_size > VALUE_2MB) { + return -ENOMEM; + } + + queue_mem = spdk_zmalloc(vq->vq_ring_size, VALUE_2MB, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (queue_mem == NULL) { + return -ENOMEM; + } + + queue_mem_phys_addr = spdk_vtophys(queue_mem, NULL); + if (queue_mem_phys_addr == SPDK_VTOPHYS_ERROR) { + spdk_free(queue_mem); + return -EFAULT; + } + + vq->vq_ring_mem = queue_mem_phys_addr; + vq->vq_ring_virt_mem = queue_mem; + + if (!check_vq_phys_addr_ok(vq)) { + spdk_free(queue_mem); + return -ENOMEM; + } + + desc_addr = vq->vq_ring_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = (avail_addr + offsetof(struct vring_avail, ring[vq->vq_nentries]) + + VIRTIO_PCI_VRING_ALIGN - 1) & ~(VIRTIO_PCI_VRING_ALIGN - 1); + + spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index); + + io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + notify_off = spdk_mmio_read_2(&hw->common_cfg->queue_notify_off); + vq->notify_addr = (void *)((uint8_t *)hw->notify_base + + notify_off * hw->notify_off_multiplier); + + spdk_mmio_write_2(&hw->common_cfg->queue_enable, 1); + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "queue %"PRIu16" addresses:\n", vq->vq_queue_index); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t desc_addr: %" PRIx64 "\n", desc_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t aval_addr: %" PRIx64 "\n", avail_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t used_addr: %" PRIx64 "\n", used_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t notify addr: %p (notify offset: %"PRIu16")\n", + vq->notify_addr, notify_off); + + return 0; +} + +static void +modern_del_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index); + + io_write64_twopart(0, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(0, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(0, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + spdk_mmio_write_2(&hw->common_cfg->queue_enable, 0); + + spdk_free(vq->vq_ring_virt_mem); +} + +static void +modern_notify_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + spdk_mmio_write_2(vq->notify_addr, vq->vq_queue_index); +} + +static const struct virtio_dev_ops modern_ops = { + .read_dev_cfg = modern_read_dev_config, + .write_dev_cfg = modern_write_dev_config, + .get_status = modern_get_status, + .set_status = modern_set_status, + .get_features = modern_get_features, + .set_features = modern_set_features, + .destruct_dev = modern_destruct_dev, + .get_queue_size = modern_get_queue_size, + .setup_queue = modern_setup_queue, + .del_queue = modern_del_queue, + .notify_queue = modern_notify_queue, + .dump_json_info = pci_dump_json_info, + .write_json_config = pci_write_json_config, +}; + +static void * +get_cfg_addr(struct virtio_hw *hw, struct virtio_pci_cap *cap) +{ + uint8_t bar = cap->bar; + uint32_t length = cap->length; + uint32_t offset = cap->offset; + + if (bar > 5) { + SPDK_ERRLOG("invalid bar: %"PRIu8"\n", bar); + return NULL; + } + + if (offset + length < offset) { + SPDK_ERRLOG("offset(%"PRIu32") + length(%"PRIu32") overflows\n", + offset, length); + return NULL; + } + + if (offset + length > hw->pci_bar[bar].len) { + SPDK_ERRLOG("invalid cap: overflows bar space: %"PRIu32" > %"PRIu32"\n", + offset + length, hw->pci_bar[bar].len); + return NULL; + } + + if (hw->pci_bar[bar].vaddr == NULL) { + SPDK_ERRLOG("bar %"PRIu8" base addr is NULL\n", bar); + return NULL; + } + + return hw->pci_bar[bar].vaddr + offset; +} + +static int +virtio_read_caps(struct virtio_hw *hw) +{ + uint8_t pos; + struct virtio_pci_cap cap; + int ret; + + ret = spdk_pci_device_cfg_read(hw->pci_dev, &pos, 1, PCI_CAPABILITY_LIST); + if (ret < 0) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "failed to read pci capability list\n"); + return ret; + } + + while (pos) { + ret = spdk_pci_device_cfg_read(hw->pci_dev, &cap, sizeof(cap), pos); + if (ret < 0) { + SPDK_ERRLOG("failed to read pci cap at pos: %"PRIx8"\n", pos); + break; + } + + if (cap.cap_vndr == PCI_CAP_ID_MSIX) { + hw->use_msix = 1; + } + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, + "[%2"PRIx8"] skipping non VNDR cap id: %02"PRIx8"\n", + pos, cap.cap_vndr); + goto next; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, + "[%2"PRIx8"] cfg type: %"PRIu8", bar: %"PRIu8", offset: %04"PRIx32", len: %"PRIu32"\n", + pos, cap.cfg_type, cap.bar, cap.offset, cap.length); + + switch (cap.cfg_type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + hw->common_cfg = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_NOTIFY_CFG: + spdk_pci_device_cfg_read(hw->pci_dev, &hw->notify_off_multiplier, + 4, pos + sizeof(cap)); + hw->notify_base = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + hw->dev_cfg = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_ISR_CFG: + hw->isr = get_cfg_addr(hw, &cap); + break; + } + +next: + pos = cap.cap_next; + } + + if (hw->common_cfg == NULL || hw->notify_base == NULL || + hw->dev_cfg == NULL || hw->isr == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "no modern virtio pci device found.\n"); + if (ret < 0) { + return ret; + } else { + return -EINVAL; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "found modern virtio pci device.\n"); + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "common cfg mapped at: %p\n", hw->common_cfg); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "device cfg mapped at: %p\n", hw->dev_cfg); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "isr cfg mapped at: %p\n", hw->isr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "notify base: %p, notify off multiplier: %u\n", + hw->notify_base, hw->notify_off_multiplier); + + return 0; +} + +static int +virtio_pci_dev_probe(struct spdk_pci_device *pci_dev, struct virtio_pci_probe_ctx *ctx) +{ + struct virtio_hw *hw; + uint8_t *bar_vaddr; + uint64_t bar_paddr, bar_len; + int rc; + unsigned i; + char bdf[32]; + struct spdk_pci_addr addr; + + addr = spdk_pci_device_get_addr(pci_dev); + rc = spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr); + if (rc != 0) { + SPDK_ERRLOG("Ignoring a device with non-parseable PCI address\n"); + return -1; + } + + hw = calloc(1, sizeof(*hw)); + if (hw == NULL) { + SPDK_ERRLOG("%s: calloc failed\n", bdf); + return -1; + } + + hw->pci_dev = pci_dev; + + for (i = 0; i < 6; ++i) { + rc = spdk_pci_device_map_bar(pci_dev, i, (void *) &bar_vaddr, &bar_paddr, + &bar_len); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to memmap PCI BAR %u\n", bdf, i); + free_virtio_hw(hw); + return -1; + } + + hw->pci_bar[i].vaddr = bar_vaddr; + hw->pci_bar[i].len = bar_len; + } + + /* Virtio PCI caps exist only on modern PCI devices. + * Legacy devices are not supported. + */ + if (virtio_read_caps(hw) != 0) { + SPDK_NOTICELOG("Ignoring legacy PCI device at %s\n", bdf); + free_virtio_hw(hw); + return -1; + } + + rc = ctx->enum_cb((struct virtio_pci_ctx *)hw, ctx->enum_ctx); + if (rc != 0) { + free_virtio_hw(hw); + } + + return rc; +} + +static int +virtio_pci_dev_probe_cb(void *probe_ctx, struct spdk_pci_device *pci_dev) +{ + struct virtio_pci_probe_ctx *ctx = probe_ctx; + uint16_t pci_device_id = spdk_pci_device_get_device_id(pci_dev); + + if (pci_device_id != ctx->device_id) { + return 1; + } + + return virtio_pci_dev_probe(pci_dev, ctx); +} + +int +virtio_pci_dev_enumerate(virtio_pci_create_cb enum_cb, void *enum_ctx, + uint16_t pci_device_id) +{ + struct virtio_pci_probe_ctx ctx; + + if (!spdk_process_is_primary()) { + SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n"); + return 0; + } + + ctx.enum_cb = enum_cb; + ctx.enum_ctx = enum_ctx; + ctx.device_id = pci_device_id; + + return spdk_pci_enumerate(spdk_pci_virtio_get_driver(), + virtio_pci_dev_probe_cb, &ctx); +} + +int +virtio_pci_dev_attach(virtio_pci_create_cb enum_cb, void *enum_ctx, + uint16_t pci_device_id, struct spdk_pci_addr *pci_address) +{ + struct virtio_pci_probe_ctx ctx; + + if (!spdk_process_is_primary()) { + SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n"); + return 0; + } + + ctx.enum_cb = enum_cb; + ctx.enum_ctx = enum_ctx; + ctx.device_id = pci_device_id; + + return spdk_pci_device_attach(spdk_pci_virtio_get_driver(), + virtio_pci_dev_probe_cb, &ctx, pci_address); +} + +int +virtio_pci_dev_init(struct virtio_dev *vdev, const char *name, + struct virtio_pci_ctx *pci_ctx) +{ + int rc; + + rc = virtio_dev_construct(vdev, name, &modern_ops, pci_ctx); + if (rc != 0) { + return rc; + } + + vdev->is_hw = 1; + vdev->modern = 1; + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_pci", SPDK_LOG_VIRTIO_PCI) diff --git a/src/spdk/lib/virtio/virtio_user.c b/src/spdk/lib/virtio/virtio_user.c new file mode 100644 index 000000000..4f4932db9 --- /dev/null +++ b/src/spdk/lib/virtio/virtio_user.c @@ -0,0 +1,628 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <sys/eventfd.h> + +#include "vhost_user.h" +#include "spdk/string.h" +#include "spdk/config.h" + +#include "spdk_internal/virtio.h" + +#define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \ + ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) + +static int +virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + + /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come + * firstly because vhost depends on this msg to allocate virtqueue + * pair. + */ + struct vhost_vring_file file; + + file.index = queue_sel; + file.fd = dev->callfds[queue_sel]; + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file); +} + +static int +virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vring *vring = &dev->vrings[queue_sel]; + struct vhost_vring_addr addr = { + .index = queue_sel, + .desc_user_addr = (uint64_t)(uintptr_t)vring->desc, + .avail_user_addr = (uint64_t)(uintptr_t)vring->avail, + .used_user_addr = (uint64_t)(uintptr_t)vring->used, + .log_guest_addr = 0, + .flags = 0, /* disable log */ + }; + + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr); +} + +static int +virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_file file; + struct vhost_vring_state state; + struct vring *vring = &dev->vrings[queue_sel]; + int rc; + + state.index = queue_sel; + state.num = vring->num; + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state); + if (rc < 0) { + return rc; + } + + state.index = queue_sel; + state.num = 0; /* no reservation */ + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state); + if (rc < 0) { + return rc; + } + + virtio_user_set_vring_addr(vdev, queue_sel); + + /* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes + * lastly because vhost depends on this msg to judge if + * virtio is ready. + */ + file.index = queue_sel; + file.fd = dev->kickfds[queue_sel]; + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file); +} + +static int +virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_state state; + + state.index = queue_sel; + state.num = 0; + + return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state); +} + +static int +virtio_user_queue_setup(struct virtio_dev *vdev, + int (*fn)(struct virtio_dev *, uint32_t)) +{ + uint32_t i; + int rc; + + for (i = 0; i < vdev->max_queues; ++i) { + rc = fn(vdev, i); + if (rc < 0) { + SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i); + return rc; + } + } + + return 0; +} + +static int +virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct virtio_dev *vdev = cb_ctx; + struct virtio_user_dev *dev = vdev->ctx; + uint64_t features; + int ret; + + /* We have to resend all mappings anyway, so don't bother with any + * page tracking. + */ + ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL); + if (ret < 0) { + return ret; + } + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + /* Our internal rte_vhost lib requires SET_VRING_ADDR to flush a pending + * SET_MEM_TABLE. On the other hand, the upstream rte_vhost will invalidate + * the entire queue upon receiving SET_VRING_ADDR message, so we mustn't + * send it here. Both behaviors are strictly implementation specific, but + * this message isn't needed from the point of the spec, so send it only + * if vhost is compiled with our internal lib. + */ + ret = virtio_user_queue_setup(vdev, virtio_user_set_vring_addr); + if (ret < 0) { + return ret; + } +#endif + + /* Since we might want to use that mapping straight away, we have to + * make sure the guest has already processed our SET_MEM_TABLE message. + * F_REPLY_ACK is just a feature and the host is not obliged to + * support it, so we send a simple message that always has a response + * and we wait for that response. Messages are always processed in order. + */ + return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features); +} + +static int +virtio_user_register_mem(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + const struct spdk_mem_map_ops virtio_user_map_ops = { + .notify_cb = virtio_user_map_notify, + .are_contiguous = NULL + }; + + dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev); + if (dev->mem_map == NULL) { + SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); + return -1; + } + + return 0; +} + +static void +virtio_user_unregister_mem(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_mem_map_free(&dev->mem_map); +} + +static int +virtio_user_start_device(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t host_max_queues; + int ret; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 && + vdev->max_queues > 1 + vdev->fixed_queues_num) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the " + "host doesn't support VHOST_USER_PROTOCOL_F_MQ. " + "Only one request queue will be used.\n", + vdev->name, vdev->max_queues - vdev->fixed_queues_num); + vdev->max_queues = 1 + vdev->fixed_queues_num; + } + + /* negotiate the number of I/O queues. */ + ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues); + if (ret < 0) { + return ret; + } + + if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues" + "but only %"PRIu64" available\n", + vdev->name, vdev->max_queues - vdev->fixed_queues_num, + host_max_queues); + vdev->max_queues = host_max_queues; + } + + /* tell vhost to create queues */ + ret = virtio_user_queue_setup(vdev, virtio_user_create_queue); + if (ret < 0) { + return ret; + } + + ret = virtio_user_register_mem(vdev); + if (ret < 0) { + return ret; + } + + return virtio_user_queue_setup(vdev, virtio_user_kick_queue); +} + +static int +virtio_user_stop_device(struct virtio_dev *vdev) +{ + int ret; + + ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue); + /* a queue might fail to stop for various reasons, e.g. socket + * connection going down, but this mustn't prevent us from freeing + * the mem map. + */ + virtio_user_unregister_mem(vdev); + return ret; +} + +static int +virtio_user_dev_setup(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint16_t i; + + dev->vhostfd = -1; + + for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) { + dev->callfds[i] = -1; + dev->kickfds[i] = -1; + } + + dev->ops = &ops_user; + + return dev->ops->setup(dev); +} + +static int +virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset, + void *dst, int length) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_user_config cfg = {0}; + int rc; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) { + return -ENOTSUP; + } + + cfg.offset = 0; + cfg.size = VHOST_USER_MAX_CONFIG_SIZE; + + rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg); + if (rc < 0) { + SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + memcpy(dst, cfg.region + offset, length); + return 0; +} + +static int +virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset, + const void *src, int length) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_user_config cfg = {0}; + int rc; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) { + return -ENOTSUP; + } + + cfg.offset = offset; + cfg.size = length; + memcpy(cfg.region, src, length); + + rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg); + if (rc < 0) { + SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + return 0; +} + +static void +virtio_user_set_status(struct virtio_dev *vdev, uint8_t status) +{ + struct virtio_user_dev *dev = vdev->ctx; + int rc = 0; + + if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) && + status != VIRTIO_CONFIG_S_RESET) { + rc = -1; + } else if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + rc = virtio_user_start_device(vdev); + } else if (status == VIRTIO_CONFIG_S_RESET && + (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + rc = virtio_user_stop_device(vdev); + } + + if (rc != 0) { + dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET; + } else { + dev->status = status; + } +} + +static uint8_t +virtio_user_get_status(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + return dev->status; +} + +static uint64_t +virtio_user_get_features(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t features; + int rc; + + rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features); + if (rc < 0) { + SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc)); + return 0; + } + + return features; +} + +static int +virtio_user_set_features(struct virtio_dev *vdev, uint64_t features) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t protocol_features; + int ret; + + ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features); + if (ret < 0) { + return ret; + } + + vdev->negotiated_features = features; + vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1); + + if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + /* nothing else to do */ + return 0; + } + + ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features); + if (ret < 0) { + return ret; + } + + protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES; + ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features); + if (ret < 0) { + return ret; + } + + dev->protocol_features = protocol_features; + return 0; +} + +static uint16_t +virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id) +{ + struct virtio_user_dev *dev = vdev->ctx; + + /* Currently each queue has same queue size */ + return dev->queue_size; +} + +static int +virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_state state; + uint16_t queue_idx = vq->vq_queue_index; + void *queue_mem; + uint64_t desc_addr, avail_addr, used_addr; + int callfd, kickfd, rc; + + if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) { + SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx); + return -EEXIST; + } + + /* May use invalid flag, but some backend uses kickfd and + * callfd as criteria to judge if dev is alive. so finally we + * use real event_fd. + */ + callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (callfd < 0) { + SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno)); + return -errno; + } + + kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (kickfd < 0) { + SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno)); + close(callfd); + return -errno; + } + + queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (queue_mem == NULL) { + close(kickfd); + close(callfd); + return -ENOMEM; + } + + vq->vq_ring_mem = SPDK_VTOPHYS_ERROR; + vq->vq_ring_virt_mem = queue_mem; + + state.index = vq->vq_queue_index; + state.num = 0; + + if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state); + if (rc < 0) { + SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n", + spdk_strerror(-rc)); + close(kickfd); + close(callfd); + spdk_free(queue_mem); + return -rc; + } + } + + dev->callfds[queue_idx] = callfd; + dev->kickfds[queue_idx] = kickfd; + + desc_addr = (uintptr_t)vq->vq_ring_virt_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail, + ring[vq->vq_nentries]), + VIRTIO_PCI_VRING_ALIGN); + + dev->vrings[queue_idx].num = vq->vq_nentries; + dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr; + dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr; + dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr; + + return 0; +} + +static void +virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + /* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU + * correspondingly stops the ioeventfds, and reset the status of + * the device. + * For modern devices, set queue desc, avail, used in PCI bar to 0, + * not see any more behavior in QEMU. + * + * Here we just care about what information to deliver to vhost-user. + * So we just close ioeventfd for now. + */ + struct virtio_user_dev *dev = vdev->ctx; + + close(dev->callfds[vq->vq_queue_index]); + close(dev->kickfds[vq->vq_queue_index]); + dev->callfds[vq->vq_queue_index] = -1; + dev->kickfds[vq->vq_queue_index] = -1; + + spdk_free(vq->vq_ring_virt_mem); +} + +static void +virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + uint64_t buf = 1; + struct virtio_user_dev *dev = vdev->ctx; + + if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) { + SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno)); + } +} + +static void +virtio_user_destroy(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + close(dev->vhostfd); + free(dev); +} + +static void +virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_json_write_named_string(w, "type", "user"); + spdk_json_write_named_string(w, "socket", dev->path); +} + +static void +virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_json_write_named_string(w, "trtype", "user"); + spdk_json_write_named_string(w, "traddr", dev->path); + spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num); + spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0)); +} + +static const struct virtio_dev_ops virtio_user_ops = { + .read_dev_cfg = virtio_user_read_dev_config, + .write_dev_cfg = virtio_user_write_dev_config, + .get_status = virtio_user_get_status, + .set_status = virtio_user_set_status, + .get_features = virtio_user_get_features, + .set_features = virtio_user_set_features, + .destruct_dev = virtio_user_destroy, + .get_queue_size = virtio_user_get_queue_size, + .setup_queue = virtio_user_setup_queue, + .del_queue = virtio_user_del_queue, + .notify_queue = virtio_user_notify_queue, + .dump_json_info = virtio_user_dump_json_info, + .write_json_config = virtio_user_write_json_config, +}; + +int +virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path, + uint32_t queue_size) +{ + struct virtio_user_dev *dev; + int rc; + + if (name == NULL) { + SPDK_ERRLOG("No name gived for controller: %s\n", path); + return -EINVAL; + } + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return -ENOMEM; + } + + rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev); + if (rc != 0) { + SPDK_ERRLOG("Failed to init device: %s\n", path); + free(dev); + return rc; + } + + vdev->is_hw = 0; + + snprintf(dev->path, PATH_MAX, "%s", path); + dev->queue_size = queue_size; + + rc = virtio_user_dev_setup(vdev); + if (rc < 0) { + SPDK_ERRLOG("backend set up fails\n"); + goto err; + } + + rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL); + if (rc < 0) { + SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc)); + goto err; + } + + return 0; + +err: + virtio_dev_destruct(vdev); + return rc; +} |