Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/lib/virtio
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
7 files changed, 2581 insertions, 0 deletions
diff --git a/src/spdk/lib/virtio/Makefile b/src/spdk/lib/virtio/Makefile
new file mode 100644
index 000000000..8ea173c3b
--- /dev/null
+++ b/src/spdk/lib/virtio/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = virtio.c virtio_user.c virtio_pci.c vhost_user.c
+LIBNAME = virtio
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_virtio.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/virtio/spdk_virtio.map b/src/spdk/lib/virtio/spdk_virtio.map
new file mode 100644
index 000000000..76e02cff8
--- /dev/null
+++ b/src/spdk/lib/virtio/spdk_virtio.map
@@ -0,0 +1,33 @@
+{
+	global:
+
+	# internal functions in spdk_internal/virtio.h
+	virtio_recv_pkts;
+	virtqueue_req_start;
+	virtqueue_req_flush;
+	virtqueue_req_abort;
+	virtqueue_req_add_iovs;
+	virtio_dev_construct;
+	virtio_dev_reset;
+	virtio_dev_start;
+	virtio_dev_stop;
+	virtio_dev_destruct;
+	virtio_dev_acquire_queue;
+	virtio_dev_find_and_acquire_queue;
+	virtio_dev_queue_get_thread;
+	virtio_dev_queue_is_acquired;
+	virtio_dev_release_queue;
+	virtio_dev_get_status;
+	virtio_dev_set_status;
+	virtio_dev_write_dev_config;
+	virtio_dev_read_dev_config;
+	virtio_dev_backend_ops;
+	virtio_dev_has_feature;
+	virtio_dev_dump_json_info;
+	virtio_pci_dev_enumerate;
+	virtio_pci_dev_attach;
+	virtio_user_dev_init;
+	virtio_pci_dev_init;
+
+	local: *;
+};
diff --git a/src/spdk/lib/virtio/vhost_user.c b/src/spdk/lib/virtio/vhost_user.c
new file mode 100644
index 000000000..b3da9d988
--- /dev/null
+++ b/src/spdk/lib/virtio/vhost_user.c
@@ -0,0 +1,489 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vhost_user.h"
+
+#include "spdk/string.h"
+#include "spdk_internal/vhost_user.h"
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+static int
+vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
+{
+	int r;
+	struct msghdr msgh;
+	struct iovec iov;
+	size_t fd_size = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+	struct cmsghdr *cmsg;
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = (uint8_t *)buf;
+	iov.iov_len = len;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+
+	if (fds && fd_num > 0) {
+		msgh.msg_control = control;
+		msgh.msg_controllen = sizeof(control);
+		cmsg = CMSG_FIRSTHDR(&msgh);
+		cmsg->cmsg_len = CMSG_LEN(fd_size);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		memcpy(CMSG_DATA(cmsg), fds, fd_size);
+	} else {
+		msgh.msg_control = NULL;
+		msgh.msg_controllen = 0;
+	}
+
+	do {
+		r = sendmsg(fd, &msgh, 0);
+	} while (r < 0 && errno == EINTR);
+
+	if (r == -1) {
+		return -errno;
+	}
+
+	return 0;
+}
+
+static int
+vhost_user_read(int fd, struct vhost_user_msg *msg)
+{
+	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
+	ssize_t ret;
+	size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
+
+	ret = recv(fd, (void *)msg, sz_hdr, 0);
+	if ((size_t)ret != sz_hdr) {
+		SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
+			     ret, sz_hdr);
+		if (ret == -1) {
+			return -errno;
+		} else {
+			return -EBUSY;
+		}
+	}
+
+	/* validate msg flags */
+	if (msg->flags != (valid_flags)) {
+		SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
+			     msg->flags, valid_flags);
+		return -EIO;
+	}
+
+	sz_payload = msg->size;
+
+	if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
+		SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
+			     sz_payload, VHOST_USER_PAYLOAD_SIZE);
+		return -EIO;
+	}
+
+	if (sz_payload) {
+		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
+		if ((size_t)ret != sz_payload) {
+			SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
+				     ret, msg->size);
+			if (ret == -1) {
+				return -errno;
+			} else {
+				return -EBUSY;
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct hugepage_file_info {
+	uint64_t addr;            /**< virtual addr */
+	size_t   size;            /**< the file size */
+	char     path[PATH_MAX];  /**< path to backing file */
+};
+
+/* Two possible options:
+ * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
+ * array. This is simple but cannot be used in secondary process because
+ * secondary process will close and munmap that file.
+ * 2. Match HUGEFILE_FMT to find hugepage files directly.
+ *
+ * We choose option 2.
+ */
+static int
+get_hugepage_file_info(struct hugepage_file_info huges[], int max)
+{
+	int idx, rc;
+	FILE *f;
+	char buf[BUFSIZ], *tmp, *tail;
+	char *str_underline, *str_start;
+	int huge_index;
+	uint64_t v_start, v_end;
+
+	f = fopen("/proc/self/maps", "r");
+	if (!f) {
+		SPDK_ERRLOG("cannot open /proc/self/maps\n");
+		rc = -errno;
+		assert(rc < 0); /* scan-build hack */
+		return rc;
+	}
+
+	idx = 0;
+	while (fgets(buf, sizeof(buf), f) != NULL) {
+		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
+			SPDK_ERRLOG("Failed to parse address\n");
+			rc = -EIO;
+			goto out;
+		}
+
+		tmp = strchr(buf, ' ') + 1; /** skip address */
+		tmp = strchr(tmp, ' ') + 1; /** skip perm */
+		tmp = strchr(tmp, ' ') + 1; /** skip offset */
+		tmp = strchr(tmp, ' ') + 1; /** skip dev */
+		tmp = strchr(tmp, ' ') + 1; /** skip inode */
+		while (*tmp == ' ') {       /** skip spaces */
+			tmp++;
+		}
+		tail = strrchr(tmp, '\n');  /** remove newline if exists */
+		if (tail) {
+			*tail = '\0';
+		}
+
+		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
+		 * which is defined in eal_filesystem.h
+		 */
+		str_underline = strrchr(tmp, '_');
+		if (!str_underline) {
+			continue;
+		}
+
+		str_start = str_underline - strlen("map");
+		if (str_start < tmp) {
+			continue;
+		}
+
+		if (sscanf(str_start, "map_%d", &huge_index) != 1) {
+			continue;
+		}
+
+		if (idx >= max) {
+			SPDK_ERRLOG("Exceed maximum of %d\n", max);
+			rc = -ENOSPC;
+			goto out;
+		}
+
+		if (idx > 0 &&
+		    strncmp(tmp, huges[idx - 1].path, PATH_MAX) == 0 &&
+		    v_start == huges[idx - 1].addr + huges[idx - 1].size) {
+			huges[idx - 1].size += (v_end - v_start);
+			continue;
+		}
+
+		huges[idx].addr = v_start;
+		huges[idx].size = v_end - v_start;
+		snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
+		idx++;
+	}
+
+	rc = idx;
+out:
+	fclose(f);
+	return rc;
+}
+
+static int
+prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
+{
+	int i, num;
+	struct hugepage_file_info huges[VHOST_USER_MEMORY_MAX_NREGIONS];
+
+	num = get_hugepage_file_info(huges, VHOST_USER_MEMORY_MAX_NREGIONS);
+	if (num < 0) {
+		SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
+		return num;
+	}
+
+	for (i = 0; i < num; ++i) {
+		/* the memory regions are unaligned */
+		msg->payload.memory.regions[i].guest_phys_addr = huges[i].addr; /* use vaddr! */
+		msg->payload.memory.regions[i].userspace_addr = huges[i].addr;
+		msg->payload.memory.regions[i].memory_size = huges[i].size;
+		msg->payload.memory.regions[i].flags_padding = 0;
+		fds[i] = open(huges[i].path, O_RDWR);
+	}
+
+	msg->payload.memory.nregions = num;
+	msg->payload.memory.padding = 0;
+
+	return 0;
+}
+
+static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
+	[VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
+	[VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
+	[VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
+	[VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
+	[VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
+	[VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+	[VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+	[VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
+	[VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
+	[VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
+	[VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
+	[VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
+	[VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
+	[VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
+	[VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+};
+
+static int
+vhost_user_sock(struct virtio_user_dev *dev,
+		enum vhost_user_request req,
+		void *arg)
+{
+	struct vhost_user_msg msg;
+	struct vhost_vring_file *file = 0;
+	int need_reply = 0;
+	int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
+	int fd_num = 0;
+	int i, len, rc;
+	int vhostfd = dev->vhostfd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_USER, "sent message %d = %s\n", req, vhost_msg_strings[req]);
+
+	msg.request = req;
+	msg.flags = VHOST_USER_VERSION;
+	msg.size = 0;
+
+	switch (req) {
+	case VHOST_USER_GET_FEATURES:
+	case VHOST_USER_GET_PROTOCOL_FEATURES:
+	case VHOST_USER_GET_QUEUE_NUM:
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_FEATURES:
+	case VHOST_USER_SET_LOG_BASE:
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
+		msg.payload.u64 = *((__u64 *)arg);
+		msg.size = sizeof(msg.payload.u64);
+		break;
+
+	case VHOST_USER_SET_OWNER:
+	case VHOST_USER_RESET_OWNER:
+		break;
+
+	case VHOST_USER_SET_MEM_TABLE:
+		rc = prepare_vhost_memory_user(&msg, fds);
+		if (rc < 0) {
+			return rc;
+		}
+		fd_num = msg.payload.memory.nregions;
+		msg.size = sizeof(msg.payload.memory.nregions);
+		msg.size += sizeof(msg.payload.memory.padding);
+		msg.size += fd_num * sizeof(struct vhost_memory_region);
+		break;
+
+	case VHOST_USER_SET_LOG_FD:
+		fds[fd_num++] = *((int *)arg);
+		break;
+
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_ENABLE:
+		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+		msg.size = sizeof(msg.payload.state);
+		break;
+
+	case VHOST_USER_GET_VRING_BASE:
+		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+		msg.size = sizeof(msg.payload.state);
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_VRING_ADDR:
+		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
+		msg.size = sizeof(msg.payload.addr);
+		break;
+
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+		file = arg;
+		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
+		msg.size = sizeof(msg.payload.u64);
+		if (file->fd > 0) {
+			fds[fd_num++] = file->fd;
+		} else {
+			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
+		}
+		break;
+
+	case VHOST_USER_GET_CONFIG:
+		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
+		msg.size = sizeof(msg.payload.cfg);
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_CONFIG:
+		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
+		msg.size = sizeof(msg.payload.cfg);
+		break;
+
+	default:
+		SPDK_ERRLOG("trying to send unknown msg\n");
+		return -EINVAL;
+	}
+
+	len = VHOST_USER_HDR_SIZE + msg.size;
+	rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
+	if (rc < 0) {
+		SPDK_ERRLOG("%s failed: %s\n",
+			    vhost_msg_strings[req], spdk_strerror(-rc));
+		return rc;
+	}
+
+	if (req == VHOST_USER_SET_MEM_TABLE)
+		for (i = 0; i < fd_num; ++i) {
+			close(fds[i]);
+		}
+
+	if (need_reply) {
+		rc = vhost_user_read(vhostfd, &msg);
+		if (rc < 0) {
+			SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
+			return rc;
+		}
+
+		if (req != msg.request) {
+			SPDK_WARNLOG("Received unexpected msg type\n");
+			return -EIO;
+		}
+
+		switch (req) {
+		case VHOST_USER_GET_FEATURES:
+		case VHOST_USER_GET_PROTOCOL_FEATURES:
+		case VHOST_USER_GET_QUEUE_NUM:
+			if (msg.size != sizeof(msg.payload.u64)) {
+				SPDK_WARNLOG("Received bad msg size\n");
+				return -EIO;
+			}
+			*((__u64 *)arg) = msg.payload.u64;
+			break;
+		case VHOST_USER_GET_VRING_BASE:
+			if (msg.size != sizeof(msg.payload.state)) {
+				SPDK_WARNLOG("Received bad msg size\n");
+				return -EIO;
+			}
+			memcpy(arg, &msg.payload.state,
+			       sizeof(struct vhost_vring_state));
+			break;
+		case VHOST_USER_GET_CONFIG:
+			if (msg.size != sizeof(msg.payload.cfg)) {
+				SPDK_WARNLOG("Received bad msg size\n");
+				return -EIO;
+			}
+			memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
+			break;
+		default:
+			SPDK_WARNLOG("Received unexpected msg type\n");
+			return -EBADMSG;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Set up environment to talk with a vhost user backend.
+ *
+ * @return
+ *   - (-1) if fail;
+ *   - (0) if succeed.
+ */
+static int
+vhost_user_setup(struct virtio_user_dev *dev)
+{
+	int fd;
+	int flag;
+	struct sockaddr_un un;
+	ssize_t rc;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0) {
+		SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
+		return -errno;
+	}
+
+	flag = fcntl(fd, F_GETFD);
+	if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
+		SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
+	if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
+		SPDK_ERRLOG("socket path too long\n");
+		close(fd);
+		if (rc < 0) {
+			return -errno;
+		} else {
+			return -EINVAL;
+		}
+	}
+	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
+		close(fd);
+		return -errno;
+	}
+
+	dev->vhostfd = fd;
+	return 0;
+}
+
+struct virtio_user_backend_ops ops_user = {
+	.setup = vhost_user_setup,
+	.send_request = vhost_user_sock,
+};
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_user", SPDK_LOG_VIRTIO_USER)
diff --git a/src/spdk/lib/virtio/vhost_user.h b/src/spdk/lib/virtio/vhost_user.h
new file mode 100644
index 000000000..0caf51ebc
--- /dev/null
+++ b/src/spdk/lib/virtio/vhost_user.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_H
+#define _VHOST_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/virtio.h"
+#include "spdk_internal/vhost_user.h"
+
+struct virtio_user_backend_ops;
+
+struct virtio_user_dev {
+	int		vhostfd;
+
+	int		callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
+	int		kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
+	uint32_t	queue_size;
+
+	uint8_t		status;
+	char		path[PATH_MAX];
+	uint64_t	protocol_features;
+	struct vring	vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
+	struct virtio_user_backend_ops *ops;
+	struct spdk_mem_map *mem_map;
+};
+
+struct virtio_user_backend_ops {
+	int (*setup)(struct virtio_user_dev *dev);
+	int (*send_request)(struct virtio_user_dev *dev,
+			    enum vhost_user_request req,
+			    void *arg);
+};
+
+extern struct virtio_user_backend_ops ops_user;
+
+#endif
diff --git a/src/spdk/lib/virtio/virtio.c b/src/spdk/lib/virtio/virtio.c
new file mode 100644
index 000000000..03866040a
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio.c
@@ -0,0 +1,717 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/barrier.h"
+
+#include "spdk_internal/virtio.h"
+
+/* We use SMP memory barrier variants as all virtio_pci devices
+ * are purely virtual. All MMIO is executed on a CPU core, so
+ * there's no need to do full MMIO synchronization.
+ */
+#define virtio_mb()	spdk_smp_mb()
+#define virtio_rmb()	spdk_smp_rmb()
+#define virtio_wmb()	spdk_smp_wmb()
+
+/* Chain all the descriptors in the ring with an END */
+static inline void
+vring_desc_init(struct vring_desc *dp, uint16_t n)
+{
+	uint16_t i;
+
+	for (i = 0; i < n - 1; i++) {
+		dp[i].next = (uint16_t)(i + 1);
+	}
+	dp[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+static void
+virtio_init_vring(struct virtqueue *vq)
+{
+	int size = vq->vq_nentries;
+	struct vring *vr = &vq->vq_ring;
+	uint8_t *ring_mem = vq->vq_ring_virt_mem;
+
+	/*
+	 * Reinitialise since virtio port might have been stopped and restarted
+	 */
+	memset(ring_mem, 0, vq->vq_ring_size);
+	vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN);
+	vq->vq_used_cons_idx = 0;
+	vq->vq_desc_head_idx = 0;
+	vq->vq_avail_idx = 0;
+	vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1);
+	vq->vq_free_cnt = vq->vq_nentries;
+	vq->req_start = VQ_RING_DESC_CHAIN_END;
+	vq->req_end = VQ_RING_DESC_CHAIN_END;
+	vq->reqs_finished = 0;
+	memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
+
+	vring_desc_init(vr->desc, size);
+
+	/* Tell the backend not to interrupt us.
+	 * If F_EVENT_IDX is negotiated, we will always set incredibly high
+	 * used event idx, so that we will practically never receive an
+	 * interrupt. See virtqueue_req_flush()
+	 */
+	if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+		vring_used_event(&vq->vq_ring) = UINT16_MAX;
+	} else {
+		vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+	}
+}
+
+static int
+virtio_init_queue(struct virtio_dev *dev, uint16_t vtpci_queue_idx)
+{
+	unsigned int vq_size, size;
+	struct virtqueue *vq;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "setting up queue: %"PRIu16"\n", vtpci_queue_idx);
+
+	/*
+	 * Read the virtqueue size from the Queue Size field
+	 * Always power of 2 and if 0 virtqueue does not exist
+	 */
+	vq_size = virtio_dev_backend_ops(dev)->get_queue_size(dev, vtpci_queue_idx);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq_size: %u\n", vq_size);
+	if (vq_size == 0) {
+		SPDK_ERRLOG("virtqueue %"PRIu16" does not exist\n", vtpci_queue_idx);
+		return -EINVAL;
+	}
+
+	if (!spdk_u32_is_pow2(vq_size)) {
+		SPDK_ERRLOG("virtqueue %"PRIu16" size (%u) is not powerof 2\n",
+			    vtpci_queue_idx, vq_size);
+		return -EINVAL;
+	}
+
+	size = sizeof(*vq) + vq_size * sizeof(struct vq_desc_extra);
+
+	if (posix_memalign((void **)&vq, SPDK_CACHE_LINE_SIZE, size)) {
+		SPDK_ERRLOG("can not allocate vq\n");
+		return -ENOMEM;
+	}
+	memset(vq, 0, size);
+	dev->vqs[vtpci_queue_idx] = vq;
+
+	vq->vdev = dev;
+	vq->vq_queue_index = vtpci_queue_idx;
+	vq->vq_nentries = vq_size;
+
+	/*
+	 * Reserve a memzone for vring elements
+	 */
+	size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
+	vq->vq_ring_size = SPDK_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vring_size: %u, rounded_vring_size: %u\n",
+		      size, vq->vq_ring_size);
+
+	vq->owner_thread = NULL;
+
+	rc = virtio_dev_backend_ops(dev)->setup_queue(dev, vq);
+	if (rc < 0) {
+		SPDK_ERRLOG("setup_queue failed\n");
+		free(vq);
+		dev->vqs[vtpci_queue_idx] = NULL;
+		return rc;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_mem:      0x%" PRIx64 "\n",
+		      vq->vq_ring_mem);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_virt_mem: 0x%" PRIx64 "\n",
+		      (uint64_t)(uintptr_t)vq->vq_ring_virt_mem);
+
+	virtio_init_vring(vq);
+	return 0;
+}
+
+static void
+virtio_free_queues(struct virtio_dev *dev)
+{
+	uint16_t nr_vq = dev->max_queues;
+	struct virtqueue *vq;
+	uint16_t i;
+
+	if (dev->vqs == NULL) {
+		return;
+	}
+
+	for (i = 0; i < nr_vq; i++) {
+		vq = dev->vqs[i];
+		if (!vq) {
+			continue;
+		}
+
+		virtio_dev_backend_ops(dev)->del_queue(dev, vq);
+
+		free(vq);
+		dev->vqs[i] = NULL;
+	}
+
+	free(dev->vqs);
+	dev->vqs = NULL;
+}
+
+static int
+virtio_alloc_queues(struct virtio_dev *dev, uint16_t request_vq_num, uint16_t fixed_vq_num)
+{
+	uint16_t nr_vq;
+	uint16_t i;
+	int ret;
+
+	nr_vq = request_vq_num + fixed_vq_num;
+	if (nr_vq == 0) {
+		/* perfectly fine to have a device with no virtqueues. */
+		return 0;
+	}
+
+	assert(dev->vqs == NULL);
+	dev->vqs = calloc(1, sizeof(struct virtqueue *) * nr_vq);
+	if (!dev->vqs) {
+		SPDK_ERRLOG("failed to allocate %"PRIu16" vqs\n", nr_vq);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_vq; i++) {
+		ret = virtio_init_queue(dev, i);
+		if (ret < 0) {
+			virtio_free_queues(dev);
+			return ret;
+		}
+	}
+
+	dev->max_queues = nr_vq;
+	dev->fixed_queues_num = fixed_vq_num;
+	return 0;
+}
+
+/**
+ * Negotiate virtio features. For virtio_user this will also set
+ * dev->modern flag if VIRTIO_F_VERSION_1 flag is negotiated.
+ */
+static int
+virtio_negotiate_features(struct virtio_dev *dev, uint64_t req_features)
+{
+	uint64_t host_features = virtio_dev_backend_ops(dev)->get_features(dev);
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "guest features = %" PRIx64 "\n", req_features);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "device features = %" PRIx64 "\n", host_features);
+
+	rc = virtio_dev_backend_ops(dev)->set_features(dev, req_features & host_features);
+	if (rc != 0) {
+		SPDK_ERRLOG("failed to negotiate device features.\n");
+		return rc;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "negotiated features = %" PRIx64 "\n",
+		      dev->negotiated_features);
+
+	virtio_dev_set_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+	if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+		SPDK_ERRLOG("failed to set FEATURES_OK status!\n");
+		/* either the device failed, or we offered some features that
+		 * depend on other, not offered features.
+		 */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int
+virtio_dev_construct(struct virtio_dev *vdev, const char *name,
+		     const struct virtio_dev_ops *ops, void *ctx)
+{
+	int rc;
+
+	vdev->name = strdup(name);
+	if (vdev->name == NULL) {
+		return -ENOMEM;
+	}
+
+	rc = pthread_mutex_init(&vdev->mutex, NULL);
+	if (rc != 0) {
+		free(vdev->name);
+		return -rc;
+	}
+
+	vdev->backend_ops = ops;
+	vdev->ctx = ctx;
+
+	return 0;
+}
+
+int
+virtio_dev_reset(struct virtio_dev *dev, uint64_t req_features)
+{
+	req_features |= (1ULL << VIRTIO_F_VERSION_1);
+
+	virtio_dev_stop(dev);
+
+	virtio_dev_set_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_ACKNOWLEDGE)) {
+		SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_ACKNOWLEDGE status.\n");
+		return -EIO;
+	}
+
+	virtio_dev_set_status(dev, VIRTIO_CONFIG_S_DRIVER);
+	if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_DRIVER)) {
+		SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER status.\n");
+		return -EIO;
+	}
+
+	return virtio_negotiate_features(dev, req_features);
+}
+
+int
+virtio_dev_start(struct virtio_dev *vdev, uint16_t max_queues, uint16_t fixed_queue_num)
+{
+	int ret;
+
+	ret = virtio_alloc_queues(vdev, max_queues, fixed_queue_num);
+	if (ret < 0) {
+		return ret;
+	}
+
+	virtio_dev_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK);
+	if (!(virtio_dev_get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER_OK status.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+virtio_dev_destruct(struct virtio_dev *dev)
+{
+	virtio_dev_backend_ops(dev)->destruct_dev(dev);
+	pthread_mutex_destroy(&dev->mutex);
+	free(dev->name);
+}
+
+static void
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+	struct vring_desc *dp, *dp_tail;
+	struct vq_desc_extra *dxp;
+	uint16_t desc_idx_last = desc_idx;
+
+	dp  = &vq->vq_ring.desc[desc_idx];
+	dxp = &vq->vq_descx[desc_idx];
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
+	if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
+		while (dp->flags & VRING_DESC_F_NEXT) {
+			desc_idx_last = dp->next;
+			dp = &vq->vq_ring.desc[dp->next];
+		}
+	}
+	dxp->ndescs = 0;
+
+	/*
+	 * We must append the existing free chain, if any, to the end of
+	 * newly freed chain. If the virtqueue was completely used, then
+	 * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+	 */
+	if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
+		vq->vq_desc_head_idx = desc_idx;
+	} else {
+		dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
+		dp_tail->next = desc_idx;
+	}
+
+	vq->vq_desc_tail_idx = desc_idx_last;
+	dp->next = VQ_RING_DESC_CHAIN_END;
+}
+
+static uint16_t
+virtqueue_dequeue_burst_rx(struct virtqueue *vq, void **rx_pkts,
+			   uint32_t *len, uint16_t num)
+{
+	struct vring_used_elem *uep;
+	void *cookie;
+	uint16_t used_idx, desc_idx;
+	uint16_t i;
+
+	/*  Caller does the check */
+	for (i = 0; i < num ; i++) {
+		used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+		uep = &vq->vq_ring.used->ring[used_idx];
+		desc_idx = (uint16_t) uep->id;
+		len[i] = uep->len;
+		cookie = vq->vq_descx[desc_idx].cookie;
+
+		if (spdk_unlikely(cookie == NULL)) {
+			SPDK_WARNLOG("vring descriptor with no mbuf cookie at %"PRIu16"\n",
+				     vq->vq_used_cons_idx);
+			break;
+		}
+
+		__builtin_prefetch(cookie);
+
+		rx_pkts[i]  = cookie;
+		vq->vq_used_cons_idx++;
+		vq_ring_free_chain(vq, desc_idx);
+		vq->vq_descx[desc_idx].cookie = NULL;
+	}
+
+	return i;
+}
+
+static void
+finish_req(struct virtqueue *vq)
+{
+	struct vring_desc *desc;
+	uint16_t avail_idx;
+
+	desc = &vq->vq_ring.desc[vq->req_end];
+	desc->flags &= ~VRING_DESC_F_NEXT;
+
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_req_flush() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	vq->vq_ring.avail->ring[avail_idx] = vq->req_start;
+	vq->vq_avail_idx++;
+	vq->req_end = VQ_RING_DESC_CHAIN_END;
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+	vq->reqs_finished++;
+}
+
+int
+virtqueue_req_start(struct virtqueue *vq, void *cookie, int iovcnt)
+{
+	struct vq_desc_extra *dxp;
+
+	if (iovcnt > vq->vq_free_cnt) {
+		return iovcnt > vq->vq_nentries ? -EINVAL : -ENOMEM;
+	}
+
+	if (vq->req_end != VQ_RING_DESC_CHAIN_END) {
+		finish_req(vq);
+	}
+
+	vq->req_start = vq->vq_desc_head_idx;
+	dxp = &vq->vq_descx[vq->req_start];
+	dxp->cookie = cookie;
+	dxp->ndescs = 0;
+
+	return 0;
+}
+
+void
+virtqueue_req_flush(struct virtqueue *vq)
+{
+	uint16_t reqs_finished;
+
+	if (vq->req_end == VQ_RING_DESC_CHAIN_END) {
+		/* no non-empty requests have been started */
+		return;
+	}
+
+	finish_req(vq);
+	virtio_mb();
+
+	reqs_finished = vq->reqs_finished;
+	vq->reqs_finished = 0;
+
+	if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+		/* Set used event idx to a value the device will never reach.
+		 * This effectively disables interrupts.
+		 */
+		vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - vq->vq_nentries - 1;
+
+		if (!vring_need_event(vring_avail_event(&vq->vq_ring),
+				      vq->vq_avail_idx,
+				      vq->vq_avail_idx - reqs_finished)) {
+			return;
+		}
+	} else if (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) {
+		return;
+	}
+
+	virtio_dev_backend_ops(vq->vdev)->notify_queue(vq->vdev, vq);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "Notified backend after xmit\n");
+}
+
+void
+virtqueue_req_abort(struct virtqueue *vq)
+{
+	struct vring_desc *desc;
+
+	if (vq->req_start == VQ_RING_DESC_CHAIN_END) {
+		/* no requests have been started */
+		return;
+	}
+
+	desc = &vq->vq_ring.desc[vq->req_end];
+	desc->flags &= ~VRING_DESC_F_NEXT;
+
+	vq_ring_free_chain(vq, vq->req_start);
+	vq->req_start = VQ_RING_DESC_CHAIN_END;
+}
+
+void
+virtqueue_req_add_iovs(struct virtqueue *vq, struct iovec *iovs, uint16_t iovcnt,
+		       enum spdk_virtio_desc_type desc_type)
+{
+	struct vring_desc *desc;
+	struct vq_desc_extra *dxp;
+	uint16_t i, prev_head, new_head;
+
+	assert(vq->req_start != VQ_RING_DESC_CHAIN_END);
+	assert(iovcnt <= vq->vq_free_cnt);
+
+	/* TODO use indirect descriptors if iovcnt is high enough
+	 * or the caller specifies SPDK_VIRTIO_DESC_F_INDIRECT
+	 */
+
+	prev_head = vq->req_end;
+	new_head = vq->vq_desc_head_idx;
+	for (i = 0; i < iovcnt; ++i) {
+		desc = &vq->vq_ring.desc[new_head];
+
+		if (!vq->vdev->is_hw) {
+			desc->addr  = (uintptr_t)iovs[i].iov_base;
+		} else {
+			desc->addr = spdk_vtophys(iovs[i].iov_base, NULL);
+		}
+
+		desc->len = iovs[i].iov_len;
+		/* always set NEXT flag. unset it on the last descriptor
+		 * in the request-ending function.
+		 */
+		desc->flags = desc_type | VRING_DESC_F_NEXT;
+
+		prev_head = new_head;
+		new_head = desc->next;
+	}
+
+	dxp = &vq->vq_descx[vq->req_start];
+	dxp->ndescs += iovcnt;
+
+	vq->req_end = prev_head;
+	vq->vq_desc_head_idx = new_head;
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - iovcnt);
+	if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) {
+		assert(vq->vq_free_cnt == 0);
+		vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
+	}
+}
+
+#define DESC_PER_CACHELINE (SPDK_CACHE_LINE_SIZE / sizeof(struct vring_desc))
+uint16_t
+virtio_recv_pkts(struct virtqueue *vq, void **io, uint32_t *len, uint16_t nb_pkts)
+{
+	uint16_t nb_used, num;
+
+	nb_used = vq->vq_ring.used->idx - vq->vq_used_cons_idx;
+	virtio_rmb();
+
+	num = (uint16_t)(spdk_likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
+	if (spdk_likely(num > DESC_PER_CACHELINE)) {
+		num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
+	}
+
+	return virtqueue_dequeue_burst_rx(vq, io, len, num);
+}
+
+int
+virtio_dev_acquire_queue(struct virtio_dev *vdev, uint16_t index)
+{
+	struct virtqueue *vq = NULL;
+
+	if (index >= vdev->max_queues) {
+		SPDK_ERRLOG("requested vq index %"PRIu16" exceeds max queue count %"PRIu16".\n",
+			    index, vdev->max_queues);
+		return -1;
+	}
+
+	pthread_mutex_lock(&vdev->mutex);
+	vq = vdev->vqs[index];
+	if (vq == NULL || vq->owner_thread != NULL) {
+		pthread_mutex_unlock(&vdev->mutex);
+		return -1;
+	}
+
+	vq->owner_thread = spdk_get_thread();
+	pthread_mutex_unlock(&vdev->mutex);
+	return 0;
+}
+
+int32_t
+virtio_dev_find_and_acquire_queue(struct virtio_dev *vdev, uint16_t start_index)
+{
+	struct virtqueue *vq = NULL;
+	uint16_t i;
+
+	pthread_mutex_lock(&vdev->mutex);
+	for (i = start_index; i < vdev->max_queues; ++i) {
+		vq = vdev->vqs[i];
+		if (vq != NULL && vq->owner_thread == NULL) {
+			break;
+		}
+	}
+
+	if (vq == NULL || i == vdev->max_queues) {
+		SPDK_ERRLOG("no more unused virtio queues with idx >= %"PRIu16".\n", start_index);
+		pthread_mutex_unlock(&vdev->mutex);
+		return -1;
+	}
+
+	vq->owner_thread = spdk_get_thread();
+	pthread_mutex_unlock(&vdev->mutex);
+	return i;
+}
+
+struct spdk_thread *
+virtio_dev_queue_get_thread(struct virtio_dev *vdev, uint16_t index)
+{
+	struct spdk_thread *thread = NULL;
+
+	if (index >= vdev->max_queues) {
+		SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16"\n",
+			    index, vdev->max_queues);
+		abort(); /* This is not recoverable */
+	}
+
+	pthread_mutex_lock(&vdev->mutex);
+	thread = vdev->vqs[index]->owner_thread;
+	pthread_mutex_unlock(&vdev->mutex);
+
+	return thread;
+}
+
+bool
+virtio_dev_queue_is_acquired(struct virtio_dev *vdev, uint16_t index)
+{
+	return virtio_dev_queue_get_thread(vdev, index) != NULL;
+}
+
+void
+virtio_dev_release_queue(struct virtio_dev *vdev, uint16_t index)
+{
+	struct virtqueue *vq = NULL;
+
+	if (index >= vdev->max_queues) {
+		SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16".\n",
+			    index, vdev->max_queues);
+		return;
+	}
+
+	pthread_mutex_lock(&vdev->mutex);
+	vq = vdev->vqs[index];
+	if (vq == NULL) {
+		SPDK_ERRLOG("virtqueue at index %"PRIu16" is not initialized.\n", index);
+		pthread_mutex_unlock(&vdev->mutex);
+		return;
+	}
+
+	assert(vq->owner_thread == spdk_get_thread());
+	vq->owner_thread = NULL;
+	pthread_mutex_unlock(&vdev->mutex);
+}
+
+int
+virtio_dev_read_dev_config(struct virtio_dev *dev, size_t offset,
+			   void *dst, int length)
+{
+	return virtio_dev_backend_ops(dev)->read_dev_cfg(dev, offset, dst, length);
+}
+
+int
+virtio_dev_write_dev_config(struct virtio_dev *dev, size_t offset,
+			    const void *src, int length)
+{
+	return virtio_dev_backend_ops(dev)->write_dev_cfg(dev, offset, src, length);
+}
+
+void
+virtio_dev_stop(struct virtio_dev *dev)
+{
+	virtio_dev_backend_ops(dev)->set_status(dev, VIRTIO_CONFIG_S_RESET);
+	/* flush status write */
+	virtio_dev_backend_ops(dev)->get_status(dev);
+	virtio_free_queues(dev);
+}
+
+void
+virtio_dev_set_status(struct virtio_dev *dev, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_S_RESET) {
+		status |= virtio_dev_backend_ops(dev)->get_status(dev);
+	}
+
+	virtio_dev_backend_ops(dev)->set_status(dev, status);
+}
+
+uint8_t
+virtio_dev_get_status(struct virtio_dev *dev)
+{
+	return virtio_dev_backend_ops(dev)->get_status(dev);
+}
+
+const struct virtio_dev_ops *
+virtio_dev_backend_ops(struct virtio_dev *dev)
+{
+	return dev->backend_ops;
+}
+
+void
+virtio_dev_dump_json_info(struct virtio_dev *hw, struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_named_object_begin(w, "virtio");
+
+	spdk_json_write_named_uint32(w, "vq_count", hw->max_queues);
+
+	spdk_json_write_named_uint32(w, "vq_size",
+				     virtio_dev_backend_ops(hw)->get_queue_size(hw, 0));
+
+	virtio_dev_backend_ops(hw)->dump_json_info(hw, w);
+
+	spdk_json_write_object_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_dev", SPDK_LOG_VIRTIO_DEV)
diff --git a/src/spdk/lib/virtio/virtio_pci.c b/src/spdk/lib/virtio/virtio_pci.c
new file mode 100644
index 000000000..646f77c1a
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio_pci.c
@@ -0,0 +1,599 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/memory.h"
+#include "spdk/mmio.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk_internal/virtio.h"
+
+struct virtio_hw {
+	uint8_t	    use_msix;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+
+	struct {
+		/** Mem-mapped resources from given PCI BAR */
+		void        *vaddr;
+
+		/** Length of the address space */
+		uint32_t    len;
+	} pci_bar[6];
+
+	struct virtio_pci_common_cfg *common_cfg;
+	struct spdk_pci_device *pci_dev;
+
+	/** Device-specific PCI config space */
+	void *dev_cfg;
+};
+
+struct virtio_pci_probe_ctx {
+	virtio_pci_create_cb enum_cb;
+	void *enum_ctx;
+	uint16_t device_id;
+};
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+	    (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		SPDK_ERRLOG("vring address shouldn't be above 16TB!\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static void
+free_virtio_hw(struct virtio_hw *hw)
+{
+	unsigned i;
+
+	for (i = 0; i < 6; ++i) {
+		if (hw->pci_bar[i].vaddr == NULL) {
+			continue;
+		}
+
+		spdk_pci_device_unmap_bar(hw->pci_dev, i, hw->pci_bar[i].vaddr);
+	}
+
+	free(hw);
+}
+
+static void
+pci_dump_json_info(struct virtio_dev *dev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_hw *hw = dev->ctx;
+	struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr((struct spdk_pci_device *)hw->pci_dev);
+	char addr[32];
+
+	spdk_json_write_name(w, "type");
+	if (dev->modern) {
+		spdk_json_write_string(w, "pci-modern");
+	} else {
+		spdk_json_write_string(w, "pci-legacy");
+	}
+
+	spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
+	spdk_json_write_named_string(w, "pci_address", addr);
+}
+
+static void
+pci_write_json_config(struct virtio_dev *dev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_hw *hw = dev->ctx;
+	struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(hw->pci_dev);
+	char addr[32];
+
+	spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
+
+	spdk_json_write_named_string(w, "trtype", "pci");
+	spdk_json_write_named_string(w, "traddr", addr);
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	spdk_mmio_write_4(lo, val & ((1ULL << 32) - 1));
+	spdk_mmio_write_4(hi, val >> 32);
+}
+
+static int
+modern_read_dev_config(struct virtio_dev *dev, size_t offset,
+		       void *dst, int length)
+{
+	struct virtio_hw *hw = dev->ctx;
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++) {
+			*p++ = spdk_mmio_read_1((uint8_t *)hw->dev_cfg + offset + i);
+		}
+
+		new_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+
+	return 0;
+}
+
+static int
+modern_write_dev_config(struct virtio_dev *dev, size_t offset,
+			const void *src, int length)
+{
+	struct virtio_hw *hw = dev->ctx;
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++) {
+		spdk_mmio_write_1(((uint8_t *)hw->dev_cfg) + offset + i, *p++);
+	}
+
+	return 0;
+}
+
+static uint64_t
+modern_get_features(struct virtio_dev *dev)
+{
+	struct virtio_hw *hw = dev->ctx;
+	uint32_t features_lo, features_hi;
+
+	spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 0);
+	features_lo = spdk_mmio_read_4(&hw->common_cfg->device_feature);
+
+	spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 1);
+	features_hi = spdk_mmio_read_4(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static int
+modern_set_features(struct virtio_dev *dev, uint64_t features)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	if ((features & (1ULL << VIRTIO_F_VERSION_1)) == 0) {
+		SPDK_ERRLOG("VIRTIO_F_VERSION_1 feature is not enabled.\n");
+		return -EINVAL;
+	}
+
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 0);
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature, features & ((1ULL << 32) - 1));
+
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 1);
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature, features >> 32);
+
+	dev->negotiated_features = features;
+
+	return 0;
+}
+
+static void
+modern_destruct_dev(struct virtio_dev *vdev)
+{
+	struct virtio_hw *hw = vdev->ctx;
+	struct spdk_pci_device *pci_dev = hw->pci_dev;
+
+	free_virtio_hw(hw);
+	spdk_pci_device_detach(pci_dev);
+}
+
+static uint8_t
+modern_get_status(struct virtio_dev *dev)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	return spdk_mmio_read_1(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_dev *dev, uint8_t status)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	spdk_mmio_write_1(&hw->common_cfg->device_status, status);
+}
+
+static uint16_t
+modern_get_queue_size(struct virtio_dev *dev, uint16_t queue_id)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_select, queue_id);
+	return spdk_mmio_read_2(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+	struct virtio_hw *hw = dev->ctx;
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+	void *queue_mem;
+	uint64_t queue_mem_phys_addr;
+
+	/* To ensure physical address contiguity we make the queue occupy
+	 * only a single hugepage (2MB). As of Virtio 1.0, the queue size
+	 * always falls within this limit.
+	 */
+	if (vq->vq_ring_size > VALUE_2MB) {
+		return -ENOMEM;
+	}
+
+	queue_mem = spdk_zmalloc(vq->vq_ring_size, VALUE_2MB, NULL,
+				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (queue_mem == NULL) {
+		return -ENOMEM;
+	}
+
+	queue_mem_phys_addr = spdk_vtophys(queue_mem, NULL);
+	if (queue_mem_phys_addr == SPDK_VTOPHYS_ERROR) {
+		spdk_free(queue_mem);
+		return -EFAULT;
+	}
+
+	vq->vq_ring_mem = queue_mem_phys_addr;
+	vq->vq_ring_virt_mem = queue_mem;
+
+	if (!check_vq_phys_addr_ok(vq)) {
+		spdk_free(queue_mem);
+		return -ENOMEM;
+	}
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = (avail_addr + offsetof(struct vring_avail, ring[vq->vq_nentries])
+		     + VIRTIO_PCI_VRING_ALIGN - 1) & ~(VIRTIO_PCI_VRING_ALIGN - 1);
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+			   &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+			   &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+			   &hw->common_cfg->queue_used_hi);
+
+	notify_off = spdk_mmio_read_2(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				   notify_off * hw->notify_off_multiplier);
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_enable, 1);
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "queue %"PRIu16" addresses:\n", vq->vq_queue_index);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t desc_addr: %" PRIx64 "\n", desc_addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t aval_addr: %" PRIx64 "\n", avail_addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t used_addr: %" PRIx64 "\n", used_addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t notify addr: %p (notify offset: %"PRIu16")\n",
+		      vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+			   &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+			   &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+			   &hw->common_cfg->queue_used_hi);
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_enable, 0);
+
+	spdk_free(vq->vq_ring_virt_mem);
+}
+
+static void
+modern_notify_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+	spdk_mmio_write_2(vq->notify_addr, vq->vq_queue_index);
+}
+
+static const struct virtio_dev_ops modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.destruct_dev	= modern_destruct_dev,
+	.get_queue_size	= modern_get_queue_size,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+	.dump_json_info = pci_dump_json_info,
+	.write_json_config = pci_write_json_config,
+};
+
+static void *
+get_cfg_addr(struct virtio_hw *hw, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+
+	if (bar > 5) {
+		SPDK_ERRLOG("invalid bar: %"PRIu8"\n", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		SPDK_ERRLOG("offset(%"PRIu32") + length(%"PRIu32") overflows\n",
+			    offset, length);
+		return NULL;
+	}
+
+	if (offset + length > hw->pci_bar[bar].len) {
+		SPDK_ERRLOG("invalid cap: overflows bar space: %"PRIu32" > %"PRIu32"\n",
+			    offset + length, hw->pci_bar[bar].len);
+		return NULL;
+	}
+
+	if (hw->pci_bar[bar].vaddr == NULL) {
+		SPDK_ERRLOG("bar %"PRIu8" base addr is NULL\n", bar);
+		return NULL;
+	}
+
+	return hw->pci_bar[bar].vaddr + offset;
+}
+
+static int
+virtio_read_caps(struct virtio_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	ret = spdk_pci_device_cfg_read(hw->pci_dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "failed to read pci capability list\n");
+		return ret;
+	}
+
+	while (pos) {
+		ret = spdk_pci_device_cfg_read(hw->pci_dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			SPDK_ERRLOG("failed to read pci cap at pos: %"PRIx8"\n", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			hw->use_msix = 1;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI,
+				      "[%2"PRIx8"] skipping non VNDR cap id: %02"PRIx8"\n",
+				      pos, cap.cap_vndr);
+			goto next;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI,
+			      "[%2"PRIx8"] cfg type: %"PRIu8", bar: %"PRIu8", offset: %04"PRIx32", len: %"PRIu32"\n",
+			      pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			spdk_pci_device_cfg_read(hw->pci_dev, &hw->notify_off_multiplier,
+						 4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(hw, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "no modern virtio pci device found.\n");
+		if (ret < 0) {
+			return ret;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "found modern virtio pci device.\n");
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "common cfg mapped at: %p\n", hw->common_cfg);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "device cfg mapped at: %p\n", hw->dev_cfg);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "isr cfg mapped at: %p\n", hw->isr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "notify base: %p, notify off multiplier: %u\n",
+		      hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+static int
+virtio_pci_dev_probe(struct spdk_pci_device *pci_dev, struct virtio_pci_probe_ctx *ctx)
+{
+	struct virtio_hw *hw;
+	uint8_t *bar_vaddr;
+	uint64_t bar_paddr, bar_len;
+	int rc;
+	unsigned i;
+	char bdf[32];
+	struct spdk_pci_addr addr;
+
+	addr = spdk_pci_device_get_addr(pci_dev);
+	rc = spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr);
+	if (rc != 0) {
+		SPDK_ERRLOG("Ignoring a device with non-parseable PCI address\n");
+		return -1;
+	}
+
+	hw = calloc(1, sizeof(*hw));
+	if (hw == NULL) {
+		SPDK_ERRLOG("%s: calloc failed\n", bdf);
+		return -1;
+	}
+
+	hw->pci_dev = pci_dev;
+
+	for (i = 0; i < 6; ++i) {
+		rc = spdk_pci_device_map_bar(pci_dev, i, (void *) &bar_vaddr, &bar_paddr,
+					     &bar_len);
+		if (rc != 0) {
+			SPDK_ERRLOG("%s: failed to memmap PCI BAR %u\n", bdf, i);
+			free_virtio_hw(hw);
+			return -1;
+		}
+
+		hw->pci_bar[i].vaddr = bar_vaddr;
+		hw->pci_bar[i].len = bar_len;
+	}
+
+	/* Virtio PCI caps exist only on modern PCI devices.
+	 * Legacy devices are not supported.
+	 */
+	if (virtio_read_caps(hw) != 0) {
+		SPDK_NOTICELOG("Ignoring legacy PCI device at %s\n", bdf);
+		free_virtio_hw(hw);
+		return -1;
+	}
+
+	rc = ctx->enum_cb((struct virtio_pci_ctx *)hw, ctx->enum_ctx);
+	if (rc != 0) {
+		free_virtio_hw(hw);
+	}
+
+	return rc;
+}
+
+static int
+virtio_pci_dev_probe_cb(void *probe_ctx, struct spdk_pci_device *pci_dev)
+{
+	struct virtio_pci_probe_ctx *ctx = probe_ctx;
+	uint16_t pci_device_id = spdk_pci_device_get_device_id(pci_dev);
+
+	if (pci_device_id != ctx->device_id) {
+		return 1;
+	}
+
+	return virtio_pci_dev_probe(pci_dev, ctx);
+}
+
+int
+virtio_pci_dev_enumerate(virtio_pci_create_cb enum_cb, void *enum_ctx,
+			 uint16_t pci_device_id)
+{
+	struct virtio_pci_probe_ctx ctx;
+
+	if (!spdk_process_is_primary()) {
+		SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n");
+		return 0;
+	}
+
+	ctx.enum_cb = enum_cb;
+	ctx.enum_ctx = enum_ctx;
+	ctx.device_id = pci_device_id;
+
+	return spdk_pci_enumerate(spdk_pci_virtio_get_driver(),
+				  virtio_pci_dev_probe_cb, &ctx);
+}
+
+int
+virtio_pci_dev_attach(virtio_pci_create_cb enum_cb, void *enum_ctx,
+		      uint16_t pci_device_id, struct spdk_pci_addr *pci_address)
+{
+	struct virtio_pci_probe_ctx ctx;
+
+	if (!spdk_process_is_primary()) {
+		SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n");
+		return 0;
+	}
+
+	ctx.enum_cb = enum_cb;
+	ctx.enum_ctx = enum_ctx;
+	ctx.device_id = pci_device_id;
+
+	return spdk_pci_device_attach(spdk_pci_virtio_get_driver(),
+				      virtio_pci_dev_probe_cb, &ctx, pci_address);
+}
+
+int
+virtio_pci_dev_init(struct virtio_dev *vdev, const char *name,
+		    struct virtio_pci_ctx *pci_ctx)
+{
+	int rc;
+
+	rc = virtio_dev_construct(vdev, name, &modern_ops, pci_ctx);
+	if (rc != 0) {
+		return rc;
+	}
+
+	vdev->is_hw = 1;
+	vdev->modern = 1;
+
+	return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_pci", SPDK_LOG_VIRTIO_PCI)
diff --git a/src/spdk/lib/virtio/virtio_user.c b/src/spdk/lib/virtio/virtio_user.c
new file mode 100644
index 000000000..4f4932db9
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio_user.c
@@ -0,0 +1,628 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <sys/eventfd.h>
+
+#include "vhost_user.h"
+#include "spdk/string.h"
+#include "spdk/config.h"
+
+#include "spdk_internal/virtio.h"
+
+#define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
+	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+	(1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+static int
+virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
+	 * firstly because vhost depends on this msg to allocate virtqueue
+	 * pair.
+	 */
+	struct vhost_vring_file file;
+
+	file.index = queue_sel;
+	file.fd = dev->callfds[queue_sel];
+	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file);
+}
+
+static int
+virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vring *vring = &dev->vrings[queue_sel];
+	struct vhost_vring_addr addr = {
+		.index = queue_sel,
+		.desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
+		.avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
+		.used_user_addr = (uint64_t)(uintptr_t)vring->used,
+		.log_guest_addr = 0,
+		.flags = 0, /* disable log */
+	};
+
+	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr);
+}
+
+static int
+virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_vring_file file;
+	struct vhost_vring_state state;
+	struct vring *vring = &dev->vrings[queue_sel];
+	int rc;
+
+	state.index = queue_sel;
+	state.num = vring->num;
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state);
+	if (rc < 0) {
+		return rc;
+	}
+
+	state.index = queue_sel;
+	state.num = 0; /* no reservation */
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state);
+	if (rc < 0) {
+		return rc;
+	}
+
+	virtio_user_set_vring_addr(vdev, queue_sel);
+
+	/* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
+	 * lastly because vhost depends on this msg to judge if
+	 * virtio is ready.
+	 */
+	file.index = queue_sel;
+	file.fd = dev->kickfds[queue_sel];
+	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file);
+}
+
+static int
+virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_vring_state state;
+
+	state.index = queue_sel;
+	state.num = 0;
+
+	return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state);
+}
+
+static int
+virtio_user_queue_setup(struct virtio_dev *vdev,
+			int (*fn)(struct virtio_dev *, uint32_t))
+{
+	uint32_t i;
+	int rc;
+
+	for (i = 0; i < vdev->max_queues; ++i) {
+		rc = fn(vdev, i);
+		if (rc < 0) {
+			SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int
+virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
+		       enum spdk_mem_map_notify_action action,
+		       void *vaddr, size_t size)
+{
+	struct virtio_dev *vdev = cb_ctx;
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t features;
+	int ret;
+
+	/* We have to resend all mappings anyway, so don't bother with any
+	 * page tracking.
+	 */
+	ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
+	if (ret < 0) {
+		return ret;
+	}
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+	/* Our internal rte_vhost lib requires SET_VRING_ADDR to flush a pending
+	 * SET_MEM_TABLE. On the other hand, the upstream rte_vhost will invalidate
+	 * the entire queue upon receiving SET_VRING_ADDR message, so we mustn't
+	 * send it here. Both behaviors are strictly implementation specific, but
+	 * this message isn't needed from the point of the spec, so send it only
+	 * if vhost is compiled with our internal lib.
+	 */
+	ret = virtio_user_queue_setup(vdev, virtio_user_set_vring_addr);
+	if (ret < 0) {
+		return ret;
+	}
+#endif
+
+	/* Since we might want to use that mapping straight away, we have to
+	 * make sure the guest has already processed our SET_MEM_TABLE message.
+	 * F_REPLY_ACK is just a feature and the host is not obliged to
+	 * support it, so we send a simple message that always has a response
+	 * and we wait for that response. Messages are always processed in order.
+	 */
+	return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
+}
+
+static int
+virtio_user_register_mem(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	const struct spdk_mem_map_ops virtio_user_map_ops = {
+		.notify_cb = virtio_user_map_notify,
+		.are_contiguous = NULL
+	};
+
+	dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
+	if (dev->mem_map == NULL) {
+		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+virtio_user_unregister_mem(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	spdk_mem_map_free(&dev->mem_map);
+}
+
+static int
+virtio_user_start_device(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t host_max_queues;
+	int ret;
+
+	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
+	    vdev->max_queues > 1 + vdev->fixed_queues_num) {
+		SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
+			     "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
+			     "Only one request queue will be used.\n",
+			     vdev->name, vdev->max_queues - vdev->fixed_queues_num);
+		vdev->max_queues = 1 + vdev->fixed_queues_num;
+	}
+
+	/* negotiate the number of I/O queues. */
+	ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
+	if (ret < 0) {
+		return ret;
+	}
+
+	if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
+		SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
+			     "but only %"PRIu64" available\n",
+			     vdev->name, vdev->max_queues - vdev->fixed_queues_num,
+			     host_max_queues);
+		vdev->max_queues = host_max_queues;
+	}
+
+	/* tell vhost to create queues */
+	ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
+	if (ret < 0) {
+		return ret;
+	}
+
+	ret = virtio_user_register_mem(vdev);
+	if (ret < 0) {
+		return ret;
+	}
+
+	return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
+}
+
+static int
+virtio_user_stop_device(struct virtio_dev *vdev)
+{
+	int ret;
+
+	ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
+	/* a queue might fail to stop for various reasons, e.g. socket
+	 * connection going down, but this mustn't prevent us from freeing
+	 * the mem map.
+	 */
+	virtio_user_unregister_mem(vdev);
+	return ret;
+}
+
+static int
+virtio_user_dev_setup(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint16_t i;
+
+	dev->vhostfd = -1;
+
+	for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
+		dev->callfds[i] = -1;
+		dev->kickfds[i] = -1;
+	}
+
+	dev->ops = &ops_user;
+
+	return dev->ops->setup(dev);
+}
+
+static int
+virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
+			    void *dst, int length)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_user_config cfg = {0};
+	int rc;
+
+	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
+		return -ENOTSUP;
+	}
+
+	cfg.offset = 0;
+	cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
+
+	rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg);
+	if (rc < 0) {
+		SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
+		return rc;
+	}
+
+	memcpy(dst, cfg.region + offset, length);
+	return 0;
+}
+
+static int
+virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
+			     const void *src, int length)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_user_config cfg = {0};
+	int rc;
+
+	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
+		return -ENOTSUP;
+	}
+
+	cfg.offset = offset;
+	cfg.size = length;
+	memcpy(cfg.region, src, length);
+
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg);
+	if (rc < 0) {
+		SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	int rc = 0;
+
+	if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
+	    status != VIRTIO_CONFIG_S_RESET) {
+		rc = -1;
+	} else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+		rc = virtio_user_start_device(vdev);
+	} else if (status == VIRTIO_CONFIG_S_RESET &&
+		   (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		rc = virtio_user_stop_device(vdev);
+	}
+
+	if (rc != 0) {
+		dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
+	} else {
+		dev->status = status;
+	}
+}
+
+static uint8_t
+virtio_user_get_status(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	return dev->status;
+}
+
+static uint64_t
+virtio_user_get_features(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t features;
+	int rc;
+
+	rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
+	if (rc < 0) {
+		SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
+		return 0;
+	}
+
+	return features;
+}
+
+static int
+virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t protocol_features;
+	int ret;
+
+	ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features);
+	if (ret < 0) {
+		return ret;
+	}
+
+	vdev->negotiated_features = features;
+	vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
+
+	if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
+		/* nothing else to do */
+		return 0;
+	}
+
+	ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
+	if (ret < 0) {
+		return ret;
+	}
+
+	protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
+	ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
+	if (ret < 0) {
+		return ret;
+	}
+
+	dev->protocol_features = protocol_features;
+	return 0;
+}
+
+static uint16_t
+virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	/* Currently each queue has same queue size */
+	return dev->queue_size;
+}
+
+static int
+virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_vring_state state;
+	uint16_t queue_idx = vq->vq_queue_index;
+	void *queue_mem;
+	uint64_t desc_addr, avail_addr, used_addr;
+	int callfd, kickfd, rc;
+
+	if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
+		SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
+		return -EEXIST;
+	}
+
+	/* May use invalid flag, but some backend uses kickfd and
+	 * callfd as criteria to judge if dev is alive. so finally we
+	 * use real event_fd.
+	 */
+	callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+	if (callfd < 0) {
+		SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
+		return -errno;
+	}
+
+	kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+	if (kickfd < 0) {
+		SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
+		close(callfd);
+		return -errno;
+	}
+
+	queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
+				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (queue_mem == NULL) {
+		close(kickfd);
+		close(callfd);
+		return -ENOMEM;
+	}
+
+	vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
+	vq->vq_ring_virt_mem = queue_mem;
+
+	state.index = vq->vq_queue_index;
+	state.num = 0;
+
+	if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
+		rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state);
+		if (rc < 0) {
+			SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
+				    spdk_strerror(-rc));
+			close(kickfd);
+			close(callfd);
+			spdk_free(queue_mem);
+			return -rc;
+		}
+	}
+
+	dev->callfds[queue_idx] = callfd;
+	dev->kickfds[queue_idx] = kickfd;
+
+	desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+				    ring[vq->vq_nentries]),
+				    VIRTIO_PCI_VRING_ALIGN);
+
+	dev->vrings[queue_idx].num = vq->vq_nentries;
+	dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
+	dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
+	dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
+
+	return 0;
+}
+
+static void
+virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+	/* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
+	 * correspondingly stops the ioeventfds, and reset the status of
+	 * the device.
+	 * For modern devices, set queue desc, avail, used in PCI bar to 0,
+	 * not see any more behavior in QEMU.
+	 *
+	 * Here we just care about what information to deliver to vhost-user.
+	 * So we just close ioeventfd for now.
+	 */
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	close(dev->callfds[vq->vq_queue_index]);
+	close(dev->kickfds[vq->vq_queue_index]);
+	dev->callfds[vq->vq_queue_index] = -1;
+	dev->kickfds[vq->vq_queue_index] = -1;
+
+	spdk_free(vq->vq_ring_virt_mem);
+}
+
+static void
+virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+	uint64_t buf = 1;
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
+		SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
+	}
+}
+
+static void
+virtio_user_destroy(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	close(dev->vhostfd);
+	free(dev);
+}
+
+static void
+virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	spdk_json_write_named_string(w, "type", "user");
+	spdk_json_write_named_string(w, "socket", dev->path);
+}
+
+static void
+virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	spdk_json_write_named_string(w, "trtype", "user");
+	spdk_json_write_named_string(w, "traddr", dev->path);
+	spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
+	spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
+}
+
+static const struct virtio_dev_ops virtio_user_ops = {
+	.read_dev_cfg	= virtio_user_read_dev_config,
+	.write_dev_cfg	= virtio_user_write_dev_config,
+	.get_status	= virtio_user_get_status,
+	.set_status	= virtio_user_set_status,
+	.get_features	= virtio_user_get_features,
+	.set_features	= virtio_user_set_features,
+	.destruct_dev	= virtio_user_destroy,
+	.get_queue_size	= virtio_user_get_queue_size,
+	.setup_queue	= virtio_user_setup_queue,
+	.del_queue	= virtio_user_del_queue,
+	.notify_queue	= virtio_user_notify_queue,
+	.dump_json_info = virtio_user_dump_json_info,
+	.write_json_config = virtio_user_write_json_config,
+};
+
+int
+virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
+		     uint32_t queue_size)
+{
+	struct virtio_user_dev *dev;
+	int rc;
+
+	if (name == NULL) {
+		SPDK_ERRLOG("No name gived for controller: %s\n", path);
+		return -EINVAL;
+	}
+
+	dev = calloc(1, sizeof(*dev));
+	if (dev == NULL) {
+		return -ENOMEM;
+	}
+
+	rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to init device: %s\n", path);
+		free(dev);
+		return rc;
+	}
+
+	vdev->is_hw = 0;
+
+	snprintf(dev->path, PATH_MAX, "%s", path);
+	dev->queue_size = queue_size;
+
+	rc = virtio_user_dev_setup(vdev);
+	if (rc < 0) {
+		SPDK_ERRLOG("backend set up fails\n");
+		goto err;
+	}
+
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
+		goto err;
+	}
+
+	return 0;
+
+err:
+	virtio_dev_destruct(vdev);
+	return rc;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/lib/virtio
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip