Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/lib/rte_vhost
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
9 files changed, 4387 insertions, 0 deletions
diff --git a/src/spdk/lib/rte_vhost/Makefile b/src/spdk/lib/rte_vhost/Makefile
new file mode 100644
index 000000000..aa073c6ca
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/Makefile
@@ -0,0 +1,50 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+CFLAGS += -include rte_config.h
+CFLAGS += -Wno-address-of-packed-member
+
+# These are the DPDK vhost files copied (for now) into SPDK
+C_SRCS += fd_man.c socket.c vhost_user.c vhost.c
+
+LIBNAME = rte_vhost
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rte_vhost/fd_man.c b/src/spdk/lib/rte_vhost/fd_man.c
new file mode 100644
index 000000000..2ceacc9ab
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/fd_man.c
@@ -0,0 +1,300 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "fd_man.h"
+
+#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL)
+
+static int
+get_last_valid_idx(struct fdset *pfdset, int last_valid_idx)
+{
+	int i;
+
+	for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--)
+		;
+
+	return i;
+}
+
+static void
+fdset_move(struct fdset *pfdset, int dst, int src)
+{
+	pfdset->fd[dst]    = pfdset->fd[src];
+	pfdset->rwfds[dst] = pfdset->rwfds[src];
+}
+
+static void
+fdset_shrink_nolock(struct fdset *pfdset)
+{
+	int i;
+	int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1);
+
+	for (i = 0; i < last_valid_idx; i++) {
+		if (pfdset->fd[i].fd != -1)
+			continue;
+
+		fdset_move(pfdset, i, last_valid_idx);
+		last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1);
+	}
+	pfdset->num = last_valid_idx + 1;
+}
+
+/*
+ * Find deleted fd entries and remove them
+ */
+static void
+fdset_shrink(struct fdset *pfdset)
+{
+	pthread_mutex_lock(&pfdset->fd_mutex);
+	fdset_shrink_nolock(pfdset);
+	pthread_mutex_unlock(&pfdset->fd_mutex);
+}
+
+/**
+ * Returns the index in the fdset for a given fd.
+ * @return
+ *   index for the fd, or -1 if fd isn't in the fdset.
+ */
+static int
+fdset_find_fd(struct fdset *pfdset, int fd)
+{
+	int i;
+
+	for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++)
+		;
+
+	return i == pfdset->num ? -1 : i;
+}
+
+static void
+fdset_add_fd(struct fdset *pfdset, int idx, int fd,
+	fd_cb rcb, fd_cb wcb, void *dat)
+{
+	struct fdentry *pfdentry = &pfdset->fd[idx];
+	struct pollfd *pfd = &pfdset->rwfds[idx];
+
+	pfdentry->fd  = fd;
+	pfdentry->rcb = rcb;
+	pfdentry->wcb = wcb;
+	pfdentry->dat = dat;
+
+	pfd->fd = fd;
+	pfd->events  = rcb ? POLLIN : 0;
+	pfd->events |= wcb ? POLLOUT : 0;
+	pfd->revents = 0;
+}
+
+void
+fdset_init(struct fdset *pfdset)
+{
+	int i;
+
+	if (pfdset == NULL)
+		return;
+
+	for (i = 0; i < MAX_FDS; i++) {
+		pfdset->fd[i].fd = -1;
+		pfdset->fd[i].dat = NULL;
+	}
+	pfdset->num = 0;
+}
+
+/**
+ * Register the fd in the fdset with read/write handler and context.
+ */
+int
+fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
+{
+	int i;
+
+	if (pfdset == NULL || fd == -1)
+		return -1;
+
+	pthread_mutex_lock(&pfdset->fd_mutex);
+	i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+	if (i == -1) {
+		fdset_shrink_nolock(pfdset);
+		i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+		if (i == -1) {
+			pthread_mutex_unlock(&pfdset->fd_mutex);
+			return -2;
+		}
+	}
+
+	fdset_add_fd(pfdset, i, fd, rcb, wcb, dat);
+	pthread_mutex_unlock(&pfdset->fd_mutex);
+
+	return 0;
+}
+
+/**
+ *  Unregister the fd from the fdset.
+ *  Returns context of a given fd or NULL.
+ */
+void *
+fdset_del(struct fdset *pfdset, int fd)
+{
+	int i;
+	void *dat = NULL;
+
+	if (pfdset == NULL || fd == -1)
+		return NULL;
+
+	do {
+		pthread_mutex_lock(&pfdset->fd_mutex);
+
+		i = fdset_find_fd(pfdset, fd);
+		if (i != -1 && pfdset->fd[i].busy == 0) {
+			/* busy indicates r/wcb is executing! */
+			dat = pfdset->fd[i].dat;
+			pfdset->fd[i].fd = -1;
+			pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+			pfdset->fd[i].dat = NULL;
+			i = -1;
+		}
+		pthread_mutex_unlock(&pfdset->fd_mutex);
+	} while (i != -1);
+
+	return dat;
+}
+
+
+/**
+ * This functions runs in infinite blocking loop until there is no fd in
+ * pfdset. It calls corresponding r/w handler if there is event on the fd.
+ *
+ * Before the callback is called, we set the flag to busy status; If other
+ * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
+ * will wait until the flag is reset to zero(which indicates the callback is
+ * finished), then it could free the context after fdset_del.
+ */
+void *
+fdset_event_dispatch(void *arg)
+{
+	int i;
+	struct pollfd *pfd;
+	struct fdentry *pfdentry;
+	fd_cb rcb, wcb;
+	void *dat;
+	int fd, numfds;
+	int remove1, remove2;
+	int need_shrink;
+	struct fdset *pfdset = arg;
+
+	if (pfdset == NULL)
+		return NULL;
+
+	while (1) {
+
+		/*
+		 * When poll is blocked, other threads might unregister
+		 * listenfds from and register new listenfds into fdset.
+		 * When poll returns, the entries for listenfds in the fdset
+		 * might have been updated. It is ok if there is unwanted call
+		 * for new listenfds.
+		 */
+		pthread_mutex_lock(&pfdset->fd_mutex);
+		numfds = pfdset->num;
+		pthread_mutex_unlock(&pfdset->fd_mutex);
+
+		poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+
+		need_shrink = 0;
+		for (i = 0; i < numfds; i++) {
+			pthread_mutex_lock(&pfdset->fd_mutex);
+
+			pfdentry = &pfdset->fd[i];
+			fd = pfdentry->fd;
+			pfd = &pfdset->rwfds[i];
+
+			if (fd < 0) {
+				need_shrink = 1;
+				pthread_mutex_unlock(&pfdset->fd_mutex);
+				continue;
+			}
+
+			if (!pfd->revents) {
+				pthread_mutex_unlock(&pfdset->fd_mutex);
+				continue;
+			}
+
+			remove1 = remove2 = 0;
+
+			rcb = pfdentry->rcb;
+			wcb = pfdentry->wcb;
+			dat = pfdentry->dat;
+			pfdentry->busy = 1;
+
+			pthread_mutex_unlock(&pfdset->fd_mutex);
+
+			if (rcb && pfd->revents & (POLLIN | FDPOLLERR))
+				rcb(fd, dat, &remove1);
+			if (wcb && pfd->revents & (POLLOUT | FDPOLLERR))
+				wcb(fd, dat, &remove2);
+			pfdentry->busy = 0;
+			/*
+			 * fdset_del needs to check busy flag.
+			 * We don't allow fdset_del to be called in callback
+			 * directly.
+			 */
+			/*
+			 * When we are to clean up the fd from fdset,
+			 * because the fd is closed in the cb,
+			 * the old fd val could be reused by when creates new
+			 * listen fd in another thread, we couldn't call
+			 * fd_set_del.
+			 */
+			if (remove1 || remove2) {
+				pfdentry->fd = -1;
+				need_shrink = 1;
+			}
+		}
+
+		if (need_shrink)
+			fdset_shrink(pfdset);
+	}
+
+	return NULL;
+}
diff --git a/src/spdk/lib/rte_vhost/fd_man.h b/src/spdk/lib/rte_vhost/fd_man.h
new file mode 100644
index 000000000..3a9d269b3
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/fd_man.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FD_MAN_H_
+#define _FD_MAN_H_
+#include <stdint.h>
+#include <pthread.h>
+#include <poll.h>
+
+#define MAX_FDS 1024
+
+typedef void (*fd_cb)(int fd, void *dat, int *remove);
+
+struct fdentry {
+	int fd;		/* -1 indicates this entry is empty */
+	fd_cb rcb;	/* callback when this fd is readable. */
+	fd_cb wcb;	/* callback when this fd is writeable. */
+	void *dat;	/* fd context */
+	int busy;	/* whether this entry is being used in cb. */
+};
+
+struct fdset {
+	struct pollfd rwfds[MAX_FDS];
+	struct fdentry fd[MAX_FDS];
+	pthread_mutex_t fd_mutex;
+	int num;	/* current fd number of this fdset */
+};
+
+
+void fdset_init(struct fdset *pfdset);
+
+int fdset_add(struct fdset *pfdset, int fd,
+	fd_cb rcb, fd_cb wcb, void *dat);
+
+void *fdset_del(struct fdset *pfdset, int fd);
+
+void *fdset_event_dispatch(void *arg);
+
+#endif
diff --git a/src/spdk/lib/rte_vhost/rte_vhost.h b/src/spdk/lib/rte_vhost/rte_vhost.h
new file mode 100644
index 000000000..b1b7f2cd8
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/rte_vhost.h
@@ -0,0 +1,635 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_VHOST_H_
+#define _RTE_VHOST_H_
+
+/**
+ * @file
+ * Interface to vhost-user
+ */
+
+#include <stdint.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <sys/eventfd.h>
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+
+#define RTE_VHOST_USER_CLIENT		(1ULL << 0)
+#define RTE_VHOST_USER_NO_RECONNECT	(1ULL << 1)
+#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct rte_vhost_mem_region {
+	uint64_t guest_phys_addr;
+	uint64_t guest_user_addr;
+	uint64_t host_user_addr;
+	uint64_t size;
+	void	 *mmap_addr;
+	uint64_t mmap_size;
+	int fd;
+};
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct rte_vhost_memory {
+	uint32_t nregions;
+	struct rte_vhost_mem_region regions[0];
+};
+
+struct rte_vhost_inflight_desc_split {
+	uint8_t inflight;
+	uint8_t padding[5];
+	uint16_t next;
+	uint64_t counter;
+};
+
+struct rte_vhost_inflight_info_split {
+	uint64_t features;
+	uint16_t version;
+	uint16_t desc_num;
+	uint16_t last_inflight_io;
+	uint16_t used_idx;
+	struct rte_vhost_inflight_desc_split desc[0];
+};
+
+struct rte_vhost_resubmit_desc {
+	uint16_t index;
+	uint64_t counter;
+};
+
+struct rte_vhost_resubmit_info {
+	struct rte_vhost_resubmit_desc *resubmit_list;
+	uint16_t resubmit_num;
+};
+
+struct rte_vhost_ring_inflight {
+	struct rte_vhost_inflight_info_split *inflight_split;
+	struct rte_vhost_resubmit_info *resubmit_inflight;
+};
+
+struct rte_vhost_vring {
+	union {
+		struct vring_desc *desc;
+		struct vring_packed_desc *desc_packed;
+	};
+	union {
+		struct vring_avail *avail;
+		struct vring_packed_desc_event *driver_event;
+	};
+	union {
+		struct vring_used *used;
+		struct vring_packed_desc_event *device_event;
+	};
+	uint64_t		log_guest_addr;
+
+	int			callfd;
+	int			kickfd;
+	uint16_t		size;
+};
+
+/**
+ * Device and vring operations.
+ */
+struct vhost_device_ops {
+	int (*new_device)(int vid);		/**< Add device. */
+	void (*destroy_device)(int vid);	/**< Remove device. */
+
+	int (*vring_state_changed)(int vid, uint16_t queue_id, int enable);	/**< triggered when a vring is enabled or disabled */
+
+	/**
+	 * Features could be changed after the feature negotiation.
+	 * For example, VHOST_F_LOG_ALL will be set/cleared at the
+	 * start/end of live migration, respectively. This callback
+	 * is used to inform the application on such change.
+	 */
+	int (*features_changed)(int vid, uint64_t features);
+	int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf);
+	int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd);
+	int (*vhost_nvme_set_bar_mr)(int vid, void *bar_addr, uint64_t bar_size);
+	int (*vhost_nvme_get_cap)(int vid, uint64_t *cap);
+
+	int (*new_connection)(int vid);
+	void (*destroy_connection)(int vid);
+
+	int (*get_config)(int vid, uint8_t *config, uint32_t config_len);
+	int (*set_config)(int vid, uint8_t *config, uint32_t offset,
+			  uint32_t len, uint32_t flags);
+
+	void *reserved[2]; /**< Reserved for future extension */
+};
+
+/**
+ * Convert guest physical address to host virtual address
+ *
+ * @param mem
+ *  the guest memory regions
+ * @param gpa
+ *  the guest physical address for querying
+ * @return
+ *  the host virtual address on success, 0 on failure
+ */
+static inline uint64_t __attribute__((always_inline))
+rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
+{
+	struct rte_vhost_mem_region *reg;
+	uint32_t i;
+
+	for (i = 0; i < mem->nregions; i++) {
+		reg = &mem->regions[i];
+		if (gpa >= reg->guest_phys_addr &&
+		    gpa <  reg->guest_phys_addr + reg->size) {
+			return gpa - reg->guest_phys_addr +
+			       reg->host_user_addr;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Convert guest physical address to host virtual address safely
+ *
+ * This variant of rte_vhost_gpa_to_vva() takes care all the
+ * requested length is mapped and contiguous in process address
+ * space.
+ *
+ * @param mem
+ *  the guest memory regions
+ * @param gpa
+ *  the guest physical address for querying
+ * @param len
+ *  the size of the requested area to map,
+ *  updated with actual size mapped
+ * @return
+ *  the host virtual address on success, 0 on failure  */
+static inline uint64_t
+rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
+	uint64_t gpa, uint64_t *len)
+{
+	struct rte_vhost_mem_region *r;
+	uint32_t i;
+
+	for (i = 0; i < mem->nregions; i++) {
+		r = &mem->regions[i];
+		if (gpa >= r->guest_phys_addr &&
+		    gpa <  r->guest_phys_addr + r->size) {
+
+			if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
+				*len = r->guest_phys_addr + r->size - gpa;
+
+			return gpa - r->guest_phys_addr +
+			       r->host_user_addr;
+		}
+	}
+	*len = 0;
+
+	return 0;
+}
+
+#define RTE_VHOST_NEED_LOG(features)	((features) & (1ULL << VHOST_F_LOG_ALL))
+
+/**
+ * Log the memory write start with given address.
+ *
+ * This function only need be invoked when the live migration starts.
+ * Therefore, we won't need call it at all in the most of time. For
+ * making the performance impact be minimum, it's suggested to do a
+ * check before calling it:
+ *
+ *        if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ *                rte_vhost_log_write(vid, addr, len);
+ *
+ * @param vid
+ *  vhost device ID
+ * @param addr
+ *  the starting address for write
+ * @param len
+ *  the length to write
+ */
+void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
+
+/**
+ * Log the used ring update start at given offset.
+ *
+ * Same as rte_vhost_log_write, it's suggested to do a check before
+ * calling it:
+ *
+ *        if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ *                rte_vhost_log_used_vring(vid, vring_idx, offset, len);
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  the vring index
+ * @param offset
+ *  the offset inside the used ring
+ * @param len
+ *  the length to write
+ */
+void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+			      uint64_t offset, uint64_t len);
+
+int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
+
+/**
+ * Register vhost driver. path could be different for multiple
+ * instance support.
+ */
+int rte_vhost_driver_register(const char *path, uint64_t flags);
+
+/* Unregister vhost driver. This is only meaningful to vhost user. */
+int rte_vhost_driver_unregister(const char *path);
+
+/**
+ * Set the feature bits the vhost-user driver supports.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_features(const char *path, uint64_t features);
+
+/**
+ * Enable vhost-user driver features.
+ *
+ * Note that
+ * - the param @features should be a subset of the feature bits provided
+ *   by rte_vhost_driver_set_features().
+ * - it must be invoked before vhost-user negotiation starts.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param features
+ *  Features to enable
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_enable_features(const char *path, uint64_t features);
+
+/**
+ * Disable vhost-user driver features.
+ *
+ * The two notes at rte_vhost_driver_enable_features() also apply here.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param features
+ *  Features to disable
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_disable_features(const char *path, uint64_t features);
+
+/**
+ * Get the feature bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param features
+ *  A pointer to store the queried feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_features(const char *path, uint64_t *features);
+
+/**
+ * Get the feature bits after negotiation
+ *
+ * @param vid
+ *  Vhost device ID
+ * @param features
+ *  A pointer to store the queried feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
+
+/* Register callbacks. */
+int rte_vhost_driver_callback_register(const char *path,
+	struct vhost_device_ops const * const ops);
+
+/**
+ *
+ * Start the vhost-user driver.
+ *
+ * This function triggers the vhost-user negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_start(const char *path);
+
+/**
+ * Get the MTU value of the device if set in QEMU.
+ *
+ * @param vid
+ *  virtio-net device ID
+ * @param mtu
+ *  The variable to store the MTU value
+ *
+ * @return
+ *  0: success
+ *  -EAGAIN: device not yet started
+ *  -ENOTSUP: device does not support MTU feature
+ */
+int rte_vhost_get_mtu(int vid, uint16_t *mtu);
+
+/**
+ * Get the numa node from which the virtio net device's memory
+ * is allocated.
+ *
+ * @param vid
+ *  vhost device ID
+ *
+ * @return
+ *  The numa node, -1 on failure
+ */
+int rte_vhost_get_numa_node(int vid);
+
+/**
+ * Get the virtio net device's ifname, which is the vhost-user socket
+ * file path.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param buf
+ *  The buffer to stored the queried ifname
+ * @param len
+ *  The length of buf
+ *
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_ifname(int vid, char *buf, size_t len);
+
+/**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index
+ *
+ * @return
+ *  num of avail entires left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+struct rte_mbuf;
+struct rte_mempool;
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtual device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue.
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index in mq case
+ * @param pkts
+ *  array to contain packets to be enqueued
+ * @param count
+ *  packets num to be enqueued
+ * @return
+ *  num of packets enqueued
+ */
+uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * This function gets guest buffers from the virtio device TX virtqueue,
+ * construct host mbufs, copies guest buffer content to host mbufs and
+ * store them in pkts to be processed.
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index in mq case
+ * @param mbuf_pool
+ *  mbuf_pool where host mbuf is allocated.
+ * @param pkts
+ *  array to contain packets to be dequeued
+ * @param count
+ *  packets num to be dequeued
+ * @return
+ *  num of packets dequeued
+ */
+uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * Get guest mem table: a list of memory regions.
+ *
+ * An rte_vhost_vhost_memory object will be allocated internaly, to hold the
+ * guest memory regions. Application should free it at destroy_device()
+ * callback.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param mem
+ *  To store the returned mem regions
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+
+/**
+ * Get guest vring info, including the vring address, vring size, etc.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param vring
+ *  the structure to hold the requested vring info
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+			      struct rte_vhost_vring *vring);
+
+/**
+ * Set id of the last descriptors in avail and used guest vrings.
+ *
+ * In case user application operates directly on buffers, it should use this
+ * function on device destruction to retrieve the same values later on in device
+ * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *)
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param last_avail_idx
+ *  id of the last descriptor in avail ring to be set
+ * @param last_used_idx
+ *  id of the last descriptor in used ring to be set
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+		uint16_t last_avail_idx, uint16_t last_used_idx);
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+		uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Notify the guest that used descriptors have been added to the vring.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_vring_call(int vid, uint16_t vring_idx);
+
+/**
+ * Get guest inflight vring info, including inflight ring and resubmit list.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param vring
+ *  the structure to hold the requested inflight vring info
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
+	struct rte_vhost_ring_inflight *vring);
+
+/**
+ * Set split inflight descriptor.
+ *
+ * This function save descriptors that has been comsumed in available
+ * ring
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param idx
+ *  inflight entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
+	uint16_t idx);
+
+/**
+ * Save the head of list that the last batch of used descriptors.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param idx
+ *  descriptor entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_last_inflight_io_split(int vid,
+	uint16_t vring_idx, uint16_t idx);
+
+/**
+ * Clear the split inflight status.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param last_used_idx
+ *  last used idx of used ring
+ * @param idx
+ *  inflight entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+	uint16_t last_used_idx, uint16_t idx);
+
+/**
+ * Save the head of list that the last batch of used descriptors.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param idx
+ *  descriptor entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_last_inflight_io_split(int vid,
+	uint16_t vring_idx, uint16_t idx);
+
+/**
+ * Clear the split inflight status.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param last_used_idx
+ *  last used idx of used ring
+ * @param idx
+ *  inflight entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+	uint16_t last_used_idx, uint16_t idx);
+#endif /* _RTE_VHOST_H_ */
diff --git a/src/spdk/lib/rte_vhost/socket.c b/src/spdk/lib/rte_vhost/socket.c
new file mode 100644
index 000000000..ec923518b
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/socket.c
@@ -0,0 +1,841 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+
+TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+	struct vhost_user_connection_list conn_list;
+	pthread_mutex_t conn_mutex;
+	char *path;
+	int socket_fd;
+	struct sockaddr_un un;
+	bool is_server;
+	bool reconnect;
+	bool dequeue_zero_copy;
+
+	/*
+	 * The "supported_features" indicates the feature bits the
+	 * vhost driver supports. The "features" indicates the feature
+	 * bits after the rte_vhost_driver_features_disable/enable().
+	 * It is also the final feature bits used for vhost-user
+	 * features negotiation.
+	 */
+	uint64_t supported_features;
+	uint64_t features;
+
+	struct vhost_device_ops const *notify_ops;
+};
+
+struct vhost_user_connection {
+	struct vhost_user_socket *vsocket;
+	int connfd;
+	int vid;
+
+	TAILQ_ENTRY(vhost_user_connection) next;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+	struct fdset fdset;
+	int vsocket_cnt;
+	pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int create_unix_socket(struct vhost_user_socket *vsocket);
+static int vhost_user_start_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+	.fdset = {
+		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+		.num = 0
+	},
+	.vsocket_cnt = 0,
+	.mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	int ret;
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = buf;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	ret = recvmsg(sockfd, &msgh, 0);
+	if (ret <= 0) {
+		if (ret)
+			RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed, %s\n", strerror(errno));
+		else
+			RTE_LOG(INFO, VHOST_CONFIG, "peer closed\n");
+		return ret;
+	}
+
+	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+		RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+		return -1;
+	}
+
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(fds, CMSG_DATA(cmsg), fdsize);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	int ret;
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = buf;
+	iov.iov_len = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+
+	if (fds && fd_num > 0) {
+		msgh.msg_control = control;
+		msgh.msg_controllen = sizeof(control);
+		cmsg = CMSG_FIRSTHDR(&msgh);
+		if (cmsg == NULL) {
+			RTE_LOG(ERR, VHOST_CONFIG,  "cmsg == NULL\n");
+			errno = EINVAL;
+			return -1;
+		}
+		cmsg->cmsg_len = CMSG_LEN(fdsize);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		memcpy(CMSG_DATA(cmsg), fds, fdsize);
+	} else {
+		msgh.msg_control = NULL;
+		msgh.msg_controllen = 0;
+	}
+
+	do {
+		ret = sendmsg(sockfd, &msgh, 0);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+	int vid;
+	size_t size;
+	struct vhost_user_connection *conn;
+	int ret;
+
+	conn = malloc(sizeof(*conn));
+	if (conn == NULL) {
+		close(fd);
+		return;
+	}
+
+	vid = vhost_new_device(vsocket->features, vsocket->notify_ops);
+	if (vid == -1) {
+		goto err;
+	}
+
+	size = strnlen(vsocket->path, PATH_MAX);
+	vhost_set_ifname(vid, vsocket->path, size);
+
+	if (vsocket->dequeue_zero_copy)
+		vhost_enable_dequeue_zero_copy(vid);
+
+	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+	if (vsocket->notify_ops->new_connection) {
+		ret = vsocket->notify_ops->new_connection(vid);
+		if (ret < 0) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"failed to add vhost user connection with fd %d\n",
+				fd);
+			goto err;
+		}
+	}
+
+	conn->connfd = fd;
+	conn->vsocket = vsocket;
+	conn->vid = vid;
+	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+			NULL, conn);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to add fd %d into vhost server fdset\n",
+			fd);
+
+		if (vsocket->notify_ops->destroy_connection)
+			vsocket->notify_ops->destroy_connection(conn->vid);
+
+		goto err;
+	}
+
+	pthread_mutex_lock(&vsocket->conn_mutex);
+	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
+	pthread_mutex_unlock(&vsocket->conn_mutex);
+	return;
+
+err:
+	free(conn);
+	close(fd);
+}
+
+/* call back when there is new vhost-user connection from client  */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+	struct vhost_user_socket *vsocket = dat;
+
+	fd = accept(fd, NULL, NULL);
+	if (fd < 0)
+		return;
+
+	RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+	vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+	struct vhost_user_connection *conn = dat;
+	struct vhost_user_socket *vsocket = conn->vsocket;
+	int ret;
+
+	ret = vhost_user_msg_handler(conn->vid, connfd);
+	if (ret < 0) {
+		*remove = 1;
+		vhost_destroy_device(conn->vid);
+
+		if (vsocket->notify_ops->destroy_connection)
+			vsocket->notify_ops->destroy_connection(conn->vid);
+
+		pthread_mutex_lock(&vsocket->conn_mutex);
+		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
+		if (conn->connfd != -1) {
+			close(conn->connfd);
+			conn->connfd = -1;
+		}
+		pthread_mutex_unlock(&vsocket->conn_mutex);
+
+		free(conn);
+
+		if (vsocket->reconnect) {
+			create_unix_socket(vsocket);
+			vhost_user_start_client(vsocket);
+		}
+	}
+}
+
+static int
+create_unix_socket(struct vhost_user_socket *vsocket)
+{
+	int fd;
+	struct sockaddr_un *un = &vsocket->un;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0)
+		return -1;
+	RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+		vsocket->is_server ? "server" : "client", fd);
+
+	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"vhost-user: can't set nonblocking mode for socket, fd: "
+			"%d (%s)\n", fd, strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	memset(un, 0, sizeof(*un));
+	un->sun_family = AF_UNIX;
+	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
+	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+	vsocket->socket_fd = fd;
+	return 0;
+}
+
+static int
+vhost_user_start_server(struct vhost_user_socket *vsocket)
+{
+	int ret;
+	int fd = vsocket->socket_fd;
+	const char *path = vsocket->path;
+
+	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to bind to %s: %s; remove it and try again\n",
+			path, strerror(errno));
+		goto err;
+	}
+	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+	ret = listen(fd, MAX_VIRTIO_BACKLOG);
+	if (ret < 0)
+		goto err;
+
+	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+		  NULL, vsocket);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to add listen fd %d to vhost server fdset\n",
+			fd);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	close(fd);
+	return -1;
+}
+
+struct vhost_user_reconnect {
+	struct sockaddr_un un;
+	int fd;
+	struct vhost_user_socket *vsocket;
+
+	TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+	struct vhost_user_reconnect_tailq_list head;
+	pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+	int ret, flags;
+
+	ret = connect(fd, un, sz);
+	if (ret < 0 && errno != EISCONN)
+		return -1;
+
+	flags = fcntl(fd, F_GETFL, 0);
+	if (flags < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"can't get flags for connfd %d\n", fd);
+		return -2;
+	}
+	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"can't disable nonblocking on fd %d\n", fd);
+		return -2;
+	}
+	return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+	int ret;
+	struct vhost_user_reconnect *reconn, *next;
+
+	while (1) {
+		pthread_mutex_lock(&reconn_list.mutex);
+
+		/*
+		 * An equal implementation of TAILQ_FOREACH_SAFE,
+		 * which does not exist on all platforms.
+		 */
+		for (reconn = TAILQ_FIRST(&reconn_list.head);
+		     reconn != NULL; reconn = next) {
+			next = TAILQ_NEXT(reconn, next);
+
+			ret = vhost_user_connect_nonblock(reconn->fd,
+						(struct sockaddr *)&reconn->un,
+						sizeof(reconn->un));
+			if (ret == -2) {
+				close(reconn->fd);
+				RTE_LOG(ERR, VHOST_CONFIG,
+					"reconnection for fd %d failed\n",
+					reconn->fd);
+				goto remove_fd;
+			}
+			if (ret == -1)
+				continue;
+
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"%s: connected\n", reconn->vsocket->path);
+			vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+			TAILQ_REMOVE(&reconn_list.head, reconn, next);
+			free(reconn);
+		}
+
+		pthread_mutex_unlock(&reconn_list.mutex);
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+	int ret;
+
+	pthread_mutex_init(&reconn_list.mutex, NULL);
+	TAILQ_INIT(&reconn_list.head);
+
+	ret = pthread_create(&reconn_tid, NULL,
+			     vhost_user_client_reconnect, NULL);
+	if (ret < 0)
+		RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+	return ret;
+}
+
+static int
+vhost_user_start_client(struct vhost_user_socket *vsocket)
+{
+	int ret;
+	int fd = vsocket->socket_fd;
+	const char *path = vsocket->path;
+	struct vhost_user_reconnect *reconn;
+
+	ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
+					  sizeof(vsocket->un));
+	if (ret == 0) {
+		vhost_user_add_connection(fd, vsocket);
+		return 0;
+	}
+
+	RTE_LOG(WARNING, VHOST_CONFIG,
+		"failed to connect to %s: %s\n",
+		path, strerror(errno));
+
+	if (ret == -2 || !vsocket->reconnect) {
+		close(fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
+	reconn = malloc(sizeof(*reconn));
+	if (reconn == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to allocate memory for reconnect\n");
+		close(fd);
+		return -1;
+	}
+	reconn->un = vsocket->un;
+	reconn->fd = fd;
+	reconn->vsocket = vsocket;
+	pthread_mutex_lock(&reconn_list.mutex);
+	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+	pthread_mutex_unlock(&reconn_list.mutex);
+
+	return 0;
+}
+
+static struct vhost_user_socket *
+find_vhost_user_socket(const char *path)
+{
+	int i;
+
+	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+		if (!strcmp(vsocket->path, path))
+			return vsocket;
+	}
+
+	return NULL;
+}
+
+int
+rte_vhost_driver_disable_features(const char *path, uint64_t features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket)
+		vsocket->features &= ~features;
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_enable_features(const char *path, uint64_t features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket) {
+		if ((vsocket->supported_features & features) != features) {
+			/*
+			 * trying to enable features the driver doesn't
+			 * support.
+			 */
+			pthread_mutex_unlock(&vhost_user.mutex);
+			return -1;
+		}
+		vsocket->features |= features;
+	}
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_features(const char *path, uint64_t features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket) {
+		vsocket->supported_features = features;
+		vsocket->features = features;
+	}
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_features(const char *path, uint64_t *features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket)
+		*features = vsocket->features;
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	if (!vsocket) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"socket file %s is not registered yet.\n", path);
+		return -1;
+	} else {
+		return 0;
+	}
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+	int ret = -1;
+	struct vhost_user_socket *vsocket;
+
+	if (!path)
+		return -1;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+
+	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"error: the number of vhost sockets reaches maximum\n");
+		goto out;
+	}
+
+	vsocket = malloc(sizeof(struct vhost_user_socket));
+	if (!vsocket)
+		goto out;
+	memset(vsocket, 0, sizeof(struct vhost_user_socket));
+	vsocket->path = strdup(path);
+	if (!vsocket->path) {
+		free(vsocket);
+		goto out;
+	}
+	TAILQ_INIT(&vsocket->conn_list);
+	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+
+	/*
+	 * Set the supported features correctly for the builtin vhost-user
+	 * net driver.
+	 *
+	 * Applications know nothing about features the builtin virtio net
+	 * driver (virtio_net.c) supports, thus it's not possible for them
+	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
+	 * we set it unconditionally. If the application want to implement
+	 * another vhost-user driver (say SCSI), it should call the
+	 * rte_vhost_driver_set_features(), which will overwrite following
+	 * two values.
+	 */
+	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
+	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
+
+	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+		if (vsocket->reconnect && reconn_tid == 0) {
+			if (vhost_user_reconnect_init() < 0) {
+				free(vsocket->path);
+				free(vsocket);
+				goto out;
+			}
+		}
+	} else {
+		vsocket->is_server = true;
+	}
+	ret = create_unix_socket(vsocket);
+	if (ret < 0) {
+		free(vsocket->path);
+		free(vsocket);
+		goto out;
+	}
+
+	pthread_mutex_init(&vsocket->conn_mutex, NULL);
+	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+	int found = false;
+	struct vhost_user_reconnect *reconn, *next;
+
+	pthread_mutex_lock(&reconn_list.mutex);
+
+	for (reconn = TAILQ_FIRST(&reconn_list.head);
+	     reconn != NULL; reconn = next) {
+		next = TAILQ_NEXT(reconn, next);
+
+		if (reconn->vsocket == vsocket) {
+			TAILQ_REMOVE(&reconn_list.head, reconn, next);
+			close(reconn->fd);
+			free(reconn);
+			found = true;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&reconn_list.mutex);
+	return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+	int i;
+	int count;
+	struct vhost_user_connection *conn;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+
+	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+		if (!strcmp(vsocket->path, path)) {
+			if (vsocket->is_server) {
+				fdset_del(&vhost_user.fdset, vsocket->socket_fd);
+				close(vsocket->socket_fd);
+				unlink(path);
+			} else if (vsocket->reconnect) {
+				vhost_user_remove_reconnect(vsocket);
+			}
+
+			pthread_mutex_lock(&vsocket->conn_mutex);
+			TAILQ_FOREACH(conn, &vsocket->conn_list, next) {
+				close(conn->connfd);
+				conn->connfd = -1;
+			}
+			pthread_mutex_unlock(&vsocket->conn_mutex);
+
+			do {
+				pthread_mutex_lock(&vsocket->conn_mutex);
+				conn = TAILQ_FIRST(&vsocket->conn_list);
+				pthread_mutex_unlock(&vsocket->conn_mutex);
+			} while (conn != NULL);
+
+			free(vsocket->path);
+			free(vsocket);
+
+			count = --vhost_user.vsocket_cnt;
+			vhost_user.vsockets[i] = vhost_user.vsockets[count];
+			vhost_user.vsockets[count] = NULL;
+			pthread_mutex_unlock(&vhost_user.mutex);
+
+			return 0;
+		}
+	}
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(const char *path,
+	struct vhost_device_ops const * const ops)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket)
+		vsocket->notify_ops = ops;
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+struct vhost_device_ops const *
+vhost_driver_callback_get(const char *path)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? vsocket->notify_ops : NULL;
+}
+
+int
+rte_vhost_driver_start(const char *path)
+{
+	struct vhost_user_socket *vsocket;
+	static pthread_t fdset_tid;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	if (!vsocket)
+		return -1;
+
+	if (fdset_tid == 0) {
+		rte_cpuset_t orig_cpuset;
+		rte_cpuset_t tmp_cpuset;
+		long num_cores, i;
+		int ret;
+
+		CPU_ZERO(&tmp_cpuset);
+		num_cores = sysconf(_SC_NPROCESSORS_CONF);
+		/* Create a mask containing all CPUs */
+		for (i = 0; i < num_cores; i++) {
+			CPU_SET(i, &tmp_cpuset);
+		}
+
+		rte_thread_get_affinity(&orig_cpuset);
+		rte_thread_set_affinity(&tmp_cpuset);
+		ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch,
+				     &vhost_user.fdset);
+		rte_thread_set_affinity(&orig_cpuset);
+		if (ret < 0)
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"failed to create fdset handling thread");
+	}
+
+	if (vsocket->is_server)
+		return vhost_user_start_server(vsocket);
+	else
+		return vhost_user_start_client(vsocket);
+}
diff --git a/src/spdk/lib/rte_vhost/vhost.c b/src/spdk/lib/rte_vhost/vhost.c
new file mode 100644
index 000000000..8e875c585
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost.c
@@ -0,0 +1,565 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+
+#include "vhost.h"
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+struct virtio_net *
+get_device(int vid)
+{
+	struct virtio_net *dev = vhost_devices[vid];
+
+	if (unlikely(!dev)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) device not found.\n", vid);
+	}
+
+	return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+	if ((vq->callfd >= 0) && (destroy != 0))
+		close(vq->callfd);
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+	uint32_t i;
+
+	vhost_backend_cleanup(dev);
+
+	for (i = 0; i < dev->nr_vring; i++)
+		cleanup_vq(dev->virtqueue[i], destroy);
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct vhost_virtqueue *vq;
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		vq = dev->virtqueue[i];
+
+		rte_free(vq->shadow_used_ring);
+
+		rte_free(vq);
+	}
+
+	rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq)
+{
+	memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+	/* Backends are set to -1 indicating an inactive device. */
+	vq->backend = -1;
+
+	/*
+	 * always set the vq to enabled; this is to keep compatibility
+	 * with the old QEMU, whereas there is no SET_VRING_ENABLE message.
+	 */
+	vq->enabled = 1;
+
+	TAILQ_INIT(&vq->zmbuf_list);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq)
+{
+	int callfd;
+
+	callfd = vq->callfd;
+	init_vring_queue(vq);
+	vq->callfd = callfd;
+}
+
+int
+alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+	struct vhost_virtqueue *vq;
+
+	vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0);
+	if (vq == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to allocate memory for vring:%u.\n", vring_idx);
+		return -1;
+	}
+
+	dev->virtqueue[vring_idx] = vq;
+	init_vring_queue(vq);
+
+	dev->nr_vring += 1;
+
+	return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, nr_vring: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+	uint32_t i;
+
+	dev->negotiated_features = 0;
+	dev->protocol_features = 0;
+	dev->flags = 0;
+
+	for (i = 0; i < dev->nr_vring; i++)
+		reset_vring_queue(dev->virtqueue[i]);
+}
+
+/*
+ * Invoked when there is a new vhost-user connection established (when
+ * there is a new virtio device being attached).
+ */
+int
+vhost_new_device(uint64_t features, struct vhost_device_ops const *ops)
+{
+	struct virtio_net *dev;
+	int i;
+
+	dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+	if (dev == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to allocate memory for new dev.\n");
+		return -1;
+	}
+
+	for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+		if (vhost_devices[i] == NULL)
+			break;
+	}
+	if (i == MAX_VHOST_DEVICE) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to find a free slot for new device.\n");
+		rte_free(dev);
+		return -1;
+	}
+
+	vhost_devices[i] = dev;
+	dev->vid = i;
+	dev->features = features;
+	dev->notify_ops = ops;
+
+	return i;
+}
+
+/*
+ * Invoked when there is the vhost-user connection is broken (when
+ * the virtio device is being detached).
+ */
+void
+vhost_destroy_device(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(vid);
+	}
+
+	cleanup_device(dev, 1);
+	free_device(dev);
+
+	vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+	struct virtio_net *dev;
+	unsigned int len;
+
+	dev = get_device(vid);
+	if (dev == NULL)
+		return;
+
+	len = if_len > sizeof(dev->ifname) ?
+		sizeof(dev->ifname) : if_len;
+
+	strncpy(dev->ifname, if_name, len);
+	dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+void
+vhost_enable_dequeue_zero_copy(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->dequeue_zero_copy = 1;
+}
+
+int
+rte_vhost_get_mtu(int vid, uint16_t *mtu)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (!dev)
+		return -ENODEV;
+
+	if (!(dev->flags & VIRTIO_DEV_READY))
+		return -EAGAIN;
+
+	if (!(dev->negotiated_features & VIRTIO_NET_F_MTU))
+		return -ENOTSUP;
+
+	*mtu = dev->mtu;
+
+	return 0;
+}
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+	struct virtio_net *dev = get_device(vid);
+	int numa_node;
+	int ret;
+
+	if (dev == NULL)
+		return -1;
+
+	ret = get_mempolicy(&numa_node, NULL, 0, dev,
+			    MPOL_F_NODE | MPOL_F_ADDR);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to query numa node: %d\n", vid, ret);
+		return -1;
+	}
+
+	return numa_node;
+#else
+	RTE_SET_USED(vid);
+	return -1;
+#endif
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return -1;
+
+	len = RTE_MIN(len, sizeof(dev->ifname));
+
+	strncpy(buf, dev->ifname, len);
+	buf[len - 1] = '\0';
+
+	return 0;
+}
+
+int
+rte_vhost_get_negotiated_features(int vid, uint64_t *features)
+{
+	struct virtio_net *dev;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	*features = dev->negotiated_features;
+	return 0;
+}
+
+int
+rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+	struct virtio_net *dev;
+	struct rte_vhost_memory *m;
+	size_t size;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region);
+	m = malloc(sizeof(struct rte_vhost_memory) + size);
+	if (!m)
+		return -1;
+
+	m->nregions = dev->mem->nregions;
+	memcpy(m->regions, dev->mem->regions, size);
+	*mem = m;
+
+	return 0;
+}
+
+int
+rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+			  struct rte_vhost_vring *vring)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	vring->desc  = vq->desc;
+	vring->avail = vq->avail;
+	vring->used  = vq->used;
+	vring->log_guest_addr  = vq->log_guest_addr;
+
+	vring->callfd  = vq->callfd;
+	vring->kickfd  = vq->kickfd;
+	vring->size    = vq->size;
+
+	return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	vq = dev->virtqueue[queue_id];
+	if (!vq->enabled)
+		return 0;
+
+	return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return -1;
+
+	if (enable) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"guest notification isn't supported.\n");
+		return -1;
+	}
+
+	dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+	return 0;
+}
+
+void
+rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	vhost_log_write(dev, addr, len);
+}
+
+void
+rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+			 uint64_t offset, uint64_t len)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (dev == NULL)
+		return;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return;
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return;
+
+	vhost_log_used_vring(dev, vq, offset, len);
+}
+
+int
+rte_vhost_set_vring_base(int vid, uint16_t vring_idx,
+		uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	vq->last_avail_idx = last_avail_idx;
+	vq->last_used_idx = last_used_idx;
+
+	return 0;
+}
+
+int
+rte_vhost_get_vring_base(int vid, uint16_t vring_idx,
+		uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	*last_avail_idx = vq->last_avail_idx;
+	*last_used_idx = vq->last_used_idx;
+
+	return 0;
+}
+
+int
+rte_vhost_vring_call(int vid, uint16_t vring_idx)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if(!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	/* Ensure all our used ring changes are visible to the guest at the time
+	 * of interrupt.
+	 * TODO: this is currently an sfence on x86. For other architectures we
+	 * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
+	 */
+	rte_wmb();
+
+	if (vq->callfd != -1) {
+		eventfd_write(vq->callfd, (eventfd_t)1);
+		return 0;
+	}
+
+	return -1;
+}
+
+int
+rte_vhost_set_last_inflight_io_split(int vid, uint16_t vring_idx,
+				     uint16_t idx)
+{
+	return 0;
+}
+
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+				  uint16_t last_used_idx, uint16_t idx)
+{
+	return 0;
+}
+
+int
+rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
+				  uint16_t idx)
+{
+	return 0;
+}
+
+int
+rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
+				  struct rte_vhost_ring_inflight *vring)
+{
+	return 0;
+}
diff --git a/src/spdk/lib/rte_vhost/vhost.h b/src/spdk/lib/rte_vhost/vhost.h
new file mode 100644
index 000000000..d738dba7f
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost.h
@@ -0,0 +1,330 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+
+#include <rte_log.h>
+#include <rte_ether.h>
+
+#include "rte_vhost.h"
+#include "vhost_user.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+/* Used to indicate that the device is ready to operate */
+#define VIRTIO_DEV_READY 2
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+	uint64_t buf_addr;
+	uint32_t buf_len;
+	uint32_t desc_idx;
+};
+
+/*
+ * A structure to hold some fields needed in zero copy code path,
+ * mainly for associating an mbuf with the right desc_idx.
+ */
+struct zcopy_mbuf {
+	struct rte_mbuf *mbuf;
+	uint32_t desc_idx;
+	uint16_t in_use;
+
+	TAILQ_ENTRY(zcopy_mbuf) next;
+};
+TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+	struct vring_desc	*desc;
+	struct vring_avail	*avail;
+	struct vring_used	*used;
+	uint32_t		size;
+
+	uint16_t		last_avail_idx;
+	uint16_t		last_used_idx;
+#define VIRTIO_INVALID_EVENTFD		(-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
+
+	/* Backend value to determine if device should started/stopped */
+	int			backend;
+	/* Used to notify the guest (trigger interrupt) */
+	int			callfd;
+	/* Currently unused as polling mode is enabled */
+	int			kickfd;
+	int			enabled;
+
+	/* Physical address of used ring, for logging */
+	uint64_t		log_guest_addr;
+
+	uint16_t		nr_zmbuf;
+	uint16_t		zmbuf_size;
+	uint16_t		last_zmbuf_idx;
+	struct zcopy_mbuf	*zmbufs;
+	struct zcopy_mbuf_list	zmbuf_list;
+
+	struct vring_used_elem  *shadow_used_ring;
+	uint16_t                shadow_used_idx;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macros defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+#ifndef VIRTIO_NET_F_MQ
+ #define VIRTIO_NET_F_MQ		22
+#endif
+
+#define VHOST_MAX_VRING			0x100
+#define VHOST_MAX_QUEUE_PAIRS		0x80
+
+#ifndef VIRTIO_NET_F_MTU
+ #define VIRTIO_NET_F_MTU 3
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+#define VHOST_USER_F_PROTOCOL_FEATURES	30
+
+/* Features supported by this builtin vhost-user net driver. */
+#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+				(1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+				(1ULL << VIRTIO_NET_F_CTRL_RX) | \
+				(1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+				(1ULL << VIRTIO_NET_F_MQ)      | \
+				(1ULL << VIRTIO_F_VERSION_1)   | \
+				(1ULL << VHOST_F_LOG_ALL)      | \
+				(1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+				(1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+				(1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+				(1ULL << VIRTIO_NET_F_CSUM)    | \
+				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+				(1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+				(1ULL << VIRTIO_NET_F_MTU))
+
+
+struct guest_page {
+	uint64_t guest_phys_addr;
+	uint64_t host_phys_addr;
+	uint64_t size;
+};
+
+/* struct ether_addr was renamed to struct rte_ether_addr at one point */
+#ifdef RTE_ETHER_ADDR_LEN
+struct ether_addr {
+	uint8_t addr_bytes[RTE_ETHER_ADDR_LEN];
+} __attribute__((__packed__));
+#endif
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+	/* Frontend (QEMU) memory and memory region information */
+	struct rte_vhost_memory	*mem;
+	uint64_t		features;
+	uint64_t		negotiated_features;
+	uint64_t		protocol_features;
+	int			vid;
+	uint32_t		is_nvme;
+	uint32_t		flags;
+	uint16_t		vhost_hlen;
+	/* to tell if we need broadcast rarp packet */
+	rte_atomic16_t		broadcast_rarp;
+	uint32_t		nr_vring;
+	int			dequeue_zero_copy;
+	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+	char			ifname[IF_NAME_SZ];
+	uint64_t		log_size;
+	uint64_t		log_base;
+	uint64_t		log_addr;
+	struct ether_addr	mac;
+	uint16_t		mtu;
+
+	struct vhost_device_ops const *notify_ops;
+
+	uint32_t		nr_guest_pages;
+	uint32_t		max_guest_pages;
+	struct guest_page       *guest_pages;
+	int                     has_new_mem_table;
+	void			*bar_addr;
+	uint64_t		bar_size;
+	struct VhostUserMemory  mem_table;
+	int                     mem_table_fds[VHOST_MEMORY_MAX_NREGIONS];
+} __rte_cache_aligned;
+
+
+#define VHOST_LOG_PAGE	4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+	log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+	uint64_t page;
+
+	if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+		   !dev->log_base || !len))
+		return;
+
+	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+		return;
+
+	/* To make sure guest memory updates are committed before logging */
+	rte_smp_wmb();
+
+	page = addr / VHOST_LOG_PAGE;
+	while (page * VHOST_LOG_PAGE < addr + len) {
+		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+		page += 1;
+	}
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		     uint64_t offset, uint64_t len)
+{
+	vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define VHOST_LOG_LEVEL RTE_LOG_DEBUG
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+	char *pkt_addr = (char *)(addr); \
+	unsigned int index; \
+	char packet[VHOST_MAX_PRINT_BUFF]; \
+	\
+	if ((header)) \
+		snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+	else \
+		snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+	for (index = 0; index < (size); index++) { \
+		snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+			"%02hhx ", pkt_addr[index]); \
+	} \
+	snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+	\
+	VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define VHOST_LOG_LEVEL RTE_LOG_INFO
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE	1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* Convert guest physical address to host physical address */
+static inline phys_addr_t __attribute__((always_inline))
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+	uint32_t i;
+	struct guest_page *page;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		page = &dev->guest_pages[i];
+
+		if (gpa >= page->guest_phys_addr &&
+		    gpa + size < page->guest_phys_addr + page->size) {
+			return gpa - page->guest_phys_addr +
+			       page->host_phys_addr;
+		}
+	}
+
+	return 0;
+}
+
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(uint64_t features, struct vhost_device_ops const *ops);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+void vhost_enable_dequeue_zero_copy(int vid);
+
+struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+
+/*
+ * Backend-specific cleanup.
+ *
+ * TODO: fix it; we have one backend now
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/src/spdk/lib/rte_vhost/vhost_user.c b/src/spdk/lib/rte_vhost/vhost_user.c
new file mode 100644
index 000000000..a07483fcf
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost_user.c
@@ -0,0 +1,1426 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+#define VIRTIO_MIN_MTU 68
+#define VIRTIO_MAX_MTU 65535
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+	[VHOST_USER_NONE] = "VHOST_USER_NONE",
+	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
+	[VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
+	[VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
+	[VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
+	[VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
+	[VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
+	[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
+	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+	[VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN",
+	[VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL",
+	[VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP",
+	[VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP",
+	[VHOST_USER_NVME_SET_BAR_MR] = "VHOST_USER_NVME_SET_BAR_MR"
+};
+
+static uint64_t
+get_blk_size(int fd)
+{
+	struct stat stat;
+	int ret;
+
+	ret = fstat(fd, &stat);
+	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct rte_vhost_mem_region *reg;
+
+	if (!dev || !dev->mem)
+		return;
+
+	for (i = 0; i < dev->mem->nregions; i++) {
+		reg = &dev->mem->regions[i];
+		if (reg->host_user_addr) {
+			munmap(reg->mmap_addr, reg->mmap_size);
+			close(reg->fd);
+		}
+	}
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+	uint32_t i;
+
+	if (dev->has_new_mem_table) {
+		for (i = 0; i < dev->mem_table.nregions; i++) {
+			close(dev->mem_table_fds[i]);
+		}
+		dev->has_new_mem_table = 0;
+	}
+	if (dev->mem) {
+		free_mem_region(dev);
+		rte_free(dev->mem);
+		dev->mem = NULL;
+	}
+
+	free(dev->guest_pages);
+	dev->guest_pages = NULL;
+
+	if (dev->log_addr) {
+		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+		dev->log_addr = 0;
+	}
+	if (dev->bar_addr) {
+		munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+		dev->bar_addr = NULL;
+		dev->bar_size = 0;
+	}
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_user_set_owner(void)
+{
+	return 0;
+}
+
+static int
+vhost_user_reset_owner(struct virtio_net *dev)
+{
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	cleanup_device(dev, 0);
+	reset_device(dev);
+	return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static uint64_t
+vhost_user_get_features(struct virtio_net *dev)
+{
+	return dev->features;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_user_set_features(struct virtio_net *dev, uint64_t features)
+{
+	uint64_t vhost_features = 0;
+
+	vhost_features = vhost_user_get_features(dev);
+	if (features & ~vhost_features) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) received invalid negotiated features.\n",
+			dev->vid);
+		return -1;
+	}
+
+	if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) {
+		if (dev->notify_ops->features_changed) {
+			dev->notify_ops->features_changed(dev->vid, features);
+		} else {
+			dev->flags &= ~VIRTIO_DEV_RUNNING;
+			dev->notify_ops->destroy_device(dev->vid);
+		}
+	}
+
+	dev->negotiated_features = features;
+	if (dev->negotiated_features &
+		((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+		dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	} else {
+		dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+	}
+	VHOST_LOG_DEBUG(VHOST_CONFIG,
+		"(%d) mergeable RX buffers %s, virtio 1 %s\n",
+		dev->vid,
+		(dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+		(dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+	return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_user_set_vring_num(struct virtio_net *dev,
+			 VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+	vq->size = msg->payload.state.num;
+
+	if (dev->dequeue_zero_copy) {
+		vq->nr_zmbuf = 0;
+		vq->last_zmbuf_idx = 0;
+		vq->zmbuf_size = vq->size;
+		vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
+					 sizeof(struct zcopy_mbuf), 0);
+		if (vq->zmbufs == NULL) {
+			RTE_LOG(WARNING, VHOST_CONFIG,
+				"failed to allocate mem for zero copy; "
+				"zero copy is force disabled\n");
+			dev->dequeue_zero_copy = 0;
+		}
+	}
+
+	vq->shadow_used_ring = rte_malloc(NULL,
+				vq->size * sizeof(struct vring_used_elem),
+				RTE_CACHE_LINE_SIZE);
+	if (!vq->shadow_used_ring) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to allocate memory for shadow used ring.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+	int oldnode, newnode;
+	struct virtio_net *old_dev;
+	struct vhost_virtqueue *old_vq, *vq;
+	int ret;
+
+	old_dev = dev;
+	vq = old_vq = dev->virtqueue[index];
+
+	ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+			    MPOL_F_NODE | MPOL_F_ADDR);
+
+	/* check if we need to reallocate vq */
+	ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+			     MPOL_F_NODE | MPOL_F_ADDR);
+	if (ret) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Unable to get vq numa information.\n");
+		return dev;
+	}
+	if (oldnode != newnode) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"reallocate vq from %d to %d node\n", oldnode, newnode);
+		vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
+		if (!vq)
+			return dev;
+
+		memcpy(vq, old_vq, sizeof(*vq));
+		rte_free(old_vq);
+	}
+
+	/* check if we need to reallocate dev */
+	ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+			    MPOL_F_NODE | MPOL_F_ADDR);
+	if (ret) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Unable to get dev numa information.\n");
+		goto out;
+	}
+	if (oldnode != newnode) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"reallocate dev from %d to %d node\n",
+			oldnode, newnode);
+		dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+		if (!dev) {
+			dev = old_dev;
+			goto out;
+		}
+
+		memcpy(dev, old_dev, sizeof(*dev));
+		rte_free(old_dev);
+	}
+
+out:
+	dev->virtqueue[index] = vq;
+	vhost_devices[dev->vid] = dev;
+
+	return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+	return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
+{
+	struct rte_vhost_mem_region *reg;
+	uint32_t i;
+
+	/* Find the region where the address lives. */
+	for (i = 0; i < dev->mem->nregions; i++) {
+		reg = &dev->mem->regions[i];
+
+		if (qva >= reg->guest_user_addr &&
+		    qva <  reg->guest_user_addr + reg->size) {
+
+			if (unlikely(*len > reg->guest_user_addr + reg->size - qva))
+				*len = reg->guest_user_addr + reg->size - qva;
+
+			return qva - reg->guest_user_addr +
+			       reg->host_user_addr;
+		}
+	}
+
+	return 0;
+}
+
+static int vhost_setup_mem_table(struct virtio_net *dev);
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq;
+	uint64_t len;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	if (dev->has_new_mem_table) {
+		vhost_setup_mem_table(dev);
+		dev->has_new_mem_table = 0;
+	}
+
+	if (dev->mem == NULL)
+		return -1;
+
+	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
+	vq = dev->virtqueue[msg->payload.addr.index];
+
+	/* The addresses are converted from QEMU virtual to Vhost virtual. */
+	len = sizeof(struct vring_desc) * vq->size;
+	vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+			msg->payload.addr.desc_user_addr, &len);
+	if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to map desc ring.\n",
+			dev->vid);
+		return -1;
+	}
+
+	dev = numa_realloc(dev, msg->payload.addr.index);
+	vq = dev->virtqueue[msg->payload.addr.index];
+
+	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
+	vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+			msg->payload.addr.avail_user_addr, &len);
+	if (vq->avail == 0 ||
+			len != sizeof(struct vring_avail)
+			+ sizeof(uint16_t) * vq->size) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to find avail ring address.\n",
+			dev->vid);
+		return -1;
+	}
+
+	len = sizeof(struct vring_used) +
+		sizeof(struct vring_used_elem) * vq->size;
+	vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+			msg->payload.addr.used_user_addr, &len);
+	if (vq->used == 0 || len != sizeof(struct vring_used) +
+			sizeof(struct vring_used_elem) * vq->size) {
+
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to find used ring address.\n",
+			dev->vid);
+		return -1;
+	}
+
+	if (vq->last_used_idx != vq->used->idx) {
+		RTE_LOG(WARNING, VHOST_CONFIG,
+			"last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+			"some packets maybe resent for Tx and dropped for Rx\n",
+			vq->last_used_idx, vq->used->idx);
+		vq->last_used_idx  = vq->used->idx;
+		vq->last_avail_idx = vq->used->idx;
+	}
+
+	vq->log_guest_addr = msg->payload.addr.log_guest_addr;
+
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+			dev->vid, vq->desc);
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+			dev->vid, vq->avail);
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+			dev->vid, vq->used);
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+			dev->vid, vq->log_guest_addr);
+
+	return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_user_set_vring_base(struct virtio_net *dev,
+			  VhostUserMsg *msg)
+{
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	dev->virtqueue[msg->payload.state.index]->last_used_idx  = msg->payload.state.num;
+	dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num;
+
+	return 0;
+}
+
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+		   uint64_t host_phys_addr, uint64_t size)
+{
+	struct guest_page *page, *last_page;
+
+	if (dev->nr_guest_pages == dev->max_guest_pages) {
+		dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2);
+		dev->guest_pages = realloc(dev->guest_pages,
+					dev->max_guest_pages * sizeof(*page));
+	}
+
+	if (dev->nr_guest_pages > 0) {
+		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+		/* merge if the two pages are continuous */
+		if (host_phys_addr == last_page->host_phys_addr +
+				      last_page->size) {
+			last_page->size += size;
+			return;
+		}
+	}
+
+	page = &dev->guest_pages[dev->nr_guest_pages++];
+	page->guest_phys_addr = guest_phys_addr;
+	page->host_phys_addr  = host_phys_addr;
+	page->size = size;
+}
+
+static void
+add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
+		uint64_t page_size)
+{
+	uint64_t reg_size = reg->size;
+	uint64_t host_user_addr  = reg->host_user_addr;
+	uint64_t guest_phys_addr = reg->guest_phys_addr;
+	uint64_t host_phys_addr;
+	uint64_t size;
+
+	host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+	size = page_size - (guest_phys_addr & (page_size - 1));
+	size = RTE_MIN(size, reg_size);
+
+	add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+	host_user_addr  += size;
+	guest_phys_addr += size;
+	reg_size -= size;
+
+	while (reg_size > 0) {
+		size = RTE_MIN(reg_size, page_size);
+		host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
+						  host_user_addr);
+		add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+
+		host_user_addr  += size;
+		guest_phys_addr += size;
+		reg_size -= size;
+	}
+}
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct guest_page *page;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		page = &dev->guest_pages[i];
+
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"guest physical page region %u\n"
+			"\t guest_phys_addr: %" PRIx64 "\n"
+			"\t host_phys_addr : %" PRIx64 "\n"
+			"\t size           : %" PRIx64 "\n",
+			i,
+			page->guest_phys_addr,
+			page->host_phys_addr,
+			page->size);
+	}
+}
+#else
+#define dump_guest_pages(dev)
+#endif
+
+static int
+vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	uint32_t i;
+
+	if (dev->has_new_mem_table) {
+		/*
+		 * The previous mem table was not consumed, so close the
+		 *  file descriptors from that mem table before copying
+		 *  the new one.
+		 */
+		for (i = 0; i < dev->mem_table.nregions; i++) {
+			close(dev->mem_table_fds[i]);
+		}
+	}
+
+	memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table));
+	memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds));
+	dev->has_new_mem_table = 1;
+	/* vhost-user-nvme will not send
+	 * set vring addr message, enable
+	 * memory address table now.
+	 */
+	if (dev->has_new_mem_table && dev->is_nvme) {
+		vhost_setup_mem_table(dev);
+		dev->has_new_mem_table = 0;
+	}
+
+	return 0;
+}
+
+ static int
+vhost_setup_mem_table(struct virtio_net *dev)
+{
+	struct VhostUserMemory memory = dev->mem_table;
+	struct rte_vhost_mem_region *reg;
+	struct vhost_virtqueue *vq;
+	void *mmap_addr;
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+	uint64_t alignment;
+	uint32_t i;
+	int fd;
+
+	if (dev->mem) {
+		free_mem_region(dev);
+		rte_free(dev->mem);
+		dev->mem = NULL;
+	}
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		vq = dev->virtqueue[i];
+		/* Those addresses won't be valid anymore in host address space
+		 * after setting new mem table. Initiator need to resend these
+		 * addresses.
+		 */
+		vq->desc = NULL;
+		vq->avail = NULL;
+		vq->used = NULL;
+	}
+
+	dev->nr_guest_pages = 0;
+	if (!dev->guest_pages) {
+		dev->max_guest_pages = 8;
+		dev->guest_pages = malloc(dev->max_guest_pages *
+						sizeof(struct guest_page));
+	}
+
+	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+		sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
+	if (dev->mem == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to allocate memory for dev->mem\n",
+			dev->vid);
+		return -1;
+	}
+	dev->mem->nregions = memory.nregions;
+
+	for (i = 0; i < memory.nregions; i++) {
+		fd  = dev->mem_table_fds[i];
+		reg = &dev->mem->regions[i];
+
+		reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
+		reg->guest_user_addr = memory.regions[i].userspace_addr;
+		reg->size            = memory.regions[i].memory_size;
+		reg->fd              = fd;
+
+		mmap_offset = memory.regions[i].mmap_offset;
+		mmap_size   = reg->size + mmap_offset;
+
+		/* mmap() without flag of MAP_ANONYMOUS, should be called
+		 * with length argument aligned with hugepagesz at older
+		 * longterm version Linux, like 2.6.32 and 3.2.72, or
+		 * mmap() will fail with EINVAL.
+		 *
+		 * to avoid failure, make sure in caller to keep length
+		 * aligned.
+		 */
+		alignment = get_blk_size(fd);
+		if (alignment == (uint64_t)-1) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"couldn't get hugepage size through fstat\n");
+			goto err_mmap;
+		}
+		mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+		mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+				 MAP_SHARED | MAP_POPULATE, fd, 0);
+
+		if (mmap_addr == MAP_FAILED) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"mmap region %u failed.\n", i);
+			goto err_mmap;
+		}
+
+		if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"MADV_DONTDUMP advice setting failed.\n");
+		}
+
+		reg->mmap_addr = mmap_addr;
+		reg->mmap_size = mmap_size;
+		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+				      mmap_offset;
+
+		if (dev->dequeue_zero_copy)
+			add_guest_pages(dev, reg, alignment);
+
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"guest memory region %u, size: 0x%" PRIx64 "\n"
+			"\t guest physical addr: 0x%" PRIx64 "\n"
+			"\t guest virtual  addr: 0x%" PRIx64 "\n"
+			"\t host  virtual  addr: 0x%" PRIx64 "\n"
+			"\t mmap addr : 0x%" PRIx64 "\n"
+			"\t mmap size : 0x%" PRIx64 "\n"
+			"\t mmap align: 0x%" PRIx64 "\n"
+			"\t mmap off  : 0x%" PRIx64 "\n",
+			i, reg->size,
+			reg->guest_phys_addr,
+			reg->guest_user_addr,
+			reg->host_user_addr,
+			(uint64_t)(uintptr_t)mmap_addr,
+			mmap_size,
+			alignment,
+			mmap_offset);
+	}
+
+	dump_guest_pages(dev);
+
+	return 0;
+
+err_mmap:
+	free_mem_region(dev);
+	rte_free(dev->mem);
+	dev->mem = NULL;
+	return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+	return vq && vq->desc   &&
+	       vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+	       vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+	       vq->kickfd != VIRTIO_INVALID_EVENTFD &&
+	       vq->callfd != VIRTIO_INVALID_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+	struct vhost_virtqueue *vq;
+	uint32_t i;
+
+	if (dev->nr_vring == 0)
+		return 0;
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		vq = dev->virtqueue[i];
+
+		if (vq_is_ready(vq)) {
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"virtio is now ready for processing.\n");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static void
+vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	struct vhost_vring_file file;
+	struct vhost_virtqueue *vq;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+		file.fd = VIRTIO_INVALID_EVENTFD;
+	else
+		file.fd = pmsg->fds[0];
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring call idx:%d file:%d\n", file.index, file.fd);
+
+	vq = dev->virtqueue[file.index];
+	if (vq->callfd >= 0)
+		close(vq->callfd);
+
+	vq->callfd = file.fd;
+}
+
+static void
+vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	struct vhost_vring_file file;
+	struct vhost_virtqueue *vq;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+		file.fd = VIRTIO_INVALID_EVENTFD;
+	else
+		file.fd = pmsg->fds[0];
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring kick idx:%d file:%d\n", file.index, file.fd);
+
+	vq = dev->virtqueue[file.index];
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
+	vq->kickfd = file.fd;
+}
+
+static void
+free_zmbufs(struct vhost_virtqueue *vq)
+{
+	struct zcopy_mbuf *zmbuf, *next;
+
+	for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+	     zmbuf != NULL; zmbuf = next) {
+		next = TAILQ_NEXT(zmbuf, next);
+
+		rte_pktmbuf_free(zmbuf->mbuf);
+		TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+	}
+
+	rte_free(vq->zmbufs);
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+vhost_user_get_vring_base(struct virtio_net *dev,
+			  VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+	/* We have to stop the queue (virtio) if it is running. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	dev->flags &= ~VIRTIO_DEV_READY;
+
+	/* Here we are safe to get the last used index */
+	msg->payload.state.num = vq->last_used_idx;
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num);
+	/*
+	 * Based on current qemu vhost-user implementation, this message is
+	 * sent and only sent in vhost_vring_stop.
+	 * TODO: cleanup the vring, it isn't usable since here.
+	 */
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
+
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+	if (vq->callfd >= 0)
+		close(vq->callfd);
+
+	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+	if (dev->dequeue_zero_copy)
+		free_zmbufs(vq);
+	rte_free(vq->shadow_used_ring);
+	vq->shadow_used_ring = NULL;
+
+	return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+vhost_user_set_vring_enable(struct virtio_net *dev,
+			    VhostUserMsg *msg)
+{
+	int enable = (int)msg->payload.state.num;
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"set queue enable: %d to qp idx: %d\n",
+		enable, msg->payload.state.index);
+
+	if (dev->notify_ops->vring_state_changed)
+		dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable);
+
+	dev->virtqueue[msg->payload.state.index]->enabled = enable;
+
+	return 0;
+}
+
+static void
+vhost_user_set_protocol_features(struct virtio_net *dev,
+				 uint64_t protocol_features)
+{
+	if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+		return;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	dev->protocol_features = protocol_features;
+}
+
+static int
+vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	int fd = msg->fds[0];
+	uint64_t size, off;
+	void *addr;
+
+	if (fd < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+		return -1;
+	}
+
+	if (msg->size != sizeof(VhostUserLog)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"invalid log base msg size: %"PRId32" != %d\n",
+			msg->size, (int)sizeof(VhostUserLog));
+		return -1;
+	}
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	size = msg->payload.log.mmap_size;
+	off  = msg->payload.log.mmap_offset;
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"log mmap size: %"PRId64", offset: %"PRId64"\n",
+		size, off);
+
+	/*
+	 * mmap from 0 to workaround a hugepage mmap bug: mmap will
+	 * fail when offset is not page size aligned.
+	 */
+	addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	close(fd);
+	if (addr == MAP_FAILED) {
+		RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+		return -1;
+	}
+
+	/*
+	 * Free previously mapped log memory on occasionally
+	 * multiple VHOST_USER_SET_LOG_BASE.
+	 */
+	if (dev->log_addr) {
+		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+	}
+	dev->log_addr = (uint64_t)(uintptr_t)addr;
+	dev->log_base = dev->log_addr + off;
+	dev->log_size = size;
+
+	return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+	RTE_LOG(DEBUG, VHOST_CONFIG,
+		":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+	memcpy(dev->mac.addr_bytes, mac, 6);
+
+	/*
+	 * Set the flag to inject a RARP broadcast packet at
+	 * rte_vhost_dequeue_burst().
+	 *
+	 * rte_smp_wmb() is for making sure the mac is copied
+	 * before the flag is set.
+	 */
+	rte_smp_wmb();
+	rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+	return 0;
+}
+
+static int
+vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	if (msg->payload.u64 < VIRTIO_MIN_MTU ||
+			msg->payload.u64 > VIRTIO_MAX_MTU) {
+		RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
+				msg->payload.u64);
+
+		return -1;
+	}
+
+	dev->mtu = msg->payload.u64;
+
+	return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+	int ret;
+
+	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+	if (ret <= 0)
+		return ret;
+
+	if (msg && msg->size) {
+		if (msg->size > sizeof(msg->payload)) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"invalid msg size: %d\n", msg->size);
+			return -1;
+		}
+		ret = read(sockfd, &msg->payload, msg->size);
+		if (ret <= 0)
+			return ret;
+		if (ret != (int)msg->size) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"read control message failed\n");
+			return -1;
+		}
+	}
+
+	return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+	int ret;
+
+	if (!msg)
+		return 0;
+
+	msg->flags &= ~VHOST_USER_VERSION_MASK;
+	msg->flags &= ~VHOST_USER_NEED_REPLY;
+	msg->flags |= VHOST_USER_VERSION;
+	msg->flags |= VHOST_USER_REPLY_MASK;
+
+	ret = send_fd_message(sockfd, (char *)msg,
+		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+	return ret;
+}
+
+/*
+ * Allocate a queue pair if it hasn't been allocated yet
+ */
+static int
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
+{
+	uint16_t vring_idx;
+
+	switch (msg->request) {
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+		vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+		break;
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_ENABLE:
+		vring_idx = msg->payload.state.index;
+		break;
+	case VHOST_USER_SET_VRING_ADDR:
+		vring_idx = msg->payload.addr.index;
+		break;
+	default:
+		return 0;
+	}
+
+	if (vring_idx >= VHOST_MAX_VRING) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"invalid vring index: %u\n", vring_idx);
+		return -1;
+	}
+
+	if (dev->virtqueue[vring_idx])
+		return 0;
+
+	return alloc_vring_queue(dev, vring_idx);
+}
+
+static int
+vhost_user_nvme_admin_passthrough(struct virtio_net *dev,
+				  void *cmd, void *cqe, void *buf)
+{
+	if (dev->notify_ops->vhost_nvme_admin_passthrough) {
+		return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf);
+	}
+
+	return -1;
+}
+
+static int
+vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd)
+{
+	if (dev->notify_ops->vhost_nvme_set_cq_call) {
+		return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd);
+	}
+
+	return -1;
+}
+
+static int
+vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap)
+{
+	if (dev->notify_ops->vhost_nvme_get_cap) {
+		return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap);
+	}
+
+	return -1;
+}
+
+static int
+vhost_user_nvme_set_bar_mr(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	struct VhostUserMemory mem_table;
+	int fd = pmsg->fds[0];
+	void *mmap_addr;
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+	uint64_t alignment;
+	struct rte_vhost_mem_region reg;
+	int ret = 0;
+
+	memcpy(&mem_table, &pmsg->payload.memory, sizeof(mem_table));
+
+	reg.guest_phys_addr = mem_table.regions[0].guest_phys_addr;
+	reg.guest_user_addr = mem_table.regions[0].userspace_addr;
+	reg.size            = mem_table.regions[0].memory_size;
+	reg.fd              = fd;
+	mmap_offset = mem_table.regions[0].mmap_offset;
+	mmap_size   = reg.size + mmap_offset;
+
+	alignment = get_blk_size(fd);
+	if (alignment == (uint64_t)-1) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"couldn't get hugepage size through fstat\n");
+			return -1;
+	}
+	mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+	mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+			 MAP_SHARED | MAP_POPULATE, fd, 0);
+
+	if (mmap_addr == MAP_FAILED) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"mmap region failed.\n");
+		return -1;
+	}
+
+	if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"MADV_DONTDUMP advice setting failed.\n");
+	}
+
+	reg.mmap_addr = mmap_addr;
+	reg.mmap_size = mmap_size;
+	reg.host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+				      mmap_offset;
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+			"BAR memory region %u, size: 0x%" PRIx64 "\n"
+			"\t guest physical addr: 0x%" PRIx64 "\n"
+			"\t guest virtual  addr: 0x%" PRIx64 "\n"
+			"\t host  virtual  addr: 0x%" PRIx64 "\n"
+			"\t mmap addr : 0x%" PRIx64 "\n"
+			"\t mmap size : 0x%" PRIx64 "\n"
+			"\t mmap align: 0x%" PRIx64 "\n"
+			"\t mmap off  : 0x%" PRIx64 "\n",
+			0, reg.size,
+			reg.guest_phys_addr,
+			reg.guest_user_addr,
+			reg.host_user_addr,
+			(uint64_t)(uintptr_t)mmap_addr,
+			mmap_size,
+			alignment,
+			mmap_offset);
+
+	if (dev->bar_addr) {
+		munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+	}
+	dev->bar_addr = (void *)(uintptr_t)reg.host_user_addr;
+	dev->bar_size = reg.mmap_size;
+
+	if (dev->notify_ops->vhost_nvme_set_bar_mr) {
+		ret = dev->notify_ops->vhost_nvme_set_bar_mr(dev->vid, dev->bar_addr, dev->bar_size);
+		if (ret) {
+			munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+			dev->bar_addr = NULL;
+			dev->bar_size = 0;
+		}
+	}
+
+	return ret;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+	struct virtio_net *dev;
+	struct VhostUserMsg msg;
+	struct vhost_vring_file file;
+	int ret;
+	uint64_t cap;
+	uint64_t enable;
+	uint8_t cqe[16];
+	uint8_t cmd[64];
+	uint8_t buf[4096];
+
+	dev = get_device(vid);
+	if (dev == NULL)
+		return -1;
+
+	ret = read_vhost_message(fd, &msg);
+	if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+		if (ret < 0)
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"vhost read message failed\n");
+		else if (ret == 0)
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"vhost peer closed\n");
+		else
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"vhost read incorrect message\n");
+
+		return -1;
+	}
+
+	RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n",
+		dev->ifname, vhost_message_str[msg.request]);
+
+	ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to alloc queue\n");
+		return -1;
+	}
+
+	switch (msg.request) {
+	case VHOST_USER_GET_CONFIG:
+		if (dev->notify_ops->get_config(dev->vid,
+						msg.payload.config.region,
+						msg.payload.config.size) != 0) {
+			msg.size = sizeof(uint64_t);
+		}
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_CONFIG:
+		if ((dev->notify_ops->set_config(dev->vid,
+						msg.payload.config.region,
+						msg.payload.config.offset,
+						msg.payload.config.size,
+						msg.payload.config.flags)) != 0) {
+			ret = 1;
+		} else {
+			ret = 0;
+		}
+		break;
+	case VHOST_USER_NVME_ADMIN:
+		if (!dev->is_nvme) {
+			dev->is_nvme = 1;
+		}
+		memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd));
+		ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf);
+		memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe));
+		msg.size = sizeof(cqe);
+		/* NVMe Identify Command */
+		if (cmd[0] == 0x06) {
+			memcpy(msg.payload.nvme.buf, &buf, 4096);
+			msg.size += 4096;
+		}
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_NVME_SET_CQ_CALL:
+		file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
+		file.fd = msg.fds[0];
+		ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd);
+		break;
+	case VHOST_USER_NVME_GET_CAP:
+		ret = vhost_user_nvme_get_cap(dev, &cap);
+		if (!ret)
+			msg.payload.u64 = cap;
+		else
+			msg.payload.u64 = 0;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_NVME_START_STOP:
+		enable = msg.payload.u64;
+		/* device must be started before set cq call */
+		if (enable) {
+			if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+				if (dev->notify_ops->new_device(dev->vid) == 0)
+					dev->flags |= VIRTIO_DEV_RUNNING;
+			}
+		} else {
+			if (dev->flags & VIRTIO_DEV_RUNNING) {
+				dev->flags &= ~VIRTIO_DEV_RUNNING;
+				dev->notify_ops->destroy_device(dev->vid);
+			}
+		}
+		break;
+	case VHOST_USER_NVME_SET_BAR_MR:
+		ret = vhost_user_nvme_set_bar_mr(dev, &msg);
+		break;
+	case VHOST_USER_GET_FEATURES:
+		msg.payload.u64 = vhost_user_get_features(dev);
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_FEATURES:
+		vhost_user_set_features(dev, msg.payload.u64);
+		break;
+
+	case VHOST_USER_GET_PROTOCOL_FEATURES:
+		msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
+		vhost_user_set_protocol_features(dev, msg.payload.u64);
+		break;
+
+	case VHOST_USER_SET_OWNER:
+		vhost_user_set_owner();
+		break;
+	case VHOST_USER_RESET_OWNER:
+		vhost_user_reset_owner(dev);
+		break;
+
+	case VHOST_USER_SET_MEM_TABLE:
+		ret = vhost_user_set_mem_table(dev, &msg);
+		break;
+
+	case VHOST_USER_SET_LOG_BASE:
+		vhost_user_set_log_base(dev, &msg);
+
+		/* it needs a reply */
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_LOG_FD:
+		close(msg.fds[0]);
+		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+		break;
+
+	case VHOST_USER_SET_VRING_NUM:
+		vhost_user_set_vring_num(dev, &msg);
+		break;
+	case VHOST_USER_SET_VRING_ADDR:
+		vhost_user_set_vring_addr(dev, &msg);
+		break;
+	case VHOST_USER_SET_VRING_BASE:
+		vhost_user_set_vring_base(dev, &msg);
+		break;
+
+	case VHOST_USER_GET_VRING_BASE:
+		vhost_user_get_vring_base(dev, &msg);
+		msg.size = sizeof(msg.payload.state);
+		send_vhost_message(fd, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_KICK:
+		vhost_user_set_vring_kick(dev, &msg);
+		break;
+	case VHOST_USER_SET_VRING_CALL:
+		vhost_user_set_vring_call(dev, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_ERR:
+		if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+			close(msg.fds[0]);
+		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+		break;
+
+	case VHOST_USER_GET_QUEUE_NUM:
+		msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_ENABLE:
+		vhost_user_set_vring_enable(dev, &msg);
+		break;
+	case VHOST_USER_SEND_RARP:
+		vhost_user_send_rarp(dev, &msg);
+		break;
+
+	case VHOST_USER_NET_SET_MTU:
+		ret = vhost_user_net_set_mtu(dev, &msg);
+		break;
+
+	default:
+		ret = -1;
+		break;
+
+	}
+
+	if (msg.flags & VHOST_USER_NEED_REPLY) {
+		msg.payload.u64 = !!ret;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+	}
+
+	if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
+		dev->flags |= VIRTIO_DEV_READY;
+
+		if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+			if (dev->dequeue_zero_copy) {
+				RTE_LOG(INFO, VHOST_CONFIG,
+						"dequeue zero copy is enabled\n");
+			}
+
+			if (dev->notify_ops->new_device(dev->vid) == 0)
+				dev->flags |= VIRTIO_DEV_RUNNING;
+		}
+	}
+
+	return 0;
+}
diff --git a/src/spdk/lib/rte_vhost/vhost_user.h b/src/spdk/lib/rte_vhost/vhost_user.h
new file mode 100644
index 000000000..d20574b64
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost_user.h
@@ -0,0 +1,171 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_vhost.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+#define VHOST_USER_PROTOCOL_F_MQ	0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD	1
+#define VHOST_USER_PROTOCOL_F_RARP	2
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK	3
+#define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#define VHOST_USER_PROTOCOL_F_CONFIG 9
+
+#define VHOST_USER_PROTOCOL_FEATURES	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+					 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+typedef enum VhostUserRequest {
+	VHOST_USER_NONE = 0,
+	VHOST_USER_GET_FEATURES = 1,
+	VHOST_USER_SET_FEATURES = 2,
+	VHOST_USER_SET_OWNER = 3,
+	VHOST_USER_RESET_OWNER = 4,
+	VHOST_USER_SET_MEM_TABLE = 5,
+	VHOST_USER_SET_LOG_BASE = 6,
+	VHOST_USER_SET_LOG_FD = 7,
+	VHOST_USER_SET_VRING_NUM = 8,
+	VHOST_USER_SET_VRING_ADDR = 9,
+	VHOST_USER_SET_VRING_BASE = 10,
+	VHOST_USER_GET_VRING_BASE = 11,
+	VHOST_USER_SET_VRING_KICK = 12,
+	VHOST_USER_SET_VRING_CALL = 13,
+	VHOST_USER_SET_VRING_ERR = 14,
+	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+	VHOST_USER_GET_QUEUE_NUM = 17,
+	VHOST_USER_SET_VRING_ENABLE = 18,
+	VHOST_USER_SEND_RARP = 19,
+	VHOST_USER_NET_SET_MTU = 20,
+	VHOST_USER_GET_CONFIG = 24,
+	VHOST_USER_SET_CONFIG = 25,
+	VHOST_USER_NVME_ADMIN = 80,
+	VHOST_USER_NVME_SET_CQ_CALL = 81,
+	VHOST_USER_NVME_GET_CAP = 82,
+	VHOST_USER_NVME_START_STOP = 83,
+	VHOST_USER_NVME_IO_CMD = 84,
+	VHOST_USER_NVME_SET_BAR_MR = 85,
+	VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+	VHOST_USER_SLAVE_NONE = 0,
+	VHOST_USER_SLAVE_IOTLB_MSG = 1,
+	VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
+	VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+	uint64_t guest_phys_addr;
+	uint64_t memory_size;
+	uint64_t userspace_addr;
+	uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+	uint32_t nregions;
+	uint32_t padding;
+	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserConfig {
+	uint32_t offset;
+	uint32_t size;
+	uint32_t flags;
+	uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+typedef struct VhostUserMsg {
+	VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+#define VHOST_USER_NEED_REPLY		(0x1 << 3)
+	uint32_t flags;
+	uint32_t size; /* the following payload size */
+	union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+		uint64_t u64;
+		struct vhost_vring_state state;
+		struct vhost_vring_addr addr;
+		VhostUserMemory memory;
+		VhostUserLog    log;
+		VhostUserConfig config;
+		struct nvme {
+			union {
+				uint8_t req[64];
+				uint8_t cqe[16];
+			} cmd;
+			uint8_t buf[4096];
+		} nvme;
+	} payload;
+	int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/lib/rte_vhost
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip