268 files changed, 152758 insertions, 0 deletions
diff --git a/src/spdk/lib/Makefile b/src/spdk/lib/Makefile
new file mode 100644
index 000000000..4c0c383eb
--- /dev/null
+++ b/src/spdk/lib/Makefile
@@ -0,0 +1,65 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.lib_deps.mk
+
+DIRS-y += bdev blob blobfs conf accel event json jsonrpc \
+          log log_rpc lvol net rpc sock thread trace util nvme vmd nvmf scsi \
+          ioat ut_mock iscsi notify
+ifeq ($(OS),Linux)
+DIRS-y += nbd ftl
+endif
+
+DIRS-$(CONFIG_OCF) += env_ocf
+DIRS-$(CONFIG_IDXD) += idxd
+DIRS-$(CONFIG_VHOST) += vhost
+DIRS-$(CONFIG_VIRTIO) += virtio
+DIRS-$(CONFIG_REDUCE) += reduce
+DIRS-$(CONFIG_VHOST_INTERNAL_LIB) += rte_vhost
+DIRS-$(CONFIG_RDMA) += rdma
+
+# If CONFIG_ENV is pointing at a directory in lib, build it.
+# Out-of-tree env implementations must be built separately by the user.
+ENV_NAME := $(notdir $(CONFIG_ENV))
+ifeq ($(abspath $(CONFIG_ENV)),$(SPDK_ROOT_DIR)/lib/$(ENV_NAME))
+DIRS-y += $(ENV_NAME)
+endif
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/lib/accel/Makefile b/src/spdk/lib/accel/Makefile
new file mode 100644
index 000000000..0d41104de
--- /dev/null
+++ b/src/spdk/lib/accel/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+SO_SUFFIX := $(SO_VER).$(SO_MINOR)
+
+LIBNAME = accel
+C_SRCS = accel_engine.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_accel.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/accel/accel_engine.c b/src/spdk/lib/accel/accel_engine.c
new file mode 100644
index 000000000..03a405439
--- /dev/null
+++ b/src/spdk/lib/accel/accel_engine.c
@@ -0,0 +1,1044 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/accel_engine.h"
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/crc32.h"
+
+/* Accelerator Engine Framework: The following provides a top level
+ * generic API for the accelerator functions defined here. Modules,
+ * such as the one in /module/accel/ioat, supply the implemention of
+ * with the exception of the pure software implemention contained
+ * later in this file.
+ */
+
+#define ALIGN_4K		0x1000
+#define SPDK_ACCEL_NUM_TASKS	0x4000
+
+static struct spdk_mempool *g_accel_task_pool;
+
+/* Largest context size for all accel modules */
+static size_t g_max_accel_module_size = 0;
+
+static struct spdk_accel_engine *g_hw_accel_engine = NULL;
+static struct spdk_accel_engine *g_sw_accel_engine = NULL;
+static struct spdk_accel_module_if *g_accel_engine_module = NULL;
+static spdk_accel_fini_cb g_fini_cb_fn = NULL;
+static void *g_fini_cb_arg = NULL;
+
+/* Global list of registered accelerator modules */
+static TAILQ_HEAD(, spdk_accel_module_if) spdk_accel_module_list =
+	TAILQ_HEAD_INITIALIZER(spdk_accel_module_list);
+
+struct accel_io_channel {
+	struct spdk_accel_engine	*engine;
+	struct spdk_io_channel		*ch;
+};
+
+/* Forward declarations of software implementations used when an
+ * engine has not implemented the capability.
+ */
+static int sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src,
+				    uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
+				uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2,
+				   uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill,
+				uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src,
+				  uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn,
+				  void *cb_arg);
+
+/* Registration of hw modules (currently supports only 1 at a time) */
+void
+spdk_accel_hw_engine_register(struct spdk_accel_engine *accel_engine)
+{
+	if (g_hw_accel_engine == NULL) {
+		g_hw_accel_engine = accel_engine;
+	} else {
+		SPDK_NOTICELOG("Hardware offload engine already enabled\n");
+	}
+}
+
+/* Registration of sw modules (currently supports only 1) */
+static void
+accel_sw_register(struct spdk_accel_engine *accel_engine)
+{
+	assert(g_sw_accel_engine == NULL);
+	g_sw_accel_engine = accel_engine;
+}
+
+static void
+accel_sw_unregister(void)
+{
+	g_sw_accel_engine = NULL;
+}
+
+/* Common completion routine, called only by the accel framework */
+static void
+_accel_engine_done(void *ref, int status)
+{
+	struct spdk_accel_task *req = (struct spdk_accel_task *)ref;
+
+	req->cb(req->cb_arg, status);
+	spdk_mempool_put(g_accel_task_pool, req);
+}
+
+uint64_t
+spdk_accel_get_capabilities(struct spdk_io_channel *ch)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+	/* All engines are required to implement this API. */
+	return accel_ch->engine->get_capabilities();
+}
+
+/* Accel framework public API for copy function */
+int
+spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes,
+		       spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	/* If the engine does not support it, fallback to the sw implementation. */
+	if (accel_ch->engine->copy) {
+		return accel_ch->engine->copy(accel_ch->ch, dst, src, nbytes,
+					      _accel_engine_done, accel_req->offload_ctx);
+	} else {
+		return sw_accel_submit_copy(accel_ch->ch, dst, src, nbytes,
+					    _accel_engine_done, accel_req->offload_ctx);
+	}
+}
+
+/* Accel framework public API for dual cast copy function */
+int
+spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src,
+			   uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+		SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+		return -EINVAL;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	/* If the engine does not support it, fallback to the sw implementation. */
+	if (accel_ch->engine->dualcast) {
+		return accel_ch->engine->dualcast(accel_ch->ch, dst1, dst2, src, nbytes,
+						  _accel_engine_done, accel_req->offload_ctx);
+	} else {
+		return sw_accel_submit_dualcast(accel_ch->ch, dst1, dst2, src, nbytes,
+						_accel_engine_done, accel_req->offload_ctx);
+	}
+}
+
+/* Accel framework public API for batch_create function. All engines are
+ * required to implement this API.
+ */
+struct spdk_accel_batch *
+spdk_accel_batch_create(struct spdk_io_channel *ch)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+	return accel_ch->engine->batch_create(accel_ch->ch);
+}
+
+/* Accel framework public API for batch_submit function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+			spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	return accel_ch->engine->batch_submit(accel_ch->ch, batch, _accel_engine_done,
+					      accel_req->offload_ctx);
+}
+
+/* Accel framework public API for getting max batch. All engines are
+ * required to implement this API.
+ */
+uint32_t
+spdk_accel_batch_get_max(struct spdk_io_channel *ch)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+	return accel_ch->engine->batch_get_max();
+}
+
+/* Accel framework public API for for when an app is unable to complete a batch sequence,
+ * it cancels with this API.
+ */
+int
+spdk_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+	return accel_ch->engine->batch_cancel(accel_ch->ch, batch);
+}
+
+/* Accel framework public API for batch prep_copy function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+			   void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	return accel_ch->engine->batch_prep_copy(accel_ch->ch, batch, dst, src, nbytes,
+			_accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_dualcast function.  All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+			       void *dst1, void *dst2, void *src, uint64_t nbytes,
+			       spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+		SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+		return -EINVAL;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	return accel_ch->engine->batch_prep_dualcast(accel_ch->ch, batch, dst1, dst2, src,
+			nbytes, _accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_compare function.  All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+			      void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn,
+			      void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	return accel_ch->engine->batch_prep_compare(accel_ch->ch, batch, src1, src2, nbytes,
+			_accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_fill function.  All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+			   uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	return accel_ch->engine->batch_prep_fill(accel_ch->ch, batch, dst, fill, nbytes,
+			_accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_crc32c function.  All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+			     uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes,
+			     spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	return accel_ch->engine->batch_prep_crc32c(accel_ch->ch, batch, dst, src, seed, nbytes,
+			_accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for compare function */
+int
+spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, uint64_t nbytes,
+			  spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	/* If the engine does not support it, fallback to the sw implementation. */
+	if (accel_ch->engine->compare) {
+		return accel_ch->engine->compare(accel_ch->ch, src1, src2, nbytes,
+						 _accel_engine_done, accel_req->offload_ctx);
+	} else {
+		return sw_accel_submit_compare(accel_ch->ch, src1, src2, nbytes,
+					       _accel_engine_done, accel_req->offload_ctx);
+	}
+}
+
+/* Accel framework public API for fill function */
+int
+spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, uint64_t nbytes,
+		       spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	/* If the engine does not support it, fallback to the sw implementation. */
+	if (accel_ch->engine->fill) {
+		return accel_ch->engine->fill(accel_ch->ch, dst, fill, nbytes,
+					      _accel_engine_done, accel_req->offload_ctx);
+	} else {
+		return sw_accel_submit_fill(accel_ch->ch, dst, fill, nbytes,
+					    _accel_engine_done, accel_req->offload_ctx);
+	}
+}
+
+/* Accel framework public API for CRC-32C function */
+int
+spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, uint32_t seed,
+			 uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+	if (accel_req == NULL) {
+		SPDK_ERRLOG("Unable to get an accel task.\n");
+		return -ENOMEM;
+	}
+
+	accel_req->cb = cb_fn;
+	accel_req->cb_arg = cb_arg;
+
+	/* If the engine does not support it, fallback to the sw implementation. */
+	if (accel_ch->engine->crc32c) {
+		return accel_ch->engine->crc32c(accel_ch->ch, dst, src,	seed, nbytes,
+						_accel_engine_done, accel_req->offload_ctx);
+	} else {
+		return sw_accel_submit_crc32c(accel_ch->ch, dst, src, seed, nbytes,
+					      _accel_engine_done, accel_req->offload_ctx);
+	}
+}
+
+/* Helper function when when accel modules register with the framework. */
+void spdk_accel_module_list_add(struct spdk_accel_module_if *accel_module)
+{
+	TAILQ_INSERT_TAIL(&spdk_accel_module_list, accel_module, tailq);
+	if (accel_module->get_ctx_size && accel_module->get_ctx_size() > g_max_accel_module_size) {
+		g_max_accel_module_size = accel_module->get_ctx_size();
+	}
+}
+
+/* Framework level channel create callback. */
+static int
+accel_engine_create_cb(void *io_device, void *ctx_buf)
+{
+	struct accel_io_channel	*accel_ch = ctx_buf;
+
+	if (g_hw_accel_engine != NULL) {
+		accel_ch->ch = g_hw_accel_engine->get_io_channel();
+		if (accel_ch->ch != NULL) {
+			accel_ch->engine = g_hw_accel_engine;
+			return 0;
+		}
+	}
+
+	/* No hw engine enabled, use sw. */
+	accel_ch->ch = g_sw_accel_engine->get_io_channel();
+	assert(accel_ch->ch != NULL);
+	accel_ch->engine = g_sw_accel_engine;
+	return 0;
+}
+
+/* Framework level channel destroy callback. */
+static void
+accel_engine_destroy_cb(void *io_device, void *ctx_buf)
+{
+	struct accel_io_channel	*accel_ch = ctx_buf;
+
+	spdk_put_io_channel(accel_ch->ch);
+}
+
+struct spdk_io_channel *
+spdk_accel_engine_get_io_channel(void)
+{
+	return spdk_get_io_channel(&spdk_accel_module_list);
+}
+
+static void
+accel_engine_module_initialize(void)
+{
+	struct spdk_accel_module_if *accel_engine_module;
+	char task_pool_name[30];
+
+	TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) {
+		accel_engine_module->module_init();
+	}
+
+	snprintf(task_pool_name, sizeof(task_pool_name), "accel_task_pool");
+	g_accel_task_pool = spdk_mempool_create(task_pool_name,
+						SPDK_ACCEL_NUM_TASKS,
+						g_max_accel_module_size,
+						SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+						SPDK_ENV_SOCKET_ID_ANY);
+	assert(g_accel_task_pool);
+
+}
+
+int
+spdk_accel_engine_initialize(void)
+{
+	SPDK_NOTICELOG("Accel engine initialized to use software engine.\n");
+	accel_engine_module_initialize();
+
+	/*
+	 * We need a unique identifier for the accel engine framework, so use the
+	 *  spdk_accel_module_list address for this purpose.
+	 */
+	spdk_io_device_register(&spdk_accel_module_list, accel_engine_create_cb, accel_engine_destroy_cb,
+				sizeof(struct accel_io_channel), "accel_module");
+
+	return 0;
+}
+
+static void
+accel_engine_module_finish_cb(void)
+{
+	spdk_accel_fini_cb cb_fn = g_fini_cb_fn;
+
+	cb_fn(g_fini_cb_arg);
+	g_fini_cb_fn = NULL;
+	g_fini_cb_arg = NULL;
+}
+
+void
+spdk_accel_write_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_accel_module_if *accel_engine_module;
+
+	/*
+	 * The accel engine has no config, there may be some in
+	 * the modules though.
+	 */
+	spdk_json_write_array_begin(w);
+	TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) {
+		if (accel_engine_module->write_config_json) {
+			accel_engine_module->write_config_json(w);
+		}
+	}
+	spdk_json_write_array_end(w);
+}
+
+void
+spdk_accel_engine_module_finish(void)
+{
+	if (!g_accel_engine_module) {
+		g_accel_engine_module = TAILQ_FIRST(&spdk_accel_module_list);
+	} else {
+		g_accel_engine_module = TAILQ_NEXT(g_accel_engine_module, tailq);
+	}
+
+	if (!g_accel_engine_module) {
+		accel_engine_module_finish_cb();
+		return;
+	}
+
+	if (g_accel_engine_module->module_fini) {
+		spdk_thread_send_msg(spdk_get_thread(), g_accel_engine_module->module_fini, NULL);
+	} else {
+		spdk_accel_engine_module_finish();
+	}
+}
+
+void
+spdk_accel_engine_finish(spdk_accel_fini_cb cb_fn, void *cb_arg)
+{
+	assert(cb_fn != NULL);
+
+	g_fini_cb_fn = cb_fn;
+	g_fini_cb_arg = cb_arg;
+
+	spdk_io_device_unregister(&spdk_accel_module_list, NULL);
+	spdk_accel_engine_module_finish();
+	spdk_mempool_free(g_accel_task_pool);
+}
+
+void
+spdk_accel_engine_config_text(FILE *fp)
+{
+	struct spdk_accel_module_if *accel_engine_module;
+
+	TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) {
+		if (accel_engine_module->config_text) {
+			accel_engine_module->config_text(fp);
+		}
+	}
+}
+
+/*
+ * The SW Accelerator module is "built in" here (rest of file)
+ */
+
+#define SW_ACCEL_BATCH_SIZE 2048
+
+enum sw_accel_opcode {
+	SW_ACCEL_OPCODE_MEMMOVE		= 0,
+	SW_ACCEL_OPCODE_MEMFILL		= 1,
+	SW_ACCEL_OPCODE_COMPARE		= 2,
+	SW_ACCEL_OPCODE_CRC32C		= 3,
+	SW_ACCEL_OPCODE_DUALCAST	= 4,
+};
+
+struct sw_accel_op {
+	struct sw_accel_io_channel	*sw_ch;
+	void				*cb_arg;
+	spdk_accel_completion_cb	cb_fn;
+	void				*src;
+	union {
+		void			*dst;
+		void			*src2;
+	};
+	void				*dst2;
+	uint32_t			seed;
+	uint64_t			fill_pattern;
+	enum sw_accel_opcode		op_code;
+	uint64_t			nbytes;
+	TAILQ_ENTRY(sw_accel_op)	link;
+};
+
+/* The sw accel engine only supports one outstanding batch at a time. */
+struct sw_accel_io_channel {
+	TAILQ_HEAD(, sw_accel_op)	op_pool;
+	TAILQ_HEAD(, sw_accel_op)	batch;
+};
+
+static uint64_t
+sw_accel_get_capabilities(void)
+{
+	return ACCEL_COPY | ACCEL_FILL | ACCEL_CRC32C | ACCEL_COMPARE |
+	       ACCEL_DUALCAST | ACCEL_BATCH;
+}
+
+static uint32_t
+sw_accel_batch_get_max(void)
+{
+	return SW_ACCEL_BATCH_SIZE;
+}
+
+/* The sw engine plug-in does not ahve a public API, it is only callable
+ * from the accel fw and thus does not need to have its own struct definition
+ * of a batch, it just simply casts the address of the single supported batch
+ * as the struct spdk_accel_batch pointer.
+ */
+static struct spdk_accel_batch *
+sw_accel_batch_start(struct spdk_io_channel *ch)
+{
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	if (!TAILQ_EMPTY(&sw_ch->batch)) {
+		SPDK_ERRLOG("SW accel engine only supports one batch at a time.\n");
+		return NULL;
+	}
+
+	return (struct spdk_accel_batch *)&sw_ch->batch;
+}
+
+static struct sw_accel_op *
+_prep_op(struct sw_accel_io_channel *sw_ch, struct spdk_accel_batch *batch,
+	 spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+
+	if ((struct spdk_accel_batch *)&sw_ch->batch != batch) {
+		SPDK_ERRLOG("Invalid batch\n");
+		return NULL;
+	}
+
+	if (!TAILQ_EMPTY(&sw_ch->op_pool)) {
+		op = TAILQ_FIRST(&sw_ch->op_pool);
+		TAILQ_REMOVE(&sw_ch->op_pool, op, link);
+	} else {
+		SPDK_ERRLOG("Ran out of operations for batch\n");
+		return NULL;
+	}
+
+	op->cb_arg = cb_arg;
+	op->cb_fn = cb_fn;
+	op->sw_ch = sw_ch;
+
+	return op;
+}
+
+static int
+sw_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+			 void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+	if (op == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	op->src = src;
+	op->dst = dst;
+	op->nbytes = nbytes;
+	op->op_code = SW_ACCEL_OPCODE_MEMMOVE;
+	TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+	return 0;
+}
+
+static int
+sw_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst1,
+			     void *dst2,
+			     void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+	if (op == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	op->src = src;
+	op->dst = dst1;
+	op->dst2 = dst2;
+	op->nbytes = nbytes;
+	op->op_code = SW_ACCEL_OPCODE_DUALCAST;
+	TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+	return 0;
+}
+
+static int
+sw_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *src1,
+			    void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+	if (op == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	op->src = src1;
+	op->src2 = src2;
+	op->nbytes = nbytes;
+	op->op_code = SW_ACCEL_OPCODE_COMPARE;
+	TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+	return 0;
+}
+
+static int
+sw_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+			 uint8_t fill,
+			 uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+	if (op == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	op->dst = dst;
+	op->fill_pattern = fill;
+	op->nbytes = nbytes;
+	op->op_code = SW_ACCEL_OPCODE_MEMFILL;
+	TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+	return 0;
+}
+
+static int
+sw_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+			   uint32_t *dst,
+			   void *src, uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+	if (op == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	op->dst = (void *)dst;
+	op->src = src;
+	op->seed = seed;
+	op->nbytes = nbytes;
+	op->op_code = SW_ACCEL_OPCODE_CRC32C;
+	TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+	return 0;
+}
+
+
+static int
+sw_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+	if ((struct spdk_accel_batch *)&sw_ch->batch != batch) {
+		SPDK_ERRLOG("Invalid batch\n");
+		return -EINVAL;
+	}
+
+	/* Cancel the batch items by moving them back to the op_pool. */
+	while ((op = TAILQ_FIRST(&sw_ch->batch))) {
+		TAILQ_REMOVE(&sw_ch->batch, op, link);
+		TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link);
+	}
+
+	return 0;
+}
+
+static int
+sw_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+		      spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct sw_accel_op *op;
+	struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *accel_req;
+	int batch_status = 0, cmd_status = 0;
+
+	if ((struct spdk_accel_batch *)&sw_ch->batch != batch) {
+		SPDK_ERRLOG("Invalid batch\n");
+		return -EINVAL;
+	}
+
+	/* Complete the batch items. */
+	while ((op = TAILQ_FIRST(&sw_ch->batch))) {
+		TAILQ_REMOVE(&sw_ch->batch, op, link);
+		accel_req = (struct spdk_accel_task *)((uintptr_t)op->cb_arg -
+						       offsetof(struct spdk_accel_task, offload_ctx));
+
+		switch (op->op_code) {
+		case SW_ACCEL_OPCODE_MEMMOVE:
+			memcpy(op->dst, op->src, op->nbytes);
+			break;
+		case SW_ACCEL_OPCODE_DUALCAST:
+			memcpy(op->dst, op->src, op->nbytes);
+			memcpy(op->dst2, op->src, op->nbytes);
+			break;
+		case SW_ACCEL_OPCODE_COMPARE:
+			cmd_status = memcmp(op->src, op->src2, op->nbytes);
+			break;
+		case SW_ACCEL_OPCODE_MEMFILL:
+			memset(op->dst, op->fill_pattern, op->nbytes);
+			break;
+		case SW_ACCEL_OPCODE_CRC32C:
+			*(uint32_t *)op->dst = spdk_crc32c_update(op->src, op->nbytes, ~op->seed);
+			break;
+		default:
+			assert(false);
+			break;
+		}
+
+		batch_status |= cmd_status;
+		op->cb_fn(accel_req, cmd_status);
+		TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link);
+	}
+
+	/* Now complete the batch request itself. */
+	accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+					       offsetof(struct spdk_accel_task, offload_ctx));
+	cb_fn(accel_req, batch_status);
+
+	return 0;
+}
+
+static int
+sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
+		     uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_accel_task *accel_req;
+
+	memcpy(dst, src, (size_t)nbytes);
+
+	accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+					       offsetof(struct spdk_accel_task, offload_ctx));
+	cb_fn(accel_req, 0);
+	return 0;
+}
+
+static int
+sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2,
+			 void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_accel_task *accel_req;
+
+	memcpy(dst1, src, (size_t)nbytes);
+	memcpy(dst2, src, (size_t)nbytes);
+
+	accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+					       offsetof(struct spdk_accel_task, offload_ctx));
+	cb_fn(accel_req, 0);
+	return 0;
+}
+
+static int
+sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2,
+			uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_accel_task *accel_req;
+	int result;
+
+	result = memcmp(src1, src2, (size_t)nbytes);
+
+	accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+					       offsetof(struct spdk_accel_task, offload_ctx));
+	cb_fn(accel_req, result);
+
+	return 0;
+}
+
+static int
+sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill,
+		     uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_accel_task *accel_req;
+
+	memset(dst, fill, nbytes);
+	accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+					       offsetof(struct spdk_accel_task, offload_ctx));
+	cb_fn(accel_req, 0);
+
+	return 0;
+}
+
+static int
+sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src,
+		       uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_accel_task *accel_req;
+
+	*dst = spdk_crc32c_update(src, nbytes, ~seed);
+	accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+					       offsetof(struct spdk_accel_task, offload_ctx));
+	cb_fn(accel_req, 0);
+
+	return 0;
+}
+
+static struct spdk_io_channel *sw_accel_get_io_channel(void);
+
+static struct spdk_accel_engine sw_accel_engine = {
+	.get_capabilities	= sw_accel_get_capabilities,
+	.copy			= sw_accel_submit_copy,
+	.dualcast		= sw_accel_submit_dualcast,
+	.batch_get_max		= sw_accel_batch_get_max,
+	.batch_create		= sw_accel_batch_start,
+	.batch_cancel		= sw_accel_batch_cancel,
+	.batch_prep_copy	= sw_accel_batch_prep_copy,
+	.batch_prep_dualcast	= sw_accel_batch_prep_dualcast,
+	.batch_prep_compare	= sw_accel_batch_prep_compare,
+	.batch_prep_fill	= sw_accel_batch_prep_fill,
+	.batch_prep_crc32c	= sw_accel_batch_prep_crc32c,
+	.batch_submit		= sw_accel_batch_submit,
+	.compare		= sw_accel_submit_compare,
+	.fill			= sw_accel_submit_fill,
+	.crc32c			= sw_accel_submit_crc32c,
+	.get_io_channel		= sw_accel_get_io_channel,
+};
+
+static int
+sw_accel_create_cb(void *io_device, void *ctx_buf)
+{
+	struct sw_accel_io_channel *sw_ch = ctx_buf;
+	struct sw_accel_op *op;
+	int i;
+
+	TAILQ_INIT(&sw_ch->batch);
+
+	TAILQ_INIT(&sw_ch->op_pool);
+	for (i = 0 ; i < SW_ACCEL_BATCH_SIZE ; i++) {
+		op = calloc(1, sizeof(struct sw_accel_op));
+		if (op == NULL) {
+			SPDK_ERRLOG("Failed to allocate operation for batch.\n");
+			while ((op = TAILQ_FIRST(&sw_ch->op_pool))) {
+				TAILQ_REMOVE(&sw_ch->op_pool, op, link);
+				free(op);
+			}
+			return -ENOMEM;
+		}
+		TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link);
+	}
+
+	return 0;
+}
+
+static void
+sw_accel_destroy_cb(void *io_device, void *ctx_buf)
+{
+	struct sw_accel_io_channel *sw_ch = ctx_buf;
+	struct sw_accel_op *op;
+
+	while ((op = TAILQ_FIRST(&sw_ch->op_pool))) {
+		TAILQ_REMOVE(&sw_ch->op_pool, op, link);
+		free(op);
+	}
+}
+
+static struct spdk_io_channel *sw_accel_get_io_channel(void)
+{
+	return spdk_get_io_channel(&sw_accel_engine);
+}
+
+static size_t
+sw_accel_engine_get_ctx_size(void)
+{
+	return sizeof(struct spdk_accel_task);
+}
+
+static int
+sw_accel_engine_init(void)
+{
+	accel_sw_register(&sw_accel_engine);
+	spdk_io_device_register(&sw_accel_engine, sw_accel_create_cb, sw_accel_destroy_cb,
+				sizeof(struct sw_accel_io_channel), "sw_accel_engine");
+
+	return 0;
+}
+
+static void
+sw_accel_engine_fini(void *ctxt)
+{
+	spdk_io_device_unregister(&sw_accel_engine, NULL);
+	accel_sw_unregister();
+
+	spdk_accel_engine_module_finish();
+}
+
+SPDK_ACCEL_MODULE_REGISTER(sw_accel_engine_init, sw_accel_engine_fini,
+			   NULL, NULL, sw_accel_engine_get_ctx_size)
diff --git a/src/spdk/lib/accel/spdk_accel.map b/src/spdk/lib/accel/spdk_accel.map
new file mode 100644
index 000000000..bfccf0a90
--- /dev/null
+++ b/src/spdk/lib/accel/spdk_accel.map
@@ -0,0 +1,33 @@
+{
+	global:
+
+	# public functions
+	spdk_accel_engine_initialize;
+	spdk_accel_engine_finish;
+	spdk_accel_engine_config_text;
+	spdk_accel_engine_module_finish;
+	spdk_accel_engine_get_io_channel;
+	spdk_accel_get_capabilities;
+	spdk_accel_batch_get_max;
+	spdk_accel_batch_create;
+	spdk_accel_batch_prep_copy;
+	spdk_accel_batch_prep_dualcast;
+	spdk_accel_batch_prep_compare;
+	spdk_accel_batch_prep_fill;
+	spdk_accel_batch_prep_crc32c;
+	spdk_accel_batch_submit;
+	spdk_accel_batch_cancel;
+	spdk_accel_submit_copy;
+	spdk_accel_submit_dualcast;
+	spdk_accel_submit_compare;
+	spdk_accel_submit_fill;
+	spdk_accel_submit_crc32c;
+	spdk_accel_write_config_json;
+
+	# functions needed by modules
+	spdk_accel_hw_engine_register;
+	spdk_accel_module_list_add;
+
+
+	local: *;
+};
diff --git a/src/spdk/lib/bdev/Makefile b/src/spdk/lib/bdev/Makefile
new file mode 100644
index 000000000..ca0bf992a
--- /dev/null
+++ b/src/spdk/lib/bdev/Makefile
@@ -0,0 +1,50 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+ifeq ($(CONFIG_VTUNE),y)
+CFLAGS += -I$(CONFIG_VTUNE_DIR)/include -I$(CONFIG_VTUNE_DIR)/sdk/src/ittnotify
+endif
+
+C_SRCS = bdev.c bdev_rpc.c bdev_zone.c part.c scsi_nvme.c
+C_SRCS-$(CONFIG_VTUNE) += vtune.c
+LIBNAME = bdev
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_bdev.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/bdev/bdev.c b/src/spdk/lib/bdev/bdev.c
new file mode 100644
index 000000000..af8c05aaa
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev.c
@@ -0,0 +1,6763 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+
+#include "spdk/config.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/notify.h"
+#include "spdk/util.h"
+#include "spdk/trace.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+
+#include "bdev_internal.h"
+
+#ifdef SPDK_CONFIG_VTUNE
+#include "ittnotify.h"
+#include "ittnotify_types.h"
+int __itt_init_ittlib(const char *, __itt_group_id);
+#endif
+
+#define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
+#define SPDK_BDEV_IO_CACHE_SIZE			256
+#define SPDK_BDEV_AUTO_EXAMINE			true
+#define BUF_SMALL_POOL_SIZE			8191
+#define BUF_LARGE_POOL_SIZE			1023
+#define NOMEM_THRESHOLD_COUNT			8
+#define ZERO_BUFFER_SIZE			0x100000
+
+#define OWNER_BDEV		0x2
+
+#define OBJECT_BDEV_IO		0x2
+
+#define TRACE_GROUP_BDEV	0x3
+#define TRACE_BDEV_IO_START	SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0)
+#define TRACE_BDEV_IO_DONE	SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1)
+
+#define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
+#define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
+#define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
+#define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
+#define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
+#define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
+#define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
+
+#define SPDK_BDEV_POOL_ALIGNMENT 512
+
+static const char *qos_conf_type[] = {"Limit_IOPS",
+				      "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS"
+				     };
+static const char *qos_rpc_type[] = {"rw_ios_per_sec",
+				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
+				    };
+
+TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
+
+struct spdk_bdev_mgr {
+	struct spdk_mempool *bdev_io_pool;
+
+	struct spdk_mempool *buf_small_pool;
+	struct spdk_mempool *buf_large_pool;
+
+	void *zero_buffer;
+
+	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
+
+	struct spdk_bdev_list bdevs;
+
+	bool init_complete;
+	bool module_init_complete;
+
+	pthread_mutex_t mutex;
+
+#ifdef SPDK_CONFIG_VTUNE
+	__itt_domain	*domain;
+#endif
+};
+
+static struct spdk_bdev_mgr g_bdev_mgr = {
+	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
+	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
+	.init_complete = false,
+	.module_init_complete = false,
+	.mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+typedef void (*lock_range_cb)(void *ctx, int status);
+
+struct lba_range {
+	uint64_t			offset;
+	uint64_t			length;
+	void				*locked_ctx;
+	struct spdk_bdev_channel	*owner_ch;
+	TAILQ_ENTRY(lba_range)		tailq;
+};
+
+static struct spdk_bdev_opts	g_bdev_opts = {
+	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
+	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
+	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
+};
+
+static spdk_bdev_init_cb	g_init_cb_fn = NULL;
+static void			*g_init_cb_arg = NULL;
+
+static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
+static void			*g_fini_cb_arg = NULL;
+static struct spdk_thread	*g_fini_thread = NULL;
+
+struct spdk_bdev_qos_limit {
+	/** IOs or bytes allowed per second (i.e., 1s). */
+	uint64_t limit;
+
+	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
+	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
+	 *  some bytes are remaining, but the I/O is bigger than that amount. The
+	 *  excess will be deducted from the next timeslice.
+	 */
+	int64_t remaining_this_timeslice;
+
+	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
+	uint32_t min_per_timeslice;
+
+	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
+	uint32_t max_per_timeslice;
+
+	/** Function to check whether to queue the IO. */
+	bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
+
+	/** Function to update for the submitted IO. */
+	void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
+};
+
+struct spdk_bdev_qos {
+	/** Types of structure of rate limits. */
+	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
+
+	/** The channel that all I/O are funneled through. */
+	struct spdk_bdev_channel *ch;
+
+	/** The thread on which the poller is running. */
+	struct spdk_thread *thread;
+
+	/** Queue of I/O waiting to be issued. */
+	bdev_io_tailq_t queued;
+
+	/** Size of a timeslice in tsc ticks. */
+	uint64_t timeslice_size;
+
+	/** Timestamp of start of last timeslice. */
+	uint64_t last_timeslice;
+
+	/** Poller that processes queued I/O commands each time slice. */
+	struct spdk_poller *poller;
+};
+
+struct spdk_bdev_mgmt_channel {
+	bdev_io_stailq_t need_buf_small;
+	bdev_io_stailq_t need_buf_large;
+
+	/*
+	 * Each thread keeps a cache of bdev_io - this allows
+	 *  bdev threads which are *not* DPDK threads to still
+	 *  benefit from a per-thread bdev_io cache.  Without
+	 *  this, non-DPDK threads fetching from the mempool
+	 *  incur a cmpxchg on get and put.
+	 */
+	bdev_io_stailq_t per_thread_cache;
+	uint32_t	per_thread_cache_count;
+	uint32_t	bdev_io_cache_size;
+
+	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
+	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
+};
+
+/*
+ * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
+ * will queue here their IO that awaits retry. It makes it possible to retry sending
+ * IO to one bdev after IO from other bdev completes.
+ */
+struct spdk_bdev_shared_resource {
+	/* The bdev management channel */
+	struct spdk_bdev_mgmt_channel *mgmt_ch;
+
+	/*
+	 * Count of I/O submitted to bdev module and waiting for completion.
+	 * Incremented before submit_request() is called on an spdk_bdev_io.
+	 */
+	uint64_t		io_outstanding;
+
+	/*
+	 * Queue of IO awaiting retry because of a previous NOMEM status returned
+	 *  on this channel.
+	 */
+	bdev_io_tailq_t		nomem_io;
+
+	/*
+	 * Threshold which io_outstanding must drop to before retrying nomem_io.
+	 */
+	uint64_t		nomem_threshold;
+
+	/* I/O channel allocated by a bdev module */
+	struct spdk_io_channel	*shared_ch;
+
+	/* Refcount of bdev channels using this resource */
+	uint32_t		ref;
+
+	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
+};
+
+#define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
+#define BDEV_CH_QOS_ENABLED		(1 << 1)
+
+struct spdk_bdev_channel {
+	struct spdk_bdev	*bdev;
+
+	/* The channel for the underlying device */
+	struct spdk_io_channel	*channel;
+
+	/* Per io_device per thread data */
+	struct spdk_bdev_shared_resource *shared_resource;
+
+	struct spdk_bdev_io_stat stat;
+
+	/*
+	 * Count of I/O submitted to the underlying dev module through this channel
+	 * and waiting for completion.
+	 */
+	uint64_t		io_outstanding;
+
+	/*
+	 * List of all submitted I/Os including I/O that are generated via splitting.
+	 */
+	bdev_io_tailq_t		io_submitted;
+
+	/*
+	 * List of spdk_bdev_io that are currently queued because they write to a locked
+	 * LBA range.
+	 */
+	bdev_io_tailq_t		io_locked;
+
+	uint32_t		flags;
+
+	struct spdk_histogram_data *histogram;
+
+#ifdef SPDK_CONFIG_VTUNE
+	uint64_t		start_tsc;
+	uint64_t		interval_tsc;
+	__itt_string_handle	*handle;
+	struct spdk_bdev_io_stat prev_stat;
+#endif
+
+	bdev_io_tailq_t		queued_resets;
+
+	lba_range_tailq_t	locked_ranges;
+};
+
+struct media_event_entry {
+	struct spdk_bdev_media_event	event;
+	TAILQ_ENTRY(media_event_entry)	tailq;
+};
+
+#define MEDIA_EVENT_POOL_SIZE 64
+
+struct spdk_bdev_desc {
+	struct spdk_bdev		*bdev;
+	struct spdk_thread		*thread;
+	struct {
+		bool open_with_ext;
+		union {
+			spdk_bdev_remove_cb_t remove_fn;
+			spdk_bdev_event_cb_t event_fn;
+		};
+		void *ctx;
+	}				callback;
+	bool				closed;
+	bool				write;
+	pthread_mutex_t			mutex;
+	uint32_t			refs;
+	TAILQ_HEAD(, media_event_entry)	pending_media_events;
+	TAILQ_HEAD(, media_event_entry)	free_media_events;
+	struct media_event_entry	*media_events_buffer;
+	TAILQ_ENTRY(spdk_bdev_desc)	link;
+
+	uint64_t		timeout_in_sec;
+	spdk_bdev_io_timeout_cb	cb_fn;
+	void			*cb_arg;
+	struct spdk_poller	*io_timeout_poller;
+};
+
+struct spdk_bdev_iostat_ctx {
+	struct spdk_bdev_io_stat *stat;
+	spdk_bdev_get_device_stat_cb cb;
+	void *cb_arg;
+};
+
+struct set_qos_limit_ctx {
+	void (*cb_fn)(void *cb_arg, int status);
+	void *cb_arg;
+	struct spdk_bdev *bdev;
+};
+
+#define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
+#define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
+
+static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+static void bdev_write_zero_buffer_next(void *_bdev_io);
+
+static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i);
+static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status);
+
+static int
+bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
+			  uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg);
+static int
+bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			   struct iovec *iov, int iovcnt, void *md_buf,
+			   uint64_t offset_blocks, uint64_t num_blocks,
+			   spdk_bdev_io_completion_cb cb, void *cb_arg);
+
+static int
+bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+		    uint64_t offset, uint64_t length,
+		    lock_range_cb cb_fn, void *cb_arg);
+
+static int
+bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+		      uint64_t offset, uint64_t length,
+		      lock_range_cb cb_fn, void *cb_arg);
+
+static inline void bdev_io_complete(void *ctx);
+
+static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
+static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort);
+
+void
+spdk_bdev_get_opts(struct spdk_bdev_opts *opts)
+{
+	*opts = g_bdev_opts;
+}
+
+int
+spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
+{
+	uint32_t min_pool_size;
+
+	/*
+	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
+	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
+	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
+	 */
+	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
+	if (opts->bdev_io_pool_size < min_pool_size) {
+		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
+			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
+			    spdk_thread_get_count());
+		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
+		return -1;
+	}
+
+	g_bdev_opts = *opts;
+	return 0;
+}
+
+struct spdk_bdev_examine_item {
+	char *name;
+	TAILQ_ENTRY(spdk_bdev_examine_item) link;
+};
+
+TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
+
+struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
+			g_bdev_examine_allowlist);
+
+static inline bool
+bdev_examine_allowlist_check(const char *name)
+{
+	struct spdk_bdev_examine_item *item;
+	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
+		if (strcmp(name, item->name) == 0) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static inline bool
+bdev_in_examine_allowlist(struct spdk_bdev *bdev)
+{
+	struct spdk_bdev_alias *tmp;
+	if (bdev_examine_allowlist_check(bdev->name)) {
+		return true;
+	}
+	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
+		if (bdev_examine_allowlist_check(tmp->alias)) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static inline bool
+bdev_ok_to_examine(struct spdk_bdev *bdev)
+{
+	if (g_bdev_opts.bdev_auto_examine) {
+		return true;
+	} else {
+		return bdev_in_examine_allowlist(bdev);
+	}
+}
+
+static void
+bdev_examine(struct spdk_bdev *bdev)
+{
+	struct spdk_bdev_module *module;
+	uint32_t action;
+
+	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (module->examine_config && bdev_ok_to_examine(bdev)) {
+			action = module->internal.action_in_progress;
+			module->internal.action_in_progress++;
+			module->examine_config(bdev);
+			if (action != module->internal.action_in_progress) {
+				SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n",
+					    module->name);
+			}
+		}
+	}
+
+	if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) {
+		if (bdev->internal.claim_module->examine_disk) {
+			bdev->internal.claim_module->internal.action_in_progress++;
+			bdev->internal.claim_module->examine_disk(bdev);
+		}
+		return;
+	}
+
+	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (module->examine_disk && bdev_ok_to_examine(bdev)) {
+			module->internal.action_in_progress++;
+			module->examine_disk(bdev);
+		}
+	}
+}
+
+struct spdk_bdev *
+spdk_bdev_first(void)
+{
+	struct spdk_bdev *bdev;
+
+	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
+	if (bdev) {
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
+	}
+
+	return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_next(struct spdk_bdev *prev)
+{
+	struct spdk_bdev *bdev;
+
+	bdev = TAILQ_NEXT(prev, internal.link);
+	if (bdev) {
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
+	}
+
+	return bdev;
+}
+
+static struct spdk_bdev *
+_bdev_next_leaf(struct spdk_bdev *bdev)
+{
+	while (bdev != NULL) {
+		if (bdev->internal.claim_module == NULL) {
+			return bdev;
+		} else {
+			bdev = TAILQ_NEXT(bdev, internal.link);
+		}
+	}
+
+	return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_first_leaf(void)
+{
+	struct spdk_bdev *bdev;
+
+	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
+
+	if (bdev) {
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
+	}
+
+	return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_next_leaf(struct spdk_bdev *prev)
+{
+	struct spdk_bdev *bdev;
+
+	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
+
+	if (bdev) {
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
+	}
+
+	return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_get_by_name(const char *bdev_name)
+{
+	struct spdk_bdev_alias *tmp;
+	struct spdk_bdev *bdev = spdk_bdev_first();
+
+	while (bdev != NULL) {
+		if (strcmp(bdev_name, bdev->name) == 0) {
+			return bdev;
+		}
+
+		TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
+			if (strcmp(bdev_name, tmp->alias) == 0) {
+				return bdev;
+			}
+		}
+
+		bdev = spdk_bdev_next(bdev);
+	}
+
+	return NULL;
+}
+
+void
+spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
+{
+	struct iovec *iovs;
+
+	if (bdev_io->u.bdev.iovs == NULL) {
+		bdev_io->u.bdev.iovs = &bdev_io->iov;
+		bdev_io->u.bdev.iovcnt = 1;
+	}
+
+	iovs = bdev_io->u.bdev.iovs;
+
+	assert(iovs != NULL);
+	assert(bdev_io->u.bdev.iovcnt >= 1);
+
+	iovs[0].iov_base = buf;
+	iovs[0].iov_len = len;
+}
+
+void
+spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
+{
+	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
+	bdev_io->u.bdev.md_buf = md_buf;
+}
+
+static bool
+_is_buf_allocated(const struct iovec *iovs)
+{
+	if (iovs == NULL) {
+		return false;
+	}
+
+	return iovs[0].iov_base != NULL;
+}
+
+static bool
+_are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
+{
+	int i;
+	uintptr_t iov_base;
+
+	if (spdk_likely(alignment == 1)) {
+		return true;
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		iov_base = (uintptr_t)iovs[i].iov_base;
+		if ((iov_base & (alignment - 1)) != 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static void
+_copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt)
+{
+	int i;
+	size_t len;
+
+	for (i = 0; i < iovcnt; i++) {
+		len = spdk_min(iovs[i].iov_len, buf_len);
+		memcpy(buf, iovs[i].iov_base, len);
+		buf += len;
+		buf_len -= len;
+	}
+}
+
+static void
+_copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len)
+{
+	int i;
+	size_t len;
+
+	for (i = 0; i < iovcnt; i++) {
+		len = spdk_min(iovs[i].iov_len, buf_len);
+		memcpy(iovs[i].iov_base, buf, len);
+		buf += len;
+		buf_len -= len;
+	}
+}
+
+static void
+_bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
+{
+	/* save original iovec */
+	bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
+	bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
+	/* set bounce iov */
+	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
+	bdev_io->u.bdev.iovcnt = 1;
+	/* set bounce buffer for this operation */
+	bdev_io->u.bdev.iovs[0].iov_base = buf;
+	bdev_io->u.bdev.iovs[0].iov_len = len;
+	/* if this is write path, copy data from original buffer to bounce buffer */
+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+		_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt);
+	}
+}
+
+static void
+_bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
+{
+	/* save original md_buf */
+	bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf;
+	/* set bounce md_buf */
+	bdev_io->u.bdev.md_buf = md_buf;
+
+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+		memcpy(md_buf, bdev_io->internal.orig_md_buf, len);
+	}
+}
+
+static void
+bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status)
+{
+	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
+
+	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
+		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
+		bdev_io->internal.get_aux_buf_cb = NULL;
+	} else {
+		assert(bdev_io->internal.get_buf_cb != NULL);
+		bdev_io->internal.buf = buf;
+		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
+		bdev_io->internal.get_buf_cb = NULL;
+	}
+}
+
+static void
+_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	bool buf_allocated;
+	uint64_t md_len, alignment;
+	void *aligned_buf;
+
+	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
+		bdev_io_get_buf_complete(bdev_io, buf, true);
+		return;
+	}
+
+	alignment = spdk_bdev_get_buf_align(bdev);
+	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
+	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
+
+	if (buf_allocated) {
+		_bdev_io_set_bounce_buf(bdev_io, aligned_buf, len);
+	} else {
+		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
+	}
+
+	if (spdk_bdev_is_md_separate(bdev)) {
+		aligned_buf = (char *)aligned_buf + len;
+		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
+
+		assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0);
+
+		if (bdev_io->u.bdev.md_buf != NULL) {
+			_bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len);
+		} else {
+			spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len);
+		}
+	}
+	bdev_io_get_buf_complete(bdev_io, buf, true);
+}
+
+static void
+_bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_mempool *pool;
+	struct spdk_bdev_io *tmp;
+	bdev_io_stailq_t *stailq;
+	struct spdk_bdev_mgmt_channel *ch;
+	uint64_t md_len, alignment;
+
+	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
+	alignment = spdk_bdev_get_buf_align(bdev);
+	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
+
+	if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
+	    SPDK_BDEV_POOL_ALIGNMENT) {
+		pool = g_bdev_mgr.buf_small_pool;
+		stailq = &ch->need_buf_small;
+	} else {
+		pool = g_bdev_mgr.buf_large_pool;
+		stailq = &ch->need_buf_large;
+	}
+
+	if (STAILQ_EMPTY(stailq)) {
+		spdk_mempool_put(pool, buf);
+	} else {
+		tmp = STAILQ_FIRST(stailq);
+		STAILQ_REMOVE_HEAD(stailq, internal.buf_link);
+		_bdev_io_set_buf(tmp, buf, tmp->internal.buf_len);
+	}
+}
+
+static void
+bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
+{
+	assert(bdev_io->internal.buf != NULL);
+	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
+	bdev_io->internal.buf = NULL;
+}
+
+void
+spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
+{
+	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+
+	assert(buf != NULL);
+	_bdev_io_put_buf(bdev_io, buf, len);
+}
+
+static void
+_bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io)
+{
+	if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) {
+		assert(bdev_io->internal.orig_md_buf == NULL);
+		return;
+	}
+
+	/* if this is read path, copy data from bounce buffer to original buffer */
+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
+	    bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+		_copy_buf_to_iovs(bdev_io->internal.orig_iovs,
+				  bdev_io->internal.orig_iovcnt,
+				  bdev_io->internal.bounce_iov.iov_base,
+				  bdev_io->internal.bounce_iov.iov_len);
+	}
+	/* set original buffer for this io */
+	bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
+	bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
+	/* disable bouncing buffer for this io */
+	bdev_io->internal.orig_iovcnt = 0;
+	bdev_io->internal.orig_iovs = NULL;
+
+	/* do the same for metadata buffer */
+	if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) {
+		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
+
+		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
+		    bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+			memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf,
+			       bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev));
+		}
+
+		bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf;
+		bdev_io->internal.orig_md_buf = NULL;
+	}
+
+	/* We want to free the bounce buffer here since we know we're done with it (as opposed
+	 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
+	 */
+	bdev_io_put_buf(bdev_io);
+}
+
+static void
+bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_mempool *pool;
+	bdev_io_stailq_t *stailq;
+	struct spdk_bdev_mgmt_channel *mgmt_ch;
+	uint64_t alignment, md_len;
+	void *buf;
+
+	alignment = spdk_bdev_get_buf_align(bdev);
+	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
+
+	if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) +
+	    SPDK_BDEV_POOL_ALIGNMENT) {
+		SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n",
+			    len + alignment);
+		bdev_io_get_buf_complete(bdev_io, NULL, false);
+		return;
+	}
+
+	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
+
+	bdev_io->internal.buf_len = len;
+
+	if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
+	    SPDK_BDEV_POOL_ALIGNMENT) {
+		pool = g_bdev_mgr.buf_small_pool;
+		stailq = &mgmt_ch->need_buf_small;
+	} else {
+		pool = g_bdev_mgr.buf_large_pool;
+		stailq = &mgmt_ch->need_buf_large;
+	}
+
+	buf = spdk_mempool_get(pool);
+	if (!buf) {
+		STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link);
+	} else {
+		_bdev_io_set_buf(bdev_io, buf, len);
+	}
+}
+
+void
+spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	uint64_t alignment;
+
+	assert(cb != NULL);
+	bdev_io->internal.get_buf_cb = cb;
+
+	alignment = spdk_bdev_get_buf_align(bdev);
+
+	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
+	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
+		/* Buffer already present and aligned */
+		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
+		return;
+	}
+
+	bdev_io_get_buf(bdev_io, len);
+}
+
+void
+spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
+{
+	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+
+	assert(cb != NULL);
+	assert(bdev_io->internal.get_aux_buf_cb == NULL);
+	bdev_io->internal.get_aux_buf_cb = cb;
+	bdev_io_get_buf(bdev_io, len);
+}
+
+static int
+bdev_module_get_max_ctx_size(void)
+{
+	struct spdk_bdev_module *bdev_module;
+	int max_bdev_module_size = 0;
+
+	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
+			max_bdev_module_size = bdev_module->get_ctx_size();
+		}
+	}
+
+	return max_bdev_module_size;
+}
+
+void
+spdk_bdev_config_text(FILE *fp)
+{
+	struct spdk_bdev_module *bdev_module;
+
+	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (bdev_module->config_text) {
+			bdev_module->config_text(fp);
+		}
+	}
+}
+
+static void
+bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+	int i;
+	struct spdk_bdev_qos *qos = bdev->internal.qos;
+	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
+
+	if (!qos) {
+		return;
+	}
+
+	spdk_bdev_get_qos_rate_limits(bdev, limits);
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
+
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_string(w, "name", bdev->name);
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (limits[i] > 0) {
+			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
+		}
+	}
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+}
+
+void
+spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_bdev_module *bdev_module;
+	struct spdk_bdev *bdev;
+
+	assert(w != NULL);
+
+	spdk_json_write_array_begin(w);
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "bdev_set_options");
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
+	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
+	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
+	spdk_json_write_object_end(w);
+	spdk_json_write_object_end(w);
+
+	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (bdev_module->config_json) {
+			bdev_module->config_json(w);
+		}
+	}
+
+	pthread_mutex_lock(&g_bdev_mgr.mutex);
+
+	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
+		if (bdev->fn_table->write_config_json) {
+			bdev->fn_table->write_config_json(bdev, w);
+		}
+
+		bdev_qos_config_json(bdev, w);
+	}
+
+	pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+	spdk_json_write_array_end(w);
+}
+
+static int
+bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
+	struct spdk_bdev_io *bdev_io;
+	uint32_t i;
+
+	STAILQ_INIT(&ch->need_buf_small);
+	STAILQ_INIT(&ch->need_buf_large);
+
+	STAILQ_INIT(&ch->per_thread_cache);
+	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
+
+	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
+	ch->per_thread_cache_count = 0;
+	for (i = 0; i < ch->bdev_io_cache_size; i++) {
+		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
+		assert(bdev_io != NULL);
+		ch->per_thread_cache_count++;
+		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
+	}
+
+	TAILQ_INIT(&ch->shared_resources);
+	TAILQ_INIT(&ch->io_wait_queue);
+
+	return 0;
+}
+
+static void
+bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
+{
+	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
+	struct spdk_bdev_io *bdev_io;
+
+	if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) {
+		SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n");
+	}
+
+	if (!TAILQ_EMPTY(&ch->shared_resources)) {
+		SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n");
+	}
+
+	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
+		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
+		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
+		ch->per_thread_cache_count--;
+		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
+	}
+
+	assert(ch->per_thread_cache_count == 0);
+}
+
+static void
+bdev_init_complete(int rc)
+{
+	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
+	void *cb_arg = g_init_cb_arg;
+	struct spdk_bdev_module *m;
+
+	g_bdev_mgr.init_complete = true;
+	g_init_cb_fn = NULL;
+	g_init_cb_arg = NULL;
+
+	/*
+	 * For modules that need to know when subsystem init is complete,
+	 * inform them now.
+	 */
+	if (rc == 0) {
+		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
+			if (m->init_complete) {
+				m->init_complete();
+			}
+		}
+	}
+
+	cb_fn(cb_arg, rc);
+}
+
+static void
+bdev_module_action_complete(void)
+{
+	struct spdk_bdev_module *m;
+
+	/*
+	 * Don't finish bdev subsystem initialization if
+	 * module pre-initialization is still in progress, or
+	 * the subsystem been already initialized.
+	 */
+	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
+		return;
+	}
+
+	/*
+	 * Check all bdev modules for inits/examinations in progress. If any
+	 * exist, return immediately since we cannot finish bdev subsystem
+	 * initialization until all are completed.
+	 */
+	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (m->internal.action_in_progress > 0) {
+			return;
+		}
+	}
+
+	/*
+	 * Modules already finished initialization - now that all
+	 * the bdev modules have finished their asynchronous I/O
+	 * processing, the entire bdev layer can be marked as complete.
+	 */
+	bdev_init_complete(0);
+}
+
+static void
+bdev_module_action_done(struct spdk_bdev_module *module)
+{
+	assert(module->internal.action_in_progress > 0);
+	module->internal.action_in_progress--;
+	bdev_module_action_complete();
+}
+
+void
+spdk_bdev_module_init_done(struct spdk_bdev_module *module)
+{
+	bdev_module_action_done(module);
+}
+
+void
+spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
+{
+	bdev_module_action_done(module);
+}
+
+/** The last initialized bdev module */
+static struct spdk_bdev_module *g_resume_bdev_module = NULL;
+
+static void
+bdev_init_failed(void *cb_arg)
+{
+	struct spdk_bdev_module *module = cb_arg;
+
+	module->internal.action_in_progress--;
+	bdev_init_complete(-1);
+}
+
+static int
+bdev_modules_init(void)
+{
+	struct spdk_bdev_module *module;
+	int rc = 0;
+
+	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		g_resume_bdev_module = module;
+		if (module->async_init) {
+			module->internal.action_in_progress = 1;
+		}
+		rc = module->module_init();
+		if (rc != 0) {
+			/* Bump action_in_progress to prevent other modules from completion of modules_init
+			 * Send message to defer application shutdown until resources are cleaned up */
+			module->internal.action_in_progress = 1;
+			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
+			return rc;
+		}
+	}
+
+	g_resume_bdev_module = NULL;
+	return 0;
+}
+
+void
+spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
+{
+	struct spdk_conf_section *sp;
+	struct spdk_bdev_opts bdev_opts;
+	int32_t bdev_io_pool_size, bdev_io_cache_size;
+	int cache_size;
+	int rc = 0;
+	char mempool_name[32];
+
+	assert(cb_fn != NULL);
+
+	sp = spdk_conf_find_section(NULL, "Bdev");
+	if (sp != NULL) {
+		spdk_bdev_get_opts(&bdev_opts);
+
+		bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize");
+		if (bdev_io_pool_size >= 0) {
+			bdev_opts.bdev_io_pool_size = bdev_io_pool_size;
+		}
+
+		bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize");
+		if (bdev_io_cache_size >= 0) {
+			bdev_opts.bdev_io_cache_size = bdev_io_cache_size;
+		}
+
+		if (spdk_bdev_set_opts(&bdev_opts)) {
+			bdev_init_complete(-1);
+			return;
+		}
+
+		assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0);
+	}
+
+	g_init_cb_fn = cb_fn;
+	g_init_cb_arg = cb_arg;
+
+	spdk_notify_type_register("bdev_register");
+	spdk_notify_type_register("bdev_unregister");
+
+	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
+
+	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
+				  g_bdev_opts.bdev_io_pool_size,
+				  sizeof(struct spdk_bdev_io) +
+				  bdev_module_get_max_ctx_size(),
+				  0,
+				  SPDK_ENV_SOCKET_ID_ANY);
+
+	if (g_bdev_mgr.bdev_io_pool == NULL) {
+		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
+		bdev_init_complete(-1);
+		return;
+	}
+
+	/**
+	 * Ensure no more than half of the total buffers end up local caches, by
+	 *   using spdk_env_get_core_count() to determine how many local caches we need
+	 *   to account for.
+	 */
+	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
+	snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
+
+	g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
+				    BUF_SMALL_POOL_SIZE,
+				    SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
+				    SPDK_BDEV_POOL_ALIGNMENT,
+				    cache_size,
+				    SPDK_ENV_SOCKET_ID_ANY);
+	if (!g_bdev_mgr.buf_small_pool) {
+		SPDK_ERRLOG("create rbuf small pool failed\n");
+		bdev_init_complete(-1);
+		return;
+	}
+
+	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
+	snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
+
+	g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
+				    BUF_LARGE_POOL_SIZE,
+				    SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) +
+				    SPDK_BDEV_POOL_ALIGNMENT,
+				    cache_size,
+				    SPDK_ENV_SOCKET_ID_ANY);
+	if (!g_bdev_mgr.buf_large_pool) {
+		SPDK_ERRLOG("create rbuf large pool failed\n");
+		bdev_init_complete(-1);
+		return;
+	}
+
+	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
+					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (!g_bdev_mgr.zero_buffer) {
+		SPDK_ERRLOG("create bdev zero buffer failed\n");
+		bdev_init_complete(-1);
+		return;
+	}
+
+#ifdef SPDK_CONFIG_VTUNE
+	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
+#endif
+
+	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
+				bdev_mgmt_channel_destroy,
+				sizeof(struct spdk_bdev_mgmt_channel),
+				"bdev_mgr");
+
+	rc = bdev_modules_init();
+	g_bdev_mgr.module_init_complete = true;
+	if (rc != 0) {
+		SPDK_ERRLOG("bdev modules init failed\n");
+		return;
+	}
+
+	bdev_module_action_complete();
+}
+
+static void
+bdev_mgr_unregister_cb(void *io_device)
+{
+	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
+
+	if (g_bdev_mgr.bdev_io_pool) {
+		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
+			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
+				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
+				    g_bdev_opts.bdev_io_pool_size);
+		}
+
+		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
+	}
+
+	if (g_bdev_mgr.buf_small_pool) {
+		if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
+			SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
+				    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
+				    BUF_SMALL_POOL_SIZE);
+			assert(false);
+		}
+
+		spdk_mempool_free(g_bdev_mgr.buf_small_pool);
+	}
+
+	if (g_bdev_mgr.buf_large_pool) {
+		if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
+			SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
+				    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
+				    BUF_LARGE_POOL_SIZE);
+			assert(false);
+		}
+
+		spdk_mempool_free(g_bdev_mgr.buf_large_pool);
+	}
+
+	spdk_free(g_bdev_mgr.zero_buffer);
+
+	cb_fn(g_fini_cb_arg);
+	g_fini_cb_fn = NULL;
+	g_fini_cb_arg = NULL;
+	g_bdev_mgr.init_complete = false;
+	g_bdev_mgr.module_init_complete = false;
+	pthread_mutex_destroy(&g_bdev_mgr.mutex);
+}
+
+static void
+bdev_module_finish_iter(void *arg)
+{
+	struct spdk_bdev_module *bdev_module;
+
+	/* FIXME: Handling initialization failures is broken now,
+	 * so we won't even try cleaning up after successfully
+	 * initialized modules. if module_init_complete is false,
+	 * just call spdk_bdev_mgr_unregister_cb
+	 */
+	if (!g_bdev_mgr.module_init_complete) {
+		bdev_mgr_unregister_cb(NULL);
+		return;
+	}
+
+	/* Start iterating from the last touched module */
+	if (!g_resume_bdev_module) {
+		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
+	} else {
+		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
+					 internal.tailq);
+	}
+
+	while (bdev_module) {
+		if (bdev_module->async_fini) {
+			/* Save our place so we can resume later. We must
+			 * save the variable here, before calling module_fini()
+			 * below, because in some cases the module may immediately
+			 * call spdk_bdev_module_finish_done() and re-enter
+			 * this function to continue iterating. */
+			g_resume_bdev_module = bdev_module;
+		}
+
+		if (bdev_module->module_fini) {
+			bdev_module->module_fini();
+		}
+
+		if (bdev_module->async_fini) {
+			return;
+		}
+
+		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
+					 internal.tailq);
+	}
+
+	g_resume_bdev_module = NULL;
+	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
+}
+
+void
+spdk_bdev_module_finish_done(void)
+{
+	if (spdk_get_thread() != g_fini_thread) {
+		spdk_thread_send_msg(g_fini_thread, bdev_module_finish_iter, NULL);
+	} else {
+		bdev_module_finish_iter(NULL);
+	}
+}
+
+static void
+bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
+{
+	struct spdk_bdev *bdev = cb_arg;
+
+	if (bdeverrno && bdev) {
+		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
+			     bdev->name);
+
+		/*
+		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
+		 *  bdev; try to continue by manually removing this bdev from the list and continue
+		 *  with the next bdev in the list.
+		 */
+		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
+	}
+
+	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n");
+		/*
+		 * Bdev module finish need to be deferred as we might be in the middle of some context
+		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
+		 * after returning.
+		 */
+		spdk_thread_send_msg(spdk_get_thread(), bdev_module_finish_iter, NULL);
+		return;
+	}
+
+	/*
+	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
+	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
+	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
+	 * base bdevs.
+	 *
+	 * Also, walk the list in the reverse order.
+	 */
+	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
+	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
+		if (bdev->internal.claim_module != NULL) {
+			SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n",
+				      bdev->name, bdev->internal.claim_module->name);
+			continue;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name);
+		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
+		return;
+	}
+
+	/*
+	 * If any bdev fails to unclaim underlying bdev properly, we may face the
+	 * case of bdev list consisting of claimed bdevs only (if claims are managed
+	 * correctly, this would mean there's a loop in the claims graph which is
+	 * clearly impossible). Warn and unregister last bdev on the list then.
+	 */
+	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
+	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
+		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
+		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
+		return;
+	}
+}
+
+void
+spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
+{
+	struct spdk_bdev_module *m;
+
+	assert(cb_fn != NULL);
+
+	g_fini_thread = spdk_get_thread();
+
+	g_fini_cb_fn = cb_fn;
+	g_fini_cb_arg = cb_arg;
+
+	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (m->fini_start) {
+			m->fini_start();
+		}
+	}
+
+	bdev_finish_unregister_bdevs_iter(NULL, 0);
+}
+
+struct spdk_bdev_io *
+bdev_channel_get_io(struct spdk_bdev_channel *channel)
+{
+	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
+	struct spdk_bdev_io *bdev_io;
+
+	if (ch->per_thread_cache_count > 0) {
+		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
+		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
+		ch->per_thread_cache_count--;
+	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
+		/*
+		 * Don't try to look for bdev_ios in the global pool if there are
+		 * waiters on bdev_ios - we don't want this caller to jump the line.
+		 */
+		bdev_io = NULL;
+	} else {
+		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
+	}
+
+	return bdev_io;
+}
+
+void
+spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
+{
+	struct spdk_bdev_mgmt_channel *ch;
+
+	assert(bdev_io != NULL);
+	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
+
+	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
+
+	if (bdev_io->internal.buf != NULL) {
+		bdev_io_put_buf(bdev_io);
+	}
+
+	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
+		ch->per_thread_cache_count++;
+		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
+		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
+			struct spdk_bdev_io_wait_entry *entry;
+
+			entry = TAILQ_FIRST(&ch->io_wait_queue);
+			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
+			entry->cb_fn(entry->cb_arg);
+		}
+	} else {
+		/* We should never have a full cache with entries on the io wait queue. */
+		assert(TAILQ_EMPTY(&ch->io_wait_queue));
+		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
+	}
+}
+
+static bool
+bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
+{
+	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
+
+	switch (limit) {
+	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
+		return true;
+	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
+	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
+	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
+		return false;
+	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
+	default:
+		return false;
+	}
+}
+
+static bool
+bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
+{
+	switch (bdev_io->type) {
+	case SPDK_BDEV_IO_TYPE_NVME_IO:
+	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+	case SPDK_BDEV_IO_TYPE_READ:
+	case SPDK_BDEV_IO_TYPE_WRITE:
+		return true;
+	case SPDK_BDEV_IO_TYPE_ZCOPY:
+		if (bdev_io->u.bdev.zcopy.start) {
+			return true;
+		} else {
+			return false;
+		}
+	default:
+		return false;
+	}
+}
+
+static bool
+bdev_is_read_io(struct spdk_bdev_io *bdev_io)
+{
+	switch (bdev_io->type) {
+	case SPDK_BDEV_IO_TYPE_NVME_IO:
+	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+		/* Bit 1 (0x2) set for read operation */
+		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
+			return true;
+		} else {
+			return false;
+		}
+	case SPDK_BDEV_IO_TYPE_READ:
+		return true;
+	case SPDK_BDEV_IO_TYPE_ZCOPY:
+		/* Populate to read from disk */
+		if (bdev_io->u.bdev.zcopy.populate) {
+			return true;
+		} else {
+			return false;
+		}
+	default:
+		return false;
+	}
+}
+
+static uint64_t
+bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
+{
+	struct spdk_bdev	*bdev = bdev_io->bdev;
+
+	switch (bdev_io->type) {
+	case SPDK_BDEV_IO_TYPE_NVME_IO:
+	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+		return bdev_io->u.nvme_passthru.nbytes;
+	case SPDK_BDEV_IO_TYPE_READ:
+	case SPDK_BDEV_IO_TYPE_WRITE:
+		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
+	case SPDK_BDEV_IO_TYPE_ZCOPY:
+		/* Track the data in the start phase only */
+		if (bdev_io->u.bdev.zcopy.start) {
+			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
+		} else {
+			return 0;
+		}
+	default:
+		return 0;
+	}
+}
+
+static bool
+bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static bool
+bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	if (bdev_is_read_io(io) == false) {
+		return false;
+	}
+
+	return bdev_qos_rw_queue_io(limit, io);
+}
+
+static bool
+bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	if (bdev_is_read_io(io) == true) {
+		return false;
+	}
+
+	return bdev_qos_rw_queue_io(limit, io);
+}
+
+static void
+bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	limit->remaining_this_timeslice--;
+}
+
+static void
+bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io);
+}
+
+static void
+bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	if (bdev_is_read_io(io) == false) {
+		return;
+	}
+
+	return bdev_qos_rw_bps_update_quota(limit, io);
+}
+
+static void
+bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+	if (bdev_is_read_io(io) == true) {
+		return;
+	}
+
+	return bdev_qos_rw_bps_update_quota(limit, io);
+}
+
+static void
+bdev_qos_set_ops(struct spdk_bdev_qos *qos)
+{
+	int i;
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+			qos->rate_limits[i].queue_io = NULL;
+			qos->rate_limits[i].update_quota = NULL;
+			continue;
+		}
+
+		switch (i) {
+		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
+			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
+			qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota;
+			break;
+		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
+			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
+			qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota;
+			break;
+		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
+			qos->rate_limits[i].queue_io = bdev_qos_r_queue_io;
+			qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota;
+			break;
+		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
+			qos->rate_limits[i].queue_io = bdev_qos_w_queue_io;
+			qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota;
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+static void
+_bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
+			    struct spdk_bdev_io *bdev_io,
+			    enum spdk_bdev_io_status status)
+{
+	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+
+	bdev_io->internal.in_submit_request = true;
+	bdev_ch->io_outstanding++;
+	shared_resource->io_outstanding++;
+	spdk_bdev_io_complete(bdev_io, status);
+	bdev_io->internal.in_submit_request = false;
+}
+
+static inline void
+bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_io_channel *ch = bdev_ch->channel;
+	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+
+	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
+		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
+		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
+
+		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
+		    bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) ||
+		    bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) {
+			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
+						    SPDK_BDEV_IO_STATUS_SUCCESS);
+			return;
+		}
+	}
+
+	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
+		bdev_ch->io_outstanding++;
+		shared_resource->io_outstanding++;
+		bdev_io->internal.in_submit_request = true;
+		bdev->fn_table->submit_request(ch, bdev_io);
+		bdev_io->internal.in_submit_request = false;
+	} else {
+		TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
+	}
+}
+
+static int
+bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
+{
+	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
+	int				i, submitted_ios = 0;
+
+	TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) {
+		if (bdev_qos_io_to_limit(bdev_io) == true) {
+			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+				if (!qos->rate_limits[i].queue_io) {
+					continue;
+				}
+
+				if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
+								 bdev_io) == true) {
+					return submitted_ios;
+				}
+			}
+			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+				if (!qos->rate_limits[i].update_quota) {
+					continue;
+				}
+
+				qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io);
+			}
+		}
+
+		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
+		bdev_io_do_submit(ch, bdev_io);
+		submitted_ios++;
+	}
+
+	return submitted_ios;
+}
+
+static void
+bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
+{
+	int rc;
+
+	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
+	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
+	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
+	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
+				     &bdev_io->internal.waitq_entry);
+	if (rc != 0) {
+		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+	}
+}
+
+static bool
+bdev_io_type_can_split(uint8_t type)
+{
+	assert(type != SPDK_BDEV_IO_TYPE_INVALID);
+	assert(type < SPDK_BDEV_NUM_IO_TYPES);
+
+	/* Only split READ and WRITE I/O.  Theoretically other types of I/O like
+	 * UNMAP could be split, but these types of I/O are typically much larger
+	 * in size (sometimes the size of the entire block device), and the bdev
+	 * module can more efficiently split these types of I/O.  Plus those types
+	 * of I/O do not have a payload, which makes the splitting process simpler.
+	 */
+	if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static bool
+bdev_io_should_split(struct spdk_bdev_io *bdev_io)
+{
+	uint64_t start_stripe, end_stripe;
+	uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary;
+
+	if (io_boundary == 0) {
+		return false;
+	}
+
+	if (!bdev_io_type_can_split(bdev_io->type)) {
+		return false;
+	}
+
+	start_stripe = bdev_io->u.bdev.offset_blocks;
+	end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
+	/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
+	if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
+		start_stripe >>= spdk_u32log2(io_boundary);
+		end_stripe >>= spdk_u32log2(io_boundary);
+	} else {
+		start_stripe /= io_boundary;
+		end_stripe /= io_boundary;
+	}
+	return (start_stripe != end_stripe);
+}
+
+static uint32_t
+_to_next_boundary(uint64_t offset, uint32_t boundary)
+{
+	return (boundary - (offset % boundary));
+}
+
+static void
+bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+
+static void
+_bdev_io_split(void *_bdev_io)
+{
+	struct spdk_bdev_io *bdev_io = _bdev_io;
+	uint64_t current_offset, remaining;
+	uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
+	struct iovec *parent_iov, *iov;
+	uint64_t parent_iov_offset, iov_len;
+	uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt;
+	void *md_buf = NULL;
+	int rc;
+
+	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
+	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
+	blocklen = bdev_io->bdev->blocklen;
+	parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen;
+	parent_iovcnt = bdev_io->u.bdev.iovcnt;
+
+	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
+		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
+		if (parent_iov_offset < parent_iov->iov_len) {
+			break;
+		}
+		parent_iov_offset -= parent_iov->iov_len;
+	}
+
+	child_iovcnt = 0;
+	while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
+		to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary);
+		to_next_boundary = spdk_min(remaining, to_next_boundary);
+		to_next_boundary_bytes = to_next_boundary * blocklen;
+		iov = &bdev_io->child_iov[child_iovcnt];
+		iovcnt = 0;
+
+		if (bdev_io->u.bdev.md_buf) {
+			assert((parent_iov_offset % blocklen) > 0);
+			md_buf = (char *)bdev_io->u.bdev.md_buf + (parent_iov_offset / blocklen) *
+				 spdk_bdev_get_md_size(bdev_io->bdev);
+		}
+
+		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
+		       child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
+			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
+			iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset);
+			to_next_boundary_bytes -= iov_len;
+
+			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
+			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
+
+			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
+				parent_iov_offset += iov_len;
+			} else {
+				parent_iovpos++;
+				parent_iov_offset = 0;
+			}
+			child_iovcnt++;
+			iovcnt++;
+		}
+
+		if (to_next_boundary_bytes > 0) {
+			/* We had to stop this child I/O early because we ran out of
+			 * child_iov space.  Ensure the iovs to be aligned with block
+			 * size and then adjust to_next_boundary before starting the
+			 * child I/O.
+			 */
+			assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV);
+			to_last_block_bytes = to_next_boundary_bytes % blocklen;
+			if (to_last_block_bytes != 0) {
+				uint32_t child_iovpos = child_iovcnt - 1;
+				/* don't decrease child_iovcnt so the loop will naturally end */
+
+				to_last_block_bytes = blocklen - to_last_block_bytes;
+				to_next_boundary_bytes += to_last_block_bytes;
+				while (to_last_block_bytes > 0 && iovcnt > 0) {
+					iov_len = spdk_min(to_last_block_bytes,
+							   bdev_io->child_iov[child_iovpos].iov_len);
+					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
+					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
+						child_iovpos--;
+						if (--iovcnt == 0) {
+							return;
+						}
+					}
+					to_last_block_bytes -= iov_len;
+				}
+
+				assert(to_last_block_bytes == 0);
+			}
+			to_next_boundary -= to_next_boundary_bytes / blocklen;
+		}
+
+		bdev_io->u.bdev.split_outstanding++;
+
+		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+			rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
+						       spdk_io_channel_from_ctx(bdev_io->internal.ch),
+						       iov, iovcnt, md_buf, current_offset,
+						       to_next_boundary,
+						       bdev_io_split_done, bdev_io);
+		} else {
+			rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
+							spdk_io_channel_from_ctx(bdev_io->internal.ch),
+							iov, iovcnt, md_buf, current_offset,
+							to_next_boundary,
+							bdev_io_split_done, bdev_io);
+		}
+
+		if (rc == 0) {
+			current_offset += to_next_boundary;
+			remaining -= to_next_boundary;
+			bdev_io->u.bdev.split_current_offset_blocks = current_offset;
+			bdev_io->u.bdev.split_remaining_num_blocks = remaining;
+		} else {
+			bdev_io->u.bdev.split_outstanding--;
+			if (rc == -ENOMEM) {
+				if (bdev_io->u.bdev.split_outstanding == 0) {
+					/* No I/O is outstanding. Hence we should wait here. */
+					bdev_queue_io_wait_with_cb(bdev_io, _bdev_io_split);
+				}
+			} else {
+				bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+				if (bdev_io->u.bdev.split_outstanding == 0) {
+					spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0,
+							      (uintptr_t)bdev_io, 0);
+					TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
+					bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+				}
+			}
+
+			return;
+		}
+	}
+}
+
+static void
+bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *parent_io = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (!success) {
+		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		/* If any child I/O failed, stop further splitting process. */
+		parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
+		parent_io->u.bdev.split_remaining_num_blocks = 0;
+	}
+	parent_io->u.bdev.split_outstanding--;
+	if (parent_io->u.bdev.split_outstanding != 0) {
+		return;
+	}
+
+	/*
+	 * Parent I/O finishes when all blocks are consumed.
+	 */
+	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
+		assert(parent_io->internal.cb != bdev_io_split_done);
+		spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0,
+				      (uintptr_t)parent_io, 0);
+		TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
+		parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
+				       parent_io->internal.caller_ctx);
+		return;
+	}
+
+	/*
+	 * Continue with the splitting process.  This function will complete the parent I/O if the
+	 * splitting is done.
+	 */
+	_bdev_io_split(parent_io);
+}
+
+static void
+bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success);
+
+static void
+bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+	assert(bdev_io_type_can_split(bdev_io->type));
+
+	bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
+	bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
+	bdev_io->u.bdev.split_outstanding = 0;
+	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+	if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
+		_bdev_io_split(bdev_io);
+	} else {
+		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
+		spdk_bdev_io_get_buf(bdev_io, bdev_io_split_get_buf_cb,
+				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+	}
+}
+
+static void
+bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+	if (!success) {
+		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+		return;
+	}
+
+	bdev_io_split(ch, bdev_io);
+}
+
+/* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
+ *  be inlined, at least on some compilers.
+ */
+static inline void
+_bdev_io_submit(void *ctx)
+{
+	struct spdk_bdev_io *bdev_io = ctx;
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+	uint64_t tsc;
+
+	tsc = spdk_get_ticks();
+	bdev_io->internal.submit_tsc = tsc;
+	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type);
+
+	if (spdk_likely(bdev_ch->flags == 0)) {
+		bdev_io_do_submit(bdev_ch, bdev_io);
+		return;
+	}
+
+	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
+		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
+	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
+		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
+		    bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) {
+			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+		} else {
+			TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
+			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
+		}
+	} else {
+		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
+		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+	}
+}
+
+bool
+bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
+
+bool
+bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
+{
+	if (range1->length == 0 || range2->length == 0) {
+		return false;
+	}
+
+	if (range1->offset + range1->length <= range2->offset) {
+		return false;
+	}
+
+	if (range2->offset + range2->length <= range1->offset) {
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
+{
+	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
+	struct lba_range r;
+
+	switch (bdev_io->type) {
+	case SPDK_BDEV_IO_TYPE_NVME_IO:
+	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+		/* Don't try to decode the NVMe command - just assume worst-case and that
+		 * it overlaps a locked range.
+		 */
+		return true;
+	case SPDK_BDEV_IO_TYPE_WRITE:
+	case SPDK_BDEV_IO_TYPE_UNMAP:
+	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+	case SPDK_BDEV_IO_TYPE_ZCOPY:
+		r.offset = bdev_io->u.bdev.offset_blocks;
+		r.length = bdev_io->u.bdev.num_blocks;
+		if (!bdev_lba_range_overlapped(range, &r)) {
+			/* This I/O doesn't overlap the specified LBA range. */
+			return false;
+		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
+			/* This I/O overlaps, but the I/O is on the same channel that locked this
+			 * range, and the caller_ctx is the same as the locked_ctx.  This means
+			 * that this I/O is associated with the lock, and is allowed to execute.
+			 */
+			return false;
+		} else {
+			return true;
+		}
+	default:
+		return false;
+	}
+}
+
+void
+bdev_io_submit(struct spdk_bdev_io *bdev_io)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io);
+	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
+
+	assert(thread != NULL);
+	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
+
+	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
+		struct lba_range *range;
+
+		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+			if (bdev_io_range_is_locked(bdev_io, range)) {
+				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
+				return;
+			}
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
+
+	if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bdev_io)) {
+		bdev_io->internal.submit_tsc = spdk_get_ticks();
+		spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
+				      (uintptr_t)bdev_io, bdev_io->type);
+		bdev_io_split(NULL, bdev_io);
+		return;
+	}
+
+	if (ch->flags & BDEV_CH_QOS_ENABLED) {
+		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
+			_bdev_io_submit(bdev_io);
+		} else {
+			bdev_io->internal.io_submit_ch = ch;
+			bdev_io->internal.ch = bdev->internal.qos->ch;
+			spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io);
+		}
+	} else {
+		_bdev_io_submit(bdev_io);
+	}
+}
+
+static void
+bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+	struct spdk_io_channel *ch = bdev_ch->channel;
+
+	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
+
+	bdev_io->internal.in_submit_request = true;
+	bdev->fn_table->submit_request(ch, bdev_io);
+	bdev_io->internal.in_submit_request = false;
+}
+
+void
+bdev_io_init(struct spdk_bdev_io *bdev_io,
+	     struct spdk_bdev *bdev, void *cb_arg,
+	     spdk_bdev_io_completion_cb cb)
+{
+	bdev_io->bdev = bdev;
+	bdev_io->internal.caller_ctx = cb_arg;
+	bdev_io->internal.cb = cb;
+	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+	bdev_io->internal.in_submit_request = false;
+	bdev_io->internal.buf = NULL;
+	bdev_io->internal.io_submit_ch = NULL;
+	bdev_io->internal.orig_iovs = NULL;
+	bdev_io->internal.orig_iovcnt = 0;
+	bdev_io->internal.orig_md_buf = NULL;
+	bdev_io->internal.error.nvme.cdw0 = 0;
+	bdev_io->num_retries = 0;
+	bdev_io->internal.get_buf_cb = NULL;
+	bdev_io->internal.get_aux_buf_cb = NULL;
+}
+
+static bool
+bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
+{
+	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
+}
+
+bool
+spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
+{
+	bool supported;
+
+	supported = bdev_io_type_supported(bdev, io_type);
+
+	if (!supported) {
+		switch (io_type) {
+		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+			/* The bdev layer will emulate write zeroes as long as write is supported. */
+			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
+			break;
+		case SPDK_BDEV_IO_TYPE_ZCOPY:
+			/* Zero copy can be emulated with regular read and write */
+			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ) &&
+				    bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
+			break;
+		default:
+			break;
+		}
+	}
+
+	return supported;
+}
+
+int
+spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+	if (bdev->fn_table->dump_info_json) {
+		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
+	}
+
+	return 0;
+}
+
+static void
+bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
+{
+	uint32_t max_per_timeslice = 0;
+	int i;
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+			qos->rate_limits[i].max_per_timeslice = 0;
+			continue;
+		}
+
+		max_per_timeslice = qos->rate_limits[i].limit *
+				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
+
+		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
+							qos->rate_limits[i].min_per_timeslice);
+
+		qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice;
+	}
+
+	bdev_qos_set_ops(qos);
+}
+
+static int
+bdev_channel_poll_qos(void *arg)
+{
+	struct spdk_bdev_qos *qos = arg;
+	uint64_t now = spdk_get_ticks();
+	int i;
+
+	if (now < (qos->last_timeslice + qos->timeslice_size)) {
+		/* We received our callback earlier than expected - return
+		 *  immediately and wait to do accounting until at least one
+		 *  timeslice has actually expired.  This should never happen
+		 *  with a well-behaved timer implementation.
+		 */
+		return SPDK_POLLER_IDLE;
+	}
+
+	/* Reset for next round of rate limiting */
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		/* We may have allowed the IOs or bytes to slightly overrun in the last
+		 * timeslice. remaining_this_timeslice is signed, so if it's negative
+		 * here, we'll account for the overrun so that the next timeslice will
+		 * be appropriately reduced.
+		 */
+		if (qos->rate_limits[i].remaining_this_timeslice > 0) {
+			qos->rate_limits[i].remaining_this_timeslice = 0;
+		}
+	}
+
+	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
+		qos->last_timeslice += qos->timeslice_size;
+		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+			qos->rate_limits[i].remaining_this_timeslice +=
+				qos->rate_limits[i].max_per_timeslice;
+		}
+	}
+
+	return bdev_qos_io_submit(qos->ch, qos);
+}
+
+static void
+bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
+{
+	struct spdk_bdev_shared_resource *shared_resource;
+	struct lba_range *range;
+
+	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
+		range = TAILQ_FIRST(&ch->locked_ranges);
+		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
+		free(range);
+	}
+
+	spdk_put_io_channel(ch->channel);
+
+	shared_resource = ch->shared_resource;
+
+	assert(TAILQ_EMPTY(&ch->io_locked));
+	assert(TAILQ_EMPTY(&ch->io_submitted));
+	assert(ch->io_outstanding == 0);
+	assert(shared_resource->ref > 0);
+	shared_resource->ref--;
+	if (shared_resource->ref == 0) {
+		assert(shared_resource->io_outstanding == 0);
+		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
+		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
+		free(shared_resource);
+	}
+}
+
+/* Caller must hold bdev->internal.mutex. */
+static void
+bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
+{
+	struct spdk_bdev_qos	*qos = bdev->internal.qos;
+	int			i;
+
+	/* Rate limiting on this bdev enabled */
+	if (qos) {
+		if (qos->ch == NULL) {
+			struct spdk_io_channel *io_ch;
+
+			SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
+				      bdev->name, spdk_get_thread());
+
+			/* No qos channel has been selected, so set one up */
+
+			/* Take another reference to ch */
+			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
+			assert(io_ch != NULL);
+			qos->ch = ch;
+
+			qos->thread = spdk_io_channel_get_thread(io_ch);
+
+			TAILQ_INIT(&qos->queued);
+
+			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+				if (bdev_qos_is_iops_rate_limit(i) == true) {
+					qos->rate_limits[i].min_per_timeslice =
+						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
+				} else {
+					qos->rate_limits[i].min_per_timeslice =
+						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
+				}
+
+				if (qos->rate_limits[i].limit == 0) {
+					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
+				}
+			}
+			bdev_qos_update_max_quota_per_timeslice(qos);
+			qos->timeslice_size =
+				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
+			qos->last_timeslice = spdk_get_ticks();
+			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
+							   qos,
+							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
+		}
+
+		ch->flags |= BDEV_CH_QOS_ENABLED;
+	}
+}
+
+struct poll_timeout_ctx {
+	struct spdk_bdev_desc	*desc;
+	uint64_t		timeout_in_sec;
+	spdk_bdev_io_timeout_cb	cb_fn;
+	void			*cb_arg;
+};
+
+static void
+bdev_desc_free(struct spdk_bdev_desc *desc)
+{
+	pthread_mutex_destroy(&desc->mutex);
+	free(desc->media_events_buffer);
+	free(desc);
+}
+
+static void
+bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct poll_timeout_ctx *ctx  = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_bdev_desc *desc = ctx->desc;
+
+	free(ctx);
+
+	pthread_mutex_lock(&desc->mutex);
+	desc->refs--;
+	if (desc->closed == true && desc->refs == 0) {
+		pthread_mutex_unlock(&desc->mutex);
+		bdev_desc_free(desc);
+		return;
+	}
+	pthread_mutex_unlock(&desc->mutex);
+}
+
+static void
+bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i)
+{
+	struct poll_timeout_ctx *ctx  = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch);
+	struct spdk_bdev_desc *desc = ctx->desc;
+	struct spdk_bdev_io *bdev_io;
+	uint64_t now;
+
+	pthread_mutex_lock(&desc->mutex);
+	if (desc->closed == true) {
+		pthread_mutex_unlock(&desc->mutex);
+		spdk_for_each_channel_continue(i, -1);
+		return;
+	}
+	pthread_mutex_unlock(&desc->mutex);
+
+	now = spdk_get_ticks();
+	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
+		/* Exclude any I/O that are generated via splitting. */
+		if (bdev_io->internal.cb == bdev_io_split_done) {
+			continue;
+		}
+
+		/* Once we find an I/O that has not timed out, we can immediately
+		 * exit the loop.
+		 */
+		if (now < (bdev_io->internal.submit_tsc +
+			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
+			goto end;
+		}
+
+		if (bdev_io->internal.desc == desc) {
+			ctx->cb_fn(ctx->cb_arg, bdev_io);
+		}
+	}
+
+end:
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+bdev_poll_timeout_io(void *arg)
+{
+	struct spdk_bdev_desc *desc = arg;
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct poll_timeout_ctx *ctx;
+
+	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
+	if (!ctx) {
+		SPDK_ERRLOG("failed to allocate memory\n");
+		return SPDK_POLLER_BUSY;
+	}
+	ctx->desc = desc;
+	ctx->cb_arg = desc->cb_arg;
+	ctx->cb_fn = desc->cb_fn;
+	ctx->timeout_in_sec = desc->timeout_in_sec;
+
+	/* Take a ref on the descriptor in case it gets closed while we are checking
+	 * all of the channels.
+	 */
+	pthread_mutex_lock(&desc->mutex);
+	desc->refs++;
+	pthread_mutex_unlock(&desc->mutex);
+
+	spdk_for_each_channel(__bdev_to_io_dev(bdev),
+			      bdev_channel_poll_timeout_io,
+			      ctx,
+			      bdev_channel_poll_timeout_io_done);
+
+	return SPDK_POLLER_BUSY;
+}
+
+int
+spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
+		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
+{
+	assert(desc->thread == spdk_get_thread());
+
+	spdk_poller_unregister(&desc->io_timeout_poller);
+
+	if (timeout_in_sec) {
+		assert(cb_fn != NULL);
+		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
+					  desc,
+					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
+					  1000);
+		if (desc->io_timeout_poller == NULL) {
+			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
+			return -1;
+		}
+	}
+
+	desc->cb_fn = cb_fn;
+	desc->cb_arg = cb_arg;
+	desc->timeout_in_sec = timeout_in_sec;
+
+	return 0;
+}
+
+static int
+bdev_channel_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
+	struct spdk_bdev_channel	*ch = ctx_buf;
+	struct spdk_io_channel		*mgmt_io_ch;
+	struct spdk_bdev_mgmt_channel	*mgmt_ch;
+	struct spdk_bdev_shared_resource *shared_resource;
+	struct lba_range		*range;
+
+	ch->bdev = bdev;
+	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
+	if (!ch->channel) {
+		return -1;
+	}
+
+	assert(ch->histogram == NULL);
+	if (bdev->internal.histogram_enabled) {
+		ch->histogram = spdk_histogram_data_alloc();
+		if (ch->histogram == NULL) {
+			SPDK_ERRLOG("Could not allocate histogram\n");
+		}
+	}
+
+	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
+	if (!mgmt_io_ch) {
+		spdk_put_io_channel(ch->channel);
+		return -1;
+	}
+
+	mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch);
+	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
+		if (shared_resource->shared_ch == ch->channel) {
+			spdk_put_io_channel(mgmt_io_ch);
+			shared_resource->ref++;
+			break;
+		}
+	}
+
+	if (shared_resource == NULL) {
+		shared_resource = calloc(1, sizeof(*shared_resource));
+		if (shared_resource == NULL) {
+			spdk_put_io_channel(ch->channel);
+			spdk_put_io_channel(mgmt_io_ch);
+			return -1;
+		}
+
+		shared_resource->mgmt_ch = mgmt_ch;
+		shared_resource->io_outstanding = 0;
+		TAILQ_INIT(&shared_resource->nomem_io);
+		shared_resource->nomem_threshold = 0;
+		shared_resource->shared_ch = ch->channel;
+		shared_resource->ref = 1;
+		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
+	}
+
+	memset(&ch->stat, 0, sizeof(ch->stat));
+	ch->stat.ticks_rate = spdk_get_ticks_hz();
+	ch->io_outstanding = 0;
+	TAILQ_INIT(&ch->queued_resets);
+	TAILQ_INIT(&ch->locked_ranges);
+	ch->flags = 0;
+	ch->shared_resource = shared_resource;
+
+	TAILQ_INIT(&ch->io_submitted);
+	TAILQ_INIT(&ch->io_locked);
+
+#ifdef SPDK_CONFIG_VTUNE
+	{
+		char *name;
+		__itt_init_ittlib(NULL, 0);
+		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
+		if (!name) {
+			bdev_channel_destroy_resource(ch);
+			return -1;
+		}
+		ch->handle = __itt_string_handle_create(name);
+		free(name);
+		ch->start_tsc = spdk_get_ticks();
+		ch->interval_tsc = spdk_get_ticks_hz() / 100;
+		memset(&ch->prev_stat, 0, sizeof(ch->prev_stat));
+	}
+#endif
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	bdev_enable_qos(bdev, ch);
+
+	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
+		struct lba_range *new_range;
+
+		new_range = calloc(1, sizeof(*new_range));
+		if (new_range == NULL) {
+			pthread_mutex_unlock(&bdev->internal.mutex);
+			bdev_channel_destroy_resource(ch);
+			return -1;
+		}
+		new_range->length = range->length;
+		new_range->offset = range->offset;
+		new_range->locked_ctx = range->locked_ctx;
+		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
+	}
+
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	return 0;
+}
+
+/*
+ * Abort I/O that are waiting on a data buffer.  These types of I/O are
+ *  linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY.
+ */
+static void
+bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch)
+{
+	bdev_io_stailq_t tmp;
+	struct spdk_bdev_io *bdev_io;
+
+	STAILQ_INIT(&tmp);
+
+	while (!STAILQ_EMPTY(queue)) {
+		bdev_io = STAILQ_FIRST(queue);
+		STAILQ_REMOVE_HEAD(queue, internal.buf_link);
+		if (bdev_io->internal.ch == ch) {
+			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
+		} else {
+			STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link);
+		}
+	}
+
+	STAILQ_SWAP(&tmp, queue, spdk_bdev_io);
+}
+
+/*
+ * Abort I/O that are queued waiting for submission.  These types of I/O are
+ *  linked using the spdk_bdev_io link TAILQ_ENTRY.
+ */
+static void
+bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
+{
+	struct spdk_bdev_io *bdev_io, *tmp;
+
+	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
+		if (bdev_io->internal.ch == ch) {
+			TAILQ_REMOVE(queue, bdev_io, internal.link);
+			/*
+			 * spdk_bdev_io_complete() assumes that the completed I/O had
+			 *  been submitted to the bdev module.  Since in this case it
+			 *  hadn't, bump io_outstanding to account for the decrement
+			 *  that spdk_bdev_io_complete() will do.
+			 */
+			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
+				ch->io_outstanding++;
+				ch->shared_resource->io_outstanding++;
+			}
+			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
+		}
+	}
+}
+
+static bool
+bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
+{
+	struct spdk_bdev_io *bdev_io;
+
+	TAILQ_FOREACH(bdev_io, queue, internal.link) {
+		if (bdev_io == bio_to_abort) {
+			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
+			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool
+bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort)
+{
+	struct spdk_bdev_io *bdev_io;
+
+	STAILQ_FOREACH(bdev_io, queue, internal.buf_link) {
+		if (bdev_io == bio_to_abort) {
+			STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link);
+			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void
+bdev_qos_channel_destroy(void *cb_arg)
+{
+	struct spdk_bdev_qos *qos = cb_arg;
+
+	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
+	spdk_poller_unregister(&qos->poller);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos);
+
+	free(qos);
+}
+
+static int
+bdev_qos_destroy(struct spdk_bdev *bdev)
+{
+	int i;
+
+	/*
+	 * Cleanly shutting down the QoS poller is tricky, because
+	 * during the asynchronous operation the user could open
+	 * a new descriptor and create a new channel, spawning
+	 * a new QoS poller.
+	 *
+	 * The strategy is to create a new QoS structure here and swap it
+	 * in. The shutdown path then continues to refer to the old one
+	 * until it completes and then releases it.
+	 */
+	struct spdk_bdev_qos *new_qos, *old_qos;
+
+	old_qos = bdev->internal.qos;
+
+	new_qos = calloc(1, sizeof(*new_qos));
+	if (!new_qos) {
+		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
+		return -ENOMEM;
+	}
+
+	/* Copy the old QoS data into the newly allocated structure */
+	memcpy(new_qos, old_qos, sizeof(*new_qos));
+
+	/* Zero out the key parts of the QoS structure */
+	new_qos->ch = NULL;
+	new_qos->thread = NULL;
+	new_qos->poller = NULL;
+	TAILQ_INIT(&new_qos->queued);
+	/*
+	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
+	 * It will be used later for the new QoS structure.
+	 */
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		new_qos->rate_limits[i].remaining_this_timeslice = 0;
+		new_qos->rate_limits[i].min_per_timeslice = 0;
+		new_qos->rate_limits[i].max_per_timeslice = 0;
+	}
+
+	bdev->internal.qos = new_qos;
+
+	if (old_qos->thread == NULL) {
+		free(old_qos);
+	} else {
+		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
+	}
+
+	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
+	 * been destroyed yet. The destruction path will end up waiting for the final
+	 * channel to be put before it releases resources. */
+
+	return 0;
+}
+
+static void
+bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
+{
+	total->bytes_read += add->bytes_read;
+	total->num_read_ops += add->num_read_ops;
+	total->bytes_written += add->bytes_written;
+	total->num_write_ops += add->num_write_ops;
+	total->bytes_unmapped += add->bytes_unmapped;
+	total->num_unmap_ops += add->num_unmap_ops;
+	total->read_latency_ticks += add->read_latency_ticks;
+	total->write_latency_ticks += add->write_latency_ticks;
+	total->unmap_latency_ticks += add->unmap_latency_ticks;
+}
+
+static void
+bdev_channel_destroy(void *io_device, void *ctx_buf)
+{
+	struct spdk_bdev_channel	*ch = ctx_buf;
+	struct spdk_bdev_mgmt_channel	*mgmt_ch;
+	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
+		      spdk_get_thread());
+
+	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
+	pthread_mutex_lock(&ch->bdev->internal.mutex);
+	bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat);
+	pthread_mutex_unlock(&ch->bdev->internal.mutex);
+
+	mgmt_ch = shared_resource->mgmt_ch;
+
+	bdev_abort_all_queued_io(&ch->queued_resets, ch);
+	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
+	bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch);
+	bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch);
+
+	if (ch->histogram) {
+		spdk_histogram_data_free(ch->histogram);
+	}
+
+	bdev_channel_destroy_resource(ch);
+}
+
+int
+spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
+{
+	struct spdk_bdev_alias *tmp;
+
+	if (alias == NULL) {
+		SPDK_ERRLOG("Empty alias passed\n");
+		return -EINVAL;
+	}
+
+	if (spdk_bdev_get_by_name(alias)) {
+		SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias);
+		return -EEXIST;
+	}
+
+	tmp = calloc(1, sizeof(*tmp));
+	if (tmp == NULL) {
+		SPDK_ERRLOG("Unable to allocate alias\n");
+		return -ENOMEM;
+	}
+
+	tmp->alias = strdup(alias);
+	if (tmp->alias == NULL) {
+		free(tmp);
+		SPDK_ERRLOG("Unable to allocate alias\n");
+		return -ENOMEM;
+	}
+
+	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
+
+	return 0;
+}
+
+int
+spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
+{
+	struct spdk_bdev_alias *tmp;
+
+	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
+		if (strcmp(alias, tmp->alias) == 0) {
+			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
+			free(tmp->alias);
+			free(tmp);
+			return 0;
+		}
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias);
+
+	return -ENOENT;
+}
+
+void
+spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
+{
+	struct spdk_bdev_alias *p, *tmp;
+
+	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
+		TAILQ_REMOVE(&bdev->aliases, p, tailq);
+		free(p->alias);
+		free(p);
+	}
+}
+
+struct spdk_io_channel *
+spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
+{
+	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
+}
+
+const char *
+spdk_bdev_get_name(const struct spdk_bdev *bdev)
+{
+	return bdev->name;
+}
+
+const char *
+spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
+{
+	return bdev->product_name;
+}
+
+const struct spdk_bdev_aliases_list *
+spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
+{
+	return &bdev->aliases;
+}
+
+uint32_t
+spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
+{
+	return bdev->blocklen;
+}
+
+uint32_t
+spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
+{
+	return bdev->write_unit_size;
+}
+
+uint64_t
+spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
+{
+	return bdev->blockcnt;
+}
+
+const char *
+spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
+{
+	return qos_rpc_type[type];
+}
+
+void
+spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
+{
+	int i;
+
+	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev->internal.qos) {
+		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+			if (bdev->internal.qos->rate_limits[i].limit !=
+			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+				limits[i] = bdev->internal.qos->rate_limits[i].limit;
+				if (bdev_qos_is_iops_rate_limit(i) == false) {
+					/* Change from Byte to Megabyte which is user visible. */
+					limits[i] = limits[i] / 1024 / 1024;
+				}
+			}
+		}
+	}
+	pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+size_t
+spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
+{
+	return 1 << bdev->required_alignment;
+}
+
+uint32_t
+spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
+{
+	return bdev->optimal_io_boundary;
+}
+
+bool
+spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
+{
+	return bdev->write_cache;
+}
+
+const struct spdk_uuid *
+spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
+{
+	return &bdev->uuid;
+}
+
+uint16_t
+spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
+{
+	return bdev->acwu;
+}
+
+uint32_t
+spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
+{
+	return bdev->md_len;
+}
+
+bool
+spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
+{
+	return (bdev->md_len != 0) && bdev->md_interleave;
+}
+
+bool
+spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
+{
+	return (bdev->md_len != 0) && !bdev->md_interleave;
+}
+
+bool
+spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
+{
+	return bdev->zoned;
+}
+
+uint32_t
+spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
+{
+	if (spdk_bdev_is_md_interleaved(bdev)) {
+		return bdev->blocklen - bdev->md_len;
+	} else {
+		return bdev->blocklen;
+	}
+}
+
+static uint32_t
+_bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
+{
+	if (!spdk_bdev_is_md_interleaved(bdev)) {
+		return bdev->blocklen + bdev->md_len;
+	} else {
+		return bdev->blocklen;
+	}
+}
+
+enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
+{
+	if (bdev->md_len != 0) {
+		return bdev->dif_type;
+	} else {
+		return SPDK_DIF_DISABLE;
+	}
+}
+
+bool
+spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
+{
+	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
+		return bdev->dif_is_head_of_md;
+	} else {
+		return false;
+	}
+}
+
+bool
+spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
+			       enum spdk_dif_check_type check_type)
+{
+	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
+		return false;
+	}
+
+	switch (check_type) {
+	case SPDK_DIF_CHECK_TYPE_REFTAG:
+		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
+	case SPDK_DIF_CHECK_TYPE_APPTAG:
+		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
+	case SPDK_DIF_CHECK_TYPE_GUARD:
+		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
+	default:
+		return false;
+	}
+}
+
+uint64_t
+spdk_bdev_get_qd(const struct spdk_bdev *bdev)
+{
+	return bdev->internal.measured_queue_depth;
+}
+
+uint64_t
+spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
+{
+	return bdev->internal.period;
+}
+
+uint64_t
+spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
+{
+	return bdev->internal.weighted_io_time;
+}
+
+uint64_t
+spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
+{
+	return bdev->internal.io_time;
+}
+
+static void
+_calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
+
+	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
+
+	if (bdev->internal.measured_queue_depth) {
+		bdev->internal.io_time += bdev->internal.period;
+		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
+	}
+}
+
+static void
+_calculate_measured_qd(struct spdk_io_channel_iter *i)
+{
+	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch);
+
+	bdev->internal.temporary_queue_depth += ch->io_outstanding;
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+bdev_calculate_measured_queue_depth(void *ctx)
+{
+	struct spdk_bdev *bdev = ctx;
+	bdev->internal.temporary_queue_depth = 0;
+	spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev,
+			      _calculate_measured_qd_cpl);
+	return SPDK_POLLER_BUSY;
+}
+
+void
+spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
+{
+	bdev->internal.period = period;
+
+	if (bdev->internal.qd_poller != NULL) {
+		spdk_poller_unregister(&bdev->internal.qd_poller);
+		bdev->internal.measured_queue_depth = UINT64_MAX;
+	}
+
+	if (period != 0) {
+		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev,
+					   period);
+	}
+}
+
+static void
+_resize_notify(void *arg)
+{
+	struct spdk_bdev_desc *desc = arg;
+
+	pthread_mutex_lock(&desc->mutex);
+	desc->refs--;
+	if (!desc->closed) {
+		pthread_mutex_unlock(&desc->mutex);
+		desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE,
+					desc->bdev,
+					desc->callback.ctx);
+		return;
+	} else if (0 == desc->refs) {
+		/* This descriptor was closed after this resize_notify message was sent.
+		 * spdk_bdev_close() could not free the descriptor since this message was
+		 * in flight, so we free it now using bdev_desc_free().
+		 */
+		pthread_mutex_unlock(&desc->mutex);
+		bdev_desc_free(desc);
+		return;
+	}
+	pthread_mutex_unlock(&desc->mutex);
+}
+
+int
+spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
+{
+	struct spdk_bdev_desc *desc;
+	int ret;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+
+	/* bdev has open descriptors */
+	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
+	    bdev->blockcnt > size) {
+		ret = -EBUSY;
+	} else {
+		bdev->blockcnt = size;
+		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
+			pthread_mutex_lock(&desc->mutex);
+			if (desc->callback.open_with_ext && !desc->closed) {
+				desc->refs++;
+				spdk_thread_send_msg(desc->thread, _resize_notify, desc);
+			}
+			pthread_mutex_unlock(&desc->mutex);
+		}
+		ret = 0;
+	}
+
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	return ret;
+}
+
+/*
+ * Convert I/O offset and length from bytes to blocks.
+ *
+ * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
+ */
+static uint64_t
+bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
+		     uint64_t num_bytes, uint64_t *num_blocks)
+{
+	uint32_t block_size = bdev->blocklen;
+	uint8_t shift_cnt;
+
+	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
+	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
+		shift_cnt = spdk_u32log2(block_size);
+		*offset_blocks = offset_bytes >> shift_cnt;
+		*num_blocks = num_bytes >> shift_cnt;
+		return (offset_bytes - (*offset_blocks << shift_cnt)) |
+		       (num_bytes - (*num_blocks << shift_cnt));
+	} else {
+		*offset_blocks = offset_bytes / block_size;
+		*num_blocks = num_bytes / block_size;
+		return (offset_bytes % block_size) | (num_bytes % block_size);
+	}
+}
+
+static bool
+bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
+{
+	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
+	 * has been an overflow and hence the offset has been wrapped around */
+	if (offset_blocks + num_blocks < offset_blocks) {
+		return false;
+	}
+
+	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
+	if (offset_blocks + num_blocks > bdev->blockcnt) {
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+_bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf)
+{
+	return _is_buf_allocated(iovs) == (md_buf != NULL);
+}
+
+static int
+bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
+			 void *md_buf, int64_t offset_blocks, uint64_t num_blocks,
+			 spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
+	bdev_io->u.bdev.iovs = &bdev_io->iov;
+	bdev_io->u.bdev.iovs[0].iov_base = buf;
+	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+	bdev_io->u.bdev.iovcnt = 1;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+	       void *buf, uint64_t offset, uint64_t nbytes,
+	       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 nbytes, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
+		      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			      void *buf, void *md_buf, int64_t offset_blocks, uint64_t num_blocks,
+			      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct iovec iov = {
+		.iov_base = buf,
+	};
+
+	if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+		return -EINVAL;
+	}
+
+	if (!_bdev_io_check_md_buf(&iov, md_buf)) {
+		return -EINVAL;
+	}
+
+	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
+					cb, cb_arg);
+}
+
+int
+spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		struct iovec *iov, int iovcnt,
+		uint64_t offset, uint64_t nbytes,
+		spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 nbytes, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+static int
+bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
+			  uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
+	bdev_io->u.bdev.iovs = iov;
+	bdev_io->u.bdev.iovcnt = iovcnt;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			   struct iovec *iov, int iovcnt,
+			   uint64_t offset_blocks, uint64_t num_blocks,
+			   spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
+					 num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			       struct iovec *iov, int iovcnt, void *md_buf,
+			       uint64_t offset_blocks, uint64_t num_blocks,
+			       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+		return -EINVAL;
+	}
+
+	if (!_bdev_io_check_md_buf(iov, md_buf)) {
+		return -EINVAL;
+	}
+
+	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
+					 num_blocks, cb, cb_arg);
+}
+
+static int
+bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+			  spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
+	bdev_io->u.bdev.iovs = &bdev_io->iov;
+	bdev_io->u.bdev.iovs[0].iov_base = buf;
+	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+	bdev_io->u.bdev.iovcnt = 1;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		void *buf, uint64_t offset, uint64_t nbytes,
+		spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 nbytes, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
+		       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
+					 cb, cb_arg);
+}
+
+int
+spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+			       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct iovec iov = {
+		.iov_base = buf,
+	};
+
+	if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+		return -EINVAL;
+	}
+
+	if (!_bdev_io_check_md_buf(&iov, md_buf)) {
+		return -EINVAL;
+	}
+
+	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
+					 cb, cb_arg);
+}
+
+static int
+bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			   struct iovec *iov, int iovcnt, void *md_buf,
+			   uint64_t offset_blocks, uint64_t num_blocks,
+			   spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
+	bdev_io->u.bdev.iovs = iov;
+	bdev_io->u.bdev.iovcnt = iovcnt;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		 struct iovec *iov, int iovcnt,
+		 uint64_t offset, uint64_t len,
+		 spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 len, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			struct iovec *iov, int iovcnt,
+			uint64_t offset_blocks, uint64_t num_blocks,
+			spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
+					  num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+				struct iovec *iov, int iovcnt, void *md_buf,
+				uint64_t offset_blocks, uint64_t num_blocks,
+				spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+		return -EINVAL;
+	}
+
+	if (!_bdev_io_check_md_buf(iov, md_buf)) {
+		return -EINVAL;
+	}
+
+	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
+					  num_blocks, cb, cb_arg);
+}
+
+static void
+bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *parent_io = cb_arg;
+	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
+	int i, rc = 0;
+
+	if (!success) {
+		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
+		spdk_bdev_free_io(bdev_io);
+		return;
+	}
+
+	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
+		rc = memcmp(read_buf,
+			    parent_io->u.bdev.iovs[i].iov_base,
+			    parent_io->u.bdev.iovs[i].iov_len);
+		if (rc) {
+			break;
+		}
+		read_buf += parent_io->u.bdev.iovs[i].iov_len;
+	}
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (rc == 0) {
+		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
+	} else {
+		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
+		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
+	}
+}
+
+static void
+bdev_compare_do_read(void *_bdev_io)
+{
+	struct spdk_bdev_io *bdev_io = _bdev_io;
+	int rc;
+
+	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
+				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
+				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+				   bdev_compare_do_read_done, bdev_io);
+
+	if (rc == -ENOMEM) {
+		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
+	} else if (rc != 0) {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+	}
+}
+
+static int
+bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			     struct iovec *iov, int iovcnt, void *md_buf,
+			     uint64_t offset_blocks, uint64_t num_blocks,
+			     spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
+	bdev_io->u.bdev.iovs = iov;
+	bdev_io->u.bdev.iovcnt = iovcnt;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
+		bdev_io_submit(bdev_io);
+		return 0;
+	}
+
+	bdev_compare_do_read(bdev_io);
+
+	return 0;
+}
+
+int
+spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			  struct iovec *iov, int iovcnt,
+			  uint64_t offset_blocks, uint64_t num_blocks,
+			  spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
+					    num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+				  struct iovec *iov, int iovcnt, void *md_buf,
+				  uint64_t offset_blocks, uint64_t num_blocks,
+				  spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+		return -EINVAL;
+	}
+
+	if (!_bdev_io_check_md_buf(iov, md_buf)) {
+		return -EINVAL;
+	}
+
+	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
+					    num_blocks, cb, cb_arg);
+}
+
+static int
+bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+			    spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
+	bdev_io->u.bdev.iovs = &bdev_io->iov;
+	bdev_io->u.bdev.iovs[0].iov_base = buf;
+	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+	bdev_io->u.bdev.iovcnt = 1;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
+		bdev_io_submit(bdev_io);
+		return 0;
+	}
+
+	bdev_compare_do_read(bdev_io);
+
+	return 0;
+}
+
+int
+spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
+			 spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
+					   cb, cb_arg);
+}
+
+int
+spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+				 spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct iovec iov = {
+		.iov_base = buf,
+	};
+
+	if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+		return -EINVAL;
+	}
+
+	if (!_bdev_io_check_md_buf(&iov, md_buf)) {
+		return -EINVAL;
+	}
+
+	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
+					   cb, cb_arg);
+}
+
+static void
+bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status)
+{
+	struct spdk_bdev_io *bdev_io = ctx;
+
+	if (unlock_status) {
+		SPDK_ERRLOG("LBA range unlock failed\n");
+	}
+
+	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
+			     false, bdev_io->internal.caller_ctx);
+}
+
+static void
+bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
+{
+	bdev_io->internal.status = status;
+
+	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
+			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
+}
+
+static void
+bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *parent_io = cb_arg;
+
+	if (!success) {
+		SPDK_ERRLOG("Compare and write operation failed\n");
+	}
+
+	spdk_bdev_free_io(bdev_io);
+
+	bdev_comparev_and_writev_blocks_unlock(parent_io,
+					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static void
+bdev_compare_and_write_do_write(void *_bdev_io)
+{
+	struct spdk_bdev_io *bdev_io = _bdev_io;
+	int rc;
+
+	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
+				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
+				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
+				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+				     bdev_compare_and_write_do_write_done, bdev_io);
+
+
+	if (rc == -ENOMEM) {
+		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
+	} else if (rc != 0) {
+		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+	}
+}
+
+static void
+bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *parent_io = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (!success) {
+		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
+		return;
+	}
+
+	bdev_compare_and_write_do_write(parent_io);
+}
+
+static void
+bdev_compare_and_write_do_compare(void *_bdev_io)
+{
+	struct spdk_bdev_io *bdev_io = _bdev_io;
+	int rc;
+
+	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
+				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
+				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+				       bdev_compare_and_write_do_compare_done, bdev_io);
+
+	if (rc == -ENOMEM) {
+		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
+	} else if (rc != 0) {
+		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
+	}
+}
+
+static void
+bdev_comparev_and_writev_blocks_locked(void *ctx, int status)
+{
+	struct spdk_bdev_io *bdev_io = ctx;
+
+	if (status) {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
+		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+	}
+
+	bdev_compare_and_write_do_compare(bdev_io);
+}
+
+int
+spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+				     struct iovec *compare_iov, int compare_iovcnt,
+				     struct iovec *write_iov, int write_iovcnt,
+				     uint64_t offset_blocks, uint64_t num_blocks,
+				     spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	if (num_blocks > bdev->acwu) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
+	bdev_io->u.bdev.iovs = compare_iov;
+	bdev_io->u.bdev.iovcnt = compare_iovcnt;
+	bdev_io->u.bdev.fused_iovs = write_iov;
+	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
+	bdev_io->u.bdev.md_buf = NULL;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
+		bdev_io_submit(bdev_io);
+		return 0;
+	}
+
+	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
+				   bdev_comparev_and_writev_blocks_locked, bdev_io);
+}
+
+static void
+bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+	if (!success) {
+		/* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
+		bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx);
+		return;
+	}
+
+	if (bdev_io->u.bdev.zcopy.populate) {
+		/* Read the real data into the buffer */
+		bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+		bdev_io_submit(bdev_io);
+		return;
+	}
+
+	/* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */
+	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+	bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx);
+}
+
+int
+spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		      uint64_t offset_blocks, uint64_t num_blocks,
+		      bool populate,
+		      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
+		return -ENOTSUP;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io->u.bdev.iovs = NULL;
+	bdev_io->u.bdev.iovcnt = 0;
+	bdev_io->u.bdev.md_buf = NULL;
+	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
+	bdev_io->u.bdev.zcopy.commit = 0;
+	bdev_io->u.bdev.zcopy.start = 1;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
+		bdev_io_submit(bdev_io);
+	} else {
+		/* Emulate zcopy by allocating a buffer */
+		spdk_bdev_io_get_buf(bdev_io, bdev_zcopy_get_buf,
+				     bdev_io->u.bdev.num_blocks * bdev->blocklen);
+	}
+
+	return 0;
+}
+
+int
+spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
+		    spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+
+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+		/* This can happen if the zcopy was emulated in start */
+		if (bdev_io->u.bdev.zcopy.start != 1) {
+			return -EINVAL;
+		}
+		bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
+	}
+
+	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
+		return -EINVAL;
+	}
+
+	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
+	bdev_io->u.bdev.zcopy.start = 0;
+	bdev_io->internal.caller_ctx = cb_arg;
+	bdev_io->internal.cb = cb;
+	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+
+	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
+		bdev_io_submit(bdev_io);
+		return 0;
+	}
+
+	if (!bdev_io->u.bdev.zcopy.commit) {
+		/* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+		bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
+		return 0;
+	}
+
+	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
+	bdev_io_submit(bdev_io);
+
+	return 0;
+}
+
+int
+spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		       uint64_t offset, uint64_t len,
+		       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 len, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			      uint64_t offset_blocks, uint64_t num_blocks,
+			      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
+	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
+		return -ENOTSUP;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+		bdev_io_submit(bdev_io);
+		return 0;
+	}
+
+	assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE));
+	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
+	bdev_io->u.bdev.split_remaining_num_blocks = num_blocks;
+	bdev_io->u.bdev.split_current_offset_blocks = offset_blocks;
+	bdev_write_zero_buffer_next(bdev_io);
+
+	return 0;
+}
+
+int
+spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		uint64_t offset, uint64_t nbytes,
+		spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 nbytes, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		       uint64_t offset_blocks, uint64_t num_blocks,
+		       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	if (num_blocks == 0) {
+		SPDK_ERRLOG("Can't unmap 0 bytes\n");
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
+
+	bdev_io->u.bdev.iovs = &bdev_io->iov;
+	bdev_io->u.bdev.iovs[0].iov_base = NULL;
+	bdev_io->u.bdev.iovs[0].iov_len = 0;
+	bdev_io->u.bdev.iovcnt = 1;
+
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		uint64_t offset, uint64_t length,
+		spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	uint64_t offset_blocks, num_blocks;
+
+	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+				 length, &num_blocks) != 0) {
+		return -EINVAL;
+	}
+
+	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		       uint64_t offset_blocks, uint64_t num_blocks,
+		       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+		return -EINVAL;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
+	bdev_io->u.bdev.iovs = NULL;
+	bdev_io->u.bdev.iovcnt = 0;
+	bdev_io->u.bdev.offset_blocks = offset_blocks;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+static void
+bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_bdev_io *bdev_io;
+
+	bdev_io = TAILQ_FIRST(&ch->queued_resets);
+	TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
+	bdev_io_submit_reset(bdev_io);
+}
+
+static void
+bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel		*ch;
+	struct spdk_bdev_channel	*channel;
+	struct spdk_bdev_mgmt_channel	*mgmt_channel;
+	struct spdk_bdev_shared_resource *shared_resource;
+	bdev_io_tailq_t			tmp_queued;
+
+	TAILQ_INIT(&tmp_queued);
+
+	ch = spdk_io_channel_iter_get_channel(i);
+	channel = spdk_io_channel_get_ctx(ch);
+	shared_resource = channel->shared_resource;
+	mgmt_channel = shared_resource->mgmt_ch;
+
+	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
+
+	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
+		/* The QoS object is always valid and readable while
+		 * the channel flag is set, so the lock here should not
+		 * be necessary. We're not in the fast path though, so
+		 * just take it anyway. */
+		pthread_mutex_lock(&channel->bdev->internal.mutex);
+		if (channel->bdev->internal.qos->ch == channel) {
+			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
+		}
+		pthread_mutex_unlock(&channel->bdev->internal.mutex);
+	}
+
+	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
+	bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel);
+	bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel);
+	bdev_abort_all_queued_io(&tmp_queued, channel);
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_start_reset(void *ctx)
+{
+	struct spdk_bdev_channel *ch = ctx;
+
+	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
+			      ch, bdev_reset_dev);
+}
+
+static void
+bdev_channel_start_reset(struct spdk_bdev_channel *ch)
+{
+	struct spdk_bdev *bdev = ch->bdev;
+
+	assert(!TAILQ_EMPTY(&ch->queued_resets));
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev->internal.reset_in_progress == NULL) {
+		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
+		/*
+		 * Take a channel reference for the target bdev for the life of this
+		 *  reset.  This guards against the channel getting destroyed while
+		 *  spdk_for_each_channel() calls related to this reset IO are in
+		 *  progress.  We will release the reference when this reset is
+		 *  completed.
+		 */
+		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
+		bdev_start_reset(ch);
+	}
+	pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+int
+spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->internal.submit_tsc = spdk_get_ticks();
+	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
+	bdev_io->u.reset.ch_ref = NULL;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
+			  internal.ch_link);
+
+	bdev_channel_start_reset(channel);
+
+	return 0;
+}
+
+void
+spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
+		      struct spdk_bdev_io_stat *stat)
+{
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	*stat = channel->stat;
+}
+
+static void
+bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status)
+{
+	void *io_device = spdk_io_channel_iter_get_io_device(i);
+	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
+
+	bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat,
+			    bdev_iostat_ctx->cb_arg, 0);
+	free(bdev_iostat_ctx);
+}
+
+static void
+bdev_get_each_channel_stat(struct spdk_io_channel_iter *i)
+{
+	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat);
+	spdk_for_each_channel_continue(i, 0);
+}
+
+void
+spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
+			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
+{
+	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
+
+	assert(bdev != NULL);
+	assert(stat != NULL);
+	assert(cb != NULL);
+
+	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
+	if (bdev_iostat_ctx == NULL) {
+		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
+		cb(bdev, stat, cb_arg, -ENOMEM);
+		return;
+	}
+
+	bdev_iostat_ctx->stat = stat;
+	bdev_iostat_ctx->cb = cb;
+	bdev_iostat_ctx->cb_arg = cb_arg;
+
+	/* Start with the statistics from previously deleted channels. */
+	pthread_mutex_lock(&bdev->internal.mutex);
+	bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat);
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	/* Then iterate and add the statistics from each existing channel. */
+	spdk_for_each_channel(__bdev_to_io_dev(bdev),
+			      bdev_get_each_channel_stat,
+			      bdev_iostat_ctx,
+			      bdev_get_device_stat_done);
+}
+
+int
+spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
+			      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		return -EBADF;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
+	bdev_io->u.nvme_passthru.cmd = *cmd;
+	bdev_io->u.nvme_passthru.buf = buf;
+	bdev_io->u.nvme_passthru.nbytes = nbytes;
+	bdev_io->u.nvme_passthru.md_buf = NULL;
+	bdev_io->u.nvme_passthru.md_len = 0;
+
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
+			   spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		/*
+		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
+		 *  to easily determine if the command is a read or write, but for now just
+		 *  do not allow io_passthru with a read-only descriptor.
+		 */
+		return -EBADF;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
+	bdev_io->u.nvme_passthru.cmd = *cmd;
+	bdev_io->u.nvme_passthru.buf = buf;
+	bdev_io->u.nvme_passthru.nbytes = nbytes;
+	bdev_io->u.nvme_passthru.md_buf = NULL;
+	bdev_io->u.nvme_passthru.md_len = 0;
+
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
+			      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	if (!desc->write) {
+		/*
+		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
+		 *  to easily determine if the command is a read or write, but for now just
+		 *  do not allow io_passthru with a read-only descriptor.
+		 */
+		return -EBADF;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
+	bdev_io->u.nvme_passthru.cmd = *cmd;
+	bdev_io->u.nvme_passthru.buf = buf;
+	bdev_io->u.nvme_passthru.nbytes = nbytes;
+	bdev_io->u.nvme_passthru.md_buf = md_buf;
+	bdev_io->u.nvme_passthru.md_len = md_len;
+
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+static void bdev_abort_retry(void *ctx);
+static void bdev_abort(struct spdk_bdev_io *parent_io);
+
+static void
+bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
+	struct spdk_bdev_io *parent_io = cb_arg;
+	struct spdk_bdev_io *bio_to_abort, *tmp_io;
+
+	bio_to_abort = bdev_io->u.abort.bio_to_abort;
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (!success) {
+		/* Check if the target I/O completed in the meantime. */
+		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
+			if (tmp_io == bio_to_abort) {
+				break;
+			}
+		}
+
+		/* If the target I/O still exists, set the parent to failed. */
+		if (tmp_io != NULL) {
+			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		}
+	}
+
+	parent_io->u.bdev.split_outstanding--;
+	if (parent_io->u.bdev.split_outstanding == 0) {
+		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			bdev_abort_retry(parent_io);
+		} else {
+			bdev_io_complete(parent_io);
+		}
+	}
+}
+
+static int
+bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
+	      struct spdk_bdev_io *bio_to_abort,
+	      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+
+	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
+	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
+		/* TODO: Abort reset or abort request. */
+		return -ENOTSUP;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (bdev_io == NULL) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) {
+		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
+
+		/* Parent abort request is not submitted directly, but to manage its
+		 * execution add it to the submitted list here.
+		 */
+		bdev_io->internal.submit_tsc = spdk_get_ticks();
+		TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
+
+		bdev_abort(bdev_io);
+
+		return 0;
+	}
+
+	bdev_io->u.abort.bio_to_abort = bio_to_abort;
+
+	/* Submit the abort request to the underlying bdev module. */
+	bdev_io_submit(bdev_io);
+
+	return 0;
+}
+
+static uint32_t
+_bdev_abort(struct spdk_bdev_io *parent_io)
+{
+	struct spdk_bdev_desc *desc = parent_io->internal.desc;
+	struct spdk_bdev_channel *channel = parent_io->internal.ch;
+	void *bio_cb_arg;
+	struct spdk_bdev_io *bio_to_abort;
+	uint32_t matched_ios;
+	int rc;
+
+	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
+
+	/* matched_ios is returned and will be kept by the caller.
+	 *
+	 * This funcion will be used for two cases, 1) the same cb_arg is used for
+	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
+	 * Incrementing split_outstanding directly here may confuse readers especially
+	 * for the 1st case.
+	 *
+	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
+	 * works as expected.
+	 */
+	matched_ios = 0;
+	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
+		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
+			continue;
+		}
+
+		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
+			/* Any I/O which was submitted after this abort command should be excluded. */
+			continue;
+		}
+
+		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
+		if (rc != 0) {
+			if (rc == -ENOMEM) {
+				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
+			} else {
+				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+			}
+			break;
+		}
+		matched_ios++;
+	}
+
+	return matched_ios;
+}
+
+static void
+bdev_abort_retry(void *ctx)
+{
+	struct spdk_bdev_io *parent_io = ctx;
+	uint32_t matched_ios;
+
+	matched_ios = _bdev_abort(parent_io);
+
+	if (matched_ios == 0) {
+		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
+		} else {
+			/* For retry, the case that no target I/O was found is success
+			 * because it means target I/Os completed in the meantime.
+			 */
+			bdev_io_complete(parent_io);
+		}
+		return;
+	}
+
+	/* Use split_outstanding to manage the progress of aborting I/Os. */
+	parent_io->u.bdev.split_outstanding = matched_ios;
+}
+
+static void
+bdev_abort(struct spdk_bdev_io *parent_io)
+{
+	uint32_t matched_ios;
+
+	matched_ios = _bdev_abort(parent_io);
+
+	if (matched_ios == 0) {
+		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
+		} else {
+			/* The case the no target I/O was found is failure. */
+			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+			bdev_io_complete(parent_io);
+		}
+		return;
+	}
+
+	/* Use split_outstanding to manage the progress of aborting I/Os. */
+	parent_io->u.bdev.split_outstanding = matched_ios;
+}
+
+int
+spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		void *bio_cb_arg,
+		spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+	struct spdk_bdev_io *bdev_io;
+
+	if (bio_cb_arg == NULL) {
+		return -EINVAL;
+	}
+
+	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
+		return -ENOTSUP;
+	}
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (bdev_io == NULL) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->internal.submit_tsc = spdk_get_ticks();
+	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
+
+	/* Parent abort request is not submitted directly, but to manage its execution,
+	 * add it to the submitted list here.
+	 */
+	TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
+
+	bdev_abort(bdev_io);
+
+	return 0;
+}
+
+int
+spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
+			struct spdk_bdev_io_wait_entry *entry)
+{
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
+
+	if (bdev != entry->bdev) {
+		SPDK_ERRLOG("bdevs do not match\n");
+		return -EINVAL;
+	}
+
+	if (mgmt_ch->per_thread_cache_count > 0) {
+		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
+		return -EINVAL;
+	}
+
+	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
+	return 0;
+}
+
+static void
+bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
+{
+	struct spdk_bdev *bdev = bdev_ch->bdev;
+	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+	struct spdk_bdev_io *bdev_io;
+
+	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
+		/*
+		 * Allow some more I/O to complete before retrying the nomem_io queue.
+		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
+		 *  the context of a completion, because the resources for the I/O are
+		 *  not released until control returns to the bdev poller.  Also, we
+		 *  may require several small I/O to complete before a larger I/O
+		 *  (that requires splitting) can be submitted.
+		 */
+		return;
+	}
+
+	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
+		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
+		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
+		bdev_io->internal.ch->io_outstanding++;
+		shared_resource->io_outstanding++;
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+		bdev_io->internal.error.nvme.cdw0 = 0;
+		bdev_io->num_retries++;
+		bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
+		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			break;
+		}
+	}
+}
+
+static inline void
+bdev_io_complete(void *ctx)
+{
+	struct spdk_bdev_io *bdev_io = ctx;
+	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+	uint64_t tsc, tsc_diff;
+
+	if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) {
+		/*
+		 * Send the completion to the thread that originally submitted the I/O,
+		 * which may not be the current thread in the case of QoS.
+		 */
+		if (bdev_io->internal.io_submit_ch) {
+			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
+			bdev_io->internal.io_submit_ch = NULL;
+		}
+
+		/*
+		 * Defer completion to avoid potential infinite recursion if the
+		 * user's completion callback issues a new I/O.
+		 */
+		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
+				     bdev_io_complete, bdev_io);
+		return;
+	}
+
+	tsc = spdk_get_ticks();
+	tsc_diff = tsc - bdev_io->internal.submit_tsc;
+	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0);
+
+	TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
+
+	if (bdev_io->internal.ch->histogram) {
+		spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
+	}
+
+	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+		switch (bdev_io->type) {
+		case SPDK_BDEV_IO_TYPE_READ:
+			bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+			bdev_io->internal.ch->stat.num_read_ops++;
+			bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff;
+			break;
+		case SPDK_BDEV_IO_TYPE_WRITE:
+			bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+			bdev_io->internal.ch->stat.num_write_ops++;
+			bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff;
+			break;
+		case SPDK_BDEV_IO_TYPE_UNMAP:
+			bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+			bdev_io->internal.ch->stat.num_unmap_ops++;
+			bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff;
+			break;
+		case SPDK_BDEV_IO_TYPE_ZCOPY:
+			/* Track the data in the start phase only */
+			if (bdev_io->u.bdev.zcopy.start) {
+				if (bdev_io->u.bdev.zcopy.populate) {
+					bdev_io->internal.ch->stat.bytes_read +=
+						bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+					bdev_io->internal.ch->stat.num_read_ops++;
+					bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff;
+				} else {
+					bdev_io->internal.ch->stat.bytes_written +=
+						bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+					bdev_io->internal.ch->stat.num_write_ops++;
+					bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+#ifdef SPDK_CONFIG_VTUNE
+	uint64_t now_tsc = spdk_get_ticks();
+	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
+		uint64_t data[5];
+
+		data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops;
+		data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read;
+		data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops;
+		data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written;
+		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
+			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
+
+		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
+				   __itt_metadata_u64, 5, data);
+
+		bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat;
+		bdev_io->internal.ch->start_tsc = now_tsc;
+	}
+#endif
+
+	assert(bdev_io->internal.cb != NULL);
+	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
+
+	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
+			     bdev_io->internal.caller_ctx);
+}
+
+static void
+bdev_reset_complete(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
+
+	if (bdev_io->u.reset.ch_ref != NULL) {
+		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
+		bdev_io->u.reset.ch_ref = NULL;
+	}
+
+	bdev_io_complete(bdev_io);
+}
+
+static void
+bdev_unfreeze_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct spdk_bdev_io *queued_reset;
+
+	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
+	while (!TAILQ_EMPTY(&ch->queued_resets)) {
+		queued_reset = TAILQ_FIRST(&ch->queued_resets);
+		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
+		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+void
+spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+
+	bdev_io->internal.status = status;
+
+	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
+		bool unlock_channels = false;
+
+		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			SPDK_ERRLOG("NOMEM returned for reset\n");
+		}
+		pthread_mutex_lock(&bdev->internal.mutex);
+		if (bdev_io == bdev->internal.reset_in_progress) {
+			bdev->internal.reset_in_progress = NULL;
+			unlock_channels = true;
+		}
+		pthread_mutex_unlock(&bdev->internal.mutex);
+
+		if (unlock_channels) {
+			spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel,
+					      bdev_io, bdev_reset_complete);
+			return;
+		}
+	} else {
+		_bdev_io_unset_bounce_buf(bdev_io);
+
+		assert(bdev_ch->io_outstanding > 0);
+		assert(shared_resource->io_outstanding > 0);
+		bdev_ch->io_outstanding--;
+		shared_resource->io_outstanding--;
+
+		if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) {
+			TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
+			/*
+			 * Wait for some of the outstanding I/O to complete before we
+			 *  retry any of the nomem_io.  Normally we will wait for
+			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
+			 *  depth channels we will instead wait for half to complete.
+			 */
+			shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
+							   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
+			return;
+		}
+
+		if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
+			bdev_ch_retry_io(bdev_ch);
+		}
+	}
+
+	bdev_io_complete(bdev_io);
+}
+
+void
+spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
+				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
+{
+	if (sc == SPDK_SCSI_STATUS_GOOD) {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+	} else {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
+		bdev_io->internal.error.scsi.sc = sc;
+		bdev_io->internal.error.scsi.sk = sk;
+		bdev_io->internal.error.scsi.asc = asc;
+		bdev_io->internal.error.scsi.ascq = ascq;
+	}
+
+	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
+}
+
+void
+spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
+			     int *sc, int *sk, int *asc, int *ascq)
+{
+	assert(sc != NULL);
+	assert(sk != NULL);
+	assert(asc != NULL);
+	assert(ascq != NULL);
+
+	switch (bdev_io->internal.status) {
+	case SPDK_BDEV_IO_STATUS_SUCCESS:
+		*sc = SPDK_SCSI_STATUS_GOOD;
+		*sk = SPDK_SCSI_SENSE_NO_SENSE;
+		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
+		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
+		break;
+	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
+		*sc = bdev_io->internal.error.scsi.sc;
+		*sk = bdev_io->internal.error.scsi.sk;
+		*asc = bdev_io->internal.error.scsi.asc;
+		*ascq = bdev_io->internal.error.scsi.ascq;
+		break;
+	default:
+		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
+		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	}
+}
+
+void
+spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
+{
+	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+	} else {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
+	}
+
+	bdev_io->internal.error.nvme.cdw0 = cdw0;
+	bdev_io->internal.error.nvme.sct = sct;
+	bdev_io->internal.error.nvme.sc = sc;
+
+	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
+}
+
+void
+spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
+{
+	assert(sct != NULL);
+	assert(sc != NULL);
+	assert(cdw0 != NULL);
+
+	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
+		*sct = bdev_io->internal.error.nvme.sct;
+		*sc = bdev_io->internal.error.nvme.sc;
+	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+		*sct = SPDK_NVME_SCT_GENERIC;
+		*sc = SPDK_NVME_SC_SUCCESS;
+	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
+		*sct = SPDK_NVME_SCT_GENERIC;
+		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+	} else {
+		*sct = SPDK_NVME_SCT_GENERIC;
+		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+	}
+
+	*cdw0 = bdev_io->internal.error.nvme.cdw0;
+}
+
+void
+spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
+				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
+{
+	assert(first_sct != NULL);
+	assert(first_sc != NULL);
+	assert(second_sct != NULL);
+	assert(second_sc != NULL);
+	assert(cdw0 != NULL);
+
+	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
+		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
+		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
+			*first_sct = bdev_io->internal.error.nvme.sct;
+			*first_sc = bdev_io->internal.error.nvme.sc;
+			*second_sct = SPDK_NVME_SCT_GENERIC;
+			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+		} else {
+			*first_sct = SPDK_NVME_SCT_GENERIC;
+			*first_sc = SPDK_NVME_SC_SUCCESS;
+			*second_sct = bdev_io->internal.error.nvme.sct;
+			*second_sc = bdev_io->internal.error.nvme.sc;
+		}
+	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+		*first_sct = SPDK_NVME_SCT_GENERIC;
+		*first_sc = SPDK_NVME_SC_SUCCESS;
+		*second_sct = SPDK_NVME_SCT_GENERIC;
+		*second_sc = SPDK_NVME_SC_SUCCESS;
+	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
+		*first_sct = SPDK_NVME_SCT_GENERIC;
+		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		*second_sct = SPDK_NVME_SCT_GENERIC;
+		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
+		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
+		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
+		*second_sct = SPDK_NVME_SCT_GENERIC;
+		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+	} else {
+		*first_sct = SPDK_NVME_SCT_GENERIC;
+		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		*second_sct = SPDK_NVME_SCT_GENERIC;
+		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+	}
+
+	*cdw0 = bdev_io->internal.error.nvme.cdw0;
+}
+
+struct spdk_thread *
+spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
+{
+	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
+}
+
+struct spdk_io_channel *
+spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
+{
+	return bdev_io->internal.ch->channel;
+}
+
+static void
+bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits)
+{
+	uint64_t	min_qos_set;
+	int		i;
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+			break;
+		}
+	}
+
+	if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) {
+		SPDK_ERRLOG("Invalid rate limits set.\n");
+		return;
+	}
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+			continue;
+		}
+
+		if (bdev_qos_is_iops_rate_limit(i) == true) {
+			min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
+		} else {
+			min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
+		}
+
+		if (limits[i] == 0 || limits[i] % min_qos_set) {
+			SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n",
+				    limits[i], bdev->name, min_qos_set);
+			SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name);
+			return;
+		}
+	}
+
+	if (!bdev->internal.qos) {
+		bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
+		if (!bdev->internal.qos) {
+			SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
+			return;
+		}
+	}
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		bdev->internal.qos->rate_limits[i].limit = limits[i];
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n",
+			      bdev->name, i, limits[i]);
+	}
+
+	return;
+}
+
+static void
+bdev_qos_config(struct spdk_bdev *bdev)
+{
+	struct spdk_conf_section	*sp = NULL;
+	const char			*val = NULL;
+	int				i = 0, j = 0;
+	uint64_t			limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {};
+	bool				config_qos = false;
+
+	sp = spdk_conf_find_section(NULL, "QoS");
+	if (!sp) {
+		return;
+	}
+
+	while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) {
+		limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
+
+		i = 0;
+		while (true) {
+			val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0);
+			if (!val) {
+				break;
+			}
+
+			if (strcmp(bdev->name, val) != 0) {
+				i++;
+				continue;
+			}
+
+			val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1);
+			if (val) {
+				if (bdev_qos_is_iops_rate_limit(j) == true) {
+					limits[j] = strtoull(val, NULL, 10);
+				} else {
+					limits[j] = strtoull(val, NULL, 10) * 1024 * 1024;
+				}
+				config_qos = true;
+			}
+
+			break;
+		}
+
+		j++;
+	}
+
+	if (config_qos == true) {
+		bdev_qos_config_limit(bdev, limits);
+	}
+
+	return;
+}
+
+static int
+bdev_init(struct spdk_bdev *bdev)
+{
+	char *bdev_name;
+
+	assert(bdev->module != NULL);
+
+	if (!bdev->name) {
+		SPDK_ERRLOG("Bdev name is NULL\n");
+		return -EINVAL;
+	}
+
+	if (!strlen(bdev->name)) {
+		SPDK_ERRLOG("Bdev name must not be an empty string\n");
+		return -EINVAL;
+	}
+
+	if (spdk_bdev_get_by_name(bdev->name)) {
+		SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name);
+		return -EEXIST;
+	}
+
+	/* Users often register their own I/O devices using the bdev name. In
+	 * order to avoid conflicts, prepend bdev_. */
+	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
+	if (!bdev_name) {
+		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
+		return -ENOMEM;
+	}
+
+	bdev->internal.status = SPDK_BDEV_STATUS_READY;
+	bdev->internal.measured_queue_depth = UINT64_MAX;
+	bdev->internal.claim_module = NULL;
+	bdev->internal.qd_poller = NULL;
+	bdev->internal.qos = NULL;
+
+	/* If the user didn't specify a uuid, generate one. */
+	if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) {
+		spdk_uuid_generate(&bdev->uuid);
+	}
+
+	if (spdk_bdev_get_buf_align(bdev) > 1) {
+		if (bdev->split_on_optimal_io_boundary) {
+			bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
+							     SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
+		} else {
+			bdev->split_on_optimal_io_boundary = true;
+			bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
+		}
+	}
+
+	/* If the user didn't specify a write unit size, set it to one. */
+	if (bdev->write_unit_size == 0) {
+		bdev->write_unit_size = 1;
+	}
+
+	/* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */
+	if (bdev->acwu == 0) {
+		bdev->acwu = 1;
+	}
+
+	TAILQ_INIT(&bdev->internal.open_descs);
+	TAILQ_INIT(&bdev->internal.locked_ranges);
+	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
+
+	TAILQ_INIT(&bdev->aliases);
+
+	bdev->internal.reset_in_progress = NULL;
+
+	bdev_qos_config(bdev);
+
+	spdk_io_device_register(__bdev_to_io_dev(bdev),
+				bdev_channel_create, bdev_channel_destroy,
+				sizeof(struct spdk_bdev_channel),
+				bdev_name);
+
+	free(bdev_name);
+
+	pthread_mutex_init(&bdev->internal.mutex, NULL);
+	return 0;
+}
+
+static void
+bdev_destroy_cb(void *io_device)
+{
+	int			rc;
+	struct spdk_bdev	*bdev;
+	spdk_bdev_unregister_cb	cb_fn;
+	void			*cb_arg;
+
+	bdev = __bdev_from_io_dev(io_device);
+	cb_fn = bdev->internal.unregister_cb;
+	cb_arg = bdev->internal.unregister_ctx;
+
+	rc = bdev->fn_table->destruct(bdev->ctxt);
+	if (rc < 0) {
+		SPDK_ERRLOG("destruct failed\n");
+	}
+	if (rc <= 0 && cb_fn != NULL) {
+		cb_fn(cb_arg, rc);
+	}
+}
+
+
+static void
+bdev_fini(struct spdk_bdev *bdev)
+{
+	pthread_mutex_destroy(&bdev->internal.mutex);
+
+	free(bdev->internal.qos);
+
+	spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
+}
+
+static void
+bdev_start(struct spdk_bdev *bdev)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name);
+	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
+
+	/* Examine configuration before initializing I/O */
+	bdev_examine(bdev);
+}
+
+int
+spdk_bdev_register(struct spdk_bdev *bdev)
+{
+	int rc = bdev_init(bdev);
+
+	if (rc == 0) {
+		bdev_start(bdev);
+	}
+
+	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
+	return rc;
+}
+
+int
+spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
+{
+	SPDK_ERRLOG("This function is deprecated.  Use spdk_bdev_register() instead.\n");
+	return spdk_bdev_register(vbdev);
+}
+
+void
+spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
+{
+	if (bdev->internal.unregister_cb != NULL) {
+		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
+	}
+}
+
+static void
+_remove_notify(void *arg)
+{
+	struct spdk_bdev_desc *desc = arg;
+
+	pthread_mutex_lock(&desc->mutex);
+	desc->refs--;
+
+	if (!desc->closed) {
+		pthread_mutex_unlock(&desc->mutex);
+		if (desc->callback.open_with_ext) {
+			desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx);
+		} else {
+			desc->callback.remove_fn(desc->callback.ctx);
+		}
+		return;
+	} else if (0 == desc->refs) {
+		/* This descriptor was closed after this remove_notify message was sent.
+		 * spdk_bdev_close() could not free the descriptor since this message was
+		 * in flight, so we free it now using bdev_desc_free().
+		 */
+		pthread_mutex_unlock(&desc->mutex);
+		bdev_desc_free(desc);
+		return;
+	}
+	pthread_mutex_unlock(&desc->mutex);
+}
+
+/* Must be called while holding bdev->internal.mutex.
+ * returns: 0 - bdev removed and ready to be destructed.
+ *          -EBUSY - bdev can't be destructed yet.  */
+static int
+bdev_unregister_unsafe(struct spdk_bdev *bdev)
+{
+	struct spdk_bdev_desc	*desc, *tmp;
+	int			rc = 0;
+
+	/* Notify each descriptor about hotremoval */
+	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
+		rc = -EBUSY;
+		pthread_mutex_lock(&desc->mutex);
+		/*
+		 * Defer invocation of the event_cb to a separate message that will
+		 *  run later on its thread.  This ensures this context unwinds and
+		 *  we don't recursively unregister this bdev again if the event_cb
+		 *  immediately closes its descriptor.
+		 */
+		desc->refs++;
+		spdk_thread_send_msg(desc->thread, _remove_notify, desc);
+		pthread_mutex_unlock(&desc->mutex);
+	}
+
+	/* If there are no descriptors, proceed removing the bdev */
+	if (rc == 0) {
+		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list done\n", bdev->name);
+		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
+	}
+
+	return rc;
+}
+
+void
+spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
+{
+	struct spdk_thread	*thread;
+	int			rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name);
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		/* The user called this from a non-SPDK thread. */
+		if (cb_fn != NULL) {
+			cb_fn(cb_arg, -ENOTSUP);
+		}
+		return;
+	}
+
+	pthread_mutex_lock(&g_bdev_mgr.mutex);
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		pthread_mutex_unlock(&g_bdev_mgr.mutex);
+		if (cb_fn) {
+			cb_fn(cb_arg, -EBUSY);
+		}
+		return;
+	}
+
+	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
+	bdev->internal.unregister_cb = cb_fn;
+	bdev->internal.unregister_ctx = cb_arg;
+
+	/* Call under lock. */
+	rc = bdev_unregister_unsafe(bdev);
+	pthread_mutex_unlock(&bdev->internal.mutex);
+	pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+	if (rc == 0) {
+		bdev_fini(bdev);
+	}
+}
+
+static void
+bdev_dummy_event_cb(void *remove_ctx)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev remove event received with no remove callback specified");
+}
+
+static int
+bdev_start_qos(struct spdk_bdev *bdev)
+{
+	struct set_qos_limit_ctx *ctx;
+
+	/* Enable QoS */
+	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
+		ctx = calloc(1, sizeof(*ctx));
+		if (ctx == NULL) {
+			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
+			return -ENOMEM;
+		}
+		ctx->bdev = bdev;
+		spdk_for_each_channel(__bdev_to_io_dev(bdev),
+				      bdev_enable_qos_msg, ctx,
+				      bdev_enable_qos_done);
+	}
+
+	return 0;
+}
+
+static int
+bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
+{
+	struct spdk_thread *thread;
+	int rc = 0;
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
+		return -ENOTSUP;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
+		      spdk_get_thread());
+
+	desc->bdev = bdev;
+	desc->thread = thread;
+	desc->write = write;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		return -ENODEV;
+	}
+
+	if (write && bdev->internal.claim_module) {
+		SPDK_ERRLOG("Could not open %s - %s module already claimed it\n",
+			    bdev->name, bdev->internal.claim_module->name);
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		return -EPERM;
+	}
+
+	rc = bdev_start_qos(bdev);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		return rc;
+	}
+
+	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
+
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	return 0;
+}
+
+int
+spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
+	       void *remove_ctx, struct spdk_bdev_desc **_desc)
+{
+	struct spdk_bdev_desc *desc;
+	int rc;
+
+	desc = calloc(1, sizeof(*desc));
+	if (desc == NULL) {
+		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
+		return -ENOMEM;
+	}
+
+	if (remove_cb == NULL) {
+		remove_cb = bdev_dummy_event_cb;
+	}
+
+	TAILQ_INIT(&desc->pending_media_events);
+	TAILQ_INIT(&desc->free_media_events);
+
+	desc->callback.open_with_ext = false;
+	desc->callback.remove_fn = remove_cb;
+	desc->callback.ctx = remove_ctx;
+	pthread_mutex_init(&desc->mutex, NULL);
+
+	pthread_mutex_lock(&g_bdev_mgr.mutex);
+
+	rc = bdev_open(bdev, write, desc);
+	if (rc != 0) {
+		bdev_desc_free(desc);
+		desc = NULL;
+	}
+
+	*_desc = desc;
+
+	pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+	return rc;
+}
+
+int
+spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
+		   void *event_ctx, struct spdk_bdev_desc **_desc)
+{
+	struct spdk_bdev_desc *desc;
+	struct spdk_bdev *bdev;
+	unsigned int event_id;
+	int rc;
+
+	if (event_cb == NULL) {
+		SPDK_ERRLOG("Missing event callback function\n");
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&g_bdev_mgr.mutex);
+
+	bdev = spdk_bdev_get_by_name(bdev_name);
+
+	if (bdev == NULL) {
+		SPDK_ERRLOG("Failed to find bdev with name: %s\n", bdev_name);
+		pthread_mutex_unlock(&g_bdev_mgr.mutex);
+		return -EINVAL;
+	}
+
+	desc = calloc(1, sizeof(*desc));
+	if (desc == NULL) {
+		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
+		pthread_mutex_unlock(&g_bdev_mgr.mutex);
+		return -ENOMEM;
+	}
+
+	TAILQ_INIT(&desc->pending_media_events);
+	TAILQ_INIT(&desc->free_media_events);
+
+	desc->callback.open_with_ext = true;
+	desc->callback.event_fn = event_cb;
+	desc->callback.ctx = event_ctx;
+	pthread_mutex_init(&desc->mutex, NULL);
+
+	if (bdev->media_events) {
+		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
+						   sizeof(*desc->media_events_buffer));
+		if (desc->media_events_buffer == NULL) {
+			SPDK_ERRLOG("Failed to initialize media event pool\n");
+			bdev_desc_free(desc);
+			pthread_mutex_unlock(&g_bdev_mgr.mutex);
+			return -ENOMEM;
+		}
+
+		for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) {
+			TAILQ_INSERT_TAIL(&desc->free_media_events,
+					  &desc->media_events_buffer[event_id], tailq);
+		}
+	}
+
+	rc = bdev_open(bdev, write, desc);
+	if (rc != 0) {
+		bdev_desc_free(desc);
+		desc = NULL;
+	}
+
+	*_desc = desc;
+
+	pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+	return rc;
+}
+
+void
+spdk_bdev_close(struct spdk_bdev_desc *desc)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
+		      spdk_get_thread());
+
+	assert(desc->thread == spdk_get_thread());
+
+	spdk_poller_unregister(&desc->io_timeout_poller);
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	pthread_mutex_lock(&desc->mutex);
+
+	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
+
+	desc->closed = true;
+
+	if (0 == desc->refs) {
+		pthread_mutex_unlock(&desc->mutex);
+		bdev_desc_free(desc);
+	} else {
+		pthread_mutex_unlock(&desc->mutex);
+	}
+
+	/* If no more descriptors, kill QoS channel */
+	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
+		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
+			      bdev->name, spdk_get_thread());
+
+		if (bdev_qos_destroy(bdev)) {
+			/* There isn't anything we can do to recover here. Just let the
+			 * old QoS poller keep running. The QoS handling won't change
+			 * cores when the user allocates a new channel, but it won't break. */
+			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
+		}
+	}
+
+	spdk_bdev_set_qd_sampling_period(bdev, 0);
+
+	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
+		rc = bdev_unregister_unsafe(bdev);
+		pthread_mutex_unlock(&bdev->internal.mutex);
+
+		if (rc == 0) {
+			bdev_fini(bdev);
+		}
+	} else {
+		pthread_mutex_unlock(&bdev->internal.mutex);
+	}
+}
+
+int
+spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			    struct spdk_bdev_module *module)
+{
+	if (bdev->internal.claim_module != NULL) {
+		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
+			    bdev->internal.claim_module->name);
+		return -EPERM;
+	}
+
+	if (desc && !desc->write) {
+		desc->write = true;
+	}
+
+	bdev->internal.claim_module = module;
+	return 0;
+}
+
+void
+spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
+{
+	assert(bdev->internal.claim_module != NULL);
+	bdev->internal.claim_module = NULL;
+}
+
+struct spdk_bdev *
+spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
+{
+	assert(desc != NULL);
+	return desc->bdev;
+}
+
+void
+spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
+{
+	struct iovec *iovs;
+	int iovcnt;
+
+	if (bdev_io == NULL) {
+		return;
+	}
+
+	switch (bdev_io->type) {
+	case SPDK_BDEV_IO_TYPE_READ:
+	case SPDK_BDEV_IO_TYPE_WRITE:
+	case SPDK_BDEV_IO_TYPE_ZCOPY:
+		iovs = bdev_io->u.bdev.iovs;
+		iovcnt = bdev_io->u.bdev.iovcnt;
+		break;
+	default:
+		iovs = NULL;
+		iovcnt = 0;
+		break;
+	}
+
+	if (iovp) {
+		*iovp = iovs;
+	}
+	if (iovcntp) {
+		*iovcntp = iovcnt;
+	}
+}
+
+void *
+spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
+{
+	if (bdev_io == NULL) {
+		return NULL;
+	}
+
+	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
+		return NULL;
+	}
+
+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
+	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+		return bdev_io->u.bdev.md_buf;
+	}
+
+	return NULL;
+}
+
+void *
+spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
+{
+	if (bdev_io == NULL) {
+		assert(false);
+		return NULL;
+	}
+
+	return bdev_io->internal.caller_ctx;
+}
+
+void
+spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
+{
+
+	if (spdk_bdev_module_list_find(bdev_module->name)) {
+		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
+		assert(false);
+	}
+
+	/*
+	 * Modules with examine callbacks must be initialized first, so they are
+	 *  ready to handle examine callbacks from later modules that will
+	 *  register physical bdevs.
+	 */
+	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
+		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
+	} else {
+		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
+	}
+}
+
+struct spdk_bdev_module *
+spdk_bdev_module_list_find(const char *name)
+{
+	struct spdk_bdev_module *bdev_module;
+
+	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+		if (strcmp(name, bdev_module->name) == 0) {
+			break;
+		}
+	}
+
+	return bdev_module;
+}
+
+static void
+bdev_write_zero_buffer_next(void *_bdev_io)
+{
+	struct spdk_bdev_io *bdev_io = _bdev_io;
+	uint64_t num_bytes, num_blocks;
+	void *md_buf = NULL;
+	int rc;
+
+	num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) *
+			     bdev_io->u.bdev.split_remaining_num_blocks,
+			     ZERO_BUFFER_SIZE);
+	num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev);
+
+	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
+		md_buf = (char *)g_bdev_mgr.zero_buffer +
+			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
+	}
+
+	rc = bdev_write_blocks_with_md(bdev_io->internal.desc,
+				       spdk_io_channel_from_ctx(bdev_io->internal.ch),
+				       g_bdev_mgr.zero_buffer, md_buf,
+				       bdev_io->u.bdev.split_current_offset_blocks, num_blocks,
+				       bdev_write_zero_buffer_done, bdev_io);
+	if (rc == 0) {
+		bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks;
+		bdev_io->u.bdev.split_current_offset_blocks += num_blocks;
+	} else if (rc == -ENOMEM) {
+		bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next);
+	} else {
+		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+	}
+}
+
+static void
+bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *parent_io = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (!success) {
+		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
+		return;
+	}
+
+	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
+		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
+		return;
+	}
+
+	bdev_write_zero_buffer_next(parent_io);
+}
+
+static void
+bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
+{
+	pthread_mutex_lock(&ctx->bdev->internal.mutex);
+	ctx->bdev->internal.qos_mod_in_progress = false;
+	pthread_mutex_unlock(&ctx->bdev->internal.mutex);
+
+	if (ctx->cb_fn) {
+		ctx->cb_fn(ctx->cb_arg, status);
+	}
+	free(ctx);
+}
+
+static void
+bdev_disable_qos_done(void *cb_arg)
+{
+	struct set_qos_limit_ctx *ctx = cb_arg;
+	struct spdk_bdev *bdev = ctx->bdev;
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_qos *qos;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	qos = bdev->internal.qos;
+	bdev->internal.qos = NULL;
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	while (!TAILQ_EMPTY(&qos->queued)) {
+		/* Send queued I/O back to their original thread for resubmission. */
+		bdev_io = TAILQ_FIRST(&qos->queued);
+		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
+
+		if (bdev_io->internal.io_submit_ch) {
+			/*
+			 * Channel was changed when sending it to the QoS thread - change it back
+			 *  before sending it back to the original thread.
+			 */
+			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
+			bdev_io->internal.io_submit_ch = NULL;
+		}
+
+		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
+				     _bdev_io_submit, bdev_io);
+	}
+
+	if (qos->thread != NULL) {
+		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
+		spdk_poller_unregister(&qos->poller);
+	}
+
+	free(qos);
+
+	bdev_set_qos_limit_done(ctx, 0);
+}
+
+static void
+bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status)
+{
+	void *io_device = spdk_io_channel_iter_get_io_device(i);
+	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
+	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_thread *thread;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	thread = bdev->internal.qos->thread;
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	if (thread != NULL) {
+		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
+	} else {
+		bdev_disable_qos_done(ctx);
+	}
+}
+
+static void
+bdev_disable_qos_msg(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
+
+	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_update_qos_rate_limit_msg(void *cb_arg)
+{
+	struct set_qos_limit_ctx *ctx = cb_arg;
+	struct spdk_bdev *bdev = ctx->bdev;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	bdev_set_qos_limit_done(ctx, 0);
+}
+
+static void
+bdev_enable_qos_msg(struct spdk_io_channel_iter *i)
+{
+	void *io_device = spdk_io_channel_iter_get_io_device(i);
+	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	bdev_enable_qos(bdev, bdev_ch);
+	pthread_mutex_unlock(&bdev->internal.mutex);
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	bdev_set_qos_limit_done(ctx, status);
+}
+
+static void
+bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
+{
+	int i;
+
+	assert(bdev->internal.qos != NULL);
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+			bdev->internal.qos->rate_limits[i].limit = limits[i];
+
+			if (limits[i] == 0) {
+				bdev->internal.qos->rate_limits[i].limit =
+					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
+			}
+		}
+	}
+}
+
+void
+spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
+			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
+{
+	struct set_qos_limit_ctx	*ctx;
+	uint32_t			limit_set_complement;
+	uint64_t			min_limit_per_sec;
+	int				i;
+	bool				disable_rate_limit = true;
+
+	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+			continue;
+		}
+
+		if (limits[i] > 0) {
+			disable_rate_limit = false;
+		}
+
+		if (bdev_qos_is_iops_rate_limit(i) == true) {
+			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
+		} else {
+			/* Change from megabyte to byte rate limit */
+			limits[i] = limits[i] * 1024 * 1024;
+			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
+		}
+
+		limit_set_complement = limits[i] % min_limit_per_sec;
+		if (limit_set_complement) {
+			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
+				    limits[i], min_limit_per_sec);
+			limits[i] += min_limit_per_sec - limit_set_complement;
+			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
+		}
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+	ctx->bdev = bdev;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev->internal.qos_mod_in_progress) {
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		free(ctx);
+		cb_fn(cb_arg, -EAGAIN);
+		return;
+	}
+	bdev->internal.qos_mod_in_progress = true;
+
+	if (disable_rate_limit == true && bdev->internal.qos) {
+		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
+			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
+			     bdev->internal.qos->rate_limits[i].limit !=
+			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
+				disable_rate_limit = false;
+				break;
+			}
+		}
+	}
+
+	if (disable_rate_limit == false) {
+		if (bdev->internal.qos == NULL) {
+			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
+			if (!bdev->internal.qos) {
+				pthread_mutex_unlock(&bdev->internal.mutex);
+				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
+				bdev_set_qos_limit_done(ctx, -ENOMEM);
+				return;
+			}
+		}
+
+		if (bdev->internal.qos->thread == NULL) {
+			/* Enabling */
+			bdev_set_qos_rate_limits(bdev, limits);
+
+			spdk_for_each_channel(__bdev_to_io_dev(bdev),
+					      bdev_enable_qos_msg, ctx,
+					      bdev_enable_qos_done);
+		} else {
+			/* Updating */
+			bdev_set_qos_rate_limits(bdev, limits);
+
+			spdk_thread_send_msg(bdev->internal.qos->thread,
+					     bdev_update_qos_rate_limit_msg, ctx);
+		}
+	} else {
+		if (bdev->internal.qos != NULL) {
+			bdev_set_qos_rate_limits(bdev, limits);
+
+			/* Disabling */
+			spdk_for_each_channel(__bdev_to_io_dev(bdev),
+					      bdev_disable_qos_msg, ctx,
+					      bdev_disable_qos_msg_done);
+		} else {
+			pthread_mutex_unlock(&bdev->internal.mutex);
+			bdev_set_qos_limit_done(ctx, 0);
+			return;
+		}
+	}
+
+	pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+struct spdk_bdev_histogram_ctx {
+	spdk_bdev_histogram_status_cb cb_fn;
+	void *cb_arg;
+	struct spdk_bdev *bdev;
+	int status;
+};
+
+static void
+bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	pthread_mutex_lock(&ctx->bdev->internal.mutex);
+	ctx->bdev->internal.histogram_in_progress = false;
+	pthread_mutex_unlock(&ctx->bdev->internal.mutex);
+	ctx->cb_fn(ctx->cb_arg, ctx->status);
+	free(ctx);
+}
+
+static void
+bdev_histogram_disable_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+
+	if (ch->histogram != NULL) {
+		spdk_histogram_data_free(ch->histogram);
+		ch->histogram = NULL;
+	}
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	if (status != 0) {
+		ctx->status = status;
+		ctx->bdev->internal.histogram_enabled = false;
+		spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx,
+				      bdev_histogram_disable_channel_cb);
+	} else {
+		pthread_mutex_lock(&ctx->bdev->internal.mutex);
+		ctx->bdev->internal.histogram_in_progress = false;
+		pthread_mutex_unlock(&ctx->bdev->internal.mutex);
+		ctx->cb_fn(ctx->cb_arg, ctx->status);
+		free(ctx);
+	}
+}
+
+static void
+bdev_histogram_enable_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	int status = 0;
+
+	if (ch->histogram == NULL) {
+		ch->histogram = spdk_histogram_data_alloc();
+		if (ch->histogram == NULL) {
+			status = -ENOMEM;
+		}
+	}
+
+	spdk_for_each_channel_continue(i, status);
+}
+
+void
+spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
+			   void *cb_arg, bool enable)
+{
+	struct spdk_bdev_histogram_ctx *ctx;
+
+	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
+	if (ctx == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->bdev = bdev;
+	ctx->status = 0;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev->internal.histogram_in_progress) {
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		free(ctx);
+		cb_fn(cb_arg, -EAGAIN);
+		return;
+	}
+
+	bdev->internal.histogram_in_progress = true;
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	bdev->internal.histogram_enabled = enable;
+
+	if (enable) {
+		/* Allocate histogram for each channel */
+		spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx,
+				      bdev_histogram_enable_channel_cb);
+	} else {
+		spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx,
+				      bdev_histogram_disable_channel_cb);
+	}
+}
+
+struct spdk_bdev_histogram_data_ctx {
+	spdk_bdev_histogram_data_cb cb_fn;
+	void *cb_arg;
+	struct spdk_bdev *bdev;
+	/** merged histogram data from all channels */
+	struct spdk_histogram_data	*histogram;
+};
+
+static void
+bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
+	free(ctx);
+}
+
+static void
+bdev_histogram_get_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	int status = 0;
+
+	if (ch->histogram == NULL) {
+		status = -EFAULT;
+	} else {
+		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
+	}
+
+	spdk_for_each_channel_continue(i, status);
+}
+
+void
+spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
+			spdk_bdev_histogram_data_cb cb_fn,
+			void *cb_arg)
+{
+	struct spdk_bdev_histogram_data_ctx *ctx;
+
+	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
+	if (ctx == NULL) {
+		cb_fn(cb_arg, -ENOMEM, NULL);
+		return;
+	}
+
+	ctx->bdev = bdev;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	ctx->histogram = histogram;
+
+	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx,
+			      bdev_histogram_get_channel_cb);
+}
+
+size_t
+spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
+			   size_t max_events)
+{
+	struct media_event_entry *entry;
+	size_t num_events = 0;
+
+	for (; num_events < max_events; ++num_events) {
+		entry = TAILQ_FIRST(&desc->pending_media_events);
+		if (entry == NULL) {
+			break;
+		}
+
+		events[num_events] = entry->event;
+		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
+		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
+	}
+
+	return num_events;
+}
+
+int
+spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
+			    size_t num_events)
+{
+	struct spdk_bdev_desc *desc;
+	struct media_event_entry *entry;
+	size_t event_id;
+	int rc = 0;
+
+	assert(bdev->media_events);
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
+		if (desc->write) {
+			break;
+		}
+	}
+
+	if (desc == NULL || desc->media_events_buffer == NULL) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	for (event_id = 0; event_id < num_events; ++event_id) {
+		entry = TAILQ_FIRST(&desc->free_media_events);
+		if (entry == NULL) {
+			break;
+		}
+
+		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
+		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
+		entry->event = events[event_id];
+	}
+
+	rc = event_id;
+out:
+	pthread_mutex_unlock(&bdev->internal.mutex);
+	return rc;
+}
+
+void
+spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
+{
+	struct spdk_bdev_desc *desc;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
+		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
+			desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev,
+						desc->callback.ctx);
+		}
+	}
+	pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+struct locked_lba_range_ctx {
+	struct lba_range		range;
+	struct spdk_bdev		*bdev;
+	struct lba_range		*current_range;
+	struct lba_range		*owner_range;
+	struct spdk_poller		*poller;
+	lock_range_cb			cb_fn;
+	void				*cb_arg;
+};
+
+static void
+bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status)
+{
+	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	ctx->cb_fn(ctx->cb_arg, -ENOMEM);
+	free(ctx);
+}
+
+static void
+bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i);
+
+static void
+bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status)
+{
+	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_bdev *bdev = ctx->bdev;
+
+	if (status == -ENOMEM) {
+		/* One of the channels could not allocate a range object.
+		 * So we have to go back and clean up any ranges that were
+		 * allocated successfully before we return error status to
+		 * the caller.  We can reuse the unlock function to do that
+		 * clean up.
+		 */
+		spdk_for_each_channel(__bdev_to_io_dev(bdev),
+				      bdev_unlock_lba_range_get_channel, ctx,
+				      bdev_lock_error_cleanup_cb);
+		return;
+	}
+
+	/* All channels have locked this range and no I/O overlapping the range
+	 * are outstanding!  Set the owner_ch for the range object for the
+	 * locking channel, so that this channel will know that it is allowed
+	 * to write to this range.
+	 */
+	ctx->owner_range->owner_ch = ctx->range.owner_ch;
+	ctx->cb_fn(ctx->cb_arg, status);
+
+	/* Don't free the ctx here.  Its range is in the bdev's global list of
+	 * locked ranges still, and will be removed and freed when this range
+	 * is later unlocked.
+	 */
+}
+
+static int
+bdev_lock_lba_range_check_io(void *_i)
+{
+	struct spdk_io_channel_iter *i = _i;
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct lba_range *range = ctx->current_range;
+	struct spdk_bdev_io *bdev_io;
+
+	spdk_poller_unregister(&ctx->poller);
+
+	/* The range is now in the locked_ranges, so no new IO can be submitted to this
+	 * range.  But we need to wait until any outstanding IO overlapping with this range
+	 * are completed.
+	 */
+	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
+		if (bdev_io_range_is_locked(bdev_io, range)) {
+			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
+			return SPDK_POLLER_BUSY;
+		}
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct lba_range *range;
+
+	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+		if (range->length == ctx->range.length &&
+		    range->offset == ctx->range.offset &&
+		    range->locked_ctx == ctx->range.locked_ctx) {
+			/* This range already exists on this channel, so don't add
+			 * it again.  This can happen when a new channel is created
+			 * while the for_each_channel operation is in progress.
+			 * Do not check for outstanding I/O in that case, since the
+			 * range was locked before any I/O could be submitted to the
+			 * new channel.
+			 */
+			spdk_for_each_channel_continue(i, 0);
+			return;
+		}
+	}
+
+	range = calloc(1, sizeof(*range));
+	if (range == NULL) {
+		spdk_for_each_channel_continue(i, -ENOMEM);
+		return;
+	}
+
+	range->length = ctx->range.length;
+	range->offset = ctx->range.offset;
+	range->locked_ctx = ctx->range.locked_ctx;
+	ctx->current_range = range;
+	if (ctx->range.owner_ch == ch) {
+		/* This is the range object for the channel that will hold
+		 * the lock.  Store it in the ctx object so that we can easily
+		 * set its owner_ch after the lock is finally acquired.
+		 */
+		ctx->owner_range = range;
+	}
+	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
+	bdev_lock_lba_range_check_io(i);
+}
+
+static void
+bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
+{
+	assert(spdk_get_thread() == ctx->range.owner_ch->channel->thread);
+
+	/* We will add a copy of this range to each channel now. */
+	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx,
+			      bdev_lock_lba_range_cb);
+}
+
+static bool
+bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
+{
+	struct lba_range *r;
+
+	TAILQ_FOREACH(r, tailq, tailq) {
+		if (bdev_lba_range_overlapped(range, r)) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static int
+bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+		    uint64_t offset, uint64_t length,
+		    lock_range_cb cb_fn, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct locked_lba_range_ctx *ctx;
+
+	if (cb_arg == NULL) {
+		SPDK_ERRLOG("cb_arg must not be NULL\n");
+		return -EINVAL;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		return -ENOMEM;
+	}
+
+	ctx->range.offset = offset;
+	ctx->range.length = length;
+	ctx->range.owner_ch = ch;
+	ctx->range.locked_ctx = cb_arg;
+	ctx->bdev = bdev;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
+		/* There is an active lock overlapping with this range.
+		 * Put it on the pending list until this range no
+		 * longer overlaps with another.
+		 */
+		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
+	} else {
+		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
+		bdev_lock_lba_range_ctx(bdev, ctx);
+	}
+	pthread_mutex_unlock(&bdev->internal.mutex);
+	return 0;
+}
+
+static void
+bdev_lock_lba_range_ctx_msg(void *_ctx)
+{
+	struct locked_lba_range_ctx *ctx = _ctx;
+
+	bdev_lock_lba_range_ctx(ctx->bdev, ctx);
+}
+
+static void
+bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status)
+{
+	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct locked_lba_range_ctx *pending_ctx;
+	struct spdk_bdev_channel *ch = ctx->range.owner_ch;
+	struct spdk_bdev *bdev = ch->bdev;
+	struct lba_range *range, *tmp;
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	/* Check if there are any pending locked ranges that overlap with this range
+	 * that was just unlocked.  If there are, check that it doesn't overlap with any
+	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
+	 * the lock process.
+	 */
+	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
+		if (bdev_lba_range_overlapped(range, &ctx->range) &&
+		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
+			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
+			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
+			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
+			spdk_thread_send_msg(pending_ctx->range.owner_ch->channel->thread,
+					     bdev_lock_lba_range_ctx_msg, pending_ctx);
+		}
+	}
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	ctx->cb_fn(ctx->cb_arg, status);
+	free(ctx);
+}
+
+static void
+bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	TAILQ_HEAD(, spdk_bdev_io) io_locked;
+	struct spdk_bdev_io *bdev_io;
+	struct lba_range *range;
+
+	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+		if (ctx->range.offset == range->offset &&
+		    ctx->range.length == range->length &&
+		    ctx->range.locked_ctx == range->locked_ctx) {
+			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
+			free(range);
+			break;
+		}
+	}
+
+	/* Note: we should almost always be able to assert that the range specified
+	 * was found.  But there are some very rare corner cases where a new channel
+	 * gets created simultaneously with a range unlock, where this function
+	 * would execute on that new channel and wouldn't have the range.
+	 * We also use this to clean up range allocations when a later allocation
+	 * fails in the locking path.
+	 * So we can't actually assert() here.
+	 */
+
+	/* Swap the locked IO into a temporary list, and then try to submit them again.
+	 * We could hyper-optimize this to only resubmit locked I/O that overlap
+	 * with the range that was just unlocked, but this isn't a performance path so
+	 * we go for simplicity here.
+	 */
+	TAILQ_INIT(&io_locked);
+	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
+	while (!TAILQ_EMPTY(&io_locked)) {
+		bdev_io = TAILQ_FIRST(&io_locked);
+		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
+		bdev_io_submit(bdev_io);
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+		      uint64_t offset, uint64_t length,
+		      lock_range_cb cb_fn, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct locked_lba_range_ctx *ctx;
+	struct lba_range *range;
+	bool range_found = false;
+
+	/* Let's make sure the specified channel actually has a lock on
+	 * the specified range.  Note that the range must match exactly.
+	 */
+	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+		if (range->offset == offset && range->length == length &&
+		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
+			range_found = true;
+			break;
+		}
+	}
+
+	if (!range_found) {
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&bdev->internal.mutex);
+	/* We confirmed that this channel has locked the specified range.  To
+	 * start the unlock the process, we find the range in the bdev's locked_ranges
+	 * and remove it.  This ensures new channels don't inherit the locked range.
+	 * Then we will send a message to each channel (including the one specified
+	 * here) to remove the range from its per-channel list.
+	 */
+	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
+		if (range->offset == offset && range->length == length &&
+		    range->locked_ctx == cb_arg) {
+			break;
+		}
+	}
+	if (range == NULL) {
+		assert(false);
+		pthread_mutex_unlock(&bdev->internal.mutex);
+		return -EINVAL;
+	}
+	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
+	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
+	pthread_mutex_unlock(&bdev->internal.mutex);
+
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx,
+			      bdev_unlock_lba_range_cb);
+	return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV)
+
+SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
+{
+	spdk_trace_register_owner(OWNER_BDEV, 'b');
+	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
+	spdk_trace_register_description("BDEV_IO_START", TRACE_BDEV_IO_START, OWNER_BDEV,
+					OBJECT_BDEV_IO, 1, 0, "type:   ");
+	spdk_trace_register_description("BDEV_IO_DONE", TRACE_BDEV_IO_DONE, OWNER_BDEV,
+					OBJECT_BDEV_IO, 0, 0, "");
+}
diff --git a/src/spdk/lib/bdev/bdev_internal.h b/src/spdk/lib/bdev/bdev_internal.h
new file mode 100644
index 000000000..d1fa6e65a
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev_internal.h
@@ -0,0 +1,50 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_INTERNAL_H
+#define SPDK_BDEV_INTERNAL_H
+
+#include "spdk/bdev.h"
+
+struct spdk_bdev;
+struct spdk_bdev_io;
+struct spdk_bdev_channel;
+
+struct spdk_bdev_io *bdev_channel_get_io(struct spdk_bdev_channel *channel);
+
+void bdev_io_init(struct spdk_bdev_io *bdev_io, struct spdk_bdev *bdev, void *cb_arg,
+		  spdk_bdev_io_completion_cb cb);
+
+void bdev_io_submit(struct spdk_bdev_io *bdev_io);
+
+#endif /* SPDK_BDEV_INTERNAL_H */
diff --git a/src/spdk/lib/bdev/bdev_rpc.c b/src/spdk/lib/bdev/bdev_rpc.c
new file mode 100644
index 000000000..6ce7136c4
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev_rpc.c
@@ -0,0 +1,98 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/bdev.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+struct spdk_rpc_set_bdev_opts {
+	uint32_t bdev_io_pool_size;
+	uint32_t bdev_io_cache_size;
+	bool bdev_auto_examine;
+};
+
+static const struct spdk_json_object_decoder rpc_set_bdev_opts_decoders[] = {
+	{"bdev_io_pool_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_pool_size), spdk_json_decode_uint32, true},
+	{"bdev_io_cache_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_cache_size), spdk_json_decode_uint32, true},
+	{"bdev_auto_examine", offsetof(struct spdk_rpc_set_bdev_opts, bdev_auto_examine), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_bdev_set_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+	struct spdk_rpc_set_bdev_opts rpc_opts;
+	struct spdk_bdev_opts bdev_opts;
+	struct spdk_json_write_ctx *w;
+	int rc;
+
+	rpc_opts.bdev_io_pool_size = UINT32_MAX;
+	rpc_opts.bdev_io_cache_size = UINT32_MAX;
+	rpc_opts.bdev_auto_examine = true;
+
+	if (params != NULL) {
+		if (spdk_json_decode_object(params, rpc_set_bdev_opts_decoders,
+					    SPDK_COUNTOF(rpc_set_bdev_opts_decoders), &rpc_opts)) {
+			SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							 "Invalid parameters");
+			return;
+		}
+	}
+
+	spdk_bdev_get_opts(&bdev_opts);
+	if (rpc_opts.bdev_io_pool_size != UINT32_MAX) {
+		bdev_opts.bdev_io_pool_size = rpc_opts.bdev_io_pool_size;
+	}
+	if (rpc_opts.bdev_io_cache_size != UINT32_MAX) {
+		bdev_opts.bdev_io_cache_size = rpc_opts.bdev_io_cache_size;
+	}
+	bdev_opts.bdev_auto_examine = rpc_opts.bdev_auto_examine;
+	rc = spdk_bdev_set_opts(&bdev_opts);
+
+	if (rc != 0) {
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Pool size %" PRIu32 " too small for cache size %" PRIu32,
+						     bdev_opts.bdev_io_pool_size, bdev_opts.bdev_io_cache_size);
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("bdev_set_options", rpc_bdev_set_options, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_options, set_bdev_options)
diff --git a/src/spdk/lib/bdev/bdev_zone.c b/src/spdk/lib/bdev/bdev_zone.c
new file mode 100644
index 000000000..3cf2ecb67
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev_zone.c
@@ -0,0 +1,201 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev_zone.h"
+#include "spdk/bdev_module.h"
+
+#include "bdev_internal.h"
+
+uint64_t
+spdk_bdev_get_zone_size(const struct spdk_bdev *bdev)
+{
+	return bdev->zone_size;
+}
+
+uint32_t
+spdk_bdev_get_max_open_zones(const struct spdk_bdev *bdev)
+{
+	return bdev->max_open_zones;
+}
+
+uint32_t
+spdk_bdev_get_optimal_open_zones(const struct spdk_bdev *bdev)
+{
+	return bdev->optimal_open_zones;
+}
+
+int
+spdk_bdev_get_zone_info(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			uint64_t zone_id, size_t num_zones, struct spdk_bdev_zone_info *info,
+			spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_GET_ZONE_INFO;
+	bdev_io->u.zone_mgmt.zone_id = zone_id;
+	bdev_io->u.zone_mgmt.num_zones = num_zones;
+	bdev_io->u.zone_mgmt.buf = info;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_zone_management(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			  uint64_t zone_id, enum spdk_bdev_zone_action action,
+			  spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT;
+	bdev_io->u.zone_mgmt.zone_action = action;
+	bdev_io->u.zone_mgmt.zone_id = zone_id;
+	bdev_io->u.zone_mgmt.num_zones = 1;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+static int
+zone_bdev_append_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			 void *buf, void *md_buf, uint64_t zone_id, uint64_t num_blocks,
+			 spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_APPEND;
+	bdev_io->u.bdev.iovs = &bdev_io->iov;
+	bdev_io->u.bdev.iovs[0].iov_base = buf;
+	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+	bdev_io->u.bdev.iovcnt = 1;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = zone_id;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_zone_append(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		      void *buf, uint64_t start_lba, uint64_t num_blocks,
+		      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return zone_bdev_append_with_md(desc, ch, buf, NULL, start_lba, num_blocks,
+					cb, cb_arg);
+}
+
+int
+spdk_bdev_zone_append_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			      void *buf, void *md, uint64_t start_lba, uint64_t num_blocks,
+			      spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return zone_bdev_append_with_md(desc, ch, buf, md, start_lba, num_blocks,
+					cb, cb_arg);
+}
+
+int
+spdk_bdev_zone_appendv_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+			       struct iovec *iov, int iovcnt, void *md_buf, uint64_t zone_id,
+			       uint64_t num_blocks, spdk_bdev_io_completion_cb cb,
+			       void *cb_arg)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+	bdev_io = bdev_channel_get_io(channel);
+	if (!bdev_io) {
+		return -ENOMEM;
+	}
+
+	bdev_io->internal.ch = channel;
+	bdev_io->internal.desc = desc;
+	bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_APPEND;
+	bdev_io->u.bdev.iovs = iov;
+	bdev_io->u.bdev.iovcnt = iovcnt;
+	bdev_io->u.bdev.md_buf = md_buf;
+	bdev_io->u.bdev.num_blocks = num_blocks;
+	bdev_io->u.bdev.offset_blocks = zone_id;
+	bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+	bdev_io_submit(bdev_io);
+	return 0;
+}
+
+int
+spdk_bdev_zone_appendv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+		       struct iovec *iovs, int iovcnt, uint64_t zone_id, uint64_t num_blocks,
+		       spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+	return spdk_bdev_zone_appendv_with_md(desc, ch, iovs, iovcnt, NULL, zone_id, num_blocks,
+					      cb, cb_arg);
+}
+
+uint64_t
+spdk_bdev_io_get_append_location(struct spdk_bdev_io *bdev_io)
+{
+	return bdev_io->u.bdev.offset_blocks;
+}
diff --git a/src/spdk/lib/bdev/part.c b/src/spdk/lib/bdev/part.c
new file mode 100644
index 000000000..01a395591
--- /dev/null
+++ b/src/spdk/lib/bdev/part.c
@@ -0,0 +1,524 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common code for partition-like virtual bdevs.
+ */
+
+#include "spdk/bdev.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+
+#include "spdk/bdev_module.h"
+
+struct spdk_bdev_part_base {
+	struct spdk_bdev		*bdev;
+	struct spdk_bdev_desc		*desc;
+	uint32_t			ref;
+	uint32_t			channel_size;
+	spdk_bdev_part_base_free_fn	base_free_fn;
+	void				*ctx;
+	bool				claimed;
+	struct spdk_bdev_module		*module;
+	struct spdk_bdev_fn_table	*fn_table;
+	struct bdev_part_tailq		*tailq;
+	spdk_io_channel_create_cb	ch_create_cb;
+	spdk_io_channel_destroy_cb	ch_destroy_cb;
+	struct spdk_thread		*thread;
+};
+
+struct spdk_bdev *
+spdk_bdev_part_base_get_bdev(struct spdk_bdev_part_base *part_base)
+{
+	return part_base->bdev;
+}
+
+struct spdk_bdev_desc *
+spdk_bdev_part_base_get_desc(struct spdk_bdev_part_base *part_base)
+{
+	return part_base->desc;
+}
+
+struct bdev_part_tailq *
+spdk_bdev_part_base_get_tailq(struct spdk_bdev_part_base *part_base)
+{
+	return part_base->tailq;
+}
+
+void *
+spdk_bdev_part_base_get_ctx(struct spdk_bdev_part_base *part_base)
+{
+	return part_base->ctx;
+}
+
+const char *
+spdk_bdev_part_base_get_bdev_name(struct spdk_bdev_part_base *part_base)
+{
+	return part_base->bdev->name;
+}
+
+static void
+bdev_part_base_free(void *ctx)
+{
+	struct spdk_bdev_desc *desc = ctx;
+
+	spdk_bdev_close(desc);
+}
+
+void
+spdk_bdev_part_base_free(struct spdk_bdev_part_base *base)
+{
+	if (base->desc) {
+		/* Close the underlying bdev on its same opened thread. */
+		if (base->thread && base->thread != spdk_get_thread()) {
+			spdk_thread_send_msg(base->thread, bdev_part_base_free, base->desc);
+		} else {
+			spdk_bdev_close(base->desc);
+		}
+	}
+
+	if (base->base_free_fn != NULL) {
+		base->base_free_fn(base->ctx);
+	}
+
+	free(base);
+}
+
+static void
+bdev_part_free_cb(void *io_device)
+{
+	struct spdk_bdev_part *part = io_device;
+	struct spdk_bdev_part_base *base;
+
+	assert(part);
+	assert(part->internal.base);
+
+	base = part->internal.base;
+
+	TAILQ_REMOVE(base->tailq, part, tailq);
+
+	if (--base->ref == 0) {
+		spdk_bdev_module_release_bdev(base->bdev);
+		spdk_bdev_part_base_free(base);
+	}
+
+	spdk_bdev_destruct_done(&part->internal.bdev, 0);
+	free(part->internal.bdev.name);
+	free(part->internal.bdev.product_name);
+	free(part);
+}
+
+int
+spdk_bdev_part_free(struct spdk_bdev_part *part)
+{
+	spdk_io_device_unregister(part, bdev_part_free_cb);
+
+	/* Return 1 to indicate that this is an asynchronous operation that isn't complete
+	 * until spdk_bdev_destruct_done is called */
+	return 1;
+}
+
+void
+spdk_bdev_part_base_hotremove(struct spdk_bdev_part_base *part_base, struct bdev_part_tailq *tailq)
+{
+	struct spdk_bdev_part *part, *tmp;
+
+	TAILQ_FOREACH_SAFE(part, tailq, tailq, tmp) {
+		if (part->internal.base == part_base) {
+			spdk_bdev_unregister(&part->internal.bdev, NULL, NULL);
+		}
+	}
+}
+
+static bool
+bdev_part_io_type_supported(void *_part, enum spdk_bdev_io_type io_type)
+{
+	struct spdk_bdev_part *part = _part;
+
+	/* We can't decode/modify passthrough NVMe commands, so don't report
+	 *  that a partition supports these io types, even if the underlying
+	 *  bdev does.
+	 */
+	switch (io_type) {
+	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
+	case SPDK_BDEV_IO_TYPE_NVME_IO:
+	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+		return false;
+	default:
+		break;
+	}
+
+	return part->internal.base->bdev->fn_table->io_type_supported(part->internal.base->bdev->ctxt,
+			io_type);
+}
+
+static struct spdk_io_channel *
+bdev_part_get_io_channel(void *_part)
+{
+	struct spdk_bdev_part *part = _part;
+
+	return spdk_get_io_channel(part);
+}
+
+struct spdk_bdev *
+spdk_bdev_part_get_bdev(struct spdk_bdev_part *part)
+{
+	return &part->internal.bdev;
+}
+
+struct spdk_bdev_part_base *
+spdk_bdev_part_get_base(struct spdk_bdev_part *part)
+{
+	return part->internal.base;
+}
+
+struct spdk_bdev *
+spdk_bdev_part_get_base_bdev(struct spdk_bdev_part *part)
+{
+	return part->internal.base->bdev;
+}
+
+uint64_t
+spdk_bdev_part_get_offset_blocks(struct spdk_bdev_part *part)
+{
+	return part->internal.offset_blocks;
+}
+
+static int
+bdev_part_remap_dif(struct spdk_bdev_io *bdev_io, uint32_t offset,
+		    uint32_t remapped_offset)
+{
+	struct spdk_bdev *bdev = bdev_io->bdev;
+	struct spdk_dif_ctx dif_ctx;
+	struct spdk_dif_error err_blk = {};
+	int rc;
+
+	if (spdk_likely(!(bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK))) {
+		return 0;
+	}
+
+	rc = spdk_dif_ctx_init(&dif_ctx,
+			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
+			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
+			       offset, 0, 0, 0, 0);
+	if (rc != 0) {
+		SPDK_ERRLOG("Initialization of DIF context failed\n");
+		return rc;
+	}
+
+	spdk_dif_ctx_set_remapped_init_ref_tag(&dif_ctx, remapped_offset);
+
+	if (bdev->md_interleave) {
+		rc = spdk_dif_remap_ref_tag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+					    bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+	} else {
+		struct iovec md_iov = {
+			.iov_base	= bdev_io->u.bdev.md_buf,
+			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
+		};
+
+		rc = spdk_dix_remap_ref_tag(&md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+	}
+
+	if (rc != 0) {
+		SPDK_ERRLOG("Remapping reference tag failed. type=%d, offset=%" PRIu32 "\n",
+			    err_blk.err_type, err_blk.err_offset);
+	}
+
+	return rc;
+}
+
+static void
+bdev_part_complete_read_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *part_io = cb_arg;
+	uint32_t offset, remapped_offset;
+	int rc, status;
+
+	offset = bdev_io->u.bdev.offset_blocks;
+	remapped_offset = part_io->u.bdev.offset_blocks;
+
+	if (success) {
+		rc = bdev_part_remap_dif(bdev_io, offset, remapped_offset);
+		if (rc != 0) {
+			success = false;
+		}
+	}
+
+	status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+	spdk_bdev_io_complete(part_io, status);
+	spdk_bdev_free_io(bdev_io);
+}
+
+static void
+bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *part_io = cb_arg;
+	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+	spdk_bdev_io_complete(part_io, status);
+	spdk_bdev_free_io(bdev_io);
+}
+
+static void
+bdev_part_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_bdev_io *part_io = cb_arg;
+	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+	spdk_bdev_io_set_buf(part_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len);
+	spdk_bdev_io_complete(part_io, status);
+	spdk_bdev_free_io(bdev_io);
+}
+
+int
+spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+	struct spdk_bdev_part *part = ch->part;
+	struct spdk_io_channel *base_ch = ch->base_ch;
+	struct spdk_bdev_desc *base_desc = part->internal.base->desc;
+	uint64_t offset, remapped_offset;
+	int rc = 0;
+
+	offset = bdev_io->u.bdev.offset_blocks;
+	remapped_offset = offset + part->internal.offset_blocks;
+
+	/* Modify the I/O to adjust for the offset within the base bdev. */
+	switch (bdev_io->type) {
+	case SPDK_BDEV_IO_TYPE_READ:
+		if (bdev_io->u.bdev.md_buf == NULL) {
+			rc = spdk_bdev_readv_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs,
+						    bdev_io->u.bdev.iovcnt, remapped_offset,
+						    bdev_io->u.bdev.num_blocks,
+						    bdev_part_complete_read_io, bdev_io);
+		} else {
+			rc = spdk_bdev_readv_blocks_with_md(base_desc, base_ch,
+							    bdev_io->u.bdev.iovs,
+							    bdev_io->u.bdev.iovcnt,
+							    bdev_io->u.bdev.md_buf, remapped_offset,
+							    bdev_io->u.bdev.num_blocks,
+							    bdev_part_complete_read_io, bdev_io);
+		}
+		break;
+	case SPDK_BDEV_IO_TYPE_WRITE:
+		rc = bdev_part_remap_dif(bdev_io, offset, remapped_offset);
+		if (rc != 0) {
+			return SPDK_BDEV_IO_STATUS_FAILED;
+		}
+
+		if (bdev_io->u.bdev.md_buf == NULL) {
+			rc = spdk_bdev_writev_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs,
+						     bdev_io->u.bdev.iovcnt, remapped_offset,
+						     bdev_io->u.bdev.num_blocks,
+						     bdev_part_complete_io, bdev_io);
+		} else {
+			rc = spdk_bdev_writev_blocks_with_md(base_desc, base_ch,
+							     bdev_io->u.bdev.iovs,
+							     bdev_io->u.bdev.iovcnt,
+							     bdev_io->u.bdev.md_buf, remapped_offset,
+							     bdev_io->u.bdev.num_blocks,
+							     bdev_part_complete_io, bdev_io);
+		}
+		break;
+	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+		rc = spdk_bdev_write_zeroes_blocks(base_desc, base_ch, remapped_offset,
+						   bdev_io->u.bdev.num_blocks, bdev_part_complete_io,
+						   bdev_io);
+		break;
+	case SPDK_BDEV_IO_TYPE_UNMAP:
+		rc = spdk_bdev_unmap_blocks(base_desc, base_ch, remapped_offset,
+					    bdev_io->u.bdev.num_blocks, bdev_part_complete_io,
+					    bdev_io);
+		break;
+	case SPDK_BDEV_IO_TYPE_FLUSH:
+		rc = spdk_bdev_flush_blocks(base_desc, base_ch, remapped_offset,
+					    bdev_io->u.bdev.num_blocks, bdev_part_complete_io,
+					    bdev_io);
+		break;
+	case SPDK_BDEV_IO_TYPE_RESET:
+		rc = spdk_bdev_reset(base_desc, base_ch,
+				     bdev_part_complete_io, bdev_io);
+		break;
+	case SPDK_BDEV_IO_TYPE_ZCOPY:
+		rc = spdk_bdev_zcopy_start(base_desc, base_ch, remapped_offset,
+					   bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate,
+					   bdev_part_complete_zcopy_io, bdev_io);
+		break;
+	default:
+		SPDK_ERRLOG("unknown I/O type %d\n", bdev_io->type);
+		return SPDK_BDEV_IO_STATUS_FAILED;
+	}
+
+	return rc;
+}
+
+static int
+bdev_part_channel_create_cb(void *io_device, void *ctx_buf)
+{
+	struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device;
+	struct spdk_bdev_part_channel *ch = ctx_buf;
+
+	ch->part = part;
+	ch->base_ch = spdk_bdev_get_io_channel(part->internal.base->desc);
+	if (ch->base_ch == NULL) {
+		return -1;
+	}
+
+	if (part->internal.base->ch_create_cb) {
+		return part->internal.base->ch_create_cb(io_device, ctx_buf);
+	} else {
+		return 0;
+	}
+}
+
+static void
+bdev_part_channel_destroy_cb(void *io_device, void *ctx_buf)
+{
+	struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device;
+	struct spdk_bdev_part_channel *ch = ctx_buf;
+
+	if (part->internal.base->ch_destroy_cb) {
+		part->internal.base->ch_destroy_cb(io_device, ctx_buf);
+	}
+	spdk_put_io_channel(ch->base_ch);
+}
+
+struct spdk_bdev_part_base *
+	spdk_bdev_part_base_construct(struct spdk_bdev *bdev,
+			      spdk_bdev_remove_cb_t remove_cb, struct spdk_bdev_module *module,
+			      struct spdk_bdev_fn_table *fn_table, struct bdev_part_tailq *tailq,
+			      spdk_bdev_part_base_free_fn free_fn, void *ctx,
+			      uint32_t channel_size, spdk_io_channel_create_cb ch_create_cb,
+			      spdk_io_channel_destroy_cb ch_destroy_cb)
+{
+	int rc;
+	struct spdk_bdev_part_base *base;
+
+	base = calloc(1, sizeof(*base));
+	if (!base) {
+		SPDK_ERRLOG("Memory allocation failure\n");
+		return NULL;
+	}
+	fn_table->get_io_channel = bdev_part_get_io_channel;
+	fn_table->io_type_supported = bdev_part_io_type_supported;
+
+	base->bdev = bdev;
+	base->desc = NULL;
+	base->ref = 0;
+	base->module = module;
+	base->fn_table = fn_table;
+	base->tailq = tailq;
+	base->base_free_fn = free_fn;
+	base->ctx = ctx;
+	base->claimed = false;
+	base->channel_size = channel_size;
+	base->ch_create_cb = ch_create_cb;
+	base->ch_destroy_cb = ch_destroy_cb;
+
+	rc = spdk_bdev_open(bdev, false, remove_cb, base, &base->desc);
+	if (rc) {
+		spdk_bdev_part_base_free(base);
+		SPDK_ERRLOG("could not open bdev %s: %s\n", spdk_bdev_get_name(bdev),
+			    spdk_strerror(-rc));
+		return NULL;
+	}
+
+	/* Save the thread where the base device is opened */
+	base->thread = spdk_get_thread();
+
+	return base;
+}
+
+int
+spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base,
+			 char *name, uint64_t offset_blocks, uint64_t num_blocks,
+			 char *product_name)
+{
+	part->internal.bdev.blocklen = base->bdev->blocklen;
+	part->internal.bdev.blockcnt = num_blocks;
+	part->internal.offset_blocks = offset_blocks;
+
+	part->internal.bdev.write_cache = base->bdev->write_cache;
+	part->internal.bdev.required_alignment = base->bdev->required_alignment;
+	part->internal.bdev.ctxt = part;
+	part->internal.bdev.module = base->module;
+	part->internal.bdev.fn_table = base->fn_table;
+
+	part->internal.bdev.md_interleave = base->bdev->md_interleave;
+	part->internal.bdev.md_len = base->bdev->md_len;
+	part->internal.bdev.dif_type = base->bdev->dif_type;
+	part->internal.bdev.dif_is_head_of_md = base->bdev->dif_is_head_of_md;
+	part->internal.bdev.dif_check_flags = base->bdev->dif_check_flags;
+
+	part->internal.bdev.name = strdup(name);
+	part->internal.bdev.product_name = strdup(product_name);
+
+	if (part->internal.bdev.name == NULL) {
+		SPDK_ERRLOG("Failed to allocate name for new part of bdev %s\n", spdk_bdev_get_name(base->bdev));
+		return -1;
+	} else if (part->internal.bdev.product_name == NULL) {
+		free(part->internal.bdev.name);
+		SPDK_ERRLOG("Failed to allocate product name for new part of bdev %s\n",
+			    spdk_bdev_get_name(base->bdev));
+		return -1;
+	}
+
+	base->ref++;
+	part->internal.base = base;
+
+	if (!base->claimed) {
+		int rc;
+
+		rc = spdk_bdev_module_claim_bdev(base->bdev, base->desc, base->module);
+		if (rc) {
+			SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base->bdev));
+			free(part->internal.bdev.name);
+			free(part->internal.bdev.product_name);
+			return -1;
+		}
+		base->claimed = true;
+	}
+
+	spdk_io_device_register(part, bdev_part_channel_create_cb,
+				bdev_part_channel_destroy_cb,
+				base->channel_size,
+				name);
+
+	spdk_bdev_register(&part->internal.bdev);
+	TAILQ_INSERT_TAIL(base->tailq, part, tailq);
+
+	return 0;
+}
diff --git a/src/spdk/lib/bdev/scsi_nvme.c b/src/spdk/lib/bdev/scsi_nvme.c
new file mode 100644
index 000000000..f9fe319bd
--- /dev/null
+++ b/src/spdk/lib/bdev/scsi_nvme.c
@@ -0,0 +1,261 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2016 FUJITSU LIMITED, All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of the copyright holder nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/bdev_module.h"
+
+#include "spdk/nvme_spec.h"
+
+void
+spdk_scsi_nvme_translate(const struct spdk_bdev_io *bdev_io, int *sc, int *sk,
+			 int *asc, int *ascq)
+{
+	int nvme_sct = bdev_io->internal.error.nvme.sct;
+	int nvme_sc = bdev_io->internal.error.nvme.sc;
+
+	switch (nvme_sct) {
+	case SPDK_NVME_SCT_GENERIC:
+		switch (nvme_sc) {
+		case SPDK_NVME_SC_SUCCESS:
+			*sc   = SPDK_SCSI_STATUS_GOOD;
+			*sk   = SPDK_SCSI_SENSE_NO_SENSE;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_INVALID_OPCODE:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_INVALID_FIELD:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_DATA_TRANSFER_ERROR:
+		case SPDK_NVME_SC_CAPACITY_EXCEEDED:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_ABORTED_POWER_LOSS:
+			*sc   = SPDK_SCSI_STATUS_TASK_ABORTED;
+			*sk   = SPDK_SCSI_SENSE_ABORTED_COMMAND;
+			*asc  = SPDK_SCSI_ASC_WARNING;
+			*ascq = SPDK_SCSI_ASCQ_POWER_LOSS_EXPECTED;
+			break;
+		case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_HARDWARE_ERROR;
+			*asc  = SPDK_SCSI_ASC_INTERNAL_TARGET_FAILURE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_ABORTED_BY_REQUEST:
+		case SPDK_NVME_SC_ABORTED_SQ_DELETION:
+		case SPDK_NVME_SC_ABORTED_FAILED_FUSED:
+		case SPDK_NVME_SC_ABORTED_MISSING_FUSED:
+			*sc   = SPDK_SCSI_STATUS_TASK_ABORTED;
+			*sk   = SPDK_SCSI_SENSE_ABORTED_COMMAND;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_ACCESS_DENIED;
+			*ascq = SPDK_SCSI_ASCQ_INVALID_LU_IDENTIFIER;
+			break;
+		case SPDK_NVME_SC_LBA_OUT_OF_RANGE:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_NAMESPACE_NOT_READY:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_NOT_READY;
+			*asc  = SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_RESERVATION_CONFLICT:
+			*sc   = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+			*sk   = SPDK_SCSI_SENSE_NO_SENSE;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_COMMAND_ID_CONFLICT:
+		case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR:
+		case SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR:
+		case SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS:
+		case SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID:
+		case SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID:
+		case SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID:
+		case SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF:
+		case SPDK_NVME_SC_INVALID_PRP_OFFSET:
+		case SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED:
+		case SPDK_NVME_SC_INVALID_SGL_OFFSET:
+		case SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT:
+		case SPDK_NVME_SC_KEEP_ALIVE_EXPIRED:
+		case SPDK_NVME_SC_KEEP_ALIVE_INVALID:
+		case SPDK_NVME_SC_FORMAT_IN_PROGRESS:
+		default:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		}
+		break;
+	case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+		switch (nvme_sc) {
+		case SPDK_NVME_SC_COMPLETION_QUEUE_INVALID:
+		case SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_INVALID_FORMAT:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_FORMAT_COMMAND_FAILED;
+			*ascq = SPDK_SCSI_ASCQ_FORMAT_COMMAND_FAILED;
+			break;
+		case SPDK_NVME_SC_CONFLICTING_ATTRIBUTES:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_DATA_PROTECT;
+			*asc  = SPDK_SCSI_ASC_WRITE_PROTECTED;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER:
+		case SPDK_NVME_SC_INVALID_QUEUE_SIZE:
+		case SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED:
+		case SPDK_NVME_SC_INVALID_FIRMWARE_SLOT:
+		case SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE:
+		case SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR:
+		case SPDK_NVME_SC_INVALID_LOG_PAGE:
+		case SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET:
+		case SPDK_NVME_SC_INVALID_QUEUE_DELETION:
+		case SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE:
+		case SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE:
+		case SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC:
+		case SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET:
+		case SPDK_NVME_SC_FIRMWARE_REQ_RESET:
+		case SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION:
+		case SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED:
+		case SPDK_NVME_SC_OVERLAPPING_RANGE:
+		case SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY:
+		case SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE:
+		case SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED:
+		case SPDK_NVME_SC_NAMESPACE_IS_PRIVATE:
+		case SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED:
+		case SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED:
+		case SPDK_NVME_SC_CONTROLLER_LIST_INVALID:
+		case SPDK_NVME_SC_INVALID_PROTECTION_INFO:
+		default:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		}
+		break;
+	case SPDK_NVME_SCT_MEDIA_ERROR:
+		switch (nvme_sc) {
+		case SPDK_NVME_SC_WRITE_FAULTS:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+			*asc  = SPDK_SCSI_ASC_PERIPHERAL_DEVICE_WRITE_FAULT;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_UNRECOVERED_READ_ERROR:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+			*asc  = SPDK_SCSI_ASC_UNRECOVERED_READ_ERROR;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_GUARD_CHECK_ERROR:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+			*asc  = SPDK_SCSI_ASC_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+			*ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+			break;
+		case SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+			*asc  = SPDK_SCSI_ASC_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+			*ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+			break;
+		case SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+			*asc  = SPDK_SCSI_ASC_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+			*ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+			break;
+		case SPDK_NVME_SC_COMPARE_FAILURE:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_MISCOMPARE;
+			*asc  = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		case SPDK_NVME_SC_ACCESS_DENIED:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_DATA_PROTECT;
+			*asc  = SPDK_SCSI_ASC_ACCESS_DENIED;
+			*ascq = SPDK_SCSI_ASCQ_NO_ACCESS_RIGHTS;
+			break;
+		case SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK:
+		default:
+			*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+			*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+			*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+			break;
+		}
+		break;
+	case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+	default:
+		*sc   = SPDK_SCSI_STATUS_CHECK_CONDITION;
+		*sk   = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+		*asc  = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	}
+}
diff --git a/src/spdk/lib/bdev/spdk_bdev.map b/src/spdk/lib/bdev/spdk_bdev.map
new file mode 100644
index 000000000..9f9c3c7e5
--- /dev/null
+++ b/src/spdk/lib/bdev/spdk_bdev.map
@@ -0,0 +1,154 @@
+{
+	global:
+
+	# Public functions in bdev.h
+	spdk_bdev_get_opts;
+	spdk_bdev_set_opts;
+	spdk_bdev_initialize;
+	spdk_bdev_finish;
+	spdk_bdev_config_text;
+	spdk_bdev_subsystem_config_json;
+	spdk_bdev_get_by_name;
+	spdk_bdev_first;
+	spdk_bdev_next;
+	spdk_bdev_first_leaf;
+	spdk_bdev_next_leaf;
+	spdk_bdev_open;
+	spdk_bdev_open_ext;
+	spdk_bdev_close;
+	spdk_bdev_desc_get_bdev;
+	spdk_bdev_set_timeout;
+	spdk_bdev_io_type_supported;
+	spdk_bdev_dump_info_json;
+	spdk_bdev_get_name;
+	spdk_bdev_get_product_name;
+	spdk_bdev_get_block_size;
+	spdk_bdev_get_write_unit_size;
+	spdk_bdev_get_num_blocks;
+	spdk_bdev_get_qos_rpc_type;
+	spdk_bdev_get_qos_rate_limits;
+	spdk_bdev_set_qos_rate_limits;
+	spdk_bdev_get_buf_align;
+	spdk_bdev_get_optimal_io_boundary;
+	spdk_bdev_has_write_cache;
+	spdk_bdev_get_uuid;
+	spdk_bdev_get_acwu;
+	spdk_bdev_get_md_size;
+	spdk_bdev_is_md_interleaved;
+	spdk_bdev_is_md_separate;
+	spdk_bdev_is_zoned;
+	spdk_bdev_get_data_block_size;
+	spdk_bdev_get_dif_type;
+	spdk_bdev_is_dif_head_of_md;
+	spdk_bdev_is_dif_check_enabled;
+	spdk_bdev_get_qd;
+	spdk_bdev_get_qd_sampling_period;
+	spdk_bdev_set_qd_sampling_period;
+	spdk_bdev_get_io_time;
+	spdk_bdev_get_weighted_io_time;
+	spdk_bdev_get_io_channel;
+	spdk_bdev_read;
+	spdk_bdev_read_blocks;
+	spdk_bdev_read_blocks_with_md;
+	spdk_bdev_readv;
+	spdk_bdev_readv_blocks;
+	spdk_bdev_readv_blocks_with_md;
+	spdk_bdev_write;
+	spdk_bdev_write_blocks;
+	spdk_bdev_write_blocks_with_md;
+	spdk_bdev_writev;
+	spdk_bdev_writev_blocks;
+	spdk_bdev_writev_blocks_with_md;
+	spdk_bdev_compare_blocks;
+	spdk_bdev_compare_blocks_with_md;
+	spdk_bdev_comparev_blocks;
+	spdk_bdev_comparev_blocks_with_md;
+	spdk_bdev_comparev_and_writev_blocks;
+	spdk_bdev_zcopy_start;
+	spdk_bdev_zcopy_end;
+	spdk_bdev_write_zeroes;
+	spdk_bdev_write_zeroes_blocks;
+	spdk_bdev_unmap;
+	spdk_bdev_unmap_blocks;
+	spdk_bdev_flush;
+	spdk_bdev_flush_blocks;
+	spdk_bdev_reset;
+	spdk_bdev_abort;
+	spdk_bdev_nvme_admin_passthru;
+	spdk_bdev_nvme_io_passthru;
+	spdk_bdev_nvme_io_passthru_md;
+	spdk_bdev_free_io;
+	spdk_bdev_queue_io_wait;
+	spdk_bdev_get_io_stat;
+	spdk_bdev_get_device_stat;
+	spdk_bdev_io_get_nvme_status;
+	spdk_bdev_io_get_nvme_fused_status;
+	spdk_bdev_io_get_scsi_status;
+	spdk_bdev_io_get_iovec;
+	spdk_bdev_io_get_md_buf;
+	spdk_bdev_io_get_cb_arg;
+	spdk_bdev_histogram_enable;
+	spdk_bdev_histogram_get;
+	spdk_bdev_get_media_events;
+
+	# Public functions in bdev_module.h
+	spdk_bdev_register;
+	spdk_bdev_unregister;
+	spdk_bdev_destruct_done;
+	spdk_vbdev_register;
+	spdk_bdev_module_examine_done;
+	spdk_bdev_module_init_done;
+	spdk_bdev_module_finish_done;
+	spdk_bdev_module_claim_bdev;
+	spdk_bdev_module_release_bdev;
+	spdk_bdev_alias_add;
+	spdk_bdev_alias_del;
+	spdk_bdev_alias_del_all;
+	spdk_bdev_get_aliases;
+	spdk_bdev_io_get_buf;
+	spdk_bdev_io_get_aux_buf;
+	spdk_bdev_io_put_aux_buf;
+	spdk_bdev_io_set_buf;
+	spdk_bdev_io_set_md_buf;
+	spdk_bdev_io_complete;
+	spdk_bdev_io_complete_nvme_status;
+	spdk_bdev_io_complete_scsi_status;
+	spdk_bdev_io_get_thread;
+	spdk_bdev_io_get_io_channel;
+	spdk_bdev_notify_blockcnt_change;
+	spdk_scsi_nvme_translate;
+	spdk_bdev_module_list_add;
+	spdk_bdev_module_list_find;
+	spdk_bdev_part_base_get_bdev;
+	spdk_bdev_part_base_get_bdev_name;
+	spdk_bdev_part_base_get_desc;
+	spdk_bdev_part_base_get_tailq;
+	spdk_bdev_part_base_get_ctx;
+	spdk_bdev_part_base_free;
+	spdk_bdev_part_free;
+	spdk_bdev_part_base_hotremove;
+	spdk_bdev_part_base_construct;
+	spdk_bdev_part_construct;
+	spdk_bdev_part_submit_request;
+	spdk_bdev_part_get_bdev;
+	spdk_bdev_part_get_base;
+	spdk_bdev_part_get_base_bdev;
+	spdk_bdev_part_get_offset_blocks;
+	spdk_bdev_push_media_events;
+	spdk_bdev_notify_media_management;
+
+	# Public functions in bdev_zone.h
+	spdk_bdev_get_zone_size;
+	spdk_bdev_get_max_open_zones;
+	spdk_bdev_get_optimal_open_zones;
+	spdk_bdev_get_zone_info;
+	spdk_bdev_zone_management;
+	spdk_bdev_zone_append;
+	spdk_bdev_zone_appendv;
+	spdk_bdev_zone_append_with_md;
+	spdk_bdev_zone_appendv_with_md;
+	spdk_bdev_io_get_append_location;
+
+	# Everything else
+	local: *;
+};
diff --git a/src/spdk/lib/bdev/vtune.c b/src/spdk/lib/bdev/vtune.c
new file mode 100644
index 000000000..2cb48826e
--- /dev/null
+++ b/src/spdk/lib/bdev/vtune.c
@@ -0,0 +1,49 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/config.h"
+#if SPDK_CONFIG_VTUNE
+
+/* Disable warnings triggered by the VTune code */
+#if defined(__GNUC__) && \
+	__GNUC__ > 4 || \
+	(__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#endif
+#endif
+
+#include "ittnotify_static.c"
+
+#endif
diff --git a/src/spdk/lib/blob/Makefile b/src/spdk/lib/blob/Makefile
new file mode 100644
index 000000000..53ae6800b
--- /dev/null
+++ b/src/spdk/lib/blob/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = blobstore.c request.c zeroes.c blob_bs_dev.c
+LIBNAME = blob
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blob.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/blob/blob_bs_dev.c b/src/spdk/lib/blob/blob_bs_dev.c
new file mode 100644
index 000000000..8705a1c16
--- /dev/null
+++ b/src/spdk/lib/blob/blob_bs_dev.c
@@ -0,0 +1,150 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/blob.h"
+#include "spdk/log.h"
+#include "blobstore.h"
+
+static void
+blob_bs_dev_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+		  uint64_t lba, uint32_t lba_count,
+		  struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+blob_bs_dev_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+		   struct iovec *iov, int iovcnt,
+		   uint64_t lba, uint32_t lba_count,
+		   struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+blob_bs_dev_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+			 uint64_t lba, uint32_t lba_count,
+			 struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+blob_bs_dev_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+		  uint64_t lba, uint32_t lba_count,
+		  struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+blob_bs_dev_read_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_dev_cb_args *cb_args = (struct spdk_bs_dev_cb_args *)cb_arg;
+
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno);
+}
+
+static inline void
+blob_bs_dev_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+		 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
+
+	spdk_blob_io_read(b->blob, channel, payload, lba, lba_count,
+			  blob_bs_dev_read_cpl, cb_args);
+}
+
+static inline void
+blob_bs_dev_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+		  struct iovec *iov, int iovcnt,
+		  uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
+
+	spdk_blob_io_readv(b->blob, channel, iov, iovcnt, lba, lba_count,
+			   blob_bs_dev_read_cpl, cb_args);
+}
+
+static void
+blob_bs_dev_destroy_cpl(void *cb_arg, int bserrno)
+{
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Error on blob_bs_dev destroy: %d", bserrno);
+	}
+
+	/* Free blob_bs_dev */
+	free(cb_arg);
+}
+
+static void
+blob_bs_dev_destroy(struct spdk_bs_dev *bs_dev)
+{
+	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)bs_dev;
+
+	spdk_blob_close(b->blob, blob_bs_dev_destroy_cpl, b);
+}
+
+
+struct spdk_bs_dev *
+bs_create_blob_bs_dev(struct spdk_blob *blob)
+{
+	struct spdk_blob_bs_dev  *b;
+
+	b = calloc(1, sizeof(*b));
+	if (b == NULL) {
+		return NULL;
+	}
+	/* snapshot blob */
+	b->bs_dev.blockcnt = blob->active.num_clusters *
+			     blob->bs->pages_per_cluster * bs_io_unit_per_page(blob->bs);
+	b->bs_dev.blocklen = spdk_bs_get_io_unit_size(blob->bs);
+	b->bs_dev.create_channel = NULL;
+	b->bs_dev.destroy_channel = NULL;
+	b->bs_dev.destroy = blob_bs_dev_destroy;
+	b->bs_dev.write = blob_bs_dev_write;
+	b->bs_dev.writev = blob_bs_dev_writev;
+	b->bs_dev.read = blob_bs_dev_read;
+	b->bs_dev.readv = blob_bs_dev_readv;
+	b->bs_dev.write_zeroes = blob_bs_dev_write_zeroes;
+	b->bs_dev.unmap = blob_bs_dev_unmap;
+	b->blob = blob;
+
+	return &b->bs_dev;
+}
diff --git a/src/spdk/lib/blob/blobstore.c b/src/spdk/lib/blob/blobstore.c
new file mode 100644
index 000000000..768fc5b45
--- /dev/null
+++ b/src/spdk/lib/blob/blobstore.c
@@ -0,0 +1,7461 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blob.h"
+#include "spdk/crc32.h"
+#include "spdk/env.h"
+#include "spdk/queue.h"
+#include "spdk/thread.h"
+#include "spdk/bit_array.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+
+#include "blobstore.h"
+
+#define BLOB_CRC32C_INITIAL    0xffffffffUL
+
+static int bs_register_md_thread(struct spdk_blob_store *bs);
+static int bs_unregister_md_thread(struct spdk_blob_store *bs);
+static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
+static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
+		uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg);
+
+static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
+			  uint16_t value_len, bool internal);
+static int blob_get_xattr_value(struct spdk_blob *blob, const char *name,
+				const void **value, size_t *value_len, bool internal);
+static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal);
+
+static void blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
+			       spdk_blob_op_complete cb_fn, void *cb_arg);
+
+static void
+blob_verify_md_op(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+	assert(spdk_get_thread() == blob->bs->md_thread);
+	assert(blob->state != SPDK_BLOB_STATE_LOADING);
+}
+
+static struct spdk_blob_list *
+bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid)
+{
+	struct spdk_blob_list *snapshot_entry = NULL;
+
+	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
+		if (snapshot_entry->id == blobid) {
+			break;
+		}
+	}
+
+	return snapshot_entry;
+}
+
+static void
+bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page)
+{
+	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
+	assert(spdk_bit_array_get(bs->used_md_pages, page) == false);
+
+	spdk_bit_array_set(bs->used_md_pages, page);
+}
+
+static void
+bs_release_md_page(struct spdk_blob_store *bs, uint32_t page)
+{
+	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
+	assert(spdk_bit_array_get(bs->used_md_pages, page) == true);
+
+	spdk_bit_array_clear(bs->used_md_pages, page);
+}
+
+static void
+bs_claim_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
+{
+	assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters));
+	assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == false);
+	assert(bs->num_free_clusters > 0);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num);
+
+	spdk_bit_array_set(bs->used_clusters, cluster_num);
+	bs->num_free_clusters--;
+}
+
+static int
+blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster)
+{
+	uint64_t *cluster_lba = &blob->active.clusters[cluster_num];
+
+	blob_verify_md_op(blob);
+
+	if (*cluster_lba != 0) {
+		return -EEXIST;
+	}
+
+	*cluster_lba = bs_cluster_to_lba(blob->bs, cluster);
+	return 0;
+}
+
+static int
+bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num,
+		    uint64_t *lowest_free_cluster, uint32_t *lowest_free_md_page, bool update_map)
+{
+	uint32_t *extent_page = 0;
+
+	pthread_mutex_lock(&blob->bs->used_clusters_mutex);
+	*lowest_free_cluster = spdk_bit_array_find_first_clear(blob->bs->used_clusters,
+			       *lowest_free_cluster);
+	if (*lowest_free_cluster == UINT32_MAX) {
+		/* No more free clusters. Cannot satisfy the request */
+		pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
+		return -ENOSPC;
+	}
+
+	if (blob->use_extent_table) {
+		extent_page = bs_cluster_to_extent_page(blob, cluster_num);
+		if (*extent_page == 0) {
+			/* No extent_page is allocated for the cluster */
+			*lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages,
+					       *lowest_free_md_page);
+			if (*lowest_free_md_page == UINT32_MAX) {
+				/* No more free md pages. Cannot satisfy the request */
+				pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
+				return -ENOSPC;
+			}
+			bs_claim_md_page(blob->bs, *lowest_free_md_page);
+		}
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *lowest_free_cluster, blob->id);
+	bs_claim_cluster(blob->bs, *lowest_free_cluster);
+
+	pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
+
+	if (update_map) {
+		blob_insert_cluster(blob, cluster_num, *lowest_free_cluster);
+		if (blob->use_extent_table && *extent_page == 0) {
+			*extent_page = *lowest_free_md_page;
+		}
+	}
+
+	return 0;
+}
+
+static void
+bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
+{
+	assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters));
+	assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == true);
+	assert(bs->num_free_clusters < bs->total_clusters);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num);
+
+	pthread_mutex_lock(&bs->used_clusters_mutex);
+	spdk_bit_array_clear(bs->used_clusters, cluster_num);
+	bs->num_free_clusters++;
+	pthread_mutex_unlock(&bs->used_clusters_mutex);
+}
+
+static void
+blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs)
+{
+	xattrs->count = 0;
+	xattrs->names = NULL;
+	xattrs->ctx = NULL;
+	xattrs->get_value = NULL;
+}
+
+void
+spdk_blob_opts_init(struct spdk_blob_opts *opts)
+{
+	opts->num_clusters = 0;
+	opts->thin_provision = false;
+	opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
+	blob_xattrs_init(&opts->xattrs);
+	opts->use_extent_table = true;
+}
+
+void
+spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts)
+{
+	opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
+}
+
+static struct spdk_blob *
+blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
+{
+	struct spdk_blob *blob;
+
+	blob = calloc(1, sizeof(*blob));
+	if (!blob) {
+		return NULL;
+	}
+
+	blob->id = id;
+	blob->bs = bs;
+
+	blob->parent_id = SPDK_BLOBID_INVALID;
+
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+	blob->extent_rle_found = false;
+	blob->extent_table_found = false;
+	blob->active.num_pages = 1;
+	blob->active.pages = calloc(1, sizeof(*blob->active.pages));
+	if (!blob->active.pages) {
+		free(blob);
+		return NULL;
+	}
+
+	blob->active.pages[0] = bs_blobid_to_page(id);
+
+	TAILQ_INIT(&blob->xattrs);
+	TAILQ_INIT(&blob->xattrs_internal);
+	TAILQ_INIT(&blob->pending_persists);
+
+	return blob;
+}
+
+static void
+xattrs_free(struct spdk_xattr_tailq *xattrs)
+{
+	struct spdk_xattr	*xattr, *xattr_tmp;
+
+	TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) {
+		TAILQ_REMOVE(xattrs, xattr, link);
+		free(xattr->name);
+		free(xattr->value);
+		free(xattr);
+	}
+}
+
+static void
+blob_free(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+	assert(TAILQ_EMPTY(&blob->pending_persists));
+
+	free(blob->active.extent_pages);
+	free(blob->clean.extent_pages);
+	free(blob->active.clusters);
+	free(blob->clean.clusters);
+	free(blob->active.pages);
+	free(blob->clean.pages);
+
+	xattrs_free(&blob->xattrs);
+	xattrs_free(&blob->xattrs_internal);
+
+	if (blob->back_bs_dev) {
+		blob->back_bs_dev->destroy(blob->back_bs_dev);
+	}
+
+	free(blob);
+}
+
+struct freeze_io_ctx {
+	struct spdk_bs_cpl cpl;
+	struct spdk_blob *blob;
+};
+
+static void
+blob_io_sync(struct spdk_io_channel_iter *i)
+{
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+blob_execute_queued_io(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch);
+	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_bs_request_set	*set;
+	struct spdk_bs_user_op_args	*args;
+	spdk_bs_user_op_t *op, *tmp;
+
+	TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) {
+		set = (struct spdk_bs_request_set *)op;
+		args = &set->u.user_op;
+
+		if (args->blob == ctx->blob) {
+			TAILQ_REMOVE(&ch->queued_io, op, link);
+			bs_user_op_execute(op);
+		}
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+blob_io_cpl(struct spdk_io_channel_iter *i, int status)
+{
+	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0);
+
+	free(ctx);
+}
+
+static void
+blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct freeze_io_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
+	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
+	ctx->blob = blob;
+
+	/* Freeze I/O on blob */
+	blob->frozen_refcnt++;
+
+	if (blob->frozen_refcnt == 1) {
+		spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl);
+	} else {
+		cb_fn(cb_arg, 0);
+		free(ctx);
+	}
+}
+
+static void
+blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct freeze_io_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
+	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
+	ctx->blob = blob;
+
+	assert(blob->frozen_refcnt > 0);
+
+	blob->frozen_refcnt--;
+
+	if (blob->frozen_refcnt == 0) {
+		spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl);
+	} else {
+		cb_fn(cb_arg, 0);
+		free(ctx);
+	}
+}
+
+static int
+blob_mark_clean(struct spdk_blob *blob)
+{
+	uint32_t *extent_pages = NULL;
+	uint64_t *clusters = NULL;
+	uint32_t *pages = NULL;
+
+	assert(blob != NULL);
+
+	if (blob->active.num_extent_pages) {
+		assert(blob->active.extent_pages);
+		extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages));
+		if (!extent_pages) {
+			return -ENOMEM;
+		}
+		memcpy(extent_pages, blob->active.extent_pages,
+		       blob->active.num_extent_pages * sizeof(*extent_pages));
+	}
+
+	if (blob->active.num_clusters) {
+		assert(blob->active.clusters);
+		clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters));
+		if (!clusters) {
+			free(extent_pages);
+			return -ENOMEM;
+		}
+		memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
+	}
+
+	if (blob->active.num_pages) {
+		assert(blob->active.pages);
+		pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages));
+		if (!pages) {
+			free(extent_pages);
+			free(clusters);
+			return -ENOMEM;
+		}
+		memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
+	}
+
+	free(blob->clean.extent_pages);
+	free(blob->clean.clusters);
+	free(blob->clean.pages);
+
+	blob->clean.num_extent_pages = blob->active.num_extent_pages;
+	blob->clean.extent_pages = blob->active.extent_pages;
+	blob->clean.num_clusters = blob->active.num_clusters;
+	blob->clean.clusters = blob->active.clusters;
+	blob->clean.num_pages = blob->active.num_pages;
+	blob->clean.pages = blob->active.pages;
+
+	blob->active.extent_pages = extent_pages;
+	blob->active.clusters = clusters;
+	blob->active.pages = pages;
+
+	/* If the metadata was dirtied again while the metadata was being written to disk,
+	 *  we do not want to revert the DIRTY state back to CLEAN here.
+	 */
+	if (blob->state == SPDK_BLOB_STATE_LOADING) {
+		blob->state = SPDK_BLOB_STATE_CLEAN;
+	}
+
+	return 0;
+}
+
+static int
+blob_deserialize_xattr(struct spdk_blob *blob,
+		       struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal)
+{
+	struct spdk_xattr                       *xattr;
+
+	if (desc_xattr->length != sizeof(desc_xattr->name_length) +
+	    sizeof(desc_xattr->value_length) +
+	    desc_xattr->name_length + desc_xattr->value_length) {
+		return -EINVAL;
+	}
+
+	xattr = calloc(1, sizeof(*xattr));
+	if (xattr == NULL) {
+		return -ENOMEM;
+	}
+
+	xattr->name = malloc(desc_xattr->name_length + 1);
+	if (xattr->name == NULL) {
+		free(xattr);
+		return -ENOMEM;
+	}
+	memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length);
+	xattr->name[desc_xattr->name_length] = '\0';
+
+	xattr->value = malloc(desc_xattr->value_length);
+	if (xattr->value == NULL) {
+		free(xattr->name);
+		free(xattr);
+		return -ENOMEM;
+	}
+	xattr->value_len = desc_xattr->value_length;
+	memcpy(xattr->value,
+	       (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
+	       desc_xattr->value_length);
+
+	TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link);
+
+	return 0;
+}
+
+
+static int
+blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob)
+{
+	struct spdk_blob_md_descriptor *desc;
+	size_t	cur_desc = 0;
+	void *tmp;
+
+	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+	while (cur_desc < sizeof(page->descriptors)) {
+		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
+			if (desc->length == 0) {
+				/* If padding and length are 0, this terminates the page */
+				break;
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
+			struct spdk_blob_md_descriptor_flags	*desc_flags;
+
+			desc_flags = (struct spdk_blob_md_descriptor_flags *)desc;
+
+			if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) {
+				return -EINVAL;
+			}
+
+			if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) !=
+			    SPDK_BLOB_INVALID_FLAGS_MASK) {
+				return -EINVAL;
+			}
+
+			if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) !=
+			    SPDK_BLOB_DATA_RO_FLAGS_MASK) {
+				blob->data_ro = true;
+				blob->md_ro = true;
+			}
+
+			if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) !=
+			    SPDK_BLOB_MD_RO_FLAGS_MASK) {
+				blob->md_ro = true;
+			}
+
+			if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
+				blob->data_ro = true;
+				blob->md_ro = true;
+			}
+
+			blob->invalid_flags = desc_flags->invalid_flags;
+			blob->data_ro_flags = desc_flags->data_ro_flags;
+			blob->md_ro_flags = desc_flags->md_ro_flags;
+
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
+			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
+			unsigned int				i, j;
+			unsigned int				cluster_count = blob->active.num_clusters;
+
+			if (blob->extent_table_found) {
+				/* Extent Table already present in the md,
+				 * both descriptors should never be at the same time. */
+				return -EINVAL;
+			}
+			blob->extent_rle_found = true;
+
+			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
+
+			if (desc_extent_rle->length == 0 ||
+			    (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) {
+				return -EINVAL;
+			}
+
+			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
+					if (desc_extent_rle->extents[i].cluster_idx != 0) {
+						if (!spdk_bit_array_get(blob->bs->used_clusters,
+									desc_extent_rle->extents[i].cluster_idx + j)) {
+							return -EINVAL;
+						}
+					}
+					cluster_count++;
+				}
+			}
+
+			if (cluster_count == 0) {
+				return -EINVAL;
+			}
+			tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
+			if (tmp == NULL) {
+				return -ENOMEM;
+			}
+			blob->active.clusters = tmp;
+			blob->active.cluster_array_size = cluster_count;
+
+			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
+					if (desc_extent_rle->extents[i].cluster_idx != 0) {
+						blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
+								desc_extent_rle->extents[i].cluster_idx + j);
+					} else if (spdk_blob_is_thin_provisioned(blob)) {
+						blob->active.clusters[blob->active.num_clusters++] = 0;
+					} else {
+						return -EINVAL;
+					}
+				}
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
+			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
+			uint32_t num_extent_pages = blob->active.num_extent_pages;
+			uint32_t i, j;
+			size_t extent_pages_length;
+
+			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
+			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
+
+			if (blob->extent_rle_found) {
+				/* This means that Extent RLE is present in MD,
+				 * both should never be at the same time. */
+				return -EINVAL;
+			} else if (blob->extent_table_found &&
+				   desc_extent_table->num_clusters != blob->remaining_clusters_in_et) {
+				/* Number of clusters in this ET does not match number
+				 * from previously read EXTENT_TABLE. */
+				return -EINVAL;
+			}
+
+			blob->extent_table_found = true;
+
+			if (desc_extent_table->length == 0 ||
+			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
+				return -EINVAL;
+			}
+
+			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+				num_extent_pages += desc_extent_table->extent_page[i].num_pages;
+			}
+
+			tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t));
+			if (tmp == NULL) {
+				return -ENOMEM;
+			}
+			blob->active.extent_pages = tmp;
+			blob->active.extent_pages_array_size = num_extent_pages;
+
+			blob->remaining_clusters_in_et = desc_extent_table->num_clusters;
+
+			/* Extent table entries contain md page numbers for extent pages.
+			 * Zeroes represent unallocated extent pages, those are run-length-encoded.
+			 */
+			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+				if (desc_extent_table->extent_page[i].page_idx != 0) {
+					assert(desc_extent_table->extent_page[i].num_pages == 1);
+					blob->active.extent_pages[blob->active.num_extent_pages++] =
+						desc_extent_table->extent_page[i].page_idx;
+				} else if (spdk_blob_is_thin_provisioned(blob)) {
+					for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) {
+						blob->active.extent_pages[blob->active.num_extent_pages++] = 0;
+					}
+				} else {
+					return -EINVAL;
+				}
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
+			unsigned int					i;
+			unsigned int					cluster_count = 0;
+			size_t						cluster_idx_length;
+
+			if (blob->extent_rle_found) {
+				/* This means that Extent RLE is present in MD,
+				 * both should never be at the same time. */
+				return -EINVAL;
+			}
+
+			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
+			cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
+
+			if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
+			    (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
+				return -EINVAL;
+			}
+
+			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
+				if (desc_extent->cluster_idx[i] != 0) {
+					if (!spdk_bit_array_get(blob->bs->used_clusters, desc_extent->cluster_idx[i])) {
+						return -EINVAL;
+					}
+				}
+				cluster_count++;
+			}
+
+			if (cluster_count == 0) {
+				return -EINVAL;
+			}
+
+			/* When reading extent pages sequentially starting cluster idx should match
+			 * current size of a blob.
+			 * If changed to batch reading, this check shall be removed. */
+			if (desc_extent->start_cluster_idx != blob->active.num_clusters) {
+				return -EINVAL;
+			}
+
+			tmp = realloc(blob->active.clusters,
+				      (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters));
+			if (tmp == NULL) {
+				return -ENOMEM;
+			}
+			blob->active.clusters = tmp;
+			blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters);
+
+			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
+				if (desc_extent->cluster_idx[i] != 0) {
+					blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
+							desc_extent->cluster_idx[i]);
+				} else if (spdk_blob_is_thin_provisioned(blob)) {
+					blob->active.clusters[blob->active.num_clusters++] = 0;
+				} else {
+					return -EINVAL;
+				}
+			}
+			assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters);
+			assert(blob->remaining_clusters_in_et >= cluster_count);
+			blob->remaining_clusters_in_et -= cluster_count;
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
+			int rc;
+
+			rc = blob_deserialize_xattr(blob,
+						    (struct spdk_blob_md_descriptor_xattr *) desc, false);
+			if (rc != 0) {
+				return rc;
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
+			int rc;
+
+			rc = blob_deserialize_xattr(blob,
+						    (struct spdk_blob_md_descriptor_xattr *) desc, true);
+			if (rc != 0) {
+				return rc;
+			}
+		} else {
+			/* Unrecognized descriptor type.  Do not fail - just continue to the
+			 *  next descriptor.  If this descriptor is associated with some feature
+			 *  defined in a newer version of blobstore, that version of blobstore
+			 *  should create and set an associated feature flag to specify if this
+			 *  blob can be loaded or not.
+			 */
+		}
+
+		/* Advance to the next descriptor */
+		cur_desc += sizeof(*desc) + desc->length;
+		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
+			break;
+		}
+		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
+	}
+
+	return 0;
+}
+
+static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page);
+
+static int
+blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+	assert(blob->state == SPDK_BLOB_STATE_LOADING);
+
+	if (bs_load_cur_extent_page_valid(extent_page) == false) {
+		return -ENOENT;
+	}
+
+	return blob_parse_page(extent_page, blob);
+}
+
+static int
+blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
+	   struct spdk_blob *blob)
+{
+	const struct spdk_blob_md_page *page;
+	uint32_t i;
+	int rc;
+
+	assert(page_count > 0);
+	assert(pages[0].sequence_num == 0);
+	assert(blob != NULL);
+	assert(blob->state == SPDK_BLOB_STATE_LOADING);
+	assert(blob->active.clusters == NULL);
+
+	/* The blobid provided doesn't match what's in the MD, this can
+	 * happen for example if a bogus blobid is passed in through open.
+	 */
+	if (blob->id != pages[0].id) {
+		SPDK_ERRLOG("Blobid (%lu) doesn't match what's in metadata (%lu)\n",
+			    blob->id, pages[0].id);
+		return -ENOENT;
+	}
+
+	for (i = 0; i < page_count; i++) {
+		page = &pages[i];
+
+		assert(page->id == blob->id);
+		assert(page->sequence_num == i);
+
+		rc = blob_parse_page(page, blob);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int
+blob_serialize_add_page(const struct spdk_blob *blob,
+			struct spdk_blob_md_page **pages,
+			uint32_t *page_count,
+			struct spdk_blob_md_page **last_page)
+{
+	struct spdk_blob_md_page *page;
+
+	assert(pages != NULL);
+	assert(page_count != NULL);
+
+	if (*page_count == 0) {
+		assert(*pages == NULL);
+		*page_count = 1;
+		*pages = spdk_malloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
+				     NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	} else {
+		assert(*pages != NULL);
+		(*page_count)++;
+		*pages = spdk_realloc(*pages,
+				      SPDK_BS_PAGE_SIZE * (*page_count),
+				      SPDK_BS_PAGE_SIZE);
+	}
+
+	if (*pages == NULL) {
+		*page_count = 0;
+		*last_page = NULL;
+		return -ENOMEM;
+	}
+
+	page = &(*pages)[*page_count - 1];
+	memset(page, 0, sizeof(*page));
+	page->id = blob->id;
+	page->sequence_num = *page_count - 1;
+	page->next = SPDK_INVALID_MD_PAGE;
+	*last_page = page;
+
+	return 0;
+}
+
+/* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor.
+ * Update required_sz on both success and failure.
+ *
+ */
+static int
+blob_serialize_xattr(const struct spdk_xattr *xattr,
+		     uint8_t *buf, size_t buf_sz,
+		     size_t *required_sz, bool internal)
+{
+	struct spdk_blob_md_descriptor_xattr	*desc;
+
+	*required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) +
+		       strlen(xattr->name) +
+		       xattr->value_len;
+
+	if (buf_sz < *required_sz) {
+		return -1;
+	}
+
+	desc = (struct spdk_blob_md_descriptor_xattr *)buf;
+
+	desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR;
+	desc->length = sizeof(desc->name_length) +
+		       sizeof(desc->value_length) +
+		       strlen(xattr->name) +
+		       xattr->value_len;
+	desc->name_length = strlen(xattr->name);
+	desc->value_length = xattr->value_len;
+
+	memcpy(desc->name, xattr->name, desc->name_length);
+	memcpy((void *)((uintptr_t)desc->name + desc->name_length),
+	       xattr->value,
+	       desc->value_length);
+
+	return 0;
+}
+
+static void
+blob_serialize_extent_table_entry(const struct spdk_blob *blob,
+				  uint64_t start_ep, uint64_t *next_ep,
+				  uint8_t **buf, size_t *remaining_sz)
+{
+	struct spdk_blob_md_descriptor_extent_table *desc;
+	size_t cur_sz;
+	uint64_t i, et_idx;
+	uint32_t extent_page, ep_len;
+
+	/* The buffer must have room for at least num_clusters entry */
+	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters);
+	if (*remaining_sz < cur_sz) {
+		*next_ep = start_ep;
+		return;
+	}
+
+	desc = (struct spdk_blob_md_descriptor_extent_table *)*buf;
+	desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE;
+
+	desc->num_clusters = blob->active.num_clusters;
+
+	ep_len = 1;
+	et_idx = 0;
+	for (i = start_ep; i < blob->active.num_extent_pages; i++) {
+		if (*remaining_sz < cur_sz  + sizeof(desc->extent_page[0])) {
+			/* If we ran out of buffer space, return */
+			break;
+		}
+
+		extent_page = blob->active.extent_pages[i];
+		/* Verify that next extent_page is unallocated */
+		if (extent_page == 0 &&
+		    (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) {
+			ep_len++;
+			continue;
+		}
+		desc->extent_page[et_idx].page_idx = extent_page;
+		desc->extent_page[et_idx].num_pages = ep_len;
+		et_idx++;
+
+		ep_len = 1;
+		cur_sz += sizeof(desc->extent_page[et_idx]);
+	}
+	*next_ep = i;
+
+	desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx;
+	*remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length;
+	*buf += sizeof(struct spdk_blob_md_descriptor) + desc->length;
+}
+
+static int
+blob_serialize_extent_table(const struct spdk_blob *blob,
+			    struct spdk_blob_md_page **pages,
+			    struct spdk_blob_md_page *cur_page,
+			    uint32_t *page_count, uint8_t **buf,
+			    size_t *remaining_sz)
+{
+	uint64_t				last_extent_page;
+	int					rc;
+
+	last_extent_page = 0;
+	/* At least single extent table entry has to be always persisted.
+	 * Such case occurs with num_extent_pages == 0. */
+	while (last_extent_page <= blob->active.num_extent_pages) {
+		blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf,
+						  remaining_sz);
+
+		if (last_extent_page == blob->active.num_extent_pages) {
+			break;
+		}
+
+		rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
+		if (rc < 0) {
+			return rc;
+		}
+
+		*buf = (uint8_t *)cur_page->descriptors;
+		*remaining_sz = sizeof(cur_page->descriptors);
+	}
+
+	return 0;
+}
+
+static void
+blob_serialize_extent_rle(const struct spdk_blob *blob,
+			  uint64_t start_cluster, uint64_t *next_cluster,
+			  uint8_t **buf, size_t *buf_sz)
+{
+	struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
+	size_t cur_sz;
+	uint64_t i, extent_idx;
+	uint64_t lba, lba_per_cluster, lba_count;
+
+	/* The buffer must have room for at least one extent */
+	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]);
+	if (*buf_sz < cur_sz) {
+		*next_cluster = start_cluster;
+		return;
+	}
+
+	desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf;
+	desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE;
+
+	lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
+
+	lba = blob->active.clusters[start_cluster];
+	lba_count = lba_per_cluster;
+	extent_idx = 0;
+	for (i = start_cluster + 1; i < blob->active.num_clusters; i++) {
+		if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) {
+			/* Run-length encode sequential non-zero LBA */
+			lba_count += lba_per_cluster;
+			continue;
+		} else if (lba == 0 && blob->active.clusters[i] == 0) {
+			/* Run-length encode unallocated clusters */
+			lba_count += lba_per_cluster;
+			continue;
+		}
+		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
+		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
+		extent_idx++;
+
+		cur_sz += sizeof(desc_extent_rle->extents[extent_idx]);
+
+		if (*buf_sz < cur_sz) {
+			/* If we ran out of buffer space, return */
+			*next_cluster = i;
+			break;
+		}
+
+		lba = blob->active.clusters[i];
+		lba_count = lba_per_cluster;
+	}
+
+	if (*buf_sz >= cur_sz) {
+		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
+		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
+		extent_idx++;
+
+		*next_cluster = blob->active.num_clusters;
+	}
+
+	desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx;
+	*buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
+	*buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
+}
+
+static int
+blob_serialize_extents_rle(const struct spdk_blob *blob,
+			   struct spdk_blob_md_page **pages,
+			   struct spdk_blob_md_page *cur_page,
+			   uint32_t *page_count, uint8_t **buf,
+			   size_t *remaining_sz)
+{
+	uint64_t				last_cluster;
+	int					rc;
+
+	last_cluster = 0;
+	while (last_cluster < blob->active.num_clusters) {
+		blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz);
+
+		if (last_cluster == blob->active.num_clusters) {
+			break;
+		}
+
+		rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
+		if (rc < 0) {
+			return rc;
+		}
+
+		*buf = (uint8_t *)cur_page->descriptors;
+		*remaining_sz = sizeof(cur_page->descriptors);
+	}
+
+	return 0;
+}
+
+static void
+blob_serialize_extent_page(const struct spdk_blob *blob,
+			   uint64_t cluster, struct spdk_blob_md_page *page)
+{
+	struct spdk_blob_md_descriptor_extent_page *desc_extent;
+	uint64_t i, extent_idx;
+	uint64_t lba, lba_per_cluster;
+	uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
+
+	desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors;
+	desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE;
+
+	lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
+
+	desc_extent->start_cluster_idx = start_cluster_idx;
+	extent_idx = 0;
+	for (i = start_cluster_idx; i < blob->active.num_clusters; i++) {
+		lba = blob->active.clusters[i];
+		desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster;
+		if (extent_idx >= SPDK_EXTENTS_PER_EP) {
+			break;
+		}
+	}
+	desc_extent->length = sizeof(desc_extent->start_cluster_idx) +
+			      sizeof(desc_extent->cluster_idx[0]) * extent_idx;
+}
+
+static void
+blob_serialize_flags(const struct spdk_blob *blob,
+		     uint8_t *buf, size_t *buf_sz)
+{
+	struct spdk_blob_md_descriptor_flags *desc;
+
+	/*
+	 * Flags get serialized first, so we should always have room for the flags
+	 *  descriptor.
+	 */
+	assert(*buf_sz >= sizeof(*desc));
+
+	desc = (struct spdk_blob_md_descriptor_flags *)buf;
+	desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS;
+	desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor);
+	desc->invalid_flags = blob->invalid_flags;
+	desc->data_ro_flags = blob->data_ro_flags;
+	desc->md_ro_flags = blob->md_ro_flags;
+
+	*buf_sz -= sizeof(*desc);
+}
+
+static int
+blob_serialize_xattrs(const struct spdk_blob *blob,
+		      const struct spdk_xattr_tailq *xattrs, bool internal,
+		      struct spdk_blob_md_page **pages,
+		      struct spdk_blob_md_page *cur_page,
+		      uint32_t *page_count, uint8_t **buf,
+		      size_t *remaining_sz)
+{
+	const struct spdk_xattr	*xattr;
+	int	rc;
+
+	TAILQ_FOREACH(xattr, xattrs, link) {
+		size_t required_sz = 0;
+
+		rc = blob_serialize_xattr(xattr,
+					  *buf, *remaining_sz,
+					  &required_sz, internal);
+		if (rc < 0) {
+			/* Need to add a new page to the chain */
+			rc = blob_serialize_add_page(blob, pages, page_count,
+						     &cur_page);
+			if (rc < 0) {
+				spdk_free(*pages);
+				*pages = NULL;
+				*page_count = 0;
+				return rc;
+			}
+
+			*buf = (uint8_t *)cur_page->descriptors;
+			*remaining_sz = sizeof(cur_page->descriptors);
+
+			/* Try again */
+			required_sz = 0;
+			rc = blob_serialize_xattr(xattr,
+						  *buf, *remaining_sz,
+						  &required_sz, internal);
+
+			if (rc < 0) {
+				spdk_free(*pages);
+				*pages = NULL;
+				*page_count = 0;
+				return rc;
+			}
+		}
+
+		*remaining_sz -= required_sz;
+		*buf += required_sz;
+	}
+
+	return 0;
+}
+
+static int
+blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages,
+	       uint32_t *page_count)
+{
+	struct spdk_blob_md_page		*cur_page;
+	int					rc;
+	uint8_t					*buf;
+	size_t					remaining_sz;
+
+	assert(pages != NULL);
+	assert(page_count != NULL);
+	assert(blob != NULL);
+	assert(blob->state == SPDK_BLOB_STATE_DIRTY);
+
+	*pages = NULL;
+	*page_count = 0;
+
+	/* A blob always has at least 1 page, even if it has no descriptors */
+	rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
+	if (rc < 0) {
+		return rc;
+	}
+
+	buf = (uint8_t *)cur_page->descriptors;
+	remaining_sz = sizeof(cur_page->descriptors);
+
+	/* Serialize flags */
+	blob_serialize_flags(blob, buf, &remaining_sz);
+	buf += sizeof(struct spdk_blob_md_descriptor_flags);
+
+	/* Serialize xattrs */
+	rc = blob_serialize_xattrs(blob, &blob->xattrs, false,
+				   pages, cur_page, page_count, &buf, &remaining_sz);
+	if (rc < 0) {
+		return rc;
+	}
+
+	/* Serialize internal xattrs */
+	rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true,
+				   pages, cur_page, page_count, &buf, &remaining_sz);
+	if (rc < 0) {
+		return rc;
+	}
+
+	if (blob->use_extent_table) {
+		/* Serialize extent table */
+		rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz);
+	} else {
+		/* Serialize extents */
+		rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz);
+	}
+
+	return rc;
+}
+
+struct spdk_blob_load_ctx {
+	struct spdk_blob		*blob;
+
+	struct spdk_blob_md_page	*pages;
+	uint32_t			num_pages;
+	uint32_t			next_extent_page;
+	spdk_bs_sequence_t	        *seq;
+
+	spdk_bs_sequence_cpl		cb_fn;
+	void				*cb_arg;
+};
+
+static uint32_t
+blob_md_page_calc_crc(void *page)
+{
+	uint32_t		crc;
+
+	crc = BLOB_CRC32C_INITIAL;
+	crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc);
+	crc ^= BLOB_CRC32C_INITIAL;
+
+	return crc;
+
+}
+
+static void
+blob_load_final(void *cb_arg, int bserrno)
+{
+	struct spdk_blob_load_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+
+	if (bserrno == 0) {
+		blob_mark_clean(blob);
+	}
+
+	ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno);
+
+	/* Free the memory */
+	spdk_free(ctx->pages);
+	free(ctx);
+}
+
+static void
+blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
+{
+	struct spdk_blob_load_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+
+	if (bserrno == 0) {
+		blob->back_bs_dev = bs_create_blob_bs_dev(snapshot);
+		if (blob->back_bs_dev == NULL) {
+			bserrno = -ENOMEM;
+		}
+	}
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Snapshot fail\n");
+	}
+
+	blob_load_final(ctx, bserrno);
+}
+
+static void blob_update_clear_method(struct spdk_blob *blob);
+
+static void
+blob_load_backing_dev(void *cb_arg)
+{
+	struct spdk_blob_load_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	const void			*value;
+	size_t				len;
+	int				rc;
+
+	if (spdk_blob_is_thin_provisioned(blob)) {
+		rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true);
+		if (rc == 0) {
+			if (len != sizeof(spdk_blob_id)) {
+				blob_load_final(ctx, -EINVAL);
+				return;
+			}
+			/* open snapshot blob and continue in the callback function */
+			blob->parent_id = *(spdk_blob_id *)value;
+			spdk_bs_open_blob(blob->bs, blob->parent_id,
+					  blob_load_snapshot_cpl, ctx);
+			return;
+		} else {
+			/* add zeroes_dev for thin provisioned blob */
+			blob->back_bs_dev = bs_create_zeroes_dev();
+		}
+	} else {
+		/* standard blob */
+		blob->back_bs_dev = NULL;
+	}
+	blob_load_final(ctx, 0);
+}
+
+static void
+blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_load_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_md_page	*page;
+	uint64_t			i;
+	uint32_t			crc;
+	uint64_t			lba;
+	void				*tmp;
+	uint64_t			sz;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Extent page read failed: %d\n", bserrno);
+		blob_load_final(ctx, bserrno);
+		return;
+	}
+
+	if (ctx->pages == NULL) {
+		/* First iteration of this function, allocate buffer for single EXTENT_PAGE */
+		ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, NULL, SPDK_ENV_SOCKET_ID_ANY,
+					  SPDK_MALLOC_DMA);
+		if (!ctx->pages) {
+			blob_load_final(ctx, -ENOMEM);
+			return;
+		}
+		ctx->num_pages = 1;
+		ctx->next_extent_page = 0;
+	} else {
+		page = &ctx->pages[0];
+		crc = blob_md_page_calc_crc(page);
+		if (crc != page->crc) {
+			blob_load_final(ctx, -EINVAL);
+			return;
+		}
+
+		if (page->next != SPDK_INVALID_MD_PAGE) {
+			blob_load_final(ctx, -EINVAL);
+			return;
+		}
+
+		bserrno = blob_parse_extent_page(page, blob);
+		if (bserrno) {
+			blob_load_final(ctx, bserrno);
+			return;
+		}
+	}
+
+	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
+		if (blob->active.extent_pages[i] != 0) {
+			/* Extent page was allocated, read and parse it. */
+			lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]);
+			ctx->next_extent_page = i + 1;
+
+			bs_sequence_read_dev(seq, &ctx->pages[0], lba,
+					     bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
+					     blob_load_cpl_extents_cpl, ctx);
+			return;
+		} else {
+			/* Thin provisioned blobs can point to unallocated extent pages.
+			 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */
+
+			sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP);
+			blob->active.num_clusters += sz;
+			blob->remaining_clusters_in_et -= sz;
+
+			assert(spdk_blob_is_thin_provisioned(blob));
+			assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0);
+
+			tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
+			if (tmp == NULL) {
+				blob_load_final(ctx, -ENOMEM);
+				return;
+			}
+			memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0,
+			       sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size));
+			blob->active.clusters = tmp;
+			blob->active.cluster_array_size = blob->active.num_clusters;
+		}
+	}
+
+	blob_load_backing_dev(ctx);
+}
+
+static void
+blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_load_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_md_page	*page;
+	int				rc;
+	uint32_t			crc;
+	uint32_t			current_page;
+
+	if (ctx->num_pages == 1) {
+		current_page = bs_blobid_to_page(blob->id);
+	} else {
+		assert(ctx->num_pages != 0);
+		page = &ctx->pages[ctx->num_pages - 2];
+		current_page = page->next;
+	}
+
+	if (bserrno) {
+		SPDK_ERRLOG("Metadata page %d read failed for blobid %lu: %d\n",
+			    current_page, blob->id, bserrno);
+		blob_load_final(ctx, bserrno);
+		return;
+	}
+
+	page = &ctx->pages[ctx->num_pages - 1];
+	crc = blob_md_page_calc_crc(page);
+	if (crc != page->crc) {
+		SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %lu\n",
+			    current_page, blob->id);
+		blob_load_final(ctx, -EINVAL);
+		return;
+	}
+
+	if (page->next != SPDK_INVALID_MD_PAGE) {
+		uint32_t next_page = page->next;
+		uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page);
+
+		/* Read the next page */
+		ctx->num_pages++;
+		ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages),
+					  sizeof(*page));
+		if (ctx->pages == NULL) {
+			blob_load_final(ctx, -ENOMEM);
+			return;
+		}
+
+		bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1],
+				     next_lba,
+				     bs_byte_to_lba(blob->bs, sizeof(*page)),
+				     blob_load_cpl, ctx);
+		return;
+	}
+
+	/* Parse the pages */
+	rc = blob_parse(ctx->pages, ctx->num_pages, blob);
+	if (rc) {
+		blob_load_final(ctx, rc);
+		return;
+	}
+
+	if (blob->extent_table_found == true) {
+		/* If EXTENT_TABLE was found, that means support for it should be enabled. */
+		assert(blob->extent_rle_found == false);
+		blob->use_extent_table = true;
+	} else {
+		/* If EXTENT_RLE or no extent_* descriptor was found disable support
+		 * for extent table. No extent_* descriptors means that blob has length of 0
+		 * and no extent_rle descriptors were persisted for it.
+		 * EXTENT_TABLE if used, is always present in metadata regardless of length. */
+		blob->use_extent_table = false;
+	}
+
+	/* Check the clear_method stored in metadata vs what may have been passed
+	 * via spdk_bs_open_blob_ext() and update accordingly.
+	 */
+	blob_update_clear_method(blob);
+
+	spdk_free(ctx->pages);
+	ctx->pages = NULL;
+
+	if (blob->extent_table_found) {
+		blob_load_cpl_extents_cpl(seq, ctx, 0);
+	} else {
+		blob_load_backing_dev(ctx);
+	}
+}
+
+/* Load a blob from disk given a blobid */
+static void
+blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
+	  spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_blob_load_ctx *ctx;
+	struct spdk_blob_store *bs;
+	uint32_t page_num;
+	uint64_t lba;
+
+	blob_verify_md_op(blob);
+
+	bs = blob->bs;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(seq, cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->blob = blob;
+	ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE);
+	if (!ctx->pages) {
+		free(ctx);
+		cb_fn(seq, cb_arg, -ENOMEM);
+		return;
+	}
+	ctx->num_pages = 1;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+	ctx->seq = seq;
+
+	page_num = bs_blobid_to_page(blob->id);
+	lba = bs_md_page_to_lba(blob->bs, page_num);
+
+	blob->state = SPDK_BLOB_STATE_LOADING;
+
+	bs_sequence_read_dev(seq, &ctx->pages[0], lba,
+			     bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE),
+			     blob_load_cpl, ctx);
+}
+
+struct spdk_blob_persist_ctx {
+	struct spdk_blob		*blob;
+
+	struct spdk_bs_super_block	*super;
+
+	struct spdk_blob_md_page	*pages;
+	uint32_t			next_extent_page;
+	struct spdk_blob_md_page	*extent_page;
+
+	spdk_bs_sequence_t		*seq;
+	spdk_bs_sequence_cpl		cb_fn;
+	void				*cb_arg;
+	TAILQ_ENTRY(spdk_blob_persist_ctx) link;
+};
+
+static void
+bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba,
+		   uint32_t lba_count)
+{
+	switch (ctx->blob->clear_method) {
+	case BLOB_CLEAR_WITH_DEFAULT:
+	case BLOB_CLEAR_WITH_UNMAP:
+		bs_batch_unmap_dev(batch, lba, lba_count);
+		break;
+	case BLOB_CLEAR_WITH_WRITE_ZEROES:
+		bs_batch_write_zeroes_dev(batch, lba, lba_count);
+		break;
+	case BLOB_CLEAR_WITH_NONE:
+	default:
+		break;
+	}
+}
+
+static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx);
+
+static void
+blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob_persist_ctx	*next_persist;
+	struct spdk_blob		*blob = ctx->blob;
+
+	if (bserrno == 0) {
+		blob_mark_clean(blob);
+	}
+
+	assert(ctx == TAILQ_FIRST(&blob->pending_persists));
+	TAILQ_REMOVE(&blob->pending_persists, ctx, link);
+
+	next_persist = TAILQ_FIRST(&blob->pending_persists);
+
+	/* Call user callback */
+	ctx->cb_fn(seq, ctx->cb_arg, bserrno);
+
+	/* Free the memory */
+	spdk_free(ctx->pages);
+	free(ctx);
+
+	if (next_persist != NULL) {
+		blob_persist_check_dirty(next_persist);
+	}
+}
+
+static void
+blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_store		*bs = blob->bs;
+	size_t				i;
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	/* Release all clusters that were truncated */
+	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
+		uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]);
+
+		/* Nothing to release if it was not allocated */
+		if (blob->active.clusters[i] != 0) {
+			bs_release_cluster(bs, cluster_num);
+		}
+	}
+
+	if (blob->active.num_clusters == 0) {
+		free(blob->active.clusters);
+		blob->active.clusters = NULL;
+		blob->active.cluster_array_size = 0;
+	} else if (blob->active.num_clusters != blob->active.cluster_array_size) {
+#ifndef __clang_analyzer__
+		void *tmp;
+
+		/* scan-build really can't figure reallocs, workaround it */
+		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters);
+		assert(tmp != NULL);
+		blob->active.clusters = tmp;
+
+		tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages);
+		assert(tmp != NULL);
+		blob->active.extent_pages = tmp;
+#endif
+		blob->active.extent_pages_array_size = blob->active.num_extent_pages;
+		blob->active.cluster_array_size = blob->active.num_clusters;
+	}
+
+	/* TODO: Add path to persist clear extent pages. */
+	blob_persist_complete(seq, ctx, bserrno);
+}
+
+static void
+blob_persist_clear_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_store		*bs = blob->bs;
+	spdk_bs_batch_t			*batch;
+	size_t				i;
+	uint64_t			lba;
+	uint32_t			lba_count;
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	/* Clusters don't move around in blobs. The list shrinks or grows
+	 * at the end, but no changes ever occur in the middle of the list.
+	 */
+
+	batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx);
+
+	/* Clear all clusters that were truncated */
+	lba = 0;
+	lba_count = 0;
+	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
+		uint64_t next_lba = blob->active.clusters[i];
+		uint32_t next_lba_count = bs_cluster_to_lba(bs, 1);
+
+		if (next_lba > 0 && (lba + lba_count) == next_lba) {
+			/* This cluster is contiguous with the previous one. */
+			lba_count += next_lba_count;
+			continue;
+		}
+
+		/* This cluster is not contiguous with the previous one. */
+
+		/* If a run of LBAs previously existing, clear them now */
+		if (lba_count > 0) {
+			bs_batch_clear_dev(ctx, batch, lba, lba_count);
+		}
+
+		/* Start building the next batch */
+		lba = next_lba;
+		if (next_lba > 0) {
+			lba_count = next_lba_count;
+		} else {
+			lba_count = 0;
+		}
+	}
+
+	/* If we ended with a contiguous set of LBAs, clear them now */
+	if (lba_count > 0) {
+		bs_batch_clear_dev(ctx, batch, lba, lba_count);
+	}
+
+	bs_batch_close(batch);
+}
+
+static void
+blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_store		*bs = blob->bs;
+	size_t				i;
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	/* This loop starts at 1 because the first page is special and handled
+	 * below. The pages (except the first) are never written in place,
+	 * so any pages in the clean list must be zeroed.
+	 */
+	for (i = 1; i < blob->clean.num_pages; i++) {
+		bs_release_md_page(bs, blob->clean.pages[i]);
+	}
+
+	if (blob->active.num_pages == 0) {
+		uint32_t page_num;
+
+		page_num = bs_blobid_to_page(blob->id);
+		bs_release_md_page(bs, page_num);
+	}
+
+	/* Move on to clearing clusters */
+	blob_persist_clear_clusters(seq, ctx, 0);
+}
+
+static void
+blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_store		*bs = blob->bs;
+	uint64_t			lba;
+	uint32_t			lba_count;
+	spdk_bs_batch_t			*batch;
+	size_t				i;
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx);
+
+	lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
+
+	/* This loop starts at 1 because the first page is special and handled
+	 * below. The pages (except the first) are never written in place,
+	 * so any pages in the clean list must be zeroed.
+	 */
+	for (i = 1; i < blob->clean.num_pages; i++) {
+		lba = bs_md_page_to_lba(bs, blob->clean.pages[i]);
+
+		bs_batch_write_zeroes_dev(batch, lba, lba_count);
+	}
+
+	/* The first page will only be zeroed if this is a delete. */
+	if (blob->active.num_pages == 0) {
+		uint32_t page_num;
+
+		/* The first page in the metadata goes where the blobid indicates */
+		page_num = bs_blobid_to_page(blob->id);
+		lba = bs_md_page_to_lba(bs, page_num);
+
+		bs_batch_write_zeroes_dev(batch, lba, lba_count);
+	}
+
+	bs_batch_close(batch);
+}
+
+static void
+blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_store		*bs = blob->bs;
+	uint64_t			lba;
+	uint32_t			lba_count;
+	struct spdk_blob_md_page	*page;
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	if (blob->active.num_pages == 0) {
+		/* Move on to the next step */
+		blob_persist_zero_pages(seq, ctx, 0);
+		return;
+	}
+
+	lba_count = bs_byte_to_lba(bs, sizeof(*page));
+
+	page = &ctx->pages[0];
+	/* The first page in the metadata goes where the blobid indicates */
+	lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id));
+
+	bs_sequence_write_dev(seq, page, lba, lba_count,
+			      blob_persist_zero_pages, ctx);
+}
+
+static void
+blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	struct spdk_blob_store		*bs = blob->bs;
+	uint64_t			lba;
+	uint32_t			lba_count;
+	struct spdk_blob_md_page	*page;
+	spdk_bs_batch_t			*batch;
+	size_t				i;
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	/* Clusters don't move around in blobs. The list shrinks or grows
+	 * at the end, but no changes ever occur in the middle of the list.
+	 */
+
+	lba_count = bs_byte_to_lba(bs, sizeof(*page));
+
+	batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx);
+
+	/* This starts at 1. The root page is not written until
+	 * all of the others are finished
+	 */
+	for (i = 1; i < blob->active.num_pages; i++) {
+		page = &ctx->pages[i];
+		assert(page->sequence_num == i);
+
+		lba = bs_md_page_to_lba(bs, blob->active.pages[i]);
+
+		bs_batch_write_dev(batch, page, lba, lba_count);
+	}
+
+	bs_batch_close(batch);
+}
+
+static int
+blob_resize(struct spdk_blob *blob, uint64_t sz)
+{
+	uint64_t	i;
+	uint64_t	*tmp;
+	uint64_t	lfc; /* lowest free cluster */
+	uint32_t	lfmd; /*  lowest free md page */
+	uint64_t	num_clusters;
+	uint32_t	*ep_tmp;
+	uint64_t	new_num_ep = 0, current_num_ep = 0;
+	struct spdk_blob_store *bs;
+
+	bs = blob->bs;
+
+	blob_verify_md_op(blob);
+
+	if (blob->active.num_clusters == sz) {
+		return 0;
+	}
+
+	if (blob->active.num_clusters < blob->active.cluster_array_size) {
+		/* If this blob was resized to be larger, then smaller, then
+		 * larger without syncing, then the cluster array already
+		 * contains spare assigned clusters we can use.
+		 */
+		num_clusters = spdk_min(blob->active.cluster_array_size,
+					sz);
+	} else {
+		num_clusters = blob->active.num_clusters;
+	}
+
+	if (blob->use_extent_table) {
+		/* Round up since every cluster beyond current Extent Table size,
+		 * requires new extent page. */
+		new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP);
+		current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP);
+	}
+
+	/* Do two passes - one to verify that we can obtain enough clusters
+	 * and md pages, another to actually claim them.
+	 */
+
+	if (spdk_blob_is_thin_provisioned(blob) == false) {
+		lfc = 0;
+		for (i = num_clusters; i < sz; i++) {
+			lfc = spdk_bit_array_find_first_clear(bs->used_clusters, lfc);
+			if (lfc == UINT32_MAX) {
+				/* No more free clusters. Cannot satisfy the request */
+				return -ENOSPC;
+			}
+			lfc++;
+		}
+		lfmd = 0;
+		for (i = current_num_ep; i < new_num_ep ; i++) {
+			lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd);
+			if (lfmd == UINT32_MAX) {
+				/* No more free md pages. Cannot satisfy the request */
+				return -ENOSPC;
+			}
+		}
+	}
+
+	if (sz > num_clusters) {
+		/* Expand the cluster array if necessary.
+		 * We only shrink the array when persisting.
+		 */
+		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz);
+		if (sz > 0 && tmp == NULL) {
+			return -ENOMEM;
+		}
+		memset(tmp + blob->active.cluster_array_size, 0,
+		       sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
+		blob->active.clusters = tmp;
+		blob->active.cluster_array_size = sz;
+
+		/* Expand the extents table, only if enough clusters were added */
+		if (new_num_ep > current_num_ep && blob->use_extent_table) {
+			ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep);
+			if (new_num_ep > 0 && ep_tmp == NULL) {
+				return -ENOMEM;
+			}
+			memset(ep_tmp + blob->active.extent_pages_array_size, 0,
+			       sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size));
+			blob->active.extent_pages = ep_tmp;
+			blob->active.extent_pages_array_size = new_num_ep;
+		}
+	}
+
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+
+	if (spdk_blob_is_thin_provisioned(blob) == false) {
+		lfc = 0;
+		lfmd = 0;
+		for (i = num_clusters; i < sz; i++) {
+			bs_allocate_cluster(blob, i, &lfc, &lfmd, true);
+			lfc++;
+			lfmd++;
+		}
+	}
+
+	blob->active.num_clusters = sz;
+	blob->active.num_extent_pages = new_num_ep;
+
+	return 0;
+}
+
+static void
+blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx)
+{
+	spdk_bs_sequence_t *seq = ctx->seq;
+	struct spdk_blob *blob = ctx->blob;
+	struct spdk_blob_store *bs = blob->bs;
+	uint64_t i;
+	uint32_t page_num;
+	void *tmp;
+	int rc;
+
+	/* Generate the new metadata */
+	rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages);
+	if (rc < 0) {
+		blob_persist_complete(seq, ctx, rc);
+		return;
+	}
+
+	assert(blob->active.num_pages >= 1);
+
+	/* Resize the cache of page indices */
+	tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
+	if (!tmp) {
+		blob_persist_complete(seq, ctx, -ENOMEM);
+		return;
+	}
+	blob->active.pages = tmp;
+
+	/* Assign this metadata to pages. This requires two passes -
+	 * one to verify that there are enough pages and a second
+	 * to actually claim them. */
+	page_num = 0;
+	/* Note that this loop starts at one. The first page location is fixed by the blobid. */
+	for (i = 1; i < blob->active.num_pages; i++) {
+		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
+		if (page_num == UINT32_MAX) {
+			blob_persist_complete(seq, ctx, -ENOMEM);
+			return;
+		}
+		page_num++;
+	}
+
+	page_num = 0;
+	blob->active.pages[0] = bs_blobid_to_page(blob->id);
+	for (i = 1; i < blob->active.num_pages; i++) {
+		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
+		ctx->pages[i - 1].next = page_num;
+		/* Now that previous metadata page is complete, calculate the crc for it. */
+		ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
+		blob->active.pages[i] = page_num;
+		bs_claim_md_page(bs, page_num);
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming page %u for blob %lu\n", page_num, blob->id);
+		page_num++;
+	}
+	ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
+	/* Start writing the metadata from last page to first */
+	blob->state = SPDK_BLOB_STATE_CLEAN;
+	blob_persist_write_page_chain(seq, ctx, 0);
+}
+
+static void
+blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx	*ctx = cb_arg;
+	struct spdk_blob		*blob = ctx->blob;
+	size_t				i;
+	uint32_t			extent_page_id;
+	uint32_t                        page_count = 0;
+	int				rc;
+
+	if (ctx->extent_page != NULL) {
+		spdk_free(ctx->extent_page);
+		ctx->extent_page = NULL;
+	}
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	/* Only write out changed extent pages */
+	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
+		extent_page_id = blob->active.extent_pages[i];
+		if (extent_page_id == 0) {
+			/* No Extent Page to persist */
+			assert(spdk_blob_is_thin_provisioned(blob));
+			continue;
+		}
+		/* Writing out new extent page for the first time. Either active extent pages is larger
+		 * than clean extent pages or there was no extent page assigned due to thin provisioning. */
+		if (i >= blob->clean.extent_pages_array_size || blob->clean.extent_pages[i] == 0) {
+			blob->state = SPDK_BLOB_STATE_DIRTY;
+			assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id));
+			ctx->next_extent_page = i + 1;
+			rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page);
+			if (rc < 0) {
+				blob_persist_complete(seq, ctx, rc);
+				return;
+			}
+
+			blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page);
+
+			ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page);
+
+			bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id),
+					      bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
+					      blob_persist_write_extent_pages, ctx);
+			return;
+		}
+		assert(blob->clean.extent_pages[i] != 0);
+	}
+
+	blob_persist_generate_new_md(ctx);
+}
+
+static void
+blob_persist_start(struct spdk_blob_persist_ctx *ctx)
+{
+	spdk_bs_sequence_t *seq = ctx->seq;
+	struct spdk_blob *blob = ctx->blob;
+
+	if (blob->active.num_pages == 0) {
+		/* This is the signal that the blob should be deleted.
+		 * Immediately jump to the clean up routine. */
+		assert(blob->clean.num_pages > 0);
+		blob->state = SPDK_BLOB_STATE_CLEAN;
+		blob_persist_zero_pages(seq, ctx, 0);
+		return;
+
+	}
+
+	blob_persist_write_extent_pages(seq, ctx, 0);
+}
+
+static void
+blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx *ctx = cb_arg;
+
+	spdk_free(ctx->super);
+
+	if (bserrno != 0) {
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	ctx->blob->bs->clean = 0;
+
+	blob_persist_start(ctx);
+}
+
+static void
+bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
+	       struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+
+static void
+blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_persist_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		spdk_free(ctx->super);
+		blob_persist_complete(seq, ctx, bserrno);
+		return;
+	}
+
+	ctx->super->clean = 0;
+	if (ctx->super->size == 0) {
+		ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen;
+	}
+
+	bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx);
+}
+
+static void
+blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx)
+{
+	if (ctx->blob->bs->clean) {
+		ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+					  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+		if (!ctx->super) {
+			blob_persist_complete(ctx->seq, ctx, -ENOMEM);
+			return;
+		}
+
+		bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0),
+				     bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)),
+				     blob_persist_dirty, ctx);
+	} else {
+		blob_persist_start(ctx);
+	}
+}
+
+/* Write a blob to disk */
+static void
+blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
+	     spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_blob_persist_ctx *ctx;
+
+	blob_verify_md_op(blob);
+
+	if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->pending_persists)) {
+		cb_fn(seq, cb_arg, 0);
+		return;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(seq, cb_arg, -ENOMEM);
+		return;
+	}
+	ctx->blob = blob;
+	ctx->seq = seq;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+	ctx->next_extent_page = 0;
+
+	/* Multiple blob persists can affect one another, via blob->state or
+	 * blob mutable data changes. To prevent it, queue up the persists. */
+	if (!TAILQ_EMPTY(&blob->pending_persists)) {
+		TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link);
+		return;
+	}
+	TAILQ_INSERT_HEAD(&blob->pending_persists, ctx, link);
+
+	blob_persist_check_dirty(ctx);
+}
+
+struct spdk_blob_copy_cluster_ctx {
+	struct spdk_blob *blob;
+	uint8_t *buf;
+	uint64_t page;
+	uint64_t new_cluster;
+	uint32_t new_extent_page;
+	spdk_bs_sequence_t *seq;
+};
+
+static void
+blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq;
+	TAILQ_HEAD(, spdk_bs_request_set) requests;
+	spdk_bs_user_op_t *op;
+
+	TAILQ_INIT(&requests);
+	TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link);
+
+	while (!TAILQ_EMPTY(&requests)) {
+		op = TAILQ_FIRST(&requests);
+		TAILQ_REMOVE(&requests, op, link);
+		if (bserrno == 0) {
+			bs_user_op_execute(op);
+		} else {
+			bs_user_op_abort(op);
+		}
+	}
+
+	spdk_free(ctx->buf);
+	free(ctx);
+}
+
+static void
+blob_insert_cluster_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+
+	if (bserrno) {
+		if (bserrno == -EEXIST) {
+			/* The metadata insert failed because another thread
+			 * allocated the cluster first. Free our cluster
+			 * but continue without error. */
+			bserrno = 0;
+		}
+		bs_release_cluster(ctx->blob->bs, ctx->new_cluster);
+		if (ctx->new_extent_page != 0) {
+			bs_release_md_page(ctx->blob->bs, ctx->new_extent_page);
+		}
+	}
+
+	bs_sequence_finish(ctx->seq, bserrno);
+}
+
+static void
+blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+	uint32_t cluster_number;
+
+	if (bserrno) {
+		/* The write failed, so jump to the final completion handler */
+		bs_sequence_finish(seq, bserrno);
+		return;
+	}
+
+	cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page);
+
+	blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
+					 ctx->new_extent_page, blob_insert_cluster_cpl, ctx);
+}
+
+static void
+blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		/* The read failed, so jump to the final completion handler */
+		bs_sequence_finish(seq, bserrno);
+		return;
+	}
+
+	/* Write whole cluster */
+	bs_sequence_write_dev(seq, ctx->buf,
+			      bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
+			      bs_cluster_to_lba(ctx->blob->bs, 1),
+			      blob_write_copy_cpl, ctx);
+}
+
+static void
+bs_allocate_and_copy_cluster(struct spdk_blob *blob,
+			     struct spdk_io_channel *_ch,
+			     uint64_t io_unit, spdk_bs_user_op_t *op)
+{
+	struct spdk_bs_cpl cpl;
+	struct spdk_bs_channel *ch;
+	struct spdk_blob_copy_cluster_ctx *ctx;
+	uint32_t cluster_start_page;
+	uint32_t cluster_number;
+	int rc;
+
+	ch = spdk_io_channel_get_ctx(_ch);
+
+	if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) {
+		/* There are already operations pending. Queue this user op
+		 * and return because it will be re-executed when the outstanding
+		 * cluster allocation completes. */
+		TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
+		return;
+	}
+
+	/* Round the io_unit offset down to the first page in the cluster */
+	cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit);
+
+	/* Calculate which index in the metadata cluster array the corresponding
+	 * cluster is supposed to be at. */
+	cluster_number = bs_io_unit_to_cluster_number(blob, io_unit);
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		bs_user_op_abort(op);
+		return;
+	}
+
+	assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0);
+
+	ctx->blob = blob;
+	ctx->page = cluster_start_page;
+
+	if (blob->parent_id != SPDK_BLOBID_INVALID) {
+		ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
+				       NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+		if (!ctx->buf) {
+			SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n",
+				    blob->bs->cluster_sz);
+			free(ctx);
+			bs_user_op_abort(op);
+			return;
+		}
+	}
+
+	rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page,
+				 false);
+	if (rc != 0) {
+		spdk_free(ctx->buf);
+		free(ctx);
+		bs_user_op_abort(op);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl;
+	cpl.u.blob_basic.cb_arg = ctx;
+
+	ctx->seq = bs_sequence_start(_ch, &cpl);
+	if (!ctx->seq) {
+		bs_release_cluster(blob->bs, ctx->new_cluster);
+		spdk_free(ctx->buf);
+		free(ctx);
+		bs_user_op_abort(op);
+		return;
+	}
+
+	/* Queue the user op to block other incoming operations */
+	TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
+
+	if (blob->parent_id != SPDK_BLOBID_INVALID) {
+		/* Read cluster from backing device */
+		bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
+					bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
+					bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
+					blob_write_copy, ctx);
+	} else {
+		blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
+						 ctx->new_extent_page, blob_insert_cluster_cpl, ctx);
+	}
+}
+
+static inline void
+blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length,
+				 uint64_t *lba,	uint32_t *lba_count)
+{
+	*lba_count = length;
+
+	if (!bs_io_unit_is_allocated(blob, io_unit)) {
+		assert(blob->back_bs_dev != NULL);
+		*lba = bs_io_unit_to_back_dev_lba(blob, io_unit);
+		*lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count);
+	} else {
+		*lba = bs_blob_io_unit_to_lba(blob, io_unit);
+	}
+}
+
+struct op_split_ctx {
+	struct spdk_blob *blob;
+	struct spdk_io_channel *channel;
+	uint64_t io_unit_offset;
+	uint64_t io_units_remaining;
+	void *curr_payload;
+	enum spdk_blob_op_type op_type;
+	spdk_bs_sequence_t *seq;
+};
+
+static void
+blob_request_submit_op_split_next(void *cb_arg, int bserrno)
+{
+	struct op_split_ctx	*ctx = cb_arg;
+	struct spdk_blob	*blob = ctx->blob;
+	struct spdk_io_channel	*ch = ctx->channel;
+	enum spdk_blob_op_type	op_type = ctx->op_type;
+	uint8_t			*buf = ctx->curr_payload;
+	uint64_t		offset = ctx->io_unit_offset;
+	uint64_t		length = ctx->io_units_remaining;
+	uint64_t		op_length;
+
+	if (bserrno != 0 || ctx->io_units_remaining == 0) {
+		bs_sequence_finish(ctx->seq, bserrno);
+		free(ctx);
+		return;
+	}
+
+	op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob,
+			     offset));
+
+	/* Update length and payload for next operation */
+	ctx->io_units_remaining -= op_length;
+	ctx->io_unit_offset += op_length;
+	if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) {
+		ctx->curr_payload += op_length * blob->bs->io_unit_size;
+	}
+
+	switch (op_type) {
+	case SPDK_BLOB_READ:
+		spdk_blob_io_read(blob, ch, buf, offset, op_length,
+				  blob_request_submit_op_split_next, ctx);
+		break;
+	case SPDK_BLOB_WRITE:
+		spdk_blob_io_write(blob, ch, buf, offset, op_length,
+				   blob_request_submit_op_split_next, ctx);
+		break;
+	case SPDK_BLOB_UNMAP:
+		spdk_blob_io_unmap(blob, ch, offset, op_length,
+				   blob_request_submit_op_split_next, ctx);
+		break;
+	case SPDK_BLOB_WRITE_ZEROES:
+		spdk_blob_io_write_zeroes(blob, ch, offset, op_length,
+					  blob_request_submit_op_split_next, ctx);
+		break;
+	case SPDK_BLOB_READV:
+	case SPDK_BLOB_WRITEV:
+		SPDK_ERRLOG("readv/write not valid\n");
+		bs_sequence_finish(ctx->seq, -EINVAL);
+		free(ctx);
+		break;
+	}
+}
+
+static void
+blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
+			     void *payload, uint64_t offset, uint64_t length,
+			     spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
+{
+	struct op_split_ctx *ctx;
+	spdk_bs_sequence_t *seq;
+	struct spdk_bs_cpl cpl;
+
+	assert(blob != NULL);
+
+	ctx = calloc(1, sizeof(struct op_split_ctx));
+	if (ctx == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = cb_fn;
+	cpl.u.blob_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(ch, &cpl);
+	if (!seq) {
+		free(ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->blob = blob;
+	ctx->channel = ch;
+	ctx->curr_payload = payload;
+	ctx->io_unit_offset = offset;
+	ctx->io_units_remaining = length;
+	ctx->op_type = op_type;
+	ctx->seq = seq;
+
+	blob_request_submit_op_split_next(ctx, 0);
+}
+
+static void
+blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
+			      void *payload, uint64_t offset, uint64_t length,
+			      spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
+{
+	struct spdk_bs_cpl cpl;
+	uint64_t lba;
+	uint32_t lba_count;
+
+	assert(blob != NULL);
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = cb_fn;
+	cpl.u.blob_basic.cb_arg = cb_arg;
+
+	blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
+
+	if (blob->frozen_refcnt) {
+		/* This blob I/O is frozen */
+		spdk_bs_user_op_t *op;
+		struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
+
+		op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
+		if (!op) {
+			cb_fn(cb_arg, -ENOMEM);
+			return;
+		}
+
+		TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
+
+		return;
+	}
+
+	switch (op_type) {
+	case SPDK_BLOB_READ: {
+		spdk_bs_batch_t *batch;
+
+		batch = bs_batch_open(_ch, &cpl);
+		if (!batch) {
+			cb_fn(cb_arg, -ENOMEM);
+			return;
+		}
+
+		if (bs_io_unit_is_allocated(blob, offset)) {
+			/* Read from the blob */
+			bs_batch_read_dev(batch, payload, lba, lba_count);
+		} else {
+			/* Read from the backing block device */
+			bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count);
+		}
+
+		bs_batch_close(batch);
+		break;
+	}
+	case SPDK_BLOB_WRITE:
+	case SPDK_BLOB_WRITE_ZEROES: {
+		if (bs_io_unit_is_allocated(blob, offset)) {
+			/* Write to the blob */
+			spdk_bs_batch_t *batch;
+
+			if (lba_count == 0) {
+				cb_fn(cb_arg, 0);
+				return;
+			}
+
+			batch = bs_batch_open(_ch, &cpl);
+			if (!batch) {
+				cb_fn(cb_arg, -ENOMEM);
+				return;
+			}
+
+			if (op_type == SPDK_BLOB_WRITE) {
+				bs_batch_write_dev(batch, payload, lba, lba_count);
+			} else {
+				bs_batch_write_zeroes_dev(batch, lba, lba_count);
+			}
+
+			bs_batch_close(batch);
+		} else {
+			/* Queue this operation and allocate the cluster */
+			spdk_bs_user_op_t *op;
+
+			op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
+			if (!op) {
+				cb_fn(cb_arg, -ENOMEM);
+				return;
+			}
+
+			bs_allocate_and_copy_cluster(blob, _ch, offset, op);
+		}
+		break;
+	}
+	case SPDK_BLOB_UNMAP: {
+		spdk_bs_batch_t *batch;
+
+		batch = bs_batch_open(_ch, &cpl);
+		if (!batch) {
+			cb_fn(cb_arg, -ENOMEM);
+			return;
+		}
+
+		if (bs_io_unit_is_allocated(blob, offset)) {
+			bs_batch_unmap_dev(batch, lba, lba_count);
+		}
+
+		bs_batch_close(batch);
+		break;
+	}
+	case SPDK_BLOB_READV:
+	case SPDK_BLOB_WRITEV:
+		SPDK_ERRLOG("readv/write not valid\n");
+		cb_fn(cb_arg, -EINVAL);
+		break;
+	}
+}
+
+static void
+blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel,
+		       void *payload, uint64_t offset, uint64_t length,
+		       spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
+{
+	assert(blob != NULL);
+
+	if (blob->data_ro && op_type != SPDK_BLOB_READ) {
+		cb_fn(cb_arg, -EPERM);
+		return;
+	}
+
+	if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+	if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) {
+		blob_request_submit_op_single(_channel, blob, payload, offset, length,
+					      cb_fn, cb_arg, op_type);
+	} else {
+		blob_request_submit_op_split(_channel, blob, payload, offset, length,
+					     cb_fn, cb_arg, op_type);
+	}
+}
+
+struct rw_iov_ctx {
+	struct spdk_blob *blob;
+	struct spdk_io_channel *channel;
+	spdk_blob_op_complete cb_fn;
+	void *cb_arg;
+	bool read;
+	int iovcnt;
+	struct iovec *orig_iov;
+	uint64_t io_unit_offset;
+	uint64_t io_units_remaining;
+	uint64_t io_units_done;
+	struct iovec iov[0];
+};
+
+static void
+rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	assert(cb_arg == NULL);
+	bs_sequence_finish(seq, bserrno);
+}
+
+static void
+rw_iov_split_next(void *cb_arg, int bserrno)
+{
+	struct rw_iov_ctx *ctx = cb_arg;
+	struct spdk_blob *blob = ctx->blob;
+	struct iovec *iov, *orig_iov;
+	int iovcnt;
+	size_t orig_iovoff;
+	uint64_t io_units_count, io_units_to_boundary, io_unit_offset;
+	uint64_t byte_count;
+
+	if (bserrno != 0 || ctx->io_units_remaining == 0) {
+		ctx->cb_fn(ctx->cb_arg, bserrno);
+		free(ctx);
+		return;
+	}
+
+	io_unit_offset = ctx->io_unit_offset;
+	io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset);
+	io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary);
+	/*
+	 * Get index and offset into the original iov array for our current position in the I/O sequence.
+	 *  byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will
+	 *  point to the current position in the I/O sequence.
+	 */
+	byte_count = ctx->io_units_done * blob->bs->io_unit_size;
+	orig_iov = &ctx->orig_iov[0];
+	orig_iovoff = 0;
+	while (byte_count > 0) {
+		if (byte_count >= orig_iov->iov_len) {
+			byte_count -= orig_iov->iov_len;
+			orig_iov++;
+		} else {
+			orig_iovoff = byte_count;
+			byte_count = 0;
+		}
+	}
+
+	/*
+	 * Build an iov array for the next I/O in the sequence.  byte_count will keep track of how many
+	 *  bytes of this next I/O remain to be accounted for in the new iov array.
+	 */
+	byte_count = io_units_count * blob->bs->io_unit_size;
+	iov = &ctx->iov[0];
+	iovcnt = 0;
+	while (byte_count > 0) {
+		assert(iovcnt < ctx->iovcnt);
+		iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff);
+		iov->iov_base = orig_iov->iov_base + orig_iovoff;
+		byte_count -= iov->iov_len;
+		orig_iovoff = 0;
+		orig_iov++;
+		iov++;
+		iovcnt++;
+	}
+
+	ctx->io_unit_offset += io_units_count;
+	ctx->io_units_remaining -= io_units_count;
+	ctx->io_units_done += io_units_count;
+	iov = &ctx->iov[0];
+
+	if (ctx->read) {
+		spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
+				   io_units_count, rw_iov_split_next, ctx);
+	} else {
+		spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
+				    io_units_count, rw_iov_split_next, ctx);
+	}
+}
+
+static void
+blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel,
+			   struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+			   spdk_blob_op_complete cb_fn, void *cb_arg, bool read)
+{
+	struct spdk_bs_cpl	cpl;
+
+	assert(blob != NULL);
+
+	if (!read && blob->data_ro) {
+		cb_fn(cb_arg, -EPERM);
+		return;
+	}
+
+	if (length == 0) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	/*
+	 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having
+	 *  to split a request that spans a cluster boundary.  For I/O that do not span a cluster boundary,
+	 *  there will be no noticeable difference compared to using a batch.  For I/O that do span a cluster
+	 *  boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need
+	 *  to allocate a separate iov array and split the I/O such that none of the resulting
+	 *  smaller I/O cross a cluster boundary.  These smaller I/O will be issued in sequence (not in parallel)
+	 *  but since this case happens very infrequently, any performance impact will be negligible.
+	 *
+	 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs
+	 *  for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them
+	 *  in a batch.  That would also require creating an intermediate spdk_bs_cpl that would get called
+	 *  when the batch was completed, to allow for freeing the memory for the iov arrays.
+	 */
+	if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) {
+		uint32_t lba_count;
+		uint64_t lba;
+
+		cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+		cpl.u.blob_basic.cb_fn = cb_fn;
+		cpl.u.blob_basic.cb_arg = cb_arg;
+
+		if (blob->frozen_refcnt) {
+			/* This blob I/O is frozen */
+			enum spdk_blob_op_type op_type;
+			spdk_bs_user_op_t *op;
+			struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel);
+
+			op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV;
+			op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length);
+			if (!op) {
+				cb_fn(cb_arg, -ENOMEM);
+				return;
+			}
+
+			TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
+
+			return;
+		}
+
+		blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
+
+		if (read) {
+			spdk_bs_sequence_t *seq;
+
+			seq = bs_sequence_start(_channel, &cpl);
+			if (!seq) {
+				cb_fn(cb_arg, -ENOMEM);
+				return;
+			}
+
+			if (bs_io_unit_is_allocated(blob, offset)) {
+				bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
+			} else {
+				bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count,
+							 rw_iov_done, NULL);
+			}
+		} else {
+			if (bs_io_unit_is_allocated(blob, offset)) {
+				spdk_bs_sequence_t *seq;
+
+				seq = bs_sequence_start(_channel, &cpl);
+				if (!seq) {
+					cb_fn(cb_arg, -ENOMEM);
+					return;
+				}
+
+				bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
+			} else {
+				/* Queue this operation and allocate the cluster */
+				spdk_bs_user_op_t *op;
+
+				op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset,
+						      length);
+				if (!op) {
+					cb_fn(cb_arg, -ENOMEM);
+					return;
+				}
+
+				bs_allocate_and_copy_cluster(blob, _channel, offset, op);
+			}
+		}
+	} else {
+		struct rw_iov_ctx *ctx;
+
+		ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec));
+		if (ctx == NULL) {
+			cb_fn(cb_arg, -ENOMEM);
+			return;
+		}
+
+		ctx->blob = blob;
+		ctx->channel = _channel;
+		ctx->cb_fn = cb_fn;
+		ctx->cb_arg = cb_arg;
+		ctx->read = read;
+		ctx->orig_iov = iov;
+		ctx->iovcnt = iovcnt;
+		ctx->io_unit_offset = offset;
+		ctx->io_units_remaining = length;
+		ctx->io_units_done = 0;
+
+		rw_iov_split_next(ctx, 0);
+	}
+}
+
+static struct spdk_blob *
+blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
+{
+	struct spdk_blob *blob;
+
+	if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) {
+		return NULL;
+	}
+
+	TAILQ_FOREACH(blob, &bs->blobs, link) {
+		if (blob->id == blobid) {
+			return blob;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+blob_get_snapshot_and_clone_entries(struct spdk_blob *blob,
+				    struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry)
+{
+	assert(blob != NULL);
+	*snapshot_entry = NULL;
+	*clone_entry = NULL;
+
+	if (blob->parent_id == SPDK_BLOBID_INVALID) {
+		return;
+	}
+
+	TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) {
+		if ((*snapshot_entry)->id == blob->parent_id) {
+			break;
+		}
+	}
+
+	if (*snapshot_entry != NULL) {
+		TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) {
+			if ((*clone_entry)->id == blob->id) {
+				break;
+			}
+		}
+
+		assert(clone_entry != NULL);
+	}
+}
+
+static int
+bs_channel_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_blob_store		*bs = io_device;
+	struct spdk_bs_channel		*channel = ctx_buf;
+	struct spdk_bs_dev		*dev;
+	uint32_t			max_ops = bs->max_channel_ops;
+	uint32_t			i;
+
+	dev = bs->dev;
+
+	channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set));
+	if (!channel->req_mem) {
+		return -1;
+	}
+
+	TAILQ_INIT(&channel->reqs);
+
+	for (i = 0; i < max_ops; i++) {
+		TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
+	}
+
+	channel->bs = bs;
+	channel->dev = dev;
+	channel->dev_channel = dev->create_channel(dev);
+
+	if (!channel->dev_channel) {
+		SPDK_ERRLOG("Failed to create device channel.\n");
+		free(channel->req_mem);
+		return -1;
+	}
+
+	TAILQ_INIT(&channel->need_cluster_alloc);
+	TAILQ_INIT(&channel->queued_io);
+
+	return 0;
+}
+
+static void
+bs_channel_destroy(void *io_device, void *ctx_buf)
+{
+	struct spdk_bs_channel *channel = ctx_buf;
+	spdk_bs_user_op_t *op;
+
+	while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) {
+		op = TAILQ_FIRST(&channel->need_cluster_alloc);
+		TAILQ_REMOVE(&channel->need_cluster_alloc, op, link);
+		bs_user_op_abort(op);
+	}
+
+	while (!TAILQ_EMPTY(&channel->queued_io)) {
+		op = TAILQ_FIRST(&channel->queued_io);
+		TAILQ_REMOVE(&channel->queued_io, op, link);
+		bs_user_op_abort(op);
+	}
+
+	free(channel->req_mem);
+	channel->dev->destroy_channel(channel->dev, channel->dev_channel);
+}
+
+static void
+bs_dev_destroy(void *io_device)
+{
+	struct spdk_blob_store *bs = io_device;
+	struct spdk_blob	*blob, *blob_tmp;
+
+	bs->dev->destroy(bs->dev);
+
+	TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) {
+		TAILQ_REMOVE(&bs->blobs, blob, link);
+		spdk_bit_array_clear(bs->open_blobids, blob->id);
+		blob_free(blob);
+	}
+
+	pthread_mutex_destroy(&bs->used_clusters_mutex);
+
+	spdk_bit_array_free(&bs->open_blobids);
+	spdk_bit_array_free(&bs->used_blobids);
+	spdk_bit_array_free(&bs->used_md_pages);
+	spdk_bit_array_free(&bs->used_clusters);
+	/*
+	 * If this function is called for any reason except a successful unload,
+	 * the unload_cpl type will be NONE and this will be a nop.
+	 */
+	bs_call_cpl(&bs->unload_cpl, bs->unload_err);
+
+	free(bs);
+}
+
+static int
+bs_blob_list_add(struct spdk_blob *blob)
+{
+	spdk_blob_id snapshot_id;
+	struct spdk_blob_list *snapshot_entry = NULL;
+	struct spdk_blob_list *clone_entry = NULL;
+
+	assert(blob != NULL);
+
+	snapshot_id = blob->parent_id;
+	if (snapshot_id == SPDK_BLOBID_INVALID) {
+		return 0;
+	}
+
+	snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id);
+	if (snapshot_entry == NULL) {
+		/* Snapshot not found */
+		snapshot_entry = calloc(1, sizeof(struct spdk_blob_list));
+		if (snapshot_entry == NULL) {
+			return -ENOMEM;
+		}
+		snapshot_entry->id = snapshot_id;
+		TAILQ_INIT(&snapshot_entry->clones);
+		TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link);
+	} else {
+		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
+			if (clone_entry->id == blob->id) {
+				break;
+			}
+		}
+	}
+
+	if (clone_entry == NULL) {
+		/* Clone not found */
+		clone_entry = calloc(1, sizeof(struct spdk_blob_list));
+		if (clone_entry == NULL) {
+			return -ENOMEM;
+		}
+		clone_entry->id = blob->id;
+		TAILQ_INIT(&clone_entry->clones);
+		TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link);
+		snapshot_entry->clone_count++;
+	}
+
+	return 0;
+}
+
+static void
+bs_blob_list_remove(struct spdk_blob *blob)
+{
+	struct spdk_blob_list *snapshot_entry = NULL;
+	struct spdk_blob_list *clone_entry = NULL;
+
+	blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry);
+
+	if (snapshot_entry == NULL) {
+		return;
+	}
+
+	blob->parent_id = SPDK_BLOBID_INVALID;
+	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
+	free(clone_entry);
+
+	snapshot_entry->clone_count--;
+}
+
+static int
+bs_blob_list_free(struct spdk_blob_store *bs)
+{
+	struct spdk_blob_list *snapshot_entry;
+	struct spdk_blob_list *snapshot_entry_tmp;
+	struct spdk_blob_list *clone_entry;
+	struct spdk_blob_list *clone_entry_tmp;
+
+	TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) {
+		TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) {
+			TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
+			free(clone_entry);
+		}
+		TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link);
+		free(snapshot_entry);
+	}
+
+	return 0;
+}
+
+static void
+bs_free(struct spdk_blob_store *bs)
+{
+	bs_blob_list_free(bs);
+
+	bs_unregister_md_thread(bs);
+	spdk_io_device_unregister(bs, bs_dev_destroy);
+}
+
+void
+spdk_bs_opts_init(struct spdk_bs_opts *opts)
+{
+	opts->cluster_sz = SPDK_BLOB_OPTS_CLUSTER_SZ;
+	opts->num_md_pages = SPDK_BLOB_OPTS_NUM_MD_PAGES;
+	opts->max_md_ops = SPDK_BLOB_OPTS_MAX_MD_OPS;
+	opts->max_channel_ops = SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS;
+	opts->clear_method = BS_CLEAR_WITH_UNMAP;
+	memset(&opts->bstype, 0, sizeof(opts->bstype));
+	opts->iter_cb_fn = NULL;
+	opts->iter_cb_arg = NULL;
+}
+
+static int
+bs_opts_verify(struct spdk_bs_opts *opts)
+{
+	if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 ||
+	    opts->max_channel_ops == 0) {
+		SPDK_ERRLOG("Blobstore options cannot be set to 0\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs)
+{
+	struct spdk_blob_store	*bs;
+	uint64_t dev_size;
+	int rc;
+
+	dev_size = dev->blocklen * dev->blockcnt;
+	if (dev_size < opts->cluster_sz) {
+		/* Device size cannot be smaller than cluster size of blobstore */
+		SPDK_INFOLOG(SPDK_LOG_BLOB, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n",
+			     dev_size, opts->cluster_sz);
+		return -ENOSPC;
+	}
+	if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) {
+		/* Cluster size cannot be smaller than page size */
+		SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n",
+			    opts->cluster_sz, SPDK_BS_PAGE_SIZE);
+		return -EINVAL;
+	}
+	bs = calloc(1, sizeof(struct spdk_blob_store));
+	if (!bs) {
+		return -ENOMEM;
+	}
+
+	TAILQ_INIT(&bs->blobs);
+	TAILQ_INIT(&bs->snapshots);
+	bs->dev = dev;
+	bs->md_thread = spdk_get_thread();
+	assert(bs->md_thread != NULL);
+
+	/*
+	 * Do not use bs_lba_to_cluster() here since blockcnt may not be an
+	 *  even multiple of the cluster size.
+	 */
+	bs->cluster_sz = opts->cluster_sz;
+	bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen);
+	bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE;
+	if (spdk_u32_is_pow2(bs->pages_per_cluster)) {
+		bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster);
+	}
+	bs->num_free_clusters = bs->total_clusters;
+	bs->used_clusters = spdk_bit_array_create(bs->total_clusters);
+	bs->io_unit_size = dev->blocklen;
+	if (bs->used_clusters == NULL) {
+		free(bs);
+		return -ENOMEM;
+	}
+
+	bs->max_channel_ops = opts->max_channel_ops;
+	bs->super_blob = SPDK_BLOBID_INVALID;
+	memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype));
+
+	/* The metadata is assumed to be at least 1 page */
+	bs->used_md_pages = spdk_bit_array_create(1);
+	bs->used_blobids = spdk_bit_array_create(0);
+	bs->open_blobids = spdk_bit_array_create(0);
+
+	pthread_mutex_init(&bs->used_clusters_mutex, NULL);
+
+	spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy,
+				sizeof(struct spdk_bs_channel), "blobstore");
+	rc = bs_register_md_thread(bs);
+	if (rc == -1) {
+		spdk_io_device_unregister(bs, NULL);
+		pthread_mutex_destroy(&bs->used_clusters_mutex);
+		spdk_bit_array_free(&bs->open_blobids);
+		spdk_bit_array_free(&bs->used_blobids);
+		spdk_bit_array_free(&bs->used_md_pages);
+		spdk_bit_array_free(&bs->used_clusters);
+		free(bs);
+		/* FIXME: this is a lie but don't know how to get a proper error code here */
+		return -ENOMEM;
+	}
+
+	*_bs = bs;
+	return 0;
+}
+
+/* START spdk_bs_load, spdk_bs_load_ctx will used for both load and unload. */
+
+struct spdk_bs_load_ctx {
+	struct spdk_blob_store		*bs;
+	struct spdk_bs_super_block	*super;
+
+	struct spdk_bs_md_mask		*mask;
+	bool				in_page_chain;
+	uint32_t			page_index;
+	uint32_t			cur_page;
+	struct spdk_blob_md_page	*page;
+
+	uint64_t			num_extent_pages;
+	uint32_t			*extent_page_num;
+	struct spdk_blob_md_page	*extent_pages;
+
+	spdk_bs_sequence_t			*seq;
+	spdk_blob_op_with_handle_complete	iter_cb_fn;
+	void					*iter_cb_arg;
+	struct spdk_blob			*blob;
+	spdk_blob_id				blobid;
+};
+
+static void
+bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno)
+{
+	assert(bserrno != 0);
+
+	spdk_free(ctx->super);
+	bs_sequence_finish(ctx->seq, bserrno);
+	bs_free(ctx->bs);
+	free(ctx);
+}
+
+static void
+bs_set_mask(struct spdk_bit_array *array, struct spdk_bs_md_mask *mask)
+{
+	uint32_t i = 0;
+
+	while (true) {
+		i = spdk_bit_array_find_first_set(array, i);
+		if (i >= mask->length) {
+			break;
+		}
+		mask->mask[i / 8] |= 1U << (i % 8);
+		i++;
+	}
+}
+
+static int
+bs_load_mask(struct spdk_bit_array **array_ptr, struct spdk_bs_md_mask *mask)
+{
+	struct spdk_bit_array *array;
+	uint32_t i;
+
+	if (spdk_bit_array_resize(array_ptr, mask->length) < 0) {
+		return -ENOMEM;
+	}
+
+	array = *array_ptr;
+	for (i = 0; i < mask->length; i++) {
+		if (mask->mask[i / 8] & (1U << (i % 8))) {
+			spdk_bit_array_set(array, i);
+		}
+	}
+
+	return 0;
+}
+
+static void
+bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
+	       struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	/* Update the values in the super block */
+	super->super_blob = bs->super_blob;
+	memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype));
+	super->crc = blob_md_page_calc_crc(super);
+	bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0),
+			      bs_byte_to_lba(bs, sizeof(*super)),
+			      cb_fn, cb_arg);
+}
+
+static void
+bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
+{
+	struct spdk_bs_load_ctx	*ctx = arg;
+	uint64_t	mask_size, lba, lba_count;
+
+	/* Write out the used clusters mask */
+	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
+	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
+				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->mask) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS;
+	ctx->mask->length = ctx->bs->total_clusters;
+	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_clusters));
+
+	bs_set_mask(ctx->bs->used_clusters, ctx->mask);
+	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
+	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
+	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
+}
+
+static void
+bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
+{
+	struct spdk_bs_load_ctx	*ctx = arg;
+	uint64_t	mask_size, lba, lba_count;
+
+	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
+	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
+				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->mask) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
+	ctx->mask->length = ctx->super->md_len;
+	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
+
+	bs_set_mask(ctx->bs->used_md_pages, ctx->mask);
+	lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
+	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
+	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
+}
+
+static void
+bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
+{
+	struct spdk_bs_load_ctx	*ctx = arg;
+	uint64_t	mask_size, lba, lba_count;
+
+	if (ctx->super->used_blobid_mask_len == 0) {
+		/*
+		 * This is a pre-v3 on-disk format where the blobid mask does not get
+		 *  written to disk.
+		 */
+		cb_fn(seq, arg, 0);
+		return;
+	}
+
+	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
+	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+				 SPDK_MALLOC_DMA);
+	if (!ctx->mask) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS;
+	ctx->mask->length = ctx->super->md_len;
+	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids));
+
+	bs_set_mask(ctx->bs->used_blobids, ctx->mask);
+	lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
+	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
+	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
+}
+
+static void
+blob_set_thin_provision(struct spdk_blob *blob)
+{
+	blob_verify_md_op(blob);
+	blob->invalid_flags |= SPDK_BLOB_THIN_PROV;
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+}
+
+static void
+blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method)
+{
+	blob_verify_md_op(blob);
+	blob->clear_method = clear_method;
+	blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT);
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+}
+
+static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno);
+
+static void
+bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	spdk_blob_id id;
+	int64_t page_num;
+
+	/* Iterate to next blob (we can't use spdk_bs_iter_next function as our
+	 * last blob has been removed */
+	page_num = bs_blobid_to_page(ctx->blobid);
+	page_num++;
+	page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num);
+	if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) {
+		bs_load_iter(ctx, NULL, -ENOENT);
+		return;
+	}
+
+	id = bs_page_to_blobid(page_num);
+
+	spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx);
+}
+
+static void
+bs_delete_corrupted_close_cb(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Failed to close corrupted blob\n");
+		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+		return;
+	}
+
+	spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx);
+}
+
+static void
+bs_delete_corrupted_blob(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	uint64_t i;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
+		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+		return;
+	}
+
+	/* Snapshot and clone have the same copy of cluster map and extent pages
+	 * at this point. Let's clear both for snpashot now,
+	 * so that it won't be cleared for clone later when we remove snapshot.
+	 * Also set thin provision to pass data corruption check */
+	for (i = 0; i < ctx->blob->active.num_clusters; i++) {
+		ctx->blob->active.clusters[i] = 0;
+	}
+	for (i = 0; i < ctx->blob->active.num_extent_pages; i++) {
+		ctx->blob->active.extent_pages[i] = 0;
+	}
+
+	ctx->blob->md_ro = false;
+
+	blob_set_thin_provision(ctx->blob);
+
+	ctx->blobid = ctx->blob->id;
+
+	spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx);
+}
+
+static void
+bs_update_corrupted_blob(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
+		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+		return;
+	}
+
+	ctx->blob->md_ro = false;
+	blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true);
+	blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true);
+	spdk_blob_set_read_only(ctx->blob);
+
+	if (ctx->iter_cb_fn) {
+		ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0);
+	}
+	bs_blob_list_add(ctx->blob);
+
+	spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+}
+
+static void
+bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Failed to open clone of a corrupted blob\n");
+		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+		return;
+	}
+
+	if (blob->parent_id == ctx->blob->id) {
+		/* Power failure occured before updating clone (snapshot delete case)
+		 * or after updating clone (creating snapshot case) - keep snapshot */
+		spdk_blob_close(blob, bs_update_corrupted_blob, ctx);
+	} else {
+		/* Power failure occured after updating clone (snapshot delete case)
+		 * or before updating clone (creating snapshot case) - remove snapshot */
+		spdk_blob_close(blob, bs_delete_corrupted_blob, ctx);
+	}
+}
+
+static void
+bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = arg;
+	const void *value;
+	size_t len;
+	int rc = 0;
+
+	if (bserrno == 0) {
+		/* Examine blob if it is corrupted after power failure. Fix
+		 * the ones that can be fixed and remove any other corrupted
+		 * ones. If it is not corrupted just process it */
+		rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true);
+		if (rc != 0) {
+			rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true);
+			if (rc != 0) {
+				/* Not corrupted - process it and continue with iterating through blobs */
+				if (ctx->iter_cb_fn) {
+					ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0);
+				}
+				bs_blob_list_add(blob);
+				spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx);
+				return;
+			}
+
+		}
+
+		assert(len == sizeof(spdk_blob_id));
+
+		ctx->blob = blob;
+
+		/* Open clone to check if we are able to fix this blob or should we remove it */
+		spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx);
+		return;
+	} else if (bserrno == -ENOENT) {
+		bserrno = 0;
+	} else {
+		/*
+		 * This case needs to be looked at further.  Same problem
+		 *  exists with applications that rely on explicit blob
+		 *  iteration.  We should just skip the blob that failed
+		 *  to load and continue on to the next one.
+		 */
+		SPDK_ERRLOG("Error in iterating blobs\n");
+	}
+
+	ctx->iter_cb_fn = NULL;
+
+	spdk_free(ctx->super);
+	spdk_free(ctx->mask);
+	bs_sequence_finish(ctx->seq, bserrno);
+	free(ctx);
+}
+
+static void
+bs_load_complete(struct spdk_bs_load_ctx *ctx)
+{
+	spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx);
+}
+
+static void
+bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	int rc;
+
+	/* The type must be correct */
+	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS);
+
+	/* The length of the mask (in bits) must not be greater than
+	 * the length of the buffer (converted to bits) */
+	assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8));
+
+	/* The length of the mask must be exactly equal to the size
+	 * (in pages) of the metadata region */
+	assert(ctx->mask->length == ctx->super->md_len);
+
+	rc = bs_load_mask(&ctx->bs->used_blobids, ctx->mask);
+	if (rc < 0) {
+		spdk_free(ctx->mask);
+		bs_load_ctx_fail(ctx, rc);
+		return;
+	}
+
+	bs_load_complete(ctx);
+}
+
+static void
+bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	uint64_t		lba, lba_count, mask_size;
+	int			rc;
+
+	if (bserrno != 0) {
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	/* The type must be correct */
+	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
+	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
+	assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
+					     struct spdk_blob_md_page) * 8));
+	/* The length of the mask must be exactly equal to the total number of clusters */
+	assert(ctx->mask->length == ctx->bs->total_clusters);
+
+	rc = bs_load_mask(&ctx->bs->used_clusters, ctx->mask);
+	if (rc < 0) {
+		spdk_free(ctx->mask);
+		bs_load_ctx_fail(ctx, rc);
+		return;
+	}
+
+	ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->bs->used_clusters);
+	assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
+
+	spdk_free(ctx->mask);
+
+	/* Read the used blobids mask */
+	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
+	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+				 SPDK_MALLOC_DMA);
+	if (!ctx->mask) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+	lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
+	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
+	bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
+			     bs_load_used_blobids_cpl, ctx);
+}
+
+static void
+bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	uint64_t		lba, lba_count, mask_size;
+	int			rc;
+
+	if (bserrno != 0) {
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	/* The type must be correct */
+	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES);
+	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
+	assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE *
+				     8));
+	/* The length of the mask must be exactly equal to the size (in pages) of the metadata region */
+	assert(ctx->mask->length == ctx->super->md_len);
+
+	rc = bs_load_mask(&ctx->bs->used_md_pages, ctx->mask);
+	if (rc < 0) {
+		spdk_free(ctx->mask);
+		bs_load_ctx_fail(ctx, rc);
+		return;
+	}
+
+	spdk_free(ctx->mask);
+
+	/* Read the used clusters mask */
+	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
+	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+				 SPDK_MALLOC_DMA);
+	if (!ctx->mask) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
+	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
+	bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
+			     bs_load_used_clusters_cpl, ctx);
+}
+
+static void
+bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx)
+{
+	uint64_t lba, lba_count, mask_size;
+
+	/* Read the used pages mask */
+	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
+	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
+				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->mask) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
+	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
+	bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
+			     bs_load_used_pages_cpl, ctx);
+}
+
+static int
+bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page)
+{
+	struct spdk_blob_store *bs = ctx->bs;
+	struct spdk_blob_md_descriptor *desc;
+	size_t	cur_desc = 0;
+
+	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+	while (cur_desc < sizeof(page->descriptors)) {
+		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
+			if (desc->length == 0) {
+				/* If padding and length are 0, this terminates the page */
+				break;
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
+			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
+			unsigned int				i, j;
+			unsigned int				cluster_count = 0;
+			uint32_t				cluster_idx;
+
+			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
+
+			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
+					cluster_idx = desc_extent_rle->extents[i].cluster_idx;
+					/*
+					 * cluster_idx = 0 means an unallocated cluster - don't mark that
+					 * in the used cluster map.
+					 */
+					if (cluster_idx != 0) {
+						spdk_bit_array_set(bs->used_clusters, cluster_idx + j);
+						if (bs->num_free_clusters == 0) {
+							return -ENOSPC;
+						}
+						bs->num_free_clusters--;
+					}
+					cluster_count++;
+				}
+			}
+			if (cluster_count == 0) {
+				return -EINVAL;
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
+			uint32_t					i;
+			uint32_t					cluster_count = 0;
+			uint32_t					cluster_idx;
+			size_t						cluster_idx_length;
+
+			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
+			cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
+
+			if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
+			    (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
+				return -EINVAL;
+			}
+
+			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
+				cluster_idx = desc_extent->cluster_idx[i];
+				/*
+				 * cluster_idx = 0 means an unallocated cluster - don't mark that
+				 * in the used cluster map.
+				 */
+				if (cluster_idx != 0) {
+					if (cluster_idx < desc_extent->start_cluster_idx &&
+					    cluster_idx >= desc_extent->start_cluster_idx + cluster_count) {
+						return -EINVAL;
+					}
+					spdk_bit_array_set(bs->used_clusters, cluster_idx);
+					if (bs->num_free_clusters == 0) {
+						return -ENOSPC;
+					}
+					bs->num_free_clusters--;
+				}
+				cluster_count++;
+			}
+
+			if (cluster_count == 0) {
+				return -EINVAL;
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
+			/* Skip this item */
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
+			/* Skip this item */
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
+			/* Skip this item */
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
+			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
+			uint32_t num_extent_pages = ctx->num_extent_pages;
+			uint32_t i;
+			size_t extent_pages_length;
+			void *tmp;
+
+			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
+			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
+
+			if (desc_extent_table->length == 0 ||
+			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
+				return -EINVAL;
+			}
+
+			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+				if (desc_extent_table->extent_page[i].page_idx != 0) {
+					if (desc_extent_table->extent_page[i].num_pages != 1) {
+						return -EINVAL;
+					}
+					num_extent_pages += 1;
+				}
+			}
+
+			if (num_extent_pages > 0) {
+				tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t));
+				if (tmp == NULL) {
+					return -ENOMEM;
+				}
+				ctx->extent_page_num = tmp;
+
+				/* Extent table entries contain md page numbers for extent pages.
+				 * Zeroes represent unallocated extent pages, those are run-length-encoded.
+				 */
+				for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+					if (desc_extent_table->extent_page[i].page_idx != 0) {
+						ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx;
+						ctx->num_extent_pages += 1;
+					}
+				}
+			}
+		} else {
+			/* Error */
+			return -EINVAL;
+		}
+		/* Advance to the next descriptor */
+		cur_desc += sizeof(*desc) + desc->length;
+		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
+			break;
+		}
+		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
+	}
+	return 0;
+}
+
+static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page)
+{
+	uint32_t crc;
+	struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+	size_t desc_len;
+
+	crc = blob_md_page_calc_crc(page);
+	if (crc != page->crc) {
+		return false;
+	}
+
+	/* Extent page should always be of sequence num 0. */
+	if (page->sequence_num != 0) {
+		return false;
+	}
+
+	/* Descriptor type must be EXTENT_PAGE. */
+	if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+		return false;
+	}
+
+	/* Descriptor length cannot exceed the page. */
+	desc_len = sizeof(*desc) + desc->length;
+	if (desc_len > sizeof(page->descriptors)) {
+		return false;
+	}
+
+	/* It has to be the only descriptor in the page. */
+	if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) {
+		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len);
+		if (desc->length != 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static bool bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx)
+{
+	uint32_t crc;
+	struct spdk_blob_md_page *page = ctx->page;
+
+	crc = blob_md_page_calc_crc(page);
+	if (crc != page->crc) {
+		return false;
+	}
+
+	/* First page of a sequence should match the blobid. */
+	if (page->sequence_num == 0 &&
+	    bs_page_to_blobid(ctx->cur_page) != page->id) {
+		return false;
+	}
+	assert(bs_load_cur_extent_page_valid(page) == false);
+
+	return true;
+}
+
+static void
+bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx);
+
+static void
+bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	if (bserrno != 0) {
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	bs_load_complete(ctx);
+}
+
+static void
+bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	spdk_free(ctx->mask);
+	ctx->mask = NULL;
+
+	if (bserrno != 0) {
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl);
+}
+
+static void
+bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	spdk_free(ctx->mask);
+	ctx->mask = NULL;
+
+	if (bserrno != 0) {
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl);
+}
+
+static void
+bs_load_write_used_md(struct spdk_bs_load_ctx *ctx)
+{
+	bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl);
+}
+
+static void
+bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx)
+{
+	uint64_t num_md_clusters;
+	uint64_t i;
+
+	ctx->in_page_chain = false;
+
+	do {
+		ctx->page_index++;
+	} while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true);
+
+	if (ctx->page_index < ctx->super->md_len) {
+		ctx->cur_page = ctx->page_index;
+		bs_load_replay_cur_md_page(ctx);
+	} else {
+		/* Claim all of the clusters used by the metadata */
+		num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster);
+		for (i = 0; i < num_md_clusters; i++) {
+			bs_claim_cluster(ctx->bs, i);
+		}
+		spdk_free(ctx->page);
+		bs_load_write_used_md(ctx);
+	}
+}
+
+static void
+bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	uint32_t page_num;
+	uint64_t i;
+
+	if (bserrno != 0) {
+		spdk_free(ctx->extent_pages);
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	for (i = 0; i < ctx->num_extent_pages; i++) {
+		/* Extent pages are only read when present within in chain md.
+		 * Integrity of md is not right if that page was not a valid extent page. */
+		if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) {
+			spdk_free(ctx->extent_pages);
+			bs_load_ctx_fail(ctx, -EILSEQ);
+			return;
+		}
+
+		page_num = ctx->extent_page_num[i];
+		spdk_bit_array_set(ctx->bs->used_md_pages, page_num);
+		if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) {
+			spdk_free(ctx->extent_pages);
+			bs_load_ctx_fail(ctx, -EILSEQ);
+			return;
+		}
+	}
+
+	spdk_free(ctx->extent_pages);
+	free(ctx->extent_page_num);
+	ctx->extent_page_num = NULL;
+	ctx->num_extent_pages = 0;
+
+	bs_load_replay_md_chain_cpl(ctx);
+}
+
+static void
+bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx)
+{
+	spdk_bs_batch_t *batch;
+	uint32_t page;
+	uint64_t lba;
+	uint64_t i;
+
+	ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, SPDK_BS_PAGE_SIZE,
+					 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->extent_pages) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx);
+
+	for (i = 0; i < ctx->num_extent_pages; i++) {
+		page = ctx->extent_page_num[i];
+		assert(page < ctx->super->md_len);
+		lba = bs_md_page_to_lba(ctx->bs, page);
+		bs_batch_read_dev(batch, &ctx->extent_pages[i], lba,
+				  bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE));
+	}
+
+	bs_batch_close(batch);
+}
+
+static void
+bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	uint32_t page_num;
+	struct spdk_blob_md_page *page;
+
+	if (bserrno != 0) {
+		bs_load_ctx_fail(ctx, bserrno);
+		return;
+	}
+
+	page_num = ctx->cur_page;
+	page = ctx->page;
+	if (bs_load_cur_md_page_valid(ctx) == true) {
+		if (page->sequence_num == 0 || ctx->in_page_chain == true) {
+			bs_claim_md_page(ctx->bs, page_num);
+			if (page->sequence_num == 0) {
+				spdk_bit_array_set(ctx->bs->used_blobids, page_num);
+			}
+			if (bs_load_replay_md_parse_page(ctx, page)) {
+				bs_load_ctx_fail(ctx, -EILSEQ);
+				return;
+			}
+			if (page->next != SPDK_INVALID_MD_PAGE) {
+				ctx->in_page_chain = true;
+				ctx->cur_page = page->next;
+				bs_load_replay_cur_md_page(ctx);
+				return;
+			}
+			if (ctx->num_extent_pages != 0) {
+				bs_load_replay_extent_pages(ctx);
+				return;
+			}
+		}
+	}
+	bs_load_replay_md_chain_cpl(ctx);
+}
+
+static void
+bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx)
+{
+	uint64_t lba;
+
+	assert(ctx->cur_page < ctx->super->md_len);
+	lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page);
+	bs_sequence_read_dev(ctx->seq, ctx->page, lba,
+			     bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
+			     bs_load_replay_md_cpl, ctx);
+}
+
+static void
+bs_load_replay_md(struct spdk_bs_load_ctx *ctx)
+{
+	ctx->page_index = 0;
+	ctx->cur_page = 0;
+	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
+				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->page) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+	bs_load_replay_cur_md_page(ctx);
+}
+
+static void
+bs_recover(struct spdk_bs_load_ctx *ctx)
+{
+	int		rc;
+
+	rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len);
+	if (rc < 0) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len);
+	if (rc < 0) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters);
+	if (rc < 0) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len);
+	if (rc < 0) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+
+	ctx->bs->num_free_clusters = ctx->bs->total_clusters;
+	bs_load_replay_md(ctx);
+}
+
+static void
+bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx *ctx = cb_arg;
+	uint32_t	crc;
+	int		rc;
+	static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
+
+	if (ctx->super->version > SPDK_BS_VERSION ||
+	    ctx->super->version < SPDK_BS_INITIAL_VERSION) {
+		bs_load_ctx_fail(ctx, -EILSEQ);
+		return;
+	}
+
+	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
+		   sizeof(ctx->super->signature)) != 0) {
+		bs_load_ctx_fail(ctx, -EILSEQ);
+		return;
+	}
+
+	crc = blob_md_page_calc_crc(ctx->super);
+	if (crc != ctx->super->crc) {
+		bs_load_ctx_fail(ctx, -EILSEQ);
+		return;
+	}
+
+	if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype matched - loading blobstore\n");
+	} else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype wildcard used - loading blobstore regardless bstype\n");
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Unexpected bstype\n");
+		SPDK_LOGDUMP(SPDK_LOG_BLOB, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
+		SPDK_LOGDUMP(SPDK_LOG_BLOB, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
+		bs_load_ctx_fail(ctx, -ENXIO);
+		return;
+	}
+
+	if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) {
+		SPDK_NOTICELOG("Size mismatch, dev size: %lu, blobstore size: %lu\n",
+			       ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size);
+		bs_load_ctx_fail(ctx, -EILSEQ);
+		return;
+	}
+
+	if (ctx->super->size == 0) {
+		ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
+	}
+
+	if (ctx->super->io_unit_size == 0) {
+		ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
+	}
+
+	/* Parse the super block */
+	ctx->bs->clean = 1;
+	ctx->bs->cluster_sz = ctx->super->cluster_size;
+	ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
+	ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE;
+	if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) {
+		ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster);
+	}
+	ctx->bs->io_unit_size = ctx->super->io_unit_size;
+	rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters);
+	if (rc < 0) {
+		bs_load_ctx_fail(ctx, -ENOMEM);
+		return;
+	}
+	ctx->bs->md_start = ctx->super->md_start;
+	ctx->bs->md_len = ctx->super->md_len;
+	ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
+					       ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
+	ctx->bs->super_blob = ctx->super->super_blob;
+	memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
+
+	if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) {
+		bs_recover(ctx);
+	} else {
+		bs_load_read_used_pages(ctx);
+	}
+}
+
+void
+spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
+	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_blob_store	*bs;
+	struct spdk_bs_cpl	cpl;
+	struct spdk_bs_load_ctx *ctx;
+	struct spdk_bs_opts	opts = {};
+	int err;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Loading blobstore from dev %p\n", dev);
+
+	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "unsupported dev block length of %d\n", dev->blocklen);
+		dev->destroy(dev);
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	if (o) {
+		opts = *o;
+	} else {
+		spdk_bs_opts_init(&opts);
+	}
+
+	if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
+		dev->destroy(dev);
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	err = bs_alloc(dev, &opts, &bs);
+	if (err) {
+		dev->destroy(dev);
+		cb_fn(cb_arg, NULL, err);
+		return;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	ctx->bs = bs;
+	ctx->iter_cb_fn = opts.iter_cb_fn;
+	ctx->iter_cb_arg = opts.iter_cb_arg;
+
+	/* Allocate memory for the super block */
+	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->super) {
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
+	cpl.u.bs_handle.cb_fn = cb_fn;
+	cpl.u.bs_handle.cb_arg = cb_arg;
+	cpl.u.bs_handle.bs = bs;
+
+	ctx->seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!ctx->seq) {
+		spdk_free(ctx->super);
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	/* Read the super block */
+	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
+			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
+			     bs_load_super_cpl, ctx);
+}
+
+/* END spdk_bs_load */
+
+/* START spdk_bs_dump */
+
+struct spdk_bs_dump_ctx {
+	struct spdk_blob_store		*bs;
+	struct spdk_bs_super_block	*super;
+	uint32_t			cur_page;
+	struct spdk_blob_md_page	*page;
+	spdk_bs_sequence_t		*seq;
+	FILE				*fp;
+	spdk_bs_dump_print_xattr	print_xattr_fn;
+	char				xattr_name[4096];
+};
+
+static void
+bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrno)
+{
+	spdk_free(ctx->super);
+
+	/*
+	 * We need to defer calling bs_call_cpl() until after
+	 * dev destruction, so tuck these away for later use.
+	 */
+	ctx->bs->unload_err = bserrno;
+	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
+	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
+
+	bs_sequence_finish(seq, 0);
+	bs_free(ctx->bs);
+	free(ctx);
+}
+
+static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg);
+
+static void
+bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx)
+{
+	uint32_t page_idx = ctx->cur_page;
+	struct spdk_blob_md_page *page = ctx->page;
+	struct spdk_blob_md_descriptor *desc;
+	size_t cur_desc = 0;
+	uint32_t crc;
+
+	fprintf(ctx->fp, "=========\n");
+	fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx);
+	fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id);
+
+	crc = blob_md_page_calc_crc(page);
+	fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch");
+
+	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+	while (cur_desc < sizeof(page->descriptors)) {
+		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
+			if (desc->length == 0) {
+				/* If padding and length are 0, this terminates the page */
+				break;
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
+			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
+			unsigned int				i;
+
+			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
+
+			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+				if (desc_extent_rle->extents[i].cluster_idx != 0) {
+					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
+						desc_extent_rle->extents[i].cluster_idx);
+				} else {
+					fprintf(ctx->fp, "Unallocated Extent - ");
+				}
+				fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length);
+				fprintf(ctx->fp, "\n");
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
+			unsigned int					i;
+
+			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
+
+			for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
+				if (desc_extent->cluster_idx[i] != 0) {
+					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
+						desc_extent->cluster_idx[i]);
+				} else {
+					fprintf(ctx->fp, "Unallocated Extent");
+				}
+				fprintf(ctx->fp, "\n");
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
+			struct spdk_blob_md_descriptor_xattr *desc_xattr;
+			uint32_t i;
+
+			desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc;
+
+			if (desc_xattr->length !=
+			    sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) +
+			    desc_xattr->name_length + desc_xattr->value_length) {
+			}
+
+			memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length);
+			ctx->xattr_name[desc_xattr->name_length] = '\0';
+			fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name);
+			fprintf(ctx->fp, "       value = \"");
+			ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name,
+					    (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
+					    desc_xattr->value_length);
+			fprintf(ctx->fp, "\"\n");
+			for (i = 0; i < desc_xattr->value_length; i++) {
+				if (i % 16 == 0) {
+					fprintf(ctx->fp, "               ");
+				}
+				fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i));
+				if ((i + 1) % 16 == 0) {
+					fprintf(ctx->fp, "\n");
+				}
+			}
+			if (i % 16 != 0) {
+				fprintf(ctx->fp, "\n");
+			}
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
+			/* TODO */
+		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
+			/* TODO */
+		} else {
+			/* Error */
+		}
+		/* Advance to the next descriptor */
+		cur_desc += sizeof(*desc) + desc->length;
+		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
+			break;
+		}
+		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
+	}
+}
+
+static void
+bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_dump_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		bs_dump_finish(seq, ctx, bserrno);
+		return;
+	}
+
+	if (ctx->page->id != 0) {
+		bs_dump_print_md_page(ctx);
+	}
+
+	ctx->cur_page++;
+
+	if (ctx->cur_page < ctx->super->md_len) {
+		bs_dump_read_md_page(seq, ctx);
+	} else {
+		spdk_free(ctx->page);
+		bs_dump_finish(seq, ctx, 0);
+	}
+}
+
+static void
+bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg)
+{
+	struct spdk_bs_dump_ctx *ctx = cb_arg;
+	uint64_t lba;
+
+	assert(ctx->cur_page < ctx->super->md_len);
+	lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page);
+	bs_sequence_read_dev(seq, ctx->page, lba,
+			     bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
+			     bs_dump_read_md_page_cpl, ctx);
+}
+
+static void
+bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_dump_ctx *ctx = cb_arg;
+
+	fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature);
+	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
+		   sizeof(ctx->super->signature)) != 0) {
+		fprintf(ctx->fp, "(Mismatch)\n");
+		bs_dump_finish(seq, ctx, bserrno);
+		return;
+	} else {
+		fprintf(ctx->fp, "(OK)\n");
+	}
+	fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version);
+	fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc,
+		(ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch");
+	fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype);
+	fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size);
+	fprintf(ctx->fp, "Super Blob ID: ");
+	if (ctx->super->super_blob == SPDK_BLOBID_INVALID) {
+		fprintf(ctx->fp, "(None)\n");
+	} else {
+		fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob);
+	}
+	fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean);
+	fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start);
+	fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len);
+	fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start);
+	fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len);
+	fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start);
+	fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len);
+	fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start);
+	fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len);
+
+	ctx->cur_page = 0;
+	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
+				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->page) {
+		bs_dump_finish(seq, ctx, -ENOMEM);
+		return;
+	}
+	bs_dump_read_md_page(seq, ctx);
+}
+
+void
+spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn,
+	     spdk_bs_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_blob_store	*bs;
+	struct spdk_bs_cpl	cpl;
+	spdk_bs_sequence_t	*seq;
+	struct spdk_bs_dump_ctx *ctx;
+	struct spdk_bs_opts	opts = {};
+	int err;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Dumping blobstore from dev %p\n", dev);
+
+	spdk_bs_opts_init(&opts);
+
+	err = bs_alloc(dev, &opts, &bs);
+	if (err) {
+		dev->destroy(dev);
+		cb_fn(cb_arg, err);
+		return;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		bs_free(bs);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->bs = bs;
+	ctx->fp = fp;
+	ctx->print_xattr_fn = print_xattr_fn;
+
+	/* Allocate memory for the super block */
+	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->super) {
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+	cpl.u.bs_basic.cb_fn = cb_fn;
+	cpl.u.bs_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		spdk_free(ctx->super);
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	/* Read the super block */
+	bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
+			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
+			     bs_dump_super_cpl, ctx);
+}
+
+/* END spdk_bs_dump */
+
+/* START spdk_bs_init */
+
+struct spdk_bs_init_ctx {
+	struct spdk_blob_store		*bs;
+	struct spdk_bs_super_block	*super;
+};
+
+static void
+bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_init_ctx *ctx = cb_arg;
+
+	spdk_free(ctx->super);
+	free(ctx);
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_init_ctx *ctx = cb_arg;
+
+	/* Write super block */
+	bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
+			      bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
+			      bs_init_persist_super_cpl, ctx);
+}
+
+void
+spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
+	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_init_ctx *ctx;
+	struct spdk_blob_store	*bs;
+	struct spdk_bs_cpl	cpl;
+	spdk_bs_sequence_t	*seq;
+	spdk_bs_batch_t		*batch;
+	uint64_t		num_md_lba;
+	uint64_t		num_md_pages;
+	uint64_t		num_md_clusters;
+	uint32_t		i;
+	struct spdk_bs_opts	opts = {};
+	int			rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Initializing blobstore on dev %p\n", dev);
+
+	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
+		SPDK_ERRLOG("unsupported dev block length of %d\n",
+			    dev->blocklen);
+		dev->destroy(dev);
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	if (o) {
+		opts = *o;
+	} else {
+		spdk_bs_opts_init(&opts);
+	}
+
+	if (bs_opts_verify(&opts) != 0) {
+		dev->destroy(dev);
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	rc = bs_alloc(dev, &opts, &bs);
+	if (rc) {
+		dev->destroy(dev);
+		cb_fn(cb_arg, NULL, rc);
+		return;
+	}
+
+	if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
+		/* By default, allocate 1 page per cluster.
+		 * Technically, this over-allocates metadata
+		 * because more metadata will reduce the number
+		 * of usable clusters. This can be addressed with
+		 * more complex math in the future.
+		 */
+		bs->md_len = bs->total_clusters;
+	} else {
+		bs->md_len = opts.num_md_pages;
+	}
+	rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
+	if (rc < 0) {
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
+	if (rc < 0) {
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len);
+	if (rc < 0) {
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	ctx->bs = bs;
+
+	/* Allocate memory for the super block */
+	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->super) {
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+	memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
+	       sizeof(ctx->super->signature));
+	ctx->super->version = SPDK_BS_VERSION;
+	ctx->super->length = sizeof(*ctx->super);
+	ctx->super->super_blob = bs->super_blob;
+	ctx->super->clean = 0;
+	ctx->super->cluster_size = bs->cluster_sz;
+	ctx->super->io_unit_size = bs->io_unit_size;
+	memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype));
+
+	/* Calculate how many pages the metadata consumes at the front
+	 * of the disk.
+	 */
+
+	/* The super block uses 1 page */
+	num_md_pages = 1;
+
+	/* The used_md_pages mask requires 1 bit per metadata page, rounded
+	 * up to the nearest page, plus a header.
+	 */
+	ctx->super->used_page_mask_start = num_md_pages;
+	ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
+					 spdk_divide_round_up(bs->md_len, 8),
+					 SPDK_BS_PAGE_SIZE);
+	num_md_pages += ctx->super->used_page_mask_len;
+
+	/* The used_clusters mask requires 1 bit per cluster, rounded
+	 * up to the nearest page, plus a header.
+	 */
+	ctx->super->used_cluster_mask_start = num_md_pages;
+	ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
+					    spdk_divide_round_up(bs->total_clusters, 8),
+					    SPDK_BS_PAGE_SIZE);
+	num_md_pages += ctx->super->used_cluster_mask_len;
+
+	/* The used_blobids mask requires 1 bit per metadata page, rounded
+	 * up to the nearest page, plus a header.
+	 */
+	ctx->super->used_blobid_mask_start = num_md_pages;
+	ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
+					   spdk_divide_round_up(bs->md_len, 8),
+					   SPDK_BS_PAGE_SIZE);
+	num_md_pages += ctx->super->used_blobid_mask_len;
+
+	/* The metadata region size was chosen above */
+	ctx->super->md_start = bs->md_start = num_md_pages;
+	ctx->super->md_len = bs->md_len;
+	num_md_pages += bs->md_len;
+
+	num_md_lba = bs_page_to_lba(bs, num_md_pages);
+
+	ctx->super->size = dev->blockcnt * dev->blocklen;
+
+	ctx->super->crc = blob_md_page_calc_crc(ctx->super);
+
+	num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
+	if (num_md_clusters > bs->total_clusters) {
+		SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, "
+			    "please decrease number of pages reserved for metadata "
+			    "or increase cluster size.\n");
+		spdk_free(ctx->super);
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+	/* Claim all of the clusters used by the metadata */
+	for (i = 0; i < num_md_clusters; i++) {
+		bs_claim_cluster(bs, i);
+	}
+
+	bs->total_data_clusters = bs->num_free_clusters;
+
+	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
+	cpl.u.bs_handle.cb_fn = cb_fn;
+	cpl.u.bs_handle.cb_arg = cb_arg;
+	cpl.u.bs_handle.bs = bs;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		spdk_free(ctx->super);
+		free(ctx);
+		bs_free(bs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
+
+	/* Clear metadata space */
+	bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
+
+	switch (opts.clear_method) {
+	case BS_CLEAR_WITH_UNMAP:
+		/* Trim data clusters */
+		bs_batch_unmap_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba);
+		break;
+	case BS_CLEAR_WITH_WRITE_ZEROES:
+		/* Write_zeroes to data clusters */
+		bs_batch_write_zeroes_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba);
+		break;
+	case BS_CLEAR_WITH_NONE:
+	default:
+		break;
+	}
+
+	bs_batch_close(batch);
+}
+
+/* END spdk_bs_init */
+
+/* START spdk_bs_destroy */
+
+static void
+bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_init_ctx *ctx = cb_arg;
+	struct spdk_blob_store *bs = ctx->bs;
+
+	/*
+	 * We need to defer calling bs_call_cpl() until after
+	 * dev destruction, so tuck these away for later use.
+	 */
+	bs->unload_err = bserrno;
+	memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
+	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
+
+	bs_sequence_finish(seq, bserrno);
+
+	bs_free(bs);
+	free(ctx);
+}
+
+void
+spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn,
+		void *cb_arg)
+{
+	struct spdk_bs_cpl	cpl;
+	spdk_bs_sequence_t	*seq;
+	struct spdk_bs_init_ctx *ctx;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Destroying blobstore\n");
+
+	if (!TAILQ_EMPTY(&bs->blobs)) {
+		SPDK_ERRLOG("Blobstore still has open blobs\n");
+		cb_fn(cb_arg, -EBUSY);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+	cpl.u.bs_basic.cb_fn = cb_fn;
+	cpl.u.bs_basic.cb_arg = cb_arg;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->bs = bs;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		free(ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	/* Write zeroes to the super block */
+	bs_sequence_write_zeroes_dev(seq,
+				     bs_page_to_lba(bs, 0),
+				     bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)),
+				     bs_destroy_trim_cpl, ctx);
+}
+
+/* END spdk_bs_destroy */
+
+/* START spdk_bs_unload */
+
+static void
+bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno)
+{
+	spdk_bs_sequence_t *seq = ctx->seq;
+
+	spdk_free(ctx->super);
+
+	/*
+	 * We need to defer calling bs_call_cpl() until after
+	 * dev destruction, so tuck these away for later use.
+	 */
+	ctx->bs->unload_err = bserrno;
+	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
+	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
+
+	bs_sequence_finish(seq, bserrno);
+
+	bs_free(ctx->bs);
+	free(ctx);
+}
+
+static void
+bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	bs_unload_finish(ctx, bserrno);
+}
+
+static void
+bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	spdk_free(ctx->mask);
+
+	if (bserrno != 0) {
+		bs_unload_finish(ctx, bserrno);
+		return;
+	}
+
+	ctx->super->clean = 1;
+
+	bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx);
+}
+
+static void
+bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	spdk_free(ctx->mask);
+	ctx->mask = NULL;
+
+	if (bserrno != 0) {
+		bs_unload_finish(ctx, bserrno);
+		return;
+	}
+
+	bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl);
+}
+
+static void
+bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	spdk_free(ctx->mask);
+	ctx->mask = NULL;
+
+	if (bserrno != 0) {
+		bs_unload_finish(ctx, bserrno);
+		return;
+	}
+
+	bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl);
+}
+
+static void
+bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_load_ctx	*ctx = cb_arg;
+
+	if (bserrno != 0) {
+		bs_unload_finish(ctx, bserrno);
+		return;
+	}
+
+	bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl);
+}
+
+void
+spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_cpl	cpl;
+	struct spdk_bs_load_ctx *ctx;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blobstore\n");
+
+	if (!TAILQ_EMPTY(&bs->blobs)) {
+		SPDK_ERRLOG("Blobstore still has open blobs\n");
+		cb_fn(cb_arg, -EBUSY);
+		return;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->bs = bs;
+
+	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->super) {
+		free(ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+	cpl.u.bs_basic.cb_fn = cb_fn;
+	cpl.u.bs_basic.cb_arg = cb_arg;
+
+	ctx->seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!ctx->seq) {
+		spdk_free(ctx->super);
+		free(ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	/* Read super block */
+	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
+			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
+			     bs_unload_read_super_cpl, ctx);
+}
+
+/* END spdk_bs_unload */
+
+/* START spdk_bs_set_super */
+
+struct spdk_bs_set_super_ctx {
+	struct spdk_blob_store		*bs;
+	struct spdk_bs_super_block	*super;
+};
+
+static void
+bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Unable to write to super block of blobstore\n");
+	}
+
+	spdk_free(ctx->super);
+
+	bs_sequence_finish(seq, bserrno);
+
+	free(ctx);
+}
+
+static void
+bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Unable to read super block of blobstore\n");
+		spdk_free(ctx->super);
+		bs_sequence_finish(seq, bserrno);
+		free(ctx);
+		return;
+	}
+
+	bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx);
+}
+
+void
+spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid,
+		  spdk_bs_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_cpl		cpl;
+	spdk_bs_sequence_t		*seq;
+	struct spdk_bs_set_super_ctx	*ctx;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Setting super blob id on blobstore\n");
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->bs = bs;
+
+	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ctx->super) {
+		free(ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+	cpl.u.bs_basic.cb_fn = cb_fn;
+	cpl.u.bs_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		spdk_free(ctx->super);
+		free(ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	bs->super_blob = blobid;
+
+	/* Read super block */
+	bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
+			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
+			     bs_set_super_read_cpl, ctx);
+}
+
+/* END spdk_bs_set_super */
+
+void
+spdk_bs_get_super(struct spdk_blob_store *bs,
+		  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+	if (bs->super_blob == SPDK_BLOBID_INVALID) {
+		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT);
+	} else {
+		cb_fn(cb_arg, bs->super_blob, 0);
+	}
+}
+
+uint64_t
+spdk_bs_get_cluster_size(struct spdk_blob_store *bs)
+{
+	return bs->cluster_sz;
+}
+
+uint64_t
+spdk_bs_get_page_size(struct spdk_blob_store *bs)
+{
+	return SPDK_BS_PAGE_SIZE;
+}
+
+uint64_t
+spdk_bs_get_io_unit_size(struct spdk_blob_store *bs)
+{
+	return bs->io_unit_size;
+}
+
+uint64_t
+spdk_bs_free_cluster_count(struct spdk_blob_store *bs)
+{
+	return bs->num_free_clusters;
+}
+
+uint64_t
+spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs)
+{
+	return bs->total_data_clusters;
+}
+
+static int
+bs_register_md_thread(struct spdk_blob_store *bs)
+{
+	bs->md_channel = spdk_get_io_channel(bs);
+	if (!bs->md_channel) {
+		SPDK_ERRLOG("Failed to get IO channel.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+bs_unregister_md_thread(struct spdk_blob_store *bs)
+{
+	spdk_put_io_channel(bs->md_channel);
+
+	return 0;
+}
+
+spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+
+	return blob->id;
+}
+
+uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+
+	return bs_cluster_to_page(blob->bs, blob->active.num_clusters);
+}
+
+uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+
+	return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs);
+}
+
+uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+
+	return blob->active.num_clusters;
+}
+
+/* START spdk_bs_create_blob */
+
+static void
+bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob *blob = cb_arg;
+	uint32_t page_idx = bs_blobid_to_page(blob->id);
+
+	if (bserrno != 0) {
+		spdk_bit_array_clear(blob->bs->used_blobids, page_idx);
+		bs_release_md_page(blob->bs, page_idx);
+	}
+
+	blob_free(blob);
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+static int
+blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs,
+		bool internal)
+{
+	uint64_t i;
+	size_t value_len = 0;
+	int rc;
+	const void *value = NULL;
+	if (xattrs->count > 0 && xattrs->get_value == NULL) {
+		return -EINVAL;
+	}
+	for (i = 0; i < xattrs->count; i++) {
+		xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len);
+		if (value == NULL || value_len == 0) {
+			return -EINVAL;
+		}
+		rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal);
+		if (rc < 0) {
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void
+bs_create_blob(struct spdk_blob_store *bs,
+	       const struct spdk_blob_opts *opts,
+	       const struct spdk_blob_xattr_opts *internal_xattrs,
+	       spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+	struct spdk_blob	*blob;
+	uint32_t		page_idx;
+	struct spdk_bs_cpl	cpl;
+	struct spdk_blob_opts	opts_default;
+	struct spdk_blob_xattr_opts internal_xattrs_default;
+	spdk_bs_sequence_t	*seq;
+	spdk_blob_id		id;
+	int rc;
+
+	assert(spdk_get_thread() == bs->md_thread);
+
+	page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
+	if (page_idx == UINT32_MAX) {
+		cb_fn(cb_arg, 0, -ENOMEM);
+		return;
+	}
+	spdk_bit_array_set(bs->used_blobids, page_idx);
+	bs_claim_md_page(bs, page_idx);
+
+	id = bs_page_to_blobid(page_idx);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Creating blob with id %lu at page %u\n", id, page_idx);
+
+	blob = blob_alloc(bs, id);
+	if (!blob) {
+		spdk_bit_array_clear(bs->used_blobids, page_idx);
+		bs_release_md_page(bs, page_idx);
+		cb_fn(cb_arg, 0, -ENOMEM);
+		return;
+	}
+
+	if (!opts) {
+		spdk_blob_opts_init(&opts_default);
+		opts = &opts_default;
+	}
+
+	blob->use_extent_table = opts->use_extent_table;
+	if (blob->use_extent_table) {
+		blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE;
+	}
+
+	if (!internal_xattrs) {
+		blob_xattrs_init(&internal_xattrs_default);
+		internal_xattrs = &internal_xattrs_default;
+	}
+
+	rc = blob_set_xattrs(blob, &opts->xattrs, false);
+	if (rc < 0) {
+		blob_free(blob);
+		spdk_bit_array_clear(bs->used_blobids, page_idx);
+		bs_release_md_page(bs, page_idx);
+		cb_fn(cb_arg, 0, rc);
+		return;
+	}
+
+	rc = blob_set_xattrs(blob, internal_xattrs, true);
+	if (rc < 0) {
+		blob_free(blob);
+		spdk_bit_array_clear(bs->used_blobids, page_idx);
+		bs_release_md_page(bs, page_idx);
+		cb_fn(cb_arg, 0, rc);
+		return;
+	}
+
+	if (opts->thin_provision) {
+		blob_set_thin_provision(blob);
+	}
+
+	blob_set_clear_method(blob, opts->clear_method);
+
+	rc = blob_resize(blob, opts->num_clusters);
+	if (rc < 0) {
+		blob_free(blob);
+		spdk_bit_array_clear(bs->used_blobids, page_idx);
+		bs_release_md_page(bs, page_idx);
+		cb_fn(cb_arg, 0, rc);
+		return;
+	}
+	cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
+	cpl.u.blobid.cb_fn = cb_fn;
+	cpl.u.blobid.cb_arg = cb_arg;
+	cpl.u.blobid.blobid = blob->id;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		blob_free(blob);
+		spdk_bit_array_clear(bs->used_blobids, page_idx);
+		bs_release_md_page(bs, page_idx);
+		cb_fn(cb_arg, 0, -ENOMEM);
+		return;
+	}
+
+	blob_persist(seq, blob, bs_create_blob_cpl, blob);
+}
+
+void spdk_bs_create_blob(struct spdk_blob_store *bs,
+			 spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+	bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
+}
+
+void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts,
+			     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+	bs_create_blob(bs, opts, NULL, cb_fn, cb_arg);
+}
+
+/* END spdk_bs_create_blob */
+
+/* START blob_cleanup */
+
+struct spdk_clone_snapshot_ctx {
+	struct spdk_bs_cpl      cpl;
+	int bserrno;
+	bool frozen;
+
+	struct spdk_io_channel *channel;
+
+	/* Current cluster for inflate operation */
+	uint64_t cluster;
+
+	/* For inflation force allocation of all unallocated clusters and remove
+	 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */
+	bool allocate_all;
+
+	struct {
+		spdk_blob_id id;
+		struct spdk_blob *blob;
+	} original;
+	struct {
+		spdk_blob_id id;
+		struct spdk_blob *blob;
+	} new;
+
+	/* xattrs specified for snapshot/clones only. They have no impact on
+	 * the original blobs xattrs. */
+	const struct spdk_blob_xattr_opts *xattrs;
+};
+
+static void
+bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = cb_arg;
+	struct spdk_bs_cpl *cpl = &ctx->cpl;
+
+	if (bserrno != 0) {
+		if (ctx->bserrno != 0) {
+			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
+		} else {
+			ctx->bserrno = bserrno;
+		}
+	}
+
+	switch (cpl->type) {
+	case SPDK_BS_CPL_TYPE_BLOBID:
+		cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_BLOB_BASIC:
+		cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
+		break;
+	default:
+		SPDK_UNREACHABLE();
+		break;
+	}
+
+	free(ctx);
+}
+
+static void
+bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+
+	if (bserrno != 0) {
+		if (ctx->bserrno != 0) {
+			SPDK_ERRLOG("Unfreeze error %d\n", bserrno);
+		} else {
+			ctx->bserrno = bserrno;
+		}
+	}
+
+	ctx->original.id = origblob->id;
+	origblob->locked_operation_in_progress = false;
+
+	spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx);
+}
+
+static void
+bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+
+	if (bserrno != 0) {
+		if (ctx->bserrno != 0) {
+			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
+		} else {
+			ctx->bserrno = bserrno;
+		}
+	}
+
+	if (ctx->frozen) {
+		/* Unfreeze any outstanding I/O */
+		blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx);
+	} else {
+		bs_snapshot_unfreeze_cpl(ctx, 0);
+	}
+
+}
+
+static void
+bs_clone_snapshot_newblob_cleanup(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *newblob = ctx->new.blob;
+
+	if (bserrno != 0) {
+		if (ctx->bserrno != 0) {
+			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
+		} else {
+			ctx->bserrno = bserrno;
+		}
+	}
+
+	ctx->new.id = newblob->id;
+	spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+/* END blob_cleanup */
+
+/* START spdk_bs_create_snapshot */
+
+static void
+bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2)
+{
+	uint64_t *cluster_temp;
+	uint32_t *extent_page_temp;
+
+	cluster_temp = blob1->active.clusters;
+	blob1->active.clusters = blob2->active.clusters;
+	blob2->active.clusters = cluster_temp;
+
+	extent_page_temp = blob1->active.extent_pages;
+	blob1->active.extent_pages = blob2->active.extent_pages;
+	blob2->active.extent_pages = extent_page_temp;
+}
+
+static void
+bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+	struct spdk_blob *newblob = ctx->new.blob;
+
+	if (bserrno != 0) {
+		bs_snapshot_swap_cluster_maps(newblob, origblob);
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	/* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */
+	bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true);
+	if (bserrno != 0) {
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	bs_blob_list_add(ctx->original.blob);
+
+	spdk_blob_set_read_only(newblob);
+
+	/* sync snapshot metadata */
+	spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+static void
+bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+	struct spdk_blob *newblob = ctx->new.blob;
+
+	if (bserrno != 0) {
+		/* return cluster map back to original */
+		bs_snapshot_swap_cluster_maps(newblob, origblob);
+
+		/* Newblob md sync failed. Valid clusters are only present in origblob.
+		 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occured.
+		 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */
+		blob_set_thin_provision(newblob);
+		assert(spdk_mem_all_zero(newblob->active.clusters,
+					 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
+		assert(spdk_mem_all_zero(newblob->active.extent_pages,
+					 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
+
+		bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	/* Set internal xattr for snapshot id */
+	bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true);
+	if (bserrno != 0) {
+		/* return cluster map back to original */
+		bs_snapshot_swap_cluster_maps(newblob, origblob);
+		bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	bs_blob_list_remove(origblob);
+	origblob->parent_id = newblob->id;
+
+	/* Create new back_bs_dev for snapshot */
+	origblob->back_bs_dev = bs_create_blob_bs_dev(newblob);
+	if (origblob->back_bs_dev == NULL) {
+		/* return cluster map back to original */
+		bs_snapshot_swap_cluster_maps(newblob, origblob);
+		bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL);
+		return;
+	}
+
+	/* set clone blob as thin provisioned */
+	blob_set_thin_provision(origblob);
+
+	bs_blob_list_add(newblob);
+
+	/* sync clone metadata */
+	spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx);
+}
+
+static void
+bs_snapshot_freeze_cpl(void *cb_arg, int rc)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+	struct spdk_blob *newblob = ctx->new.blob;
+	int bserrno;
+
+	if (rc != 0) {
+		bs_clone_snapshot_newblob_cleanup(ctx, rc);
+		return;
+	}
+
+	ctx->frozen = true;
+
+	/* set new back_bs_dev for snapshot */
+	newblob->back_bs_dev = origblob->back_bs_dev;
+	/* Set invalid flags from origblob */
+	newblob->invalid_flags = origblob->invalid_flags;
+
+	/* inherit parent from original blob if set */
+	newblob->parent_id = origblob->parent_id;
+	if (origblob->parent_id != SPDK_BLOBID_INVALID) {
+		/* Set internal xattr for snapshot id */
+		bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT,
+					 &origblob->parent_id, sizeof(spdk_blob_id), true);
+		if (bserrno != 0) {
+			bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
+			return;
+		}
+	}
+
+	/* swap cluster maps */
+	bs_snapshot_swap_cluster_maps(newblob, origblob);
+
+	/* Set the clear method on the new blob to match the original. */
+	blob_set_clear_method(newblob, origblob->clear_method);
+
+	/* sync snapshot metadata */
+	spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx);
+}
+
+static void
+bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+	struct spdk_blob *newblob = _blob;
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	ctx->new.blob = newblob;
+	assert(spdk_blob_is_thin_provisioned(newblob));
+	assert(spdk_mem_all_zero(newblob->active.clusters,
+				 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
+	assert(spdk_mem_all_zero(newblob->active.extent_pages,
+				 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
+
+	blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx);
+}
+
+static void
+bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *origblob = ctx->original.blob;
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	ctx->new.id = blobid;
+	ctx->cpl.u.blobid.blobid = blobid;
+
+	spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx);
+}
+
+
+static void
+bs_xattr_snapshot(void *arg, const char *name,
+		  const void **value, size_t *value_len)
+{
+	assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0);
+
+	struct spdk_blob *blob = (struct spdk_blob *)arg;
+	*value = &blob->id;
+	*value_len = sizeof(blob->id);
+}
+
+static void
+bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob_opts opts;
+	struct spdk_blob_xattr_opts internal_xattrs;
+	char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS };
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
+		return;
+	}
+
+	ctx->original.blob = _blob;
+
+	if (_blob->data_ro || _blob->md_ro) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot from read only blob with id %lu\n",
+			      _blob->id);
+		ctx->bserrno = -EINVAL;
+		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+		return;
+	}
+
+	if (_blob->locked_operation_in_progress) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot - another operation in progress\n");
+		ctx->bserrno = -EBUSY;
+		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+		return;
+	}
+
+	_blob->locked_operation_in_progress = true;
+
+	spdk_blob_opts_init(&opts);
+	blob_xattrs_init(&internal_xattrs);
+
+	/* Change the size of new blob to the same as in original blob,
+	 * but do not allocate clusters */
+	opts.thin_provision = true;
+	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
+	opts.use_extent_table = _blob->use_extent_table;
+
+	/* If there are any xattrs specified for snapshot, set them now */
+	if (ctx->xattrs) {
+		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
+	}
+	/* Set internal xattr SNAPSHOT_IN_PROGRESS */
+	internal_xattrs.count = 1;
+	internal_xattrs.ctx = _blob;
+	internal_xattrs.names = xattrs_names;
+	internal_xattrs.get_value = bs_xattr_snapshot;
+
+	bs_create_blob(_blob->bs, &opts, &internal_xattrs,
+		       bs_snapshot_newblob_create_cpl, ctx);
+}
+
+void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid,
+			     const struct spdk_blob_xattr_opts *snapshot_xattrs,
+			     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
+
+	if (!ctx) {
+		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
+		return;
+	}
+	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
+	ctx->cpl.u.blobid.cb_fn = cb_fn;
+	ctx->cpl.u.blobid.cb_arg = cb_arg;
+	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
+	ctx->bserrno = 0;
+	ctx->frozen = false;
+	ctx->original.id = blobid;
+	ctx->xattrs = snapshot_xattrs;
+
+	spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx);
+}
+/* END spdk_bs_create_snapshot */
+
+/* START spdk_bs_create_clone */
+
+static void
+bs_xattr_clone(void *arg, const char *name,
+	       const void **value, size_t *value_len)
+{
+	assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0);
+
+	struct spdk_blob *blob = (struct spdk_blob *)arg;
+	*value = &blob->id;
+	*value_len = sizeof(blob->id);
+}
+
+static void
+bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *clone = _blob;
+
+	ctx->new.blob = clone;
+	bs_blob_list_add(clone);
+
+	spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+static void
+bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+
+	ctx->cpl.u.blobid.blobid = blobid;
+	spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx);
+}
+
+static void
+bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx	*ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob_opts		opts;
+	struct spdk_blob_xattr_opts internal_xattrs;
+	char *xattr_names[] = { BLOB_SNAPSHOT };
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
+		return;
+	}
+
+	ctx->original.blob = _blob;
+
+	if (!_blob->data_ro || !_blob->md_ro) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Clone not from read-only blob\n");
+		ctx->bserrno = -EINVAL;
+		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+		return;
+	}
+
+	if (_blob->locked_operation_in_progress) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create clone - another operation in progress\n");
+		ctx->bserrno = -EBUSY;
+		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+		return;
+	}
+
+	_blob->locked_operation_in_progress = true;
+
+	spdk_blob_opts_init(&opts);
+	blob_xattrs_init(&internal_xattrs);
+
+	opts.thin_provision = true;
+	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
+	opts.use_extent_table = _blob->use_extent_table;
+	if (ctx->xattrs) {
+		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
+	}
+
+	/* Set internal xattr BLOB_SNAPSHOT */
+	internal_xattrs.count = 1;
+	internal_xattrs.ctx = _blob;
+	internal_xattrs.names = xattr_names;
+	internal_xattrs.get_value = bs_xattr_clone;
+
+	bs_create_blob(_blob->bs, &opts, &internal_xattrs,
+		       bs_clone_newblob_create_cpl, ctx);
+}
+
+void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid,
+			  const struct spdk_blob_xattr_opts *clone_xattrs,
+			  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+	struct spdk_clone_snapshot_ctx	*ctx = calloc(1, sizeof(*ctx));
+
+	if (!ctx) {
+		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
+		return;
+	}
+
+	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
+	ctx->cpl.u.blobid.cb_fn = cb_fn;
+	ctx->cpl.u.blobid.cb_arg = cb_arg;
+	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
+	ctx->bserrno = 0;
+	ctx->xattrs = clone_xattrs;
+	ctx->original.id = blobid;
+
+	spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx);
+}
+
+/* END spdk_bs_create_clone */
+
+/* START spdk_bs_inflate_blob */
+
+static void
+bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *_blob = ctx->original.blob;
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	assert(_parent != NULL);
+
+	bs_blob_list_remove(_blob);
+	_blob->parent_id = _parent->id;
+	blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id,
+		       sizeof(spdk_blob_id), true);
+
+	_blob->back_bs_dev->destroy(_blob->back_bs_dev);
+	_blob->back_bs_dev = bs_create_blob_bs_dev(_parent);
+	bs_blob_list_add(_blob);
+
+	spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+static void
+bs_inflate_blob_done(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *_blob = ctx->original.blob;
+	struct spdk_blob *_parent;
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	if (ctx->allocate_all) {
+		/* remove thin provisioning */
+		bs_blob_list_remove(_blob);
+		blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
+		_blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV;
+		_blob->back_bs_dev->destroy(_blob->back_bs_dev);
+		_blob->back_bs_dev = NULL;
+		_blob->parent_id = SPDK_BLOBID_INVALID;
+	} else {
+		_parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob;
+		if (_parent->parent_id != SPDK_BLOBID_INVALID) {
+			/* We must change the parent of the inflated blob */
+			spdk_bs_open_blob(_blob->bs, _parent->parent_id,
+					  bs_inflate_blob_set_parent_cpl, ctx);
+			return;
+		}
+
+		bs_blob_list_remove(_blob);
+		blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
+		_blob->parent_id = SPDK_BLOBID_INVALID;
+		_blob->back_bs_dev->destroy(_blob->back_bs_dev);
+		_blob->back_bs_dev = bs_create_zeroes_dev();
+	}
+
+	_blob->state = SPDK_BLOB_STATE_DIRTY;
+	spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+/* Check if cluster needs allocation */
+static inline bool
+bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all)
+{
+	struct spdk_blob_bs_dev *b;
+
+	assert(blob != NULL);
+
+	if (blob->active.clusters[cluster] != 0) {
+		/* Cluster is already allocated */
+		return false;
+	}
+
+	if (blob->parent_id == SPDK_BLOBID_INVALID) {
+		/* Blob have no parent blob */
+		return allocate_all;
+	}
+
+	b = (struct spdk_blob_bs_dev *)blob->back_bs_dev;
+	return (allocate_all || b->blob->active.clusters[cluster] != 0);
+}
+
+static void
+bs_inflate_blob_touch_next(void *cb_arg, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	struct spdk_blob *_blob = ctx->original.blob;
+	uint64_t offset;
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+		return;
+	}
+
+	for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) {
+		if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) {
+			break;
+		}
+	}
+
+	if (ctx->cluster < _blob->active.num_clusters) {
+		offset = bs_cluster_to_lba(_blob->bs, ctx->cluster);
+
+		/* We may safely increment a cluster before write */
+		ctx->cluster++;
+
+		/* Use zero length write to touch a cluster */
+		spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0,
+				   bs_inflate_blob_touch_next, ctx);
+	} else {
+		bs_inflate_blob_done(cb_arg, bserrno);
+	}
+}
+
+static void
+bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+	uint64_t lfc; /* lowest free cluster */
+	uint64_t i;
+
+	if (bserrno != 0) {
+		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
+		return;
+	}
+
+	ctx->original.blob = _blob;
+
+	if (_blob->locked_operation_in_progress) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot inflate blob - another operation in progress\n");
+		ctx->bserrno = -EBUSY;
+		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+		return;
+	}
+
+	_blob->locked_operation_in_progress = true;
+
+	if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) {
+		/* This blob have no parent, so we cannot decouple it. */
+		SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n");
+		bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL);
+		return;
+	}
+
+	if (spdk_blob_is_thin_provisioned(_blob) == false) {
+		/* This is not thin provisioned blob. No need to inflate. */
+		bs_clone_snapshot_origblob_cleanup(ctx, 0);
+		return;
+	}
+
+	/* Do two passes - one to verify that we can obtain enough clusters
+	 * and another to actually claim them.
+	 */
+	lfc = 0;
+	for (i = 0; i < _blob->active.num_clusters; i++) {
+		if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) {
+			lfc = spdk_bit_array_find_first_clear(_blob->bs->used_clusters, lfc);
+			if (lfc == UINT32_MAX) {
+				/* No more free clusters. Cannot satisfy the request */
+				bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC);
+				return;
+			}
+			lfc++;
+		}
+	}
+
+	ctx->cluster = 0;
+	bs_inflate_blob_touch_next(ctx, 0);
+}
+
+static void
+bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
+		spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
+
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	ctx->cpl.u.bs_basic.cb_fn = cb_fn;
+	ctx->cpl.u.bs_basic.cb_arg = cb_arg;
+	ctx->bserrno = 0;
+	ctx->original.id = blobid;
+	ctx->channel = channel;
+	ctx->allocate_all = allocate_all;
+
+	spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx);
+}
+
+void
+spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
+		     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg);
+}
+
+void
+spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
+			     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg);
+}
+/* END spdk_bs_inflate_blob */
+
+/* START spdk_blob_resize */
+struct spdk_bs_resize_ctx {
+	spdk_blob_op_complete cb_fn;
+	void *cb_arg;
+	struct spdk_blob *blob;
+	uint64_t sz;
+	int rc;
+};
+
+static void
+bs_resize_unfreeze_cpl(void *cb_arg, int rc)
+{
+	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
+
+	if (rc != 0) {
+		SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc);
+	}
+
+	if (ctx->rc != 0) {
+		SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc);
+		rc = ctx->rc;
+	}
+
+	ctx->blob->locked_operation_in_progress = false;
+
+	ctx->cb_fn(ctx->cb_arg, rc);
+	free(ctx);
+}
+
+static void
+bs_resize_freeze_cpl(void *cb_arg, int rc)
+{
+	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
+
+	if (rc != 0) {
+		ctx->blob->locked_operation_in_progress = false;
+		ctx->cb_fn(ctx->cb_arg, rc);
+		free(ctx);
+		return;
+	}
+
+	ctx->rc = blob_resize(ctx->blob, ctx->sz);
+
+	blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx);
+}
+
+void
+spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_resize_ctx *ctx;
+
+	blob_verify_md_op(blob);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Resizing blob %lu to %lu clusters\n", blob->id, sz);
+
+	if (blob->md_ro) {
+		cb_fn(cb_arg, -EPERM);
+		return;
+	}
+
+	if (sz == blob->active.num_clusters) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	if (blob->locked_operation_in_progress) {
+		cb_fn(cb_arg, -EBUSY);
+		return;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	blob->locked_operation_in_progress = true;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+	ctx->blob = blob;
+	ctx->sz = sz;
+	blob_freeze_io(blob, bs_resize_freeze_cpl, ctx);
+}
+
+/* END spdk_blob_resize */
+
+
+/* START spdk_bs_delete_blob */
+
+static void
+bs_delete_close_cpl(void *cb_arg, int bserrno)
+{
+	spdk_bs_sequence_t *seq = cb_arg;
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob *blob = cb_arg;
+
+	if (bserrno != 0) {
+		/*
+		 * We already removed this blob from the blobstore tailq, so
+		 *  we need to free it here since this is the last reference
+		 *  to it.
+		 */
+		blob_free(blob);
+		bs_delete_close_cpl(seq, bserrno);
+		return;
+	}
+
+	/*
+	 * This will immediately decrement the ref_count and call
+	 *  the completion routine since the metadata state is clean.
+	 *  By calling spdk_blob_close, we reduce the number of call
+	 *  points into code that touches the blob->open_ref count
+	 *  and the blobstore's blob list.
+	 */
+	spdk_blob_close(blob, bs_delete_close_cpl, seq);
+}
+
+struct delete_snapshot_ctx {
+	struct spdk_blob_list *parent_snapshot_entry;
+	struct spdk_blob *snapshot;
+	bool snapshot_md_ro;
+	struct spdk_blob *clone;
+	bool clone_md_ro;
+	spdk_blob_op_with_handle_complete cb_fn;
+	void *cb_arg;
+	int bserrno;
+};
+
+static void
+delete_blob_cleanup_finish(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno);
+	}
+
+	assert(ctx != NULL);
+
+	if (bserrno != 0 && ctx->bserrno == 0) {
+		ctx->bserrno = bserrno;
+	}
+
+	ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno);
+	free(ctx);
+}
+
+static void
+delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+
+	if (bserrno != 0) {
+		ctx->bserrno = bserrno;
+		SPDK_ERRLOG("Clone cleanup error %d\n", bserrno);
+	}
+
+	if (ctx->bserrno != 0) {
+		assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL);
+		TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link);
+		spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id);
+	}
+
+	ctx->snapshot->locked_operation_in_progress = false;
+	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
+
+	spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx);
+}
+
+static void
+delete_snapshot_cleanup_clone(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+
+	ctx->clone->locked_operation_in_progress = false;
+	ctx->clone->md_ro = ctx->clone_md_ro;
+
+	spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
+}
+
+static void
+delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+
+	if (bserrno) {
+		ctx->bserrno = bserrno;
+		delete_snapshot_cleanup_clone(ctx, 0);
+		return;
+	}
+
+	ctx->clone->locked_operation_in_progress = false;
+	spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx);
+}
+
+static void
+delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+	struct spdk_blob_list *parent_snapshot_entry = NULL;
+	struct spdk_blob_list *snapshot_entry = NULL;
+	struct spdk_blob_list *clone_entry = NULL;
+	struct spdk_blob_list *snapshot_clone_entry = NULL;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Failed to sync MD on blob\n");
+		ctx->bserrno = bserrno;
+		delete_snapshot_cleanup_clone(ctx, 0);
+		return;
+	}
+
+	/* Get snapshot entry for the snapshot we want to remove */
+	snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id);
+
+	assert(snapshot_entry != NULL);
+
+	/* Remove clone entry in this snapshot (at this point there can be only one clone) */
+	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
+	assert(clone_entry != NULL);
+	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
+	snapshot_entry->clone_count--;
+	assert(TAILQ_EMPTY(&snapshot_entry->clones));
+
+	if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) {
+		/* This snapshot is at the same time a clone of another snapshot - we need to
+		 * update parent snapshot (remove current clone, add new one inherited from
+		 * the snapshot that is being removed) */
+
+		/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
+		 * snapshot that we are removing */
+		blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry,
+						    &snapshot_clone_entry);
+
+		/* Switch clone entry in parent snapshot */
+		TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link);
+		TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link);
+		free(snapshot_clone_entry);
+	} else {
+		/* No parent snapshot - just remove clone entry */
+		free(clone_entry);
+	}
+
+	/* Restore md_ro flags */
+	ctx->clone->md_ro = ctx->clone_md_ro;
+	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
+
+	blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx);
+}
+
+static void
+delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+	uint64_t i;
+
+	ctx->snapshot->md_ro = false;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Failed to sync MD on clone\n");
+		ctx->bserrno = bserrno;
+
+		/* Restore snapshot to previous state */
+		bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
+		if (bserrno != 0) {
+			delete_snapshot_cleanup_clone(ctx, bserrno);
+			return;
+		}
+
+		spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
+		return;
+	}
+
+	/* Clear cluster map entries for snapshot */
+	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
+		if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) {
+			ctx->snapshot->active.clusters[i] = 0;
+		}
+	}
+	for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
+	     i < ctx->clone->active.num_extent_pages; i++) {
+		if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) {
+			ctx->snapshot->active.extent_pages[i] = 0;
+		}
+	}
+
+	blob_set_thin_provision(ctx->snapshot);
+	ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY;
+
+	if (ctx->parent_snapshot_entry != NULL) {
+		ctx->snapshot->back_bs_dev = NULL;
+	}
+
+	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx);
+}
+
+static void
+delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+	uint64_t i;
+
+	/* Temporarily override md_ro flag for clone for MD modification */
+	ctx->clone_md_ro = ctx->clone->md_ro;
+	ctx->clone->md_ro = false;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Failed to sync MD with xattr on blob\n");
+		ctx->bserrno = bserrno;
+		delete_snapshot_cleanup_clone(ctx, 0);
+		return;
+	}
+
+	/* Copy snapshot map to clone map (only unallocated clusters in clone) */
+	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
+		if (ctx->clone->active.clusters[i] == 0) {
+			ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i];
+		}
+	}
+	for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
+	     i < ctx->clone->active.num_extent_pages; i++) {
+		if (ctx->clone->active.extent_pages[i] == 0) {
+			ctx->clone->active.extent_pages[i] = ctx->snapshot->active.extent_pages[i];
+		}
+	}
+
+	/* Delete old backing bs_dev from clone (related to snapshot that will be removed) */
+	ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev);
+
+	/* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */
+	if (ctx->parent_snapshot_entry != NULL) {
+		/* ...to parent snapshot */
+		ctx->clone->parent_id = ctx->parent_snapshot_entry->id;
+		ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
+		blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id,
+			       sizeof(spdk_blob_id),
+			       true);
+	} else {
+		/* ...to blobid invalid and zeroes dev */
+		ctx->clone->parent_id = SPDK_BLOBID_INVALID;
+		ctx->clone->back_bs_dev = bs_create_zeroes_dev();
+		blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true);
+	}
+
+	spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx);
+}
+
+static void
+delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Failed to freeze I/O on clone\n");
+		ctx->bserrno = bserrno;
+		delete_snapshot_cleanup_clone(ctx, 0);
+		return;
+	}
+
+	/* Temporarily override md_ro flag for snapshot for MD modification */
+	ctx->snapshot_md_ro = ctx->snapshot->md_ro;
+	ctx->snapshot->md_ro = false;
+
+	/* Mark blob as pending for removal for power failure safety, use clone id for recovery */
+	ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id,
+				      sizeof(spdk_blob_id), true);
+	if (ctx->bserrno != 0) {
+		delete_snapshot_cleanup_clone(ctx, 0);
+		return;
+	}
+
+	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
+}
+
+static void
+delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno)
+{
+	struct delete_snapshot_ctx *ctx = cb_arg;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Failed to open clone\n");
+		ctx->bserrno = bserrno;
+		delete_snapshot_cleanup_snapshot(ctx, 0);
+		return;
+	}
+
+	ctx->clone = clone;
+
+	if (clone->locked_operation_in_progress) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress on its clone\n");
+		ctx->bserrno = -EBUSY;
+		spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
+		return;
+	}
+
+	clone->locked_operation_in_progress = true;
+
+	blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx);
+}
+
+static void
+update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx)
+{
+	struct spdk_blob_list *snapshot_entry = NULL;
+	struct spdk_blob_list *clone_entry = NULL;
+	struct spdk_blob_list *snapshot_clone_entry = NULL;
+
+	/* Get snapshot entry for the snapshot we want to remove */
+	snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id);
+
+	assert(snapshot_entry != NULL);
+
+	/* Get clone of the snapshot (at this point there can be only one clone) */
+	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
+	assert(snapshot_entry->clone_count == 1);
+	assert(clone_entry != NULL);
+
+	/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
+	 * snapshot that we are removing */
+	blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry,
+					    &snapshot_clone_entry);
+
+	spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx);
+}
+
+static void
+bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno)
+{
+	spdk_bs_sequence_t *seq = cb_arg;
+	struct spdk_blob_list *snapshot_entry = NULL;
+	uint32_t page_num;
+
+	if (bserrno) {
+		SPDK_ERRLOG("Failed to remove blob\n");
+		bs_sequence_finish(seq, bserrno);
+		return;
+	}
+
+	/* Remove snapshot from the list */
+	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
+	if (snapshot_entry != NULL) {
+		TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link);
+		free(snapshot_entry);
+	}
+
+	page_num = bs_blobid_to_page(blob->id);
+	spdk_bit_array_clear(blob->bs->used_blobids, page_num);
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+	blob->active.num_pages = 0;
+	blob_resize(blob, 0);
+
+	blob_persist(seq, blob, bs_delete_persist_cpl, blob);
+}
+
+static int
+bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone)
+{
+	struct spdk_blob_list *snapshot_entry = NULL;
+	struct spdk_blob_list *clone_entry = NULL;
+	struct spdk_blob *clone = NULL;
+	bool has_one_clone = false;
+
+	/* Check if this is a snapshot with clones */
+	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
+	if (snapshot_entry != NULL) {
+		if (snapshot_entry->clone_count > 1) {
+			SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n");
+			return -EBUSY;
+		} else if (snapshot_entry->clone_count == 1) {
+			has_one_clone = true;
+		}
+	}
+
+	/* Check if someone has this blob open (besides this delete context):
+	 * - open_ref = 1 - only this context opened blob, so it is ok to remove it
+	 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot
+	 *	and that is ok, because we will update it accordingly */
+	if (blob->open_ref <= 2 && has_one_clone) {
+		clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
+		assert(clone_entry != NULL);
+		clone = blob_lookup(blob->bs, clone_entry->id);
+
+		if (blob->open_ref == 2 && clone == NULL) {
+			/* Clone is closed and someone else opened this blob */
+			SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
+			return -EBUSY;
+		}
+
+		*update_clone = true;
+		return 0;
+	}
+
+	if (blob->open_ref > 1) {
+		SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
+		return -EBUSY;
+	}
+
+	assert(has_one_clone == false);
+	*update_clone = false;
+	return 0;
+}
+
+static void
+bs_delete_enomem_close_cpl(void *cb_arg, int bserrno)
+{
+	spdk_bs_sequence_t *seq = cb_arg;
+
+	bs_sequence_finish(seq, -ENOMEM);
+}
+
+static void
+bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
+{
+	spdk_bs_sequence_t *seq = cb_arg;
+	struct delete_snapshot_ctx *ctx;
+	bool update_clone = false;
+
+	if (bserrno != 0) {
+		bs_sequence_finish(seq, bserrno);
+		return;
+	}
+
+	blob_verify_md_op(blob);
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq);
+		return;
+	}
+
+	ctx->snapshot = blob;
+	ctx->cb_fn = bs_delete_blob_finish;
+	ctx->cb_arg = seq;
+
+	/* Check if blob can be removed and if it is a snapshot with clone on top of it */
+	ctx->bserrno = bs_is_blob_deletable(blob, &update_clone);
+	if (ctx->bserrno) {
+		spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
+		return;
+	}
+
+	if (blob->locked_operation_in_progress) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress\n");
+		ctx->bserrno = -EBUSY;
+		spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
+		return;
+	}
+
+	blob->locked_operation_in_progress = true;
+
+	/*
+	 * Remove the blob from the blob_store list now, to ensure it does not
+	 *  get returned after this point by blob_lookup().
+	 */
+	spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
+	TAILQ_REMOVE(&blob->bs->blobs, blob, link);
+
+	if (update_clone) {
+		/* This blob is a snapshot with active clone - update clone first */
+		update_clone_on_snapshot_deletion(blob, ctx);
+	} else {
+		/* This blob does not have any clones - just remove it */
+		bs_blob_list_remove(blob);
+		bs_delete_blob_finish(seq, blob, 0);
+		free(ctx);
+	}
+}
+
+void
+spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
+		    spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_cpl	cpl;
+	spdk_bs_sequence_t	*seq;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Deleting blob %lu\n", blobid);
+
+	assert(spdk_get_thread() == bs->md_thread);
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = cb_fn;
+	cpl.u.blob_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq);
+}
+
+/* END spdk_bs_delete_blob */
+
+/* START spdk_bs_open_blob */
+
+static void
+bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob *blob = cb_arg;
+
+	if (bserrno != 0) {
+		blob_free(blob);
+		seq->cpl.u.blob_handle.blob = NULL;
+		bs_sequence_finish(seq, bserrno);
+		return;
+	}
+
+	blob->open_ref++;
+
+	spdk_bit_array_set(blob->bs->open_blobids, blob->id);
+	TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link);
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_open_blob(struct spdk_blob_store *bs,
+	     spdk_blob_id blobid,
+	     struct spdk_blob_open_opts *opts,
+	     spdk_blob_op_with_handle_complete cb_fn,
+	     void *cb_arg)
+{
+	struct spdk_blob		*blob;
+	struct spdk_bs_cpl		cpl;
+	struct spdk_blob_open_opts	opts_default;
+	spdk_bs_sequence_t		*seq;
+	uint32_t			page_num;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Opening blob %lu\n", blobid);
+	assert(spdk_get_thread() == bs->md_thread);
+
+	page_num = bs_blobid_to_page(blobid);
+	if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
+		/* Invalid blobid */
+		cb_fn(cb_arg, NULL, -ENOENT);
+		return;
+	}
+
+	blob = blob_lookup(bs, blobid);
+	if (blob) {
+		blob->open_ref++;
+		cb_fn(cb_arg, blob, 0);
+		return;
+	}
+
+	blob = blob_alloc(bs, blobid);
+	if (!blob) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	if (!opts) {
+		spdk_blob_open_opts_init(&opts_default);
+		opts = &opts_default;
+	}
+
+	blob->clear_method = opts->clear_method;
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE;
+	cpl.u.blob_handle.cb_fn = cb_fn;
+	cpl.u.blob_handle.cb_arg = cb_arg;
+	cpl.u.blob_handle.blob = blob;
+
+	seq = bs_sequence_start(bs->md_channel, &cpl);
+	if (!seq) {
+		blob_free(blob);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	blob_load(seq, blob, bs_open_blob_cpl, blob);
+}
+
+void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
+		       spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
+}
+
+void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid,
+			   struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	bs_open_blob(bs, blobid, opts, cb_fn, cb_arg);
+}
+
+/* END spdk_bs_open_blob */
+
+/* START spdk_blob_set_read_only */
+int spdk_blob_set_read_only(struct spdk_blob *blob)
+{
+	blob_verify_md_op(blob);
+
+	blob->data_ro_flags |= SPDK_BLOB_READ_ONLY;
+
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+	return 0;
+}
+/* END spdk_blob_set_read_only */
+
+/* START spdk_blob_sync_md */
+
+static void
+blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob *blob = cb_arg;
+
+	if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
+		blob->data_ro = true;
+		blob->md_ro = true;
+	}
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+static void
+blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_cpl	cpl;
+	spdk_bs_sequence_t	*seq;
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = cb_fn;
+	cpl.u.blob_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(blob->bs->md_channel, &cpl);
+	if (!seq) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	blob_persist(seq, blob, blob_sync_md_cpl, blob);
+}
+
+void
+spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_verify_md_op(blob);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blob %lu\n", blob->id);
+
+	if (blob->md_ro) {
+		assert(blob->state == SPDK_BLOB_STATE_CLEAN);
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	blob_sync_md(blob, cb_fn, cb_arg);
+}
+
+/* END spdk_blob_sync_md */
+
+struct spdk_blob_insert_cluster_ctx {
+	struct spdk_thread	*thread;
+	struct spdk_blob	*blob;
+	uint32_t		cluster_num;	/* cluster index in blob */
+	uint32_t		cluster;	/* cluster on disk */
+	uint32_t		extent_page;	/* extent page on disk */
+	int			rc;
+	spdk_blob_op_complete	cb_fn;
+	void			*cb_arg;
+};
+
+static void
+blob_insert_cluster_msg_cpl(void *arg)
+{
+	struct spdk_blob_insert_cluster_ctx *ctx = arg;
+
+	ctx->cb_fn(ctx->cb_arg, ctx->rc);
+	free(ctx);
+}
+
+static void
+blob_insert_cluster_msg_cb(void *arg, int bserrno)
+{
+	struct spdk_blob_insert_cluster_ctx *ctx = arg;
+
+	ctx->rc = bserrno;
+	spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
+}
+
+static void
+blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob_md_page        *page = cb_arg;
+
+	bs_sequence_finish(seq, bserrno);
+	spdk_free(page);
+}
+
+static void
+blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
+		   spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	spdk_bs_sequence_t		*seq;
+	struct spdk_bs_cpl		cpl;
+	struct spdk_blob_md_page	*page = NULL;
+	uint32_t			page_count = 0;
+	int				rc;
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = cb_fn;
+	cpl.u.blob_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(blob->bs->md_channel, &cpl);
+	if (!seq) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	rc = blob_serialize_add_page(blob, &page, &page_count, &page);
+	if (rc < 0) {
+		bs_sequence_finish(seq, rc);
+		return;
+	}
+
+	blob_serialize_extent_page(blob, cluster_num, page);
+
+	page->crc = blob_md_page_calc_crc(page);
+
+	assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true);
+
+	bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent),
+			      bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
+			      blob_persist_extent_page_cpl, page);
+}
+
+static void
+blob_insert_cluster_msg(void *arg)
+{
+	struct spdk_blob_insert_cluster_ctx *ctx = arg;
+	uint32_t *extent_page;
+
+	ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster);
+	if (ctx->rc != 0) {
+		spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
+		return;
+	}
+
+	if (ctx->blob->use_extent_table == false) {
+		/* Extent table is not used, proceed with sync of md that will only use extents_rle. */
+		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
+		blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
+		return;
+	}
+
+	extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
+	if (*extent_page == 0) {
+		/* Extent page requires allocation.
+		 * It was already claimed in the used_md_pages map and placed in ctx.
+		 * Blob persist will take care of writing out new extent page on disk. */
+		assert(ctx->extent_page != 0);
+		assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
+		*extent_page = ctx->extent_page;
+		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
+		blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
+	} else {
+		/* It is possible for original thread to allocate extent page for
+		 * different cluster in the same extent page. In such case proceed with
+		 * updating the existing extent page, but release the additional one. */
+		if (ctx->extent_page != 0) {
+			assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
+			bs_release_md_page(ctx->blob->bs, ctx->extent_page);
+			ctx->extent_page = 0;
+		}
+		/* Extent page already allocated.
+		 * Every cluster allocation, requires just an update of single extent page. */
+		blob_insert_extent(ctx->blob, *extent_page, ctx->cluster_num,
+				   blob_insert_cluster_msg_cb, ctx);
+	}
+}
+
+static void
+blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
+				 uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_blob_insert_cluster_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->thread = spdk_get_thread();
+	ctx->blob = blob;
+	ctx->cluster_num = cluster_num;
+	ctx->cluster = cluster;
+	ctx->extent_page = extent_page;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
+}
+
+/* START spdk_blob_close */
+
+static void
+blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+	struct spdk_blob *blob = cb_arg;
+
+	if (bserrno == 0) {
+		blob->open_ref--;
+		if (blob->open_ref == 0) {
+			/*
+			 * Blobs with active.num_pages == 0 are deleted blobs.
+			 *  these blobs are removed from the blob_store list
+			 *  when the deletion process starts - so don't try to
+			 *  remove them again.
+			 */
+			if (blob->active.num_pages > 0) {
+				spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
+				TAILQ_REMOVE(&blob->bs->blobs, blob, link);
+			}
+			blob_free(blob);
+		}
+	}
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_cpl	cpl;
+	spdk_bs_sequence_t	*seq;
+
+	blob_verify_md_op(blob);
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Closing blob %lu\n", blob->id);
+
+	if (blob->open_ref == 0) {
+		cb_fn(cb_arg, -EBADF);
+		return;
+	}
+
+	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+	cpl.u.blob_basic.cb_fn = cb_fn;
+	cpl.u.blob_basic.cb_arg = cb_arg;
+
+	seq = bs_sequence_start(blob->bs->md_channel, &cpl);
+	if (!seq) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	/* Sync metadata */
+	blob_persist(seq, blob, blob_close_cpl, blob);
+}
+
+/* END spdk_blob_close */
+
+struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs)
+{
+	return spdk_get_io_channel(bs);
+}
+
+void spdk_bs_free_io_channel(struct spdk_io_channel *channel)
+{
+	spdk_put_io_channel(channel);
+}
+
+void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel,
+			uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
+			       SPDK_BLOB_UNMAP);
+}
+
+void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel,
+			       uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
+			       SPDK_BLOB_WRITE_ZEROES);
+}
+
+void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel,
+			void *payload, uint64_t offset, uint64_t length,
+			spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
+			       SPDK_BLOB_WRITE);
+}
+
+void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel,
+		       void *payload, uint64_t offset, uint64_t length,
+		       spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
+			       SPDK_BLOB_READ);
+}
+
+void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel,
+			 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+			 spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false);
+}
+
+void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel,
+			struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+			spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true);
+}
+
+struct spdk_bs_iter_ctx {
+	int64_t page_num;
+	struct spdk_blob_store *bs;
+
+	spdk_blob_op_with_handle_complete cb_fn;
+	void *cb_arg;
+};
+
+static void
+bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+	struct spdk_bs_iter_ctx *ctx = cb_arg;
+	struct spdk_blob_store *bs = ctx->bs;
+	spdk_blob_id id;
+
+	if (bserrno == 0) {
+		ctx->cb_fn(ctx->cb_arg, _blob, bserrno);
+		free(ctx);
+		return;
+	}
+
+	ctx->page_num++;
+	ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num);
+	if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) {
+		ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT);
+		free(ctx);
+		return;
+	}
+
+	id = bs_page_to_blobid(ctx->page_num);
+
+	spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx);
+}
+
+void
+spdk_bs_iter_first(struct spdk_blob_store *bs,
+		   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_iter_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	ctx->page_num = -1;
+	ctx->bs = bs;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	bs_iter_cpl(ctx, NULL, -1);
+}
+
+static void
+bs_iter_close_cpl(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_iter_ctx *ctx = cb_arg;
+
+	bs_iter_cpl(ctx, NULL, -1);
+}
+
+void
+spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob,
+		  spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_bs_iter_ctx *ctx;
+
+	assert(blob != NULL);
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	ctx->page_num = bs_blobid_to_page(blob->id);
+	ctx->bs = bs;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	/* Close the existing blob */
+	spdk_blob_close(blob, bs_iter_close_cpl, ctx);
+}
+
+static int
+blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
+	       uint16_t value_len, bool internal)
+{
+	struct spdk_xattr_tailq *xattrs;
+	struct spdk_xattr	*xattr;
+	size_t			desc_size;
+	void			*tmp;
+
+	blob_verify_md_op(blob);
+
+	if (blob->md_ro) {
+		return -EPERM;
+	}
+
+	desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len;
+	if (desc_size > SPDK_BS_MAX_DESC_SIZE) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Xattr '%s' of size %ld does not fix into single page %ld\n", name,
+			      desc_size, SPDK_BS_MAX_DESC_SIZE);
+		return -ENOMEM;
+	}
+
+	if (internal) {
+		xattrs = &blob->xattrs_internal;
+		blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR;
+	} else {
+		xattrs = &blob->xattrs;
+	}
+
+	TAILQ_FOREACH(xattr, xattrs, link) {
+		if (!strcmp(name, xattr->name)) {
+			tmp = malloc(value_len);
+			if (!tmp) {
+				return -ENOMEM;
+			}
+
+			free(xattr->value);
+			xattr->value_len = value_len;
+			xattr->value = tmp;
+			memcpy(xattr->value, value, value_len);
+
+			blob->state = SPDK_BLOB_STATE_DIRTY;
+
+			return 0;
+		}
+	}
+
+	xattr = calloc(1, sizeof(*xattr));
+	if (!xattr) {
+		return -ENOMEM;
+	}
+
+	xattr->name = strdup(name);
+	if (!xattr->name) {
+		free(xattr);
+		return -ENOMEM;
+	}
+
+	xattr->value_len = value_len;
+	xattr->value = malloc(value_len);
+	if (!xattr->value) {
+		free(xattr->name);
+		free(xattr);
+		return -ENOMEM;
+	}
+	memcpy(xattr->value, value, value_len);
+	TAILQ_INSERT_TAIL(xattrs, xattr, link);
+
+	blob->state = SPDK_BLOB_STATE_DIRTY;
+
+	return 0;
+}
+
+int
+spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
+		    uint16_t value_len)
+{
+	return blob_set_xattr(blob, name, value, value_len, false);
+}
+
+static int
+blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal)
+{
+	struct spdk_xattr_tailq *xattrs;
+	struct spdk_xattr	*xattr;
+
+	blob_verify_md_op(blob);
+
+	if (blob->md_ro) {
+		return -EPERM;
+	}
+	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
+
+	TAILQ_FOREACH(xattr, xattrs, link) {
+		if (!strcmp(name, xattr->name)) {
+			TAILQ_REMOVE(xattrs, xattr, link);
+			free(xattr->value);
+			free(xattr->name);
+			free(xattr);
+
+			if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) {
+				blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR;
+			}
+			blob->state = SPDK_BLOB_STATE_DIRTY;
+
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name)
+{
+	return blob_remove_xattr(blob, name, false);
+}
+
+static int
+blob_get_xattr_value(struct spdk_blob *blob, const char *name,
+		     const void **value, size_t *value_len, bool internal)
+{
+	struct spdk_xattr	*xattr;
+	struct spdk_xattr_tailq *xattrs;
+
+	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
+
+	TAILQ_FOREACH(xattr, xattrs, link) {
+		if (!strcmp(name, xattr->name)) {
+			*value = xattr->value;
+			*value_len = xattr->value_len;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+int
+spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
+			  const void **value, size_t *value_len)
+{
+	blob_verify_md_op(blob);
+
+	return blob_get_xattr_value(blob, name, value, value_len, false);
+}
+
+struct spdk_xattr_names {
+	uint32_t	count;
+	const char	*names[0];
+};
+
+static int
+blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names)
+{
+	struct spdk_xattr	*xattr;
+	int			count = 0;
+
+	TAILQ_FOREACH(xattr, xattrs, link) {
+		count++;
+	}
+
+	*names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *));
+	if (*names == NULL) {
+		return -ENOMEM;
+	}
+
+	TAILQ_FOREACH(xattr, xattrs, link) {
+		(*names)->names[(*names)->count++] = xattr->name;
+	}
+
+	return 0;
+}
+
+int
+spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names)
+{
+	blob_verify_md_op(blob);
+
+	return blob_get_xattr_names(&blob->xattrs, names);
+}
+
+uint32_t
+spdk_xattr_names_get_count(struct spdk_xattr_names *names)
+{
+	assert(names != NULL);
+
+	return names->count;
+}
+
+const char *
+spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index)
+{
+	if (index >= names->count) {
+		return NULL;
+	}
+
+	return names->names[index];
+}
+
+void
+spdk_xattr_names_free(struct spdk_xattr_names *names)
+{
+	free(names);
+}
+
+struct spdk_bs_type
+spdk_bs_get_bstype(struct spdk_blob_store *bs)
+{
+	return bs->bstype;
+}
+
+void
+spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype)
+{
+	memcpy(&bs->bstype, &bstype, sizeof(bstype));
+}
+
+bool
+spdk_blob_is_read_only(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+	return (blob->data_ro || blob->md_ro);
+}
+
+bool
+spdk_blob_is_snapshot(struct spdk_blob *blob)
+{
+	struct spdk_blob_list *snapshot_entry;
+
+	assert(blob != NULL);
+
+	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
+	if (snapshot_entry == NULL) {
+		return false;
+	}
+
+	return true;
+}
+
+bool
+spdk_blob_is_clone(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+
+	if (blob->parent_id != SPDK_BLOBID_INVALID) {
+		assert(spdk_blob_is_thin_provisioned(blob));
+		return true;
+	}
+
+	return false;
+}
+
+bool
+spdk_blob_is_thin_provisioned(struct spdk_blob *blob)
+{
+	assert(blob != NULL);
+	return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV);
+}
+
+static void
+blob_update_clear_method(struct spdk_blob *blob)
+{
+	enum blob_clear_method stored_cm;
+
+	assert(blob != NULL);
+
+	/* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored
+	 * in metadata previously.  If something other than the default was
+	 * specified, ignore stored value and used what was passed in.
+	 */
+	stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT);
+
+	if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) {
+		blob->clear_method = stored_cm;
+	} else if (blob->clear_method != stored_cm) {
+		SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n",
+			     blob->clear_method, stored_cm);
+	}
+}
+
+spdk_blob_id
+spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id)
+{
+	struct spdk_blob_list *snapshot_entry = NULL;
+	struct spdk_blob_list *clone_entry = NULL;
+
+	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
+		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
+			if (clone_entry->id == blob_id) {
+				return snapshot_entry->id;
+			}
+		}
+	}
+
+	return SPDK_BLOBID_INVALID;
+}
+
+int
+spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids,
+		     size_t *count)
+{
+	struct spdk_blob_list *snapshot_entry, *clone_entry;
+	size_t n;
+
+	snapshot_entry = bs_get_snapshot_entry(bs, blobid);
+	if (snapshot_entry == NULL) {
+		*count = 0;
+		return 0;
+	}
+
+	if (ids == NULL || *count < snapshot_entry->clone_count) {
+		*count = snapshot_entry->clone_count;
+		return -ENOMEM;
+	}
+	*count = snapshot_entry->clone_count;
+
+	n = 0;
+	TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
+		ids[n++] = clone_entry->id;
+	}
+
+	return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("blob", SPDK_LOG_BLOB)
diff --git a/src/spdk/lib/blob/blobstore.h b/src/spdk/lib/blob/blobstore.h
new file mode 100644
index 000000000..5e93bd6ad
--- /dev/null
+++ b/src/spdk/lib/blob/blobstore.h
@@ -0,0 +1,702 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BLOBSTORE_H
+#define SPDK_BLOBSTORE_H
+
+#include "spdk/assert.h"
+#include "spdk/blob.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+
+#include "request.h"
+
+/* In Memory Data Structures
+ *
+ * The following data structures exist only in memory.
+ */
+
+#define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024)
+#define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX
+#define SPDK_BLOB_OPTS_MAX_MD_OPS 32
+#define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512
+#define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32)
+
+struct spdk_xattr {
+	uint32_t	index;
+	uint16_t	value_len;
+	char		*name;
+	void		*value;
+	TAILQ_ENTRY(spdk_xattr)	link;
+};
+
+/* The mutable part of the blob data that is sync'd to
+ * disk. The data in here is both mutable and persistent.
+ */
+struct spdk_blob_mut_data {
+	/* Number of data clusters in the blob */
+	uint64_t	num_clusters;
+
+	/* Array LBAs that are the beginning of a cluster, in
+	 * the order they appear in the blob.
+	 */
+	uint64_t	*clusters;
+
+	/* The size of the clusters array. This is greater than or
+	 * equal to 'num_clusters'.
+	 */
+	size_t		cluster_array_size;
+
+	/* Number of extent pages */
+	uint64_t	num_extent_pages;
+
+	/* Array of page offsets into the metadata region,
+	 * containing extents. Can contain entries for not yet
+	 * allocated pages. */
+	uint32_t	*extent_pages;
+
+	/* The size of the extent page array. This is greater than or
+	 * equal to 'num_extent_pages'. */
+	size_t		extent_pages_array_size;
+
+	/* Number of metadata pages */
+	uint32_t	num_pages;
+
+	/* Array of page offsets into the metadata region, in
+	 * the order of the metadata page sequence.
+	 */
+	uint32_t	*pages;
+};
+
+enum spdk_blob_state {
+	/* The blob in-memory version does not match the on-disk
+	 * version.
+	 */
+	SPDK_BLOB_STATE_DIRTY,
+
+	/* The blob in memory version of the blob matches the on disk
+	 * version.
+	 */
+	SPDK_BLOB_STATE_CLEAN,
+
+	/* The in-memory state being synchronized with the on-disk
+	 * blob state. */
+	SPDK_BLOB_STATE_LOADING,
+};
+
+TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr);
+
+struct spdk_blob_list {
+	spdk_blob_id id;
+	size_t clone_count;
+	TAILQ_HEAD(, spdk_blob_list) clones;
+	TAILQ_ENTRY(spdk_blob_list) link;
+};
+
+struct spdk_blob {
+	struct spdk_blob_store *bs;
+
+	uint32_t	open_ref;
+
+	spdk_blob_id	id;
+	spdk_blob_id	parent_id;
+
+	enum spdk_blob_state		state;
+
+	/* Two copies of the mutable data. One is a version
+	 * that matches the last known data on disk (clean).
+	 * The other (active) is the current data. Syncing
+	 * a blob makes the clean match the active.
+	 */
+	struct spdk_blob_mut_data	clean;
+	struct spdk_blob_mut_data	active;
+
+	bool		invalid;
+	bool		data_ro;
+	bool		md_ro;
+
+	uint64_t	invalid_flags;
+	uint64_t	data_ro_flags;
+	uint64_t	md_ro_flags;
+
+	struct spdk_bs_dev *back_bs_dev;
+
+	/* TODO: The xattrs are mutable, but we don't want to be
+	 * copying them unnecessarily. Figure this out.
+	 */
+	struct spdk_xattr_tailq xattrs;
+	struct spdk_xattr_tailq xattrs_internal;
+
+	TAILQ_ENTRY(spdk_blob) link;
+
+	uint32_t frozen_refcnt;
+	bool locked_operation_in_progress;
+	enum blob_clear_method clear_method;
+	bool extent_rle_found;
+	bool extent_table_found;
+	bool use_extent_table;
+
+	/* A list of pending metadata pending_persists */
+	TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists;
+
+	/* Number of data clusters retrived from extent table,
+	 * that many have to be read from extent pages. */
+	uint64_t	remaining_clusters_in_et;
+};
+
+struct spdk_blob_store {
+	uint64_t			md_start; /* Offset from beginning of disk, in pages */
+	uint32_t			md_len; /* Count, in pages */
+
+	struct spdk_io_channel		*md_channel;
+	uint32_t			max_channel_ops;
+
+	struct spdk_thread		*md_thread;
+
+	struct spdk_bs_dev		*dev;
+
+	struct spdk_bit_array		*used_md_pages;
+	struct spdk_bit_array		*used_clusters;
+	struct spdk_bit_array		*used_blobids;
+	struct spdk_bit_array		*open_blobids;
+
+	pthread_mutex_t			used_clusters_mutex;
+
+	uint32_t			cluster_sz;
+	uint64_t			total_clusters;
+	uint64_t			total_data_clusters;
+	uint64_t			num_free_clusters;
+	uint64_t			pages_per_cluster;
+	uint8_t				pages_per_cluster_shift;
+	uint32_t			io_unit_size;
+
+	spdk_blob_id			super_blob;
+	struct spdk_bs_type		bstype;
+
+	struct spdk_bs_cpl		unload_cpl;
+	int				unload_err;
+
+	TAILQ_HEAD(, spdk_blob)		blobs;
+	TAILQ_HEAD(, spdk_blob_list)	snapshots;
+
+	bool                            clean;
+};
+
+struct spdk_bs_channel {
+	struct spdk_bs_request_set	*req_mem;
+	TAILQ_HEAD(, spdk_bs_request_set) reqs;
+
+	struct spdk_blob_store		*bs;
+
+	struct spdk_bs_dev		*dev;
+	struct spdk_io_channel		*dev_channel;
+
+	TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc;
+	TAILQ_HEAD(, spdk_bs_request_set) queued_io;
+};
+
+/** operation type */
+enum spdk_blob_op_type {
+	SPDK_BLOB_WRITE,
+	SPDK_BLOB_READ,
+	SPDK_BLOB_UNMAP,
+	SPDK_BLOB_WRITE_ZEROES,
+	SPDK_BLOB_WRITEV,
+	SPDK_BLOB_READV,
+};
+
+/* back bs_dev */
+
+#define BLOB_SNAPSHOT "SNAP"
+#define SNAPSHOT_IN_PROGRESS "SNAPTMP"
+#define SNAPSHOT_PENDING_REMOVAL "SNAPRM"
+
+struct spdk_blob_bs_dev {
+	struct spdk_bs_dev bs_dev;
+	struct spdk_blob *blob;
+};
+
+/* On-Disk Data Structures
+ *
+ * The following data structures exist on disk.
+ */
+#define SPDK_BS_INITIAL_VERSION 1
+#define SPDK_BS_VERSION 3 /* current version */
+
+#pragma pack(push, 1)
+
+#define SPDK_MD_MASK_TYPE_USED_PAGES 0
+#define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1
+#define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2
+
+struct spdk_bs_md_mask {
+	uint8_t		type;
+	uint32_t	length; /* In bits */
+	uint8_t		mask[0];
+};
+
+#define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0
+#define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2
+#define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3
+#define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4
+
+/* Following descriptors define cluster layout in a blob.
+ * EXTENT_RLE cannot be present in blobs metadata,
+ * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */
+
+/* EXTENT_RLE descriptor holds an array of LBA that points to
+ * beginning of allocated clusters. The array is run-length encoded,
+ * with 0's being unallocated clusters. It is part of serialized
+ * metadata chain for a blob. */
+#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1
+/* EXTENT_TABLE descriptor holds array of md page offsets that
+ * point to pages with EXTENT_PAGE descriptor. The 0's in the array
+ * are run-length encoded, non-zero values are unallocated pages.
+ * It is part of serialized metadata chain for a blob. */
+#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5
+/* EXTENT_PAGE descriptor holds an array of LBAs that point to
+ * beginning of allocated clusters. The array is run-length encoded,
+ * with 0's being unallocated clusters. It is NOT part of
+ * serialized metadata chain for a blob. */
+#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6
+
+struct spdk_blob_md_descriptor_xattr {
+	uint8_t		type;
+	uint32_t	length;
+
+	uint16_t	name_length;
+	uint16_t	value_length;
+
+	char		name[0];
+	/* String name immediately followed by string value. */
+};
+
+struct spdk_blob_md_descriptor_extent_rle {
+	uint8_t		type;
+	uint32_t	length;
+
+	struct {
+		uint32_t        cluster_idx;
+		uint32_t        length; /* In units of clusters */
+	} extents[0];
+};
+
+struct spdk_blob_md_descriptor_extent_table {
+	uint8_t		type;
+	uint32_t	length;
+
+	/* Number of data clusters in the blob */
+	uint64_t	num_clusters;
+
+	struct {
+		uint32_t	page_idx;
+		uint32_t	num_pages; /* In units of pages */
+	} extent_page[0];
+};
+
+struct spdk_blob_md_descriptor_extent_page {
+	uint8_t		type;
+	uint32_t	length;
+
+	/* First cluster index in this extent page */
+	uint32_t	start_cluster_idx;
+
+	uint32_t        cluster_idx[0];
+};
+
+#define SPDK_BLOB_THIN_PROV (1ULL << 0)
+#define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1)
+#define SPDK_BLOB_EXTENT_TABLE (1ULL << 2)
+#define SPDK_BLOB_INVALID_FLAGS_MASK	(SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | SPDK_BLOB_EXTENT_TABLE)
+
+#define SPDK_BLOB_READ_ONLY (1ULL << 0)
+#define SPDK_BLOB_DATA_RO_FLAGS_MASK	SPDK_BLOB_READ_ONLY
+
+#define SPDK_BLOB_CLEAR_METHOD_SHIFT 0
+#define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT)
+#define SPDK_BLOB_MD_RO_FLAGS_MASK	SPDK_BLOB_CLEAR_METHOD
+
+struct spdk_blob_md_descriptor_flags {
+	uint8_t		type;
+	uint32_t	length;
+
+	/*
+	 * If a flag in invalid_flags is set that the application is not aware of,
+	 *  it will not allow the blob to be opened.
+	 */
+	uint64_t	invalid_flags;
+
+	/*
+	 * If a flag in data_ro_flags is set that the application is not aware of,
+	 *  allow the blob to be opened in data_read_only and md_read_only mode.
+	 */
+	uint64_t	data_ro_flags;
+
+	/*
+	 * If a flag in md_ro_flags is set the the application is not aware of,
+	 *  allow the blob to be opened in md_read_only mode.
+	 */
+	uint64_t	md_ro_flags;
+};
+
+struct spdk_blob_md_descriptor {
+	uint8_t		type;
+	uint32_t	length;
+};
+
+#define SPDK_INVALID_MD_PAGE UINT32_MAX
+
+struct spdk_blob_md_page {
+	spdk_blob_id     id;
+
+	uint32_t        sequence_num;
+	uint32_t	reserved0;
+
+	/* Descriptors here */
+	uint8_t		descriptors[4072];
+
+	uint32_t	next;
+	uint32_t	crc;
+};
+#define SPDK_BS_PAGE_SIZE 0x1000
+SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size");
+
+#define SPDK_BS_MAX_DESC_SIZE sizeof(((struct spdk_blob_md_page*)0)->descriptors)
+
+/* Maximum number of extents a single Extent Page can fit.
+ * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */
+#define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t))
+#define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u)
+
+#define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB"
+
+struct spdk_bs_super_block {
+	uint8_t		signature[8];
+	uint32_t        version;
+	uint32_t        length;
+	uint32_t	clean; /* If there was a clean shutdown, this is 1. */
+	spdk_blob_id	super_blob;
+
+	uint32_t	cluster_size; /* In bytes */
+
+	uint32_t	used_page_mask_start; /* Offset from beginning of disk, in pages */
+	uint32_t	used_page_mask_len; /* Count, in pages */
+
+	uint32_t	used_cluster_mask_start; /* Offset from beginning of disk, in pages */
+	uint32_t	used_cluster_mask_len; /* Count, in pages */
+
+	uint32_t	md_start; /* Offset from beginning of disk, in pages */
+	uint32_t	md_len; /* Count, in pages */
+
+	struct spdk_bs_type	bstype; /* blobstore type */
+
+	uint32_t	used_blobid_mask_start; /* Offset from beginning of disk, in pages */
+	uint32_t	used_blobid_mask_len; /* Count, in pages */
+
+	uint64_t        size; /* size of blobstore in bytes */
+	uint32_t        io_unit_size; /* Size of io unit in bytes */
+
+	uint8_t         reserved[4000];
+	uint32_t	crc;
+};
+SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size");
+
+#pragma pack(pop)
+
+struct spdk_bs_dev *bs_create_zeroes_dev(void);
+struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob);
+
+/* Unit Conversions
+ *
+ * The blobstore works with several different units:
+ * - Byte: Self explanatory
+ * - LBA: The logical blocks on the backing storage device.
+ * - Page: The read/write units of blobs and metadata. This is
+ *         an offset into a blob in units of 4KiB.
+ * - Cluster Index: The disk is broken into a sequential list of
+ *		    clusters. This is the offset from the beginning.
+ *
+ * NOTE: These conversions all act on simple magnitudes, not with any sort
+ *        of knowledge about the blobs themselves. For instance, converting
+ *        a page to an lba with the conversion function below simply converts
+ *        a number of pages to an equivalent number of lbas, but that
+ *        lba certainly isn't the right lba that corresponds to a page offset
+ *        for a particular blob.
+ */
+static inline uint64_t
+bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length)
+{
+	assert(length % bs->dev->blocklen == 0);
+
+	return length / bs->dev->blocklen;
+}
+
+static inline uint64_t
+bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length)
+{
+	assert(length % bs_dev->blocklen == 0);
+
+	return length / bs_dev->blocklen;
+}
+
+static inline uint64_t
+bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page)
+{
+	return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen;
+}
+
+static inline uint64_t
+bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page)
+{
+	assert(page < bs->md_len);
+	return bs_page_to_lba(bs, page + bs->md_start);
+}
+
+static inline uint64_t
+bs_dev_page_to_lba(struct spdk_bs_dev *bs_dev, uint64_t page)
+{
+	return page * SPDK_BS_PAGE_SIZE / bs_dev->blocklen;
+}
+
+static inline uint64_t
+bs_io_unit_per_page(struct spdk_blob_store *bs)
+{
+	return SPDK_BS_PAGE_SIZE / bs->io_unit_size;
+}
+
+static inline uint64_t
+bs_io_unit_to_page(struct spdk_blob_store *bs, uint64_t io_unit)
+{
+	return io_unit / bs_io_unit_per_page(bs);
+}
+
+static inline uint64_t
+bs_cluster_to_page(struct spdk_blob_store *bs, uint32_t cluster)
+{
+	return (uint64_t)cluster * bs->pages_per_cluster;
+}
+
+static inline uint32_t
+bs_page_to_cluster(struct spdk_blob_store *bs, uint64_t page)
+{
+	assert(page % bs->pages_per_cluster == 0);
+
+	return page / bs->pages_per_cluster;
+}
+
+static inline uint64_t
+bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster)
+{
+	return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen);
+}
+
+static inline uint32_t
+bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba)
+{
+	assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0);
+
+	return lba / (bs->cluster_sz / bs->dev->blocklen);
+}
+
+static inline uint64_t
+bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit)
+{
+	return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen);
+}
+
+static inline uint64_t
+bs_back_dev_lba_to_io_unit(struct spdk_blob *blob, uint64_t lba)
+{
+	return lba * (blob->back_bs_dev->blocklen / blob->bs->io_unit_size);
+}
+
+static inline uint64_t
+bs_cluster_to_extent_table_id(uint64_t cluster_num)
+{
+	return cluster_num / SPDK_EXTENTS_PER_EP;
+}
+
+static inline uint32_t *
+bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num)
+{
+	uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num);
+
+	assert(blob->use_extent_table);
+	assert(extent_table_id < blob->active.extent_pages_array_size);
+
+	return &blob->active.extent_pages[extent_table_id];
+}
+
+/* End basic conversions */
+
+static inline uint64_t
+bs_blobid_to_page(spdk_blob_id id)
+{
+	return id & 0xFFFFFFFF;
+}
+
+/* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper
+ * 32 bits are not currently used. Stick a 1 there just to catch bugs where the
+ * code assumes blob id == page_idx.
+ */
+static inline spdk_blob_id
+bs_page_to_blobid(uint64_t page_idx)
+{
+	if (page_idx > UINT32_MAX) {
+		return SPDK_BLOBID_INVALID;
+	}
+	return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx;
+}
+
+/* Given an io unit offset into a blob, look up the LBA for the
+ * start of that io unit.
+ */
+static inline uint64_t
+bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit)
+{
+	uint64_t	lba;
+	uint64_t	pages_per_cluster;
+	uint8_t		shift;
+	uint64_t	io_units_per_cluster;
+	uint64_t	io_units_per_page;
+	uint64_t	page;
+
+	page = bs_io_unit_to_page(blob->bs, io_unit);
+
+	pages_per_cluster = blob->bs->pages_per_cluster;
+	shift = blob->bs->pages_per_cluster_shift;
+	io_units_per_page = bs_io_unit_per_page(blob->bs);
+
+	assert(page < blob->active.num_clusters * pages_per_cluster);
+
+	if (shift != 0) {
+		io_units_per_cluster = io_units_per_page << shift;
+		lba = blob->active.clusters[page >> shift];
+	} else {
+		io_units_per_cluster = io_units_per_page * pages_per_cluster;
+		lba = blob->active.clusters[page / pages_per_cluster];
+	}
+	lba += io_unit % io_units_per_cluster;
+	return lba;
+}
+
+/* Given an io_unit offset into a blob, look up the number of io_units until the
+ * next cluster boundary.
+ */
+static inline uint32_t
+bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit)
+{
+	uint64_t	io_units_per_cluster;
+	uint8_t         shift = blob->bs->pages_per_cluster_shift;
+
+	if (shift != 0) {
+		io_units_per_cluster = bs_io_unit_per_page(blob->bs) << shift;
+	} else {
+		io_units_per_cluster = bs_io_unit_per_page(blob->bs) * blob->bs->pages_per_cluster;
+	}
+
+	return io_units_per_cluster - (io_unit % io_units_per_cluster);
+}
+
+/* Given a page offset into a blob, look up the number of pages until the
+ * next cluster boundary.
+ */
+static inline uint32_t
+bs_num_pages_to_cluster_boundary(struct spdk_blob *blob, uint64_t page)
+{
+	uint64_t	pages_per_cluster;
+
+	pages_per_cluster = blob->bs->pages_per_cluster;
+
+	return pages_per_cluster - (page % pages_per_cluster);
+}
+
+/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */
+static inline uint32_t
+bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit)
+{
+	uint64_t	pages_per_cluster;
+	uint64_t	page;
+
+	pages_per_cluster = blob->bs->pages_per_cluster;
+	page = bs_io_unit_to_page(blob->bs, io_unit);
+
+	return page - (page % pages_per_cluster);
+}
+
+/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */
+static inline uint32_t
+bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit)
+{
+	uint64_t	pages_per_cluster = blob->bs->pages_per_cluster;
+	uint8_t		shift = blob->bs->pages_per_cluster_shift;
+	uint32_t	page_offset;
+
+	page_offset = io_unit / bs_io_unit_per_page(blob->bs);
+	if (shift != 0) {
+		return page_offset >> shift;
+	} else {
+		return page_offset / pages_per_cluster;
+	}
+}
+
+/* Given an io unit offset into a blob, look up if it is from allocated cluster. */
+static inline bool
+bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit)
+{
+	uint64_t	lba;
+	uint64_t	page;
+	uint64_t	pages_per_cluster;
+	uint8_t		shift;
+
+	shift = blob->bs->pages_per_cluster_shift;
+	pages_per_cluster = blob->bs->pages_per_cluster;
+	page = bs_io_unit_to_page(blob->bs, io_unit);
+
+	assert(page < blob->active.num_clusters * pages_per_cluster);
+
+	if (shift != 0) {
+		lba = blob->active.clusters[page >> shift];
+	} else {
+		lba = blob->active.clusters[page / pages_per_cluster];
+	}
+
+	if (lba == 0) {
+		assert(spdk_blob_is_thin_provisioned(blob));
+		return false;
+	} else {
+		return true;
+	}
+}
+
+#endif
diff --git a/src/spdk/lib/blob/request.c b/src/spdk/lib/blob/request.c
new file mode 100644
index 000000000..0975bcf24
--- /dev/null
+++ b/src/spdk/lib/blob/request.c
@@ -0,0 +1,521 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "blobstore.h"
+#include "request.h"
+
+#include "spdk/thread.h"
+#include "spdk/queue.h"
+
+#include "spdk_internal/log.h"
+
+void
+bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno)
+{
+	switch (cpl->type) {
+	case SPDK_BS_CPL_TYPE_BS_BASIC:
+		cpl->u.bs_basic.cb_fn(cpl->u.bs_basic.cb_arg,
+				      bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_BS_HANDLE:
+		cpl->u.bs_handle.cb_fn(cpl->u.bs_handle.cb_arg,
+				       bserrno == 0 ? cpl->u.bs_handle.bs : NULL,
+				       bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_BLOB_BASIC:
+		cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg,
+					bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_BLOBID:
+		cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg,
+				    bserrno == 0 ? cpl->u.blobid.blobid : SPDK_BLOBID_INVALID,
+				    bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_BLOB_HANDLE:
+		cpl->u.blob_handle.cb_fn(cpl->u.blob_handle.cb_arg,
+					 bserrno == 0 ? cpl->u.blob_handle.blob : NULL,
+					 bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_NESTED_SEQUENCE:
+		cpl->u.nested_seq.cb_fn(cpl->u.nested_seq.cb_arg,
+					cpl->u.nested_seq.parent,
+					bserrno);
+		break;
+	case SPDK_BS_CPL_TYPE_NONE:
+		/* this completion's callback is handled elsewhere */
+		break;
+	}
+}
+
+static void
+bs_request_set_complete(struct spdk_bs_request_set *set)
+{
+	struct spdk_bs_cpl cpl = set->cpl;
+	int bserrno = set->bserrno;
+
+	TAILQ_INSERT_TAIL(&set->channel->reqs, set, link);
+
+	bs_call_cpl(&cpl, bserrno);
+}
+
+static void
+bs_sequence_completion(struct spdk_io_channel *channel, void *cb_arg, int bserrno)
+{
+	struct spdk_bs_request_set *set = cb_arg;
+
+	set->bserrno = bserrno;
+	set->u.sequence.cb_fn((spdk_bs_sequence_t *)set, set->u.sequence.cb_arg, bserrno);
+}
+
+spdk_bs_sequence_t *
+bs_sequence_start(struct spdk_io_channel *_channel,
+		  struct spdk_bs_cpl *cpl)
+{
+	struct spdk_bs_channel		*channel;
+	struct spdk_bs_request_set	*set;
+
+	channel = spdk_io_channel_get_ctx(_channel);
+	assert(channel != NULL);
+	set = TAILQ_FIRST(&channel->reqs);
+	if (!set) {
+		return NULL;
+	}
+	TAILQ_REMOVE(&channel->reqs, set, link);
+
+	set->cpl = *cpl;
+	set->bserrno = 0;
+	set->channel = channel;
+
+	set->cb_args.cb_fn = bs_sequence_completion;
+	set->cb_args.cb_arg = set;
+	set->cb_args.channel = channel->dev_channel;
+
+	return (spdk_bs_sequence_t *)set;
+}
+
+void
+bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev,
+			void *payload, uint64_t lba, uint32_t lba_count,
+			spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+
+	bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload,
+		     uint64_t lba, uint32_t lba_count,
+		     spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+
+	channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload,
+		      uint64_t lba, uint32_t lba_count,
+		      spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+
+	channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count,
+			    &set->cb_args);
+}
+
+void
+bs_sequence_readv_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev,
+			 struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count,
+			 spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+
+	bs_dev->readv(bs_dev, spdk_io_channel_from_ctx(channel), iov, iovcnt, lba, lba_count,
+		      &set->cb_args);
+}
+
+void
+bs_sequence_readv_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt,
+		      uint64_t lba, uint32_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+	channel->dev->readv(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count,
+			    &set->cb_args);
+}
+
+void
+bs_sequence_writev_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt,
+		       uint64_t lba, uint32_t lba_count,
+		       spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+
+	channel->dev->writev(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count,
+			     &set->cb_args);
+}
+
+void
+bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
+			     uint64_t lba, uint32_t lba_count,
+			     spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set      *set = (struct spdk_bs_request_set *)seq;
+	struct spdk_bs_channel       *channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "writing zeroes to %" PRIu32 " blocks at LBA %" PRIu64 "\n",
+		      lba_count, lba);
+
+	set->u.sequence.cb_fn = cb_fn;
+	set->u.sequence.cb_arg = cb_arg;
+
+	channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count,
+				   &set->cb_args);
+}
+
+void
+bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno)
+{
+	if (bserrno != 0) {
+		seq->bserrno = bserrno;
+	}
+	bs_request_set_complete((struct spdk_bs_request_set *)seq);
+}
+
+void
+bs_user_op_sequence_finish(void *cb_arg, int bserrno)
+{
+	spdk_bs_sequence_t *seq = cb_arg;
+
+	bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_batch_completion(struct spdk_io_channel *_channel,
+		    void *cb_arg, int bserrno)
+{
+	struct spdk_bs_request_set	*set = cb_arg;
+
+	set->u.batch.outstanding_ops--;
+	if (bserrno != 0) {
+		set->bserrno = bserrno;
+	}
+
+	if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) {
+		if (set->u.batch.cb_fn) {
+			set->cb_args.cb_fn = bs_sequence_completion;
+			set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, bserrno);
+		} else {
+			bs_request_set_complete(set);
+		}
+	}
+}
+
+spdk_bs_batch_t *
+bs_batch_open(struct spdk_io_channel *_channel,
+	      struct spdk_bs_cpl *cpl)
+{
+	struct spdk_bs_channel		*channel;
+	struct spdk_bs_request_set	*set;
+
+	channel = spdk_io_channel_get_ctx(_channel);
+	assert(channel != NULL);
+	set = TAILQ_FIRST(&channel->reqs);
+	if (!set) {
+		return NULL;
+	}
+	TAILQ_REMOVE(&channel->reqs, set, link);
+
+	set->cpl = *cpl;
+	set->bserrno = 0;
+	set->channel = channel;
+
+	set->u.batch.cb_fn = NULL;
+	set->u.batch.cb_arg = NULL;
+	set->u.batch.outstanding_ops = 0;
+	set->u.batch.batch_closed = 0;
+
+	set->cb_args.cb_fn = bs_batch_completion;
+	set->cb_args.cb_arg = set;
+	set->cb_args.channel = channel->dev_channel;
+
+	return (spdk_bs_batch_t *)set;
+}
+
+void
+bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev,
+		     void *payload, uint64_t lba, uint32_t lba_count)
+{
+	struct spdk_bs_request_set	*set = (struct spdk_bs_request_set *)batch;
+	struct spdk_bs_channel		*channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.batch.outstanding_ops++;
+	bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload,
+		  uint64_t lba, uint32_t lba_count)
+{
+	struct spdk_bs_request_set	*set = (struct spdk_bs_request_set *)batch;
+	struct spdk_bs_channel		*channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.batch.outstanding_ops++;
+	channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload,
+		   uint64_t lba, uint32_t lba_count)
+{
+	struct spdk_bs_request_set	*set = (struct spdk_bs_request_set *)batch;
+	struct spdk_bs_channel		*channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks to LBA %" PRIu64 "\n", lba_count, lba);
+
+	set->u.batch.outstanding_ops++;
+	channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count,
+			    &set->cb_args);
+}
+
+void
+bs_batch_unmap_dev(spdk_bs_batch_t *batch,
+		   uint64_t lba, uint32_t lba_count)
+{
+	struct spdk_bs_request_set	*set = (struct spdk_bs_request_set *)batch;
+	struct spdk_bs_channel		*channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Unmapping %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count,
+		      lba);
+
+	set->u.batch.outstanding_ops++;
+	channel->dev->unmap(channel->dev, channel->dev_channel, lba, lba_count,
+			    &set->cb_args);
+}
+
+void
+bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch,
+			  uint64_t lba, uint32_t lba_count)
+{
+	struct spdk_bs_request_set	*set = (struct spdk_bs_request_set *)batch;
+	struct spdk_bs_channel		*channel = set->channel;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Zeroing %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, lba);
+
+	set->u.batch.outstanding_ops++;
+	channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count,
+				   &set->cb_args);
+}
+
+void
+bs_batch_close(spdk_bs_batch_t *batch)
+{
+	struct spdk_bs_request_set	*set = (struct spdk_bs_request_set *)batch;
+
+	set->u.batch.batch_closed = 1;
+
+	if (set->u.batch.outstanding_ops == 0) {
+		if (set->u.batch.cb_fn) {
+			set->cb_args.cb_fn = bs_sequence_completion;
+			set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, set->bserrno);
+		} else {
+			bs_request_set_complete(set);
+		}
+	}
+}
+
+spdk_bs_batch_t *
+bs_sequence_to_batch(spdk_bs_sequence_t *seq, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+
+	set->u.batch.cb_fn = cb_fn;
+	set->u.batch.cb_arg = cb_arg;
+	set->u.batch.outstanding_ops = 0;
+	set->u.batch.batch_closed = 0;
+
+	set->cb_args.cb_fn = bs_batch_completion;
+
+	return set;
+}
+
+spdk_bs_user_op_t *
+bs_user_op_alloc(struct spdk_io_channel *_channel, struct spdk_bs_cpl *cpl,
+		 enum spdk_blob_op_type op_type, struct spdk_blob *blob,
+		 void *payload, int iovcnt, uint64_t offset, uint64_t length)
+{
+	struct spdk_bs_channel		*channel;
+	struct spdk_bs_request_set	*set;
+	struct spdk_bs_user_op_args	*args;
+
+	channel = spdk_io_channel_get_ctx(_channel);
+	assert(channel != NULL);
+	set = TAILQ_FIRST(&channel->reqs);
+	if (!set) {
+		return NULL;
+	}
+	TAILQ_REMOVE(&channel->reqs, set, link);
+
+	set->cpl = *cpl;
+	set->channel = channel;
+
+	args = &set->u.user_op;
+
+	args->type = op_type;
+	args->iovcnt = iovcnt;
+	args->blob = blob;
+	args->offset = offset;
+	args->length = length;
+	args->payload = payload;
+
+	return (spdk_bs_user_op_t *)set;
+}
+
+void
+bs_user_op_execute(spdk_bs_user_op_t *op)
+{
+	struct spdk_bs_request_set	*set;
+	struct spdk_bs_user_op_args	*args;
+	struct spdk_io_channel		*ch;
+
+	set = (struct spdk_bs_request_set *)op;
+	args = &set->u.user_op;
+	ch = spdk_io_channel_from_ctx(set->channel);
+
+	switch (args->type) {
+	case SPDK_BLOB_READ:
+		spdk_blob_io_read(args->blob, ch, args->payload, args->offset, args->length,
+				  set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+		break;
+	case SPDK_BLOB_WRITE:
+		spdk_blob_io_write(args->blob, ch, args->payload, args->offset, args->length,
+				   set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+		break;
+	case SPDK_BLOB_UNMAP:
+		spdk_blob_io_unmap(args->blob, ch, args->offset, args->length,
+				   set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+		break;
+	case SPDK_BLOB_WRITE_ZEROES:
+		spdk_blob_io_write_zeroes(args->blob, ch, args->offset, args->length,
+					  set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+		break;
+	case SPDK_BLOB_READV:
+		spdk_blob_io_readv(args->blob, ch, args->payload, args->iovcnt,
+				   args->offset, args->length,
+				   set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+		break;
+	case SPDK_BLOB_WRITEV:
+		spdk_blob_io_writev(args->blob, ch, args->payload, args->iovcnt,
+				    args->offset, args->length,
+				    set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+		break;
+	}
+	TAILQ_INSERT_TAIL(&set->channel->reqs, set, link);
+}
+
+void
+bs_user_op_abort(spdk_bs_user_op_t *op)
+{
+	struct spdk_bs_request_set	*set;
+
+	set = (struct spdk_bs_request_set *)op;
+
+	set->cpl.u.blob_basic.cb_fn(set->cpl.u.blob_basic.cb_arg, -EIO);
+	TAILQ_INSERT_TAIL(&set->channel->reqs, set, link);
+}
+
+void
+bs_sequence_to_batch_completion(void *cb_arg, int bserrno)
+{
+	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)cb_arg;
+
+	set->u.batch.outstanding_ops--;
+
+	if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) {
+		if (set->cb_args.cb_fn) {
+			set->cb_args.cb_fn(set->cb_args.channel, set->cb_args.cb_arg, bserrno);
+		}
+	}
+}
+
+SPDK_LOG_REGISTER_COMPONENT("blob_rw", SPDK_LOG_BLOB_RW)
diff --git a/src/spdk/lib/blob/request.h b/src/spdk/lib/blob/request.h
new file mode 100644
index 000000000..81dc161db
--- /dev/null
+++ b/src/spdk/lib/blob/request.h
@@ -0,0 +1,217 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BS_REQUEST_H
+#define SPDK_BS_REQUEST_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blob.h"
+
+enum spdk_bs_cpl_type {
+	SPDK_BS_CPL_TYPE_NONE,
+	SPDK_BS_CPL_TYPE_BS_BASIC,
+	SPDK_BS_CPL_TYPE_BS_HANDLE,
+	SPDK_BS_CPL_TYPE_BLOB_BASIC,
+	SPDK_BS_CPL_TYPE_BLOBID,
+	SPDK_BS_CPL_TYPE_BLOB_HANDLE,
+	SPDK_BS_CPL_TYPE_NESTED_SEQUENCE,
+};
+
+enum spdk_blob_op_type;
+
+struct spdk_bs_request_set;
+
+/* Use a sequence to submit a set of requests serially */
+typedef struct spdk_bs_request_set spdk_bs_sequence_t;
+
+/* Use a batch to submit a set of requests in parallel */
+typedef struct spdk_bs_request_set spdk_bs_batch_t;
+
+/* Use a user_op to queue a user operation for later execution */
+typedef struct spdk_bs_request_set spdk_bs_user_op_t;
+
+typedef void (*spdk_bs_nested_seq_complete)(void *cb_arg, spdk_bs_sequence_t *parent, int bserrno);
+
+struct spdk_bs_cpl {
+	enum spdk_bs_cpl_type type;
+	union {
+		struct {
+			spdk_bs_op_complete     cb_fn;
+			void                    *cb_arg;
+		} bs_basic;
+
+		struct {
+			spdk_bs_op_with_handle_complete cb_fn;
+			void                            *cb_arg;
+			struct spdk_blob_store          *bs;
+		} bs_handle;
+
+		struct {
+			spdk_blob_op_complete   cb_fn;
+			void                    *cb_arg;
+		} blob_basic;
+
+		struct {
+			spdk_blob_op_with_id_complete   cb_fn;
+			void                            *cb_arg;
+			spdk_blob_id                     blobid;
+		} blobid;
+
+		struct {
+			spdk_blob_op_with_handle_complete       cb_fn;
+			void                                    *cb_arg;
+			struct spdk_blob                        *blob;
+		} blob_handle;
+
+		struct {
+			spdk_bs_nested_seq_complete	cb_fn;
+			void				*cb_arg;
+			spdk_bs_sequence_t		*parent;
+		} nested_seq;
+	} u;
+};
+
+typedef void (*spdk_bs_sequence_cpl)(spdk_bs_sequence_t *sequence,
+				     void *cb_arg, int bserrno);
+
+/* A generic request set. Can be a sequence, batch or a user_op. */
+struct spdk_bs_request_set {
+	struct spdk_bs_cpl      cpl;
+
+	int                     bserrno;
+
+	struct spdk_bs_channel		*channel;
+
+	struct spdk_bs_dev_cb_args	cb_args;
+
+	union {
+		struct {
+			spdk_bs_sequence_cpl    cb_fn;
+			void                    *cb_arg;
+		} sequence;
+
+		struct {
+			uint32_t		outstanding_ops;
+			uint32_t		batch_closed;
+			spdk_bs_sequence_cpl	cb_fn;
+			void			*cb_arg;
+		} batch;
+
+		struct spdk_bs_user_op_args {
+			int			type;
+			int			iovcnt;
+			struct spdk_blob	*blob;
+			uint64_t		offset;
+			uint64_t		length;
+			spdk_blob_op_complete	cb_fn;
+			void			*cb_arg;
+			void			*payload; /* cast to iov for readv/writev */
+		} user_op;
+	} u;
+
+	TAILQ_ENTRY(spdk_bs_request_set) link;
+};
+
+void bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno);
+
+spdk_bs_sequence_t *bs_sequence_start(struct spdk_io_channel *channel,
+				      struct spdk_bs_cpl *cpl);
+
+void bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev,
+			     void *payload, uint64_t lba, uint32_t lba_count,
+			     spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload,
+			  uint64_t lba, uint32_t lba_count,
+			  spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload,
+			   uint64_t lba, uint32_t lba_count,
+			   spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_readv_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev,
+			      struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count,
+			      spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_readv_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt,
+			   uint64_t lba, uint32_t lba_count,
+			   spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_writev_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt,
+			    uint64_t lba, uint32_t lba_count,
+			    spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
+				  uint64_t lba, uint32_t lba_count,
+				  spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno);
+
+void bs_user_op_sequence_finish(void *cb_arg, int bserrno);
+
+spdk_bs_batch_t *bs_batch_open(struct spdk_io_channel *channel,
+			       struct spdk_bs_cpl *cpl);
+
+void bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev,
+			  void *payload, uint64_t lba, uint32_t lba_count);
+
+void bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload,
+		       uint64_t lba, uint32_t lba_count);
+
+void bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload,
+			uint64_t lba, uint32_t lba_count);
+
+void bs_batch_unmap_dev(spdk_bs_batch_t *batch,
+			uint64_t lba, uint32_t lba_count);
+
+void bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch,
+			       uint64_t lba, uint32_t lba_count);
+
+void bs_batch_close(spdk_bs_batch_t *batch);
+
+spdk_bs_batch_t *bs_sequence_to_batch(spdk_bs_sequence_t *seq,
+				      spdk_bs_sequence_cpl cb_fn,
+				      void *cb_arg);
+
+spdk_bs_user_op_t *bs_user_op_alloc(struct spdk_io_channel *channel, struct spdk_bs_cpl *cpl,
+				    enum spdk_blob_op_type op_type, struct spdk_blob *blob,
+				    void *payload, int iovcnt, uint64_t offset, uint64_t length);
+
+void bs_user_op_execute(spdk_bs_user_op_t *op);
+
+void bs_user_op_abort(spdk_bs_user_op_t *op);
+
+void bs_sequence_to_batch_completion(void *cb_arg, int bserrno);
+
+#endif
diff --git a/src/spdk/lib/blob/spdk_blob.map b/src/spdk/lib/blob/spdk_blob.map
new file mode 100644
index 000000000..7c1bc473f
--- /dev/null
+++ b/src/spdk/lib/blob/spdk_blob.map
@@ -0,0 +1,64 @@
+{
+	global:
+
+	# Public functions
+	spdk_bs_opts_init;
+	spdk_bs_load;
+	spdk_bs_init;
+	spdk_bs_dump;
+	spdk_bs_destroy;
+	spdk_bs_unload;
+	spdk_bs_set_super;
+	spdk_bs_get_super;
+	spdk_bs_get_cluster_size;
+	spdk_bs_get_page_size;
+	spdk_bs_get_io_unit_size;
+	spdk_bs_free_cluster_count;
+	spdk_bs_total_data_cluster_count;
+	spdk_blob_get_id;
+	spdk_blob_get_num_pages;
+	spdk_blob_get_num_io_units;
+	spdk_blob_get_num_clusters;
+	spdk_blob_opts_init;
+	spdk_bs_create_blob_ext;
+	spdk_bs_create_blob;
+	spdk_bs_create_snapshot;
+	spdk_bs_create_clone;
+	spdk_blob_get_clones;
+	spdk_blob_get_parent_snapshot;
+	spdk_blob_is_read_only;
+	spdk_blob_is_snapshot;
+	spdk_blob_is_clone;
+	spdk_blob_is_thin_provisioned;
+	spdk_bs_delete_blob;
+	spdk_bs_inflate_blob;
+	spdk_bs_blob_decouple_parent;
+	spdk_blob_open_opts_init;
+	spdk_bs_open_blob;
+	spdk_bs_open_blob_ext;
+	spdk_blob_resize;
+	spdk_blob_set_read_only;
+	spdk_blob_sync_md;
+	spdk_blob_close;
+	spdk_bs_alloc_io_channel;
+	spdk_bs_free_io_channel;
+	spdk_blob_io_write;
+	spdk_blob_io_read;
+	spdk_blob_io_writev;
+	spdk_blob_io_readv;
+	spdk_blob_io_unmap;
+	spdk_blob_io_write_zeroes;
+	spdk_bs_iter_first;
+	spdk_bs_iter_next;
+	spdk_blob_set_xattr;
+	spdk_blob_remove_xattr;
+	spdk_blob_get_xattr_value;
+	spdk_blob_get_xattr_names;
+	spdk_xattr_names_get_count;
+	spdk_xattr_names_get_name;
+	spdk_xattr_names_free;
+	spdk_bs_get_bstype;
+	spdk_bs_set_bstype;
+
+	local: *;
+};
diff --git a/src/spdk/lib/blob/zeroes.c b/src/spdk/lib/blob/zeroes.c
new file mode 100644
index 000000000..5e8d70545
--- /dev/null
+++ b/src/spdk/lib/blob/zeroes.c
@@ -0,0 +1,122 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/blob.h"
+
+#include "blobstore.h"
+
+static void
+zeroes_destroy(struct spdk_bs_dev *bs_dev)
+{
+	return;
+}
+
+static void
+zeroes_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+	    uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+	memset(payload, 0, dev->blocklen * lba_count);
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0);
+}
+
+static void
+zeroes_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+	     uint64_t lba, uint32_t lba_count,
+	     struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+zeroes_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+	     struct iovec *iov, int iovcnt,
+	     uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+	int i;
+
+	for (i = 0; i < iovcnt; i++) {
+		memset(iov[i].iov_base, 0, iov[i].iov_len);
+	}
+
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0);
+}
+
+static void
+zeroes_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+	      struct iovec *iov, int iovcnt,
+	      uint64_t lba, uint32_t lba_count,
+	      struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+zeroes_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+		    uint64_t lba, uint32_t lba_count,
+		    struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static void
+zeroes_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+	     uint64_t lba, uint32_t lba_count,
+	     struct spdk_bs_dev_cb_args *cb_args)
+{
+	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+	assert(false);
+}
+
+static struct spdk_bs_dev g_zeroes_bs_dev = {
+	.blockcnt = UINT64_MAX,
+	.blocklen = 512,
+	.create_channel = NULL,
+	.destroy_channel = NULL,
+	.destroy = zeroes_destroy,
+	.read = zeroes_read,
+	.write = zeroes_write,
+	.readv = zeroes_readv,
+	.writev = zeroes_writev,
+	.write_zeroes = zeroes_write_zeroes,
+	.unmap = zeroes_unmap,
+};
+
+struct spdk_bs_dev *
+bs_create_zeroes_dev(void)
+{
+	return &g_zeroes_bs_dev;
+}
diff --git a/src/spdk/lib/blobfs/Makefile b/src/spdk/lib/blobfs/Makefile
new file mode 100644
index 000000000..d0c46de02
--- /dev/null
+++ b/src/spdk/lib/blobfs/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = blobfs.c tree.c
+LIBNAME = blobfs
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blobfs.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/blobfs/blobfs.c b/src/spdk/lib/blobfs/blobfs.c
new file mode 100644
index 000000000..3af6b0639
--- /dev/null
+++ b/src/spdk/lib/blobfs/blobfs.c
@@ -0,0 +1,2980 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blobfs.h"
+#include "spdk/conf.h"
+#include "tree.h"
+
+#include "spdk/queue.h"
+#include "spdk/thread.h"
+#include "spdk/assert.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+#include "spdk/trace.h"
+
+#define BLOBFS_TRACE(file, str, args...) \
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s " str, file->name, ##args)
+
+#define BLOBFS_TRACE_RW(file, str, args...) \
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS_RW, "file=%s " str, file->name, ##args)
+
+#define BLOBFS_DEFAULT_CACHE_SIZE (4ULL * 1024 * 1024 * 1024)
+#define SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ (1024 * 1024)
+
+#define SPDK_BLOBFS_SIGNATURE	"BLOBFS"
+
+static uint64_t g_fs_cache_size = BLOBFS_DEFAULT_CACHE_SIZE;
+static struct spdk_mempool *g_cache_pool;
+static TAILQ_HEAD(, spdk_file) g_caches;
+static struct spdk_poller *g_cache_pool_mgmt_poller;
+static struct spdk_thread *g_cache_pool_thread;
+#define BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US 1000ULL
+static int g_fs_count = 0;
+static pthread_mutex_t g_cache_init_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define TRACE_GROUP_BLOBFS	0x7
+#define TRACE_BLOBFS_XATTR_START	SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x0)
+#define TRACE_BLOBFS_XATTR_END		SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x1)
+#define TRACE_BLOBFS_OPEN		SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x2)
+#define TRACE_BLOBFS_CLOSE		SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x3)
+#define TRACE_BLOBFS_DELETE_START	SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x4)
+#define TRACE_BLOBFS_DELETE_DONE	SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x5)
+
+SPDK_TRACE_REGISTER_FN(blobfs_trace, "blobfs", TRACE_GROUP_BLOBFS)
+{
+	spdk_trace_register_description("BLOBFS_XATTR_START",
+					TRACE_BLOBFS_XATTR_START,
+					OWNER_NONE, OBJECT_NONE, 0,
+					SPDK_TRACE_ARG_TYPE_STR,
+					"file:    ");
+	spdk_trace_register_description("BLOBFS_XATTR_END",
+					TRACE_BLOBFS_XATTR_END,
+					OWNER_NONE, OBJECT_NONE, 0,
+					SPDK_TRACE_ARG_TYPE_STR,
+					"file:    ");
+	spdk_trace_register_description("BLOBFS_OPEN",
+					TRACE_BLOBFS_OPEN,
+					OWNER_NONE, OBJECT_NONE, 0,
+					SPDK_TRACE_ARG_TYPE_STR,
+					"file:    ");
+	spdk_trace_register_description("BLOBFS_CLOSE",
+					TRACE_BLOBFS_CLOSE,
+					OWNER_NONE, OBJECT_NONE, 0,
+					SPDK_TRACE_ARG_TYPE_STR,
+					"file:    ");
+	spdk_trace_register_description("BLOBFS_DELETE_START",
+					TRACE_BLOBFS_DELETE_START,
+					OWNER_NONE, OBJECT_NONE, 0,
+					SPDK_TRACE_ARG_TYPE_STR,
+					"file:    ");
+	spdk_trace_register_description("BLOBFS_DELETE_DONE",
+					TRACE_BLOBFS_DELETE_DONE,
+					OWNER_NONE, OBJECT_NONE, 0,
+					SPDK_TRACE_ARG_TYPE_STR,
+					"file:    ");
+}
+
+void
+cache_buffer_free(struct cache_buffer *cache_buffer)
+{
+	spdk_mempool_put(g_cache_pool, cache_buffer->buf);
+	free(cache_buffer);
+}
+
+#define CACHE_READAHEAD_THRESHOLD	(128 * 1024)
+
+struct spdk_file {
+	struct spdk_filesystem	*fs;
+	struct spdk_blob	*blob;
+	char			*name;
+	uint64_t		trace_arg_name;
+	uint64_t		length;
+	bool                    is_deleted;
+	bool			open_for_writing;
+	uint64_t		length_flushed;
+	uint64_t		length_xattr;
+	uint64_t		append_pos;
+	uint64_t		seq_byte_count;
+	uint64_t		next_seq_offset;
+	uint32_t		priority;
+	TAILQ_ENTRY(spdk_file)	tailq;
+	spdk_blob_id		blobid;
+	uint32_t		ref_count;
+	pthread_spinlock_t	lock;
+	struct cache_buffer	*last;
+	struct cache_tree	*tree;
+	TAILQ_HEAD(open_requests_head, spdk_fs_request) open_requests;
+	TAILQ_HEAD(sync_requests_head, spdk_fs_request) sync_requests;
+	TAILQ_ENTRY(spdk_file)	cache_tailq;
+};
+
+struct spdk_deleted_file {
+	spdk_blob_id	id;
+	TAILQ_ENTRY(spdk_deleted_file)	tailq;
+};
+
+struct spdk_filesystem {
+	struct spdk_blob_store	*bs;
+	TAILQ_HEAD(, spdk_file)	files;
+	struct spdk_bs_opts	bs_opts;
+	struct spdk_bs_dev	*bdev;
+	fs_send_request_fn	send_request;
+
+	struct {
+		uint32_t		max_ops;
+		struct spdk_io_channel	*sync_io_channel;
+		struct spdk_fs_channel	*sync_fs_channel;
+	} sync_target;
+
+	struct {
+		uint32_t		max_ops;
+		struct spdk_io_channel	*md_io_channel;
+		struct spdk_fs_channel	*md_fs_channel;
+	} md_target;
+
+	struct {
+		uint32_t		max_ops;
+	} io_target;
+};
+
+struct spdk_fs_cb_args {
+	union {
+		spdk_fs_op_with_handle_complete		fs_op_with_handle;
+		spdk_fs_op_complete			fs_op;
+		spdk_file_op_with_handle_complete	file_op_with_handle;
+		spdk_file_op_complete			file_op;
+		spdk_file_stat_op_complete		stat_op;
+	} fn;
+	void *arg;
+	sem_t *sem;
+	struct spdk_filesystem *fs;
+	struct spdk_file *file;
+	int rc;
+	struct iovec *iovs;
+	uint32_t iovcnt;
+	struct iovec iov;
+	union {
+		struct {
+			TAILQ_HEAD(, spdk_deleted_file)	deleted_files;
+		} fs_load;
+		struct {
+			uint64_t	length;
+		} truncate;
+		struct {
+			struct spdk_io_channel	*channel;
+			void		*pin_buf;
+			int		is_read;
+			off_t		offset;
+			size_t		length;
+			uint64_t	start_lba;
+			uint64_t	num_lba;
+			uint32_t	blocklen;
+		} rw;
+		struct {
+			const char	*old_name;
+			const char	*new_name;
+		} rename;
+		struct {
+			struct cache_buffer	*cache_buffer;
+			uint64_t		length;
+		} flush;
+		struct {
+			struct cache_buffer	*cache_buffer;
+			uint64_t		length;
+			uint64_t		offset;
+		} readahead;
+		struct {
+			/* offset of the file when the sync request was made */
+			uint64_t			offset;
+			TAILQ_ENTRY(spdk_fs_request)	tailq;
+			bool				xattr_in_progress;
+			/* length written to the xattr for this file - this should
+			 * always be the same as the offset if only one thread is
+			 * writing to the file, but could differ if multiple threads
+			 * are appending
+			 */
+			uint64_t			length;
+		} sync;
+		struct {
+			uint32_t			num_clusters;
+		} resize;
+		struct {
+			const char	*name;
+			uint32_t	flags;
+			TAILQ_ENTRY(spdk_fs_request)	tailq;
+		} open;
+		struct {
+			const char		*name;
+			struct spdk_blob	*blob;
+		} create;
+		struct {
+			const char	*name;
+		} delete;
+		struct {
+			const char	*name;
+		} stat;
+	} op;
+};
+
+static void file_free(struct spdk_file *file);
+static void fs_io_device_unregister(struct spdk_filesystem *fs);
+static void fs_free_io_channels(struct spdk_filesystem *fs);
+
+void
+spdk_fs_opts_init(struct spdk_blobfs_opts *opts)
+{
+	opts->cluster_sz = SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ;
+}
+
+static int _blobfs_cache_pool_reclaim(void *arg);
+
+static bool
+blobfs_cache_pool_need_reclaim(void)
+{
+	size_t count;
+
+	count = spdk_mempool_count(g_cache_pool);
+	/* We define a aggressive policy here as the requirements from db_bench are batched, so start the poller
+	 *  when the number of available cache buffer is less than 1/5 of total buffers.
+	 */
+	if (count > (size_t)g_fs_cache_size / CACHE_BUFFER_SIZE / 5) {
+		return false;
+	}
+
+	return true;
+}
+
+static void
+__start_cache_pool_mgmt(void *ctx)
+{
+	assert(g_cache_pool == NULL);
+
+	g_cache_pool = spdk_mempool_create("spdk_fs_cache",
+					   g_fs_cache_size / CACHE_BUFFER_SIZE,
+					   CACHE_BUFFER_SIZE,
+					   SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+					   SPDK_ENV_SOCKET_ID_ANY);
+	if (!g_cache_pool) {
+		SPDK_ERRLOG("Create mempool failed, you may "
+			    "increase the memory and try again\n");
+		assert(false);
+	}
+	TAILQ_INIT(&g_caches);
+
+	assert(g_cache_pool_mgmt_poller == NULL);
+	g_cache_pool_mgmt_poller = SPDK_POLLER_REGISTER(_blobfs_cache_pool_reclaim, NULL,
+				   BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US);
+}
+
+static void
+__stop_cache_pool_mgmt(void *ctx)
+{
+	spdk_poller_unregister(&g_cache_pool_mgmt_poller);
+
+	assert(g_cache_pool != NULL);
+	assert(spdk_mempool_count(g_cache_pool) == g_fs_cache_size / CACHE_BUFFER_SIZE);
+	spdk_mempool_free(g_cache_pool);
+	g_cache_pool = NULL;
+
+	spdk_thread_exit(g_cache_pool_thread);
+}
+
+static void
+initialize_global_cache(void)
+{
+	pthread_mutex_lock(&g_cache_init_lock);
+	if (g_fs_count == 0) {
+		g_cache_pool_thread = spdk_thread_create("cache_pool_mgmt", NULL);
+		assert(g_cache_pool_thread != NULL);
+		spdk_thread_send_msg(g_cache_pool_thread, __start_cache_pool_mgmt, NULL);
+	}
+	g_fs_count++;
+	pthread_mutex_unlock(&g_cache_init_lock);
+}
+
+static void
+free_global_cache(void)
+{
+	pthread_mutex_lock(&g_cache_init_lock);
+	g_fs_count--;
+	if (g_fs_count == 0) {
+		spdk_thread_send_msg(g_cache_pool_thread, __stop_cache_pool_mgmt, NULL);
+	}
+	pthread_mutex_unlock(&g_cache_init_lock);
+}
+
+static uint64_t
+__file_get_blob_size(struct spdk_file *file)
+{
+	uint64_t cluster_sz;
+
+	cluster_sz = file->fs->bs_opts.cluster_sz;
+	return cluster_sz * spdk_blob_get_num_clusters(file->blob);
+}
+
+struct spdk_fs_request {
+	struct spdk_fs_cb_args		args;
+	TAILQ_ENTRY(spdk_fs_request)	link;
+	struct spdk_fs_channel		*channel;
+};
+
+struct spdk_fs_channel {
+	struct spdk_fs_request		*req_mem;
+	TAILQ_HEAD(, spdk_fs_request)	reqs;
+	sem_t				sem;
+	struct spdk_filesystem		*fs;
+	struct spdk_io_channel		*bs_channel;
+	fs_send_request_fn		send_request;
+	bool				sync;
+	uint32_t			outstanding_reqs;
+	pthread_spinlock_t		lock;
+};
+
+/* For now, this is effectively an alias. But eventually we'll shift
+ * some data members over. */
+struct spdk_fs_thread_ctx {
+	struct spdk_fs_channel	ch;
+};
+
+static struct spdk_fs_request *
+alloc_fs_request_with_iov(struct spdk_fs_channel *channel, uint32_t iovcnt)
+{
+	struct spdk_fs_request *req;
+	struct iovec *iovs = NULL;
+
+	if (iovcnt > 1) {
+		iovs = calloc(iovcnt, sizeof(struct iovec));
+		if (!iovs) {
+			return NULL;
+		}
+	}
+
+	if (channel->sync) {
+		pthread_spin_lock(&channel->lock);
+	}
+
+	req = TAILQ_FIRST(&channel->reqs);
+	if (req) {
+		channel->outstanding_reqs++;
+		TAILQ_REMOVE(&channel->reqs, req, link);
+	}
+
+	if (channel->sync) {
+		pthread_spin_unlock(&channel->lock);
+	}
+
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate req on spdk_fs_channel =%p\n", channel);
+		free(iovs);
+		return NULL;
+	}
+	memset(req, 0, sizeof(*req));
+	req->channel = channel;
+	if (iovcnt > 1) {
+		req->args.iovs = iovs;
+	} else {
+		req->args.iovs = &req->args.iov;
+	}
+	req->args.iovcnt = iovcnt;
+
+	return req;
+}
+
+static struct spdk_fs_request *
+alloc_fs_request(struct spdk_fs_channel *channel)
+{
+	return alloc_fs_request_with_iov(channel, 0);
+}
+
+static void
+free_fs_request(struct spdk_fs_request *req)
+{
+	struct spdk_fs_channel *channel = req->channel;
+
+	if (req->args.iovcnt > 1) {
+		free(req->args.iovs);
+	}
+
+	if (channel->sync) {
+		pthread_spin_lock(&channel->lock);
+	}
+
+	TAILQ_INSERT_HEAD(&req->channel->reqs, req, link);
+	channel->outstanding_reqs--;
+
+	if (channel->sync) {
+		pthread_spin_unlock(&channel->lock);
+	}
+}
+
+static int
+fs_channel_create(struct spdk_filesystem *fs, struct spdk_fs_channel *channel,
+		  uint32_t max_ops)
+{
+	uint32_t i;
+
+	channel->req_mem = calloc(max_ops, sizeof(struct spdk_fs_request));
+	if (!channel->req_mem) {
+		return -1;
+	}
+
+	channel->outstanding_reqs = 0;
+	TAILQ_INIT(&channel->reqs);
+	sem_init(&channel->sem, 0, 0);
+
+	for (i = 0; i < max_ops; i++) {
+		TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
+	}
+
+	channel->fs = fs;
+
+	return 0;
+}
+
+static int
+fs_md_channel_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_filesystem		*fs;
+	struct spdk_fs_channel		*channel = ctx_buf;
+
+	fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, md_target);
+
+	return fs_channel_create(fs, channel, fs->md_target.max_ops);
+}
+
+static int
+fs_sync_channel_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_filesystem		*fs;
+	struct spdk_fs_channel		*channel = ctx_buf;
+
+	fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, sync_target);
+
+	return fs_channel_create(fs, channel, fs->sync_target.max_ops);
+}
+
+static int
+fs_io_channel_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_filesystem		*fs;
+	struct spdk_fs_channel		*channel = ctx_buf;
+
+	fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, io_target);
+
+	return fs_channel_create(fs, channel, fs->io_target.max_ops);
+}
+
+static void
+fs_channel_destroy(void *io_device, void *ctx_buf)
+{
+	struct spdk_fs_channel *channel = ctx_buf;
+
+	if (channel->outstanding_reqs > 0) {
+		SPDK_ERRLOG("channel freed with %" PRIu32 " outstanding requests!\n",
+			    channel->outstanding_reqs);
+	}
+
+	free(channel->req_mem);
+	if (channel->bs_channel != NULL) {
+		spdk_bs_free_io_channel(channel->bs_channel);
+	}
+}
+
+static void
+__send_request_direct(fs_request_fn fn, void *arg)
+{
+	fn(arg);
+}
+
+static void
+common_fs_bs_init(struct spdk_filesystem *fs, struct spdk_blob_store *bs)
+{
+	fs->bs = bs;
+	fs->bs_opts.cluster_sz = spdk_bs_get_cluster_size(bs);
+	fs->md_target.md_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs);
+	fs->md_target.md_fs_channel->send_request = __send_request_direct;
+	fs->sync_target.sync_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs);
+	fs->sync_target.sync_fs_channel->send_request = __send_request_direct;
+
+	initialize_global_cache();
+}
+
+static void
+init_cb(void *ctx, struct spdk_blob_store *bs, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_filesystem *fs = args->fs;
+
+	if (bserrno == 0) {
+		common_fs_bs_init(fs, bs);
+	} else {
+		free(fs);
+		fs = NULL;
+	}
+
+	args->fn.fs_op_with_handle(args->arg, fs, bserrno);
+	free_fs_request(req);
+}
+
+static void
+fs_conf_parse(void)
+{
+	struct spdk_conf_section *sp;
+	int cache_buffer_shift;
+
+	sp = spdk_conf_find_section(NULL, "Blobfs");
+	if (sp == NULL) {
+		g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT;
+		return;
+	}
+
+	cache_buffer_shift = spdk_conf_section_get_intval(sp, "CacheBufferShift");
+	if (cache_buffer_shift <= 0) {
+		g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT;
+	} else {
+		g_fs_cache_buffer_shift = cache_buffer_shift;
+	}
+}
+
+static struct spdk_filesystem *
+fs_alloc(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn)
+{
+	struct spdk_filesystem *fs;
+
+	fs = calloc(1, sizeof(*fs));
+	if (fs == NULL) {
+		return NULL;
+	}
+
+	fs->bdev = dev;
+	fs->send_request = send_request_fn;
+	TAILQ_INIT(&fs->files);
+
+	fs->md_target.max_ops = 512;
+	spdk_io_device_register(&fs->md_target, fs_md_channel_create, fs_channel_destroy,
+				sizeof(struct spdk_fs_channel), "blobfs_md");
+	fs->md_target.md_io_channel = spdk_get_io_channel(&fs->md_target);
+	fs->md_target.md_fs_channel = spdk_io_channel_get_ctx(fs->md_target.md_io_channel);
+
+	fs->sync_target.max_ops = 512;
+	spdk_io_device_register(&fs->sync_target, fs_sync_channel_create, fs_channel_destroy,
+				sizeof(struct spdk_fs_channel), "blobfs_sync");
+	fs->sync_target.sync_io_channel = spdk_get_io_channel(&fs->sync_target);
+	fs->sync_target.sync_fs_channel = spdk_io_channel_get_ctx(fs->sync_target.sync_io_channel);
+
+	fs->io_target.max_ops = 512;
+	spdk_io_device_register(&fs->io_target, fs_io_channel_create, fs_channel_destroy,
+				sizeof(struct spdk_fs_channel), "blobfs_io");
+
+	return fs;
+}
+
+static void
+__wake_caller(void *arg, int fserrno)
+{
+	struct spdk_fs_cb_args *args = arg;
+
+	args->rc = fserrno;
+	sem_post(args->sem);
+}
+
+void
+spdk_fs_init(struct spdk_bs_dev *dev, struct spdk_blobfs_opts *opt,
+	     fs_send_request_fn send_request_fn,
+	     spdk_fs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_filesystem *fs;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	struct spdk_bs_opts opts = {};
+
+	fs = fs_alloc(dev, send_request_fn);
+	if (fs == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	fs_conf_parse();
+
+	req = alloc_fs_request(fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		fs_free_io_channels(fs);
+		fs_io_device_unregister(fs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.fs_op_with_handle = cb_fn;
+	args->arg = cb_arg;
+	args->fs = fs;
+
+	spdk_bs_opts_init(&opts);
+	snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), SPDK_BLOBFS_SIGNATURE);
+	if (opt) {
+		opts.cluster_sz = opt->cluster_sz;
+	}
+	spdk_bs_init(dev, &opts, init_cb, req);
+}
+
+static struct spdk_file *
+file_alloc(struct spdk_filesystem *fs)
+{
+	struct spdk_file *file;
+
+	file = calloc(1, sizeof(*file));
+	if (file == NULL) {
+		return NULL;
+	}
+
+	file->tree = calloc(1, sizeof(*file->tree));
+	if (file->tree == NULL) {
+		free(file);
+		return NULL;
+	}
+
+	if (pthread_spin_init(&file->lock, 0)) {
+		free(file->tree);
+		free(file);
+		return NULL;
+	}
+
+	file->fs = fs;
+	TAILQ_INIT(&file->open_requests);
+	TAILQ_INIT(&file->sync_requests);
+	TAILQ_INSERT_TAIL(&fs->files, file, tailq);
+	file->priority = SPDK_FILE_PRIORITY_LOW;
+	return file;
+}
+
+static void fs_load_done(void *ctx, int bserrno);
+
+static int
+_handle_deleted_files(struct spdk_fs_request *req)
+{
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_filesystem *fs = args->fs;
+
+	if (!TAILQ_EMPTY(&args->op.fs_load.deleted_files)) {
+		struct spdk_deleted_file *deleted_file;
+
+		deleted_file = TAILQ_FIRST(&args->op.fs_load.deleted_files);
+		TAILQ_REMOVE(&args->op.fs_load.deleted_files, deleted_file, tailq);
+		spdk_bs_delete_blob(fs->bs, deleted_file->id, fs_load_done, req);
+		free(deleted_file);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void
+fs_load_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_filesystem *fs = args->fs;
+
+	/* The filesystem has been loaded.  Now check if there are any files that
+	 *  were marked for deletion before last unload.  Do not complete the
+	 *  fs_load callback until all of them have been deleted on disk.
+	 */
+	if (_handle_deleted_files(req) == 0) {
+		/* We found a file that's been marked for deleting but not actually
+		 *  deleted yet.  This function will get called again once the delete
+		 *  operation is completed.
+		 */
+		return;
+	}
+
+	args->fn.fs_op_with_handle(args->arg, fs, 0);
+	free_fs_request(req);
+
+}
+
+static void
+_file_build_trace_arg_name(struct spdk_file *f)
+{
+	f->trace_arg_name = 0;
+	memcpy(&f->trace_arg_name, f->name,
+	       spdk_min(sizeof(f->trace_arg_name), strlen(f->name)));
+}
+
+static void
+iter_cb(void *ctx, struct spdk_blob *blob, int rc)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_filesystem *fs = args->fs;
+	uint64_t *length;
+	const char *name;
+	uint32_t *is_deleted;
+	size_t value_len;
+
+	if (rc < 0) {
+		args->fn.fs_op_with_handle(args->arg, fs, rc);
+		free_fs_request(req);
+		return;
+	}
+
+	rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&name, &value_len);
+	if (rc < 0) {
+		args->fn.fs_op_with_handle(args->arg, fs, rc);
+		free_fs_request(req);
+		return;
+	}
+
+	rc = spdk_blob_get_xattr_value(blob, "length", (const void **)&length, &value_len);
+	if (rc < 0) {
+		args->fn.fs_op_with_handle(args->arg, fs, rc);
+		free_fs_request(req);
+		return;
+	}
+
+	assert(value_len == 8);
+
+	/* This file could be deleted last time without close it, then app crashed, so we delete it now */
+	rc = spdk_blob_get_xattr_value(blob, "is_deleted", (const void **)&is_deleted, &value_len);
+	if (rc < 0) {
+		struct spdk_file *f;
+
+		f = file_alloc(fs);
+		if (f == NULL) {
+			SPDK_ERRLOG("Cannot allocate file to handle deleted file on disk\n");
+			args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM);
+			free_fs_request(req);
+			return;
+		}
+
+		f->name = strdup(name);
+		_file_build_trace_arg_name(f);
+		f->blobid = spdk_blob_get_id(blob);
+		f->length = *length;
+		f->length_flushed = *length;
+		f->length_xattr = *length;
+		f->append_pos = *length;
+		SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "added file %s length=%ju\n", f->name, f->length);
+	} else {
+		struct spdk_deleted_file *deleted_file;
+
+		deleted_file = calloc(1, sizeof(*deleted_file));
+		if (deleted_file == NULL) {
+			args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM);
+			free_fs_request(req);
+			return;
+		}
+		deleted_file->id = spdk_blob_get_id(blob);
+		TAILQ_INSERT_TAIL(&args->op.fs_load.deleted_files, deleted_file, tailq);
+	}
+}
+
+static void
+load_cb(void *ctx, struct spdk_blob_store *bs, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_filesystem *fs = args->fs;
+	struct spdk_bs_type bstype;
+	static const struct spdk_bs_type blobfs_type = {SPDK_BLOBFS_SIGNATURE};
+	static const struct spdk_bs_type zeros;
+
+	if (bserrno != 0) {
+		args->fn.fs_op_with_handle(args->arg, NULL, bserrno);
+		free_fs_request(req);
+		fs_free_io_channels(fs);
+		fs_io_device_unregister(fs);
+		return;
+	}
+
+	bstype = spdk_bs_get_bstype(bs);
+
+	if (!memcmp(&bstype, &zeros, sizeof(bstype))) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "assigning bstype\n");
+		spdk_bs_set_bstype(bs, blobfs_type);
+	} else if (memcmp(&bstype, &blobfs_type, sizeof(bstype))) {
+		SPDK_ERRLOG("not blobfs\n");
+		SPDK_LOGDUMP(SPDK_LOG_BLOBFS, "bstype", &bstype, sizeof(bstype));
+		args->fn.fs_op_with_handle(args->arg, NULL, -EINVAL);
+		free_fs_request(req);
+		fs_free_io_channels(fs);
+		fs_io_device_unregister(fs);
+		return;
+	}
+
+	common_fs_bs_init(fs, bs);
+	fs_load_done(req, 0);
+}
+
+static void
+fs_io_device_unregister(struct spdk_filesystem *fs)
+{
+	assert(fs != NULL);
+	spdk_io_device_unregister(&fs->md_target, NULL);
+	spdk_io_device_unregister(&fs->sync_target, NULL);
+	spdk_io_device_unregister(&fs->io_target, NULL);
+	free(fs);
+}
+
+static void
+fs_free_io_channels(struct spdk_filesystem *fs)
+{
+	assert(fs != NULL);
+	spdk_fs_free_io_channel(fs->md_target.md_io_channel);
+	spdk_fs_free_io_channel(fs->sync_target.sync_io_channel);
+}
+
+void
+spdk_fs_load(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn,
+	     spdk_fs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_filesystem *fs;
+	struct spdk_fs_cb_args *args;
+	struct spdk_fs_request *req;
+	struct spdk_bs_opts	bs_opts;
+
+	fs = fs_alloc(dev, send_request_fn);
+	if (fs == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	fs_conf_parse();
+
+	req = alloc_fs_request(fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		fs_free_io_channels(fs);
+		fs_io_device_unregister(fs);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.fs_op_with_handle = cb_fn;
+	args->arg = cb_arg;
+	args->fs = fs;
+	TAILQ_INIT(&args->op.fs_load.deleted_files);
+	spdk_bs_opts_init(&bs_opts);
+	bs_opts.iter_cb_fn = iter_cb;
+	bs_opts.iter_cb_arg = req;
+	spdk_bs_load(dev, &bs_opts, load_cb, req);
+}
+
+static void
+unload_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_filesystem *fs = args->fs;
+	struct spdk_file *file, *tmp;
+
+	TAILQ_FOREACH_SAFE(file, &fs->files, tailq, tmp) {
+		TAILQ_REMOVE(&fs->files, file, tailq);
+		file_free(file);
+	}
+
+	free_global_cache();
+
+	args->fn.fs_op(args->arg, bserrno);
+	free(req);
+
+	fs_io_device_unregister(fs);
+}
+
+void
+spdk_fs_unload(struct spdk_filesystem *fs, spdk_fs_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	/*
+	 * We must free the md_channel before unloading the blobstore, so just
+	 *  allocate this request from the general heap.
+	 */
+	req = calloc(1, sizeof(*req));
+	if (req == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.fs_op = cb_fn;
+	args->arg = cb_arg;
+	args->fs = fs;
+
+	fs_free_io_channels(fs);
+	spdk_bs_unload(fs->bs, unload_cb, req);
+}
+
+static struct spdk_file *
+fs_find_file(struct spdk_filesystem *fs, const char *name)
+{
+	struct spdk_file *file;
+
+	TAILQ_FOREACH(file, &fs->files, tailq) {
+		if (!strncmp(name, file->name, SPDK_FILE_NAME_MAX)) {
+			return file;
+		}
+	}
+
+	return NULL;
+}
+
+void
+spdk_fs_file_stat_async(struct spdk_filesystem *fs, const char *name,
+			spdk_file_stat_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_file_stat stat;
+	struct spdk_file *f = NULL;
+
+	if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+		cb_fn(cb_arg, NULL, -ENAMETOOLONG);
+		return;
+	}
+
+	f = fs_find_file(fs, name);
+	if (f != NULL) {
+		stat.blobid = f->blobid;
+		stat.size = f->append_pos >= f->length ? f->append_pos : f->length;
+		cb_fn(cb_arg, &stat, 0);
+		return;
+	}
+
+	cb_fn(cb_arg, NULL, -ENOENT);
+}
+
+static void
+__copy_stat(void *arg, struct spdk_file_stat *stat, int fserrno)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	args->rc = fserrno;
+	if (fserrno == 0) {
+		memcpy(args->arg, stat, sizeof(*stat));
+	}
+	sem_post(args->sem);
+}
+
+static void
+__file_stat(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	spdk_fs_file_stat_async(args->fs, args->op.stat.name,
+				args->fn.stat_op, req);
+}
+
+int
+spdk_fs_file_stat(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+		  const char *name, struct spdk_file_stat *stat)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	int rc;
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate stat req on file=%s\n", name);
+		return -ENOMEM;
+	}
+
+	req->args.fs = fs;
+	req->args.op.stat.name = name;
+	req->args.fn.stat_op = __copy_stat;
+	req->args.arg = stat;
+	req->args.sem = &channel->sem;
+	channel->send_request(__file_stat, req);
+	sem_wait(&channel->sem);
+
+	rc = req->args.rc;
+	free_fs_request(req);
+
+	return rc;
+}
+
+static void
+fs_create_blob_close_cb(void *ctx, int bserrno)
+{
+	int rc;
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	rc = args->rc ? args->rc : bserrno;
+	args->fn.file_op(args->arg, rc);
+	free_fs_request(req);
+}
+
+static void
+fs_create_blob_resize_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *f = args->file;
+	struct spdk_blob *blob = args->op.create.blob;
+	uint64_t length = 0;
+
+	args->rc = bserrno;
+	if (bserrno) {
+		spdk_blob_close(blob, fs_create_blob_close_cb, args);
+		return;
+	}
+
+	spdk_blob_set_xattr(blob, "name", f->name, strlen(f->name) + 1);
+	spdk_blob_set_xattr(blob, "length", &length, sizeof(length));
+
+	spdk_blob_close(blob, fs_create_blob_close_cb, args);
+}
+
+static void
+fs_create_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	if (bserrno) {
+		args->fn.file_op(args->arg, bserrno);
+		free_fs_request(req);
+		return;
+	}
+
+	args->op.create.blob = blob;
+	spdk_blob_resize(blob, 1, fs_create_blob_resize_cb, req);
+}
+
+static void
+fs_create_blob_create_cb(void *ctx, spdk_blob_id blobid, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *f = args->file;
+
+	if (bserrno) {
+		args->fn.file_op(args->arg, bserrno);
+		free_fs_request(req);
+		return;
+	}
+
+	f->blobid = blobid;
+	spdk_bs_open_blob(f->fs->bs, blobid, fs_create_blob_open_cb, req);
+}
+
+void
+spdk_fs_create_file_async(struct spdk_filesystem *fs, const char *name,
+			  spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_file *file;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+		cb_fn(cb_arg, -ENAMETOOLONG);
+		return;
+	}
+
+	file = fs_find_file(fs, name);
+	if (file != NULL) {
+		cb_fn(cb_arg, -EEXIST);
+		return;
+	}
+
+	file = file_alloc(fs);
+	if (file == NULL) {
+		SPDK_ERRLOG("Cannot allocate new file for creation\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	req = alloc_fs_request(fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate create async req for file=%s\n", name);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->file = file;
+	args->fn.file_op = cb_fn;
+	args->arg = cb_arg;
+
+	file->name = strdup(name);
+	_file_build_trace_arg_name(file);
+	spdk_bs_create_blob(fs->bs, fs_create_blob_create_cb, args);
+}
+
+static void
+__fs_create_file_done(void *arg, int fserrno)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	__wake_caller(args, fserrno);
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name);
+}
+
+static void
+__fs_create_file(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name);
+	spdk_fs_create_file_async(args->fs, args->op.create.name, __fs_create_file_done, req);
+}
+
+int
+spdk_fs_create_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, const char *name)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name);
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate req to create file=%s\n", name);
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+	args->fs = fs;
+	args->op.create.name = name;
+	args->sem = &channel->sem;
+	fs->send_request(__fs_create_file, req);
+	sem_wait(&channel->sem);
+	rc = args->rc;
+	free_fs_request(req);
+
+	return rc;
+}
+
+static void
+fs_open_blob_done(void *ctx, struct spdk_blob *blob, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *f = args->file;
+
+	f->blob = blob;
+	while (!TAILQ_EMPTY(&f->open_requests)) {
+		req = TAILQ_FIRST(&f->open_requests);
+		args = &req->args;
+		TAILQ_REMOVE(&f->open_requests, req, args.op.open.tailq);
+		spdk_trace_record(TRACE_BLOBFS_OPEN, 0, 0, 0, f->trace_arg_name);
+		args->fn.file_op_with_handle(args->arg, f, bserrno);
+		free_fs_request(req);
+	}
+}
+
+static void
+fs_open_blob_create_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+	struct spdk_filesystem *fs = args->fs;
+
+	if (file == NULL) {
+		/*
+		 * This is from an open with CREATE flag - the file
+		 *  is now created so look it up in the file list for this
+		 *  filesystem.
+		 */
+		file = fs_find_file(fs, args->op.open.name);
+		assert(file != NULL);
+		args->file = file;
+	}
+
+	file->ref_count++;
+	TAILQ_INSERT_TAIL(&file->open_requests, req, args.op.open.tailq);
+	if (file->ref_count == 1) {
+		assert(file->blob == NULL);
+		spdk_bs_open_blob(fs->bs, file->blobid, fs_open_blob_done, req);
+	} else if (file->blob != NULL) {
+		fs_open_blob_done(req, file->blob, 0);
+	} else {
+		/*
+		 * The blob open for this file is in progress due to a previous
+		 *  open request.  When that open completes, it will invoke the
+		 *  open callback for this request.
+		 */
+	}
+}
+
+void
+spdk_fs_open_file_async(struct spdk_filesystem *fs, const char *name, uint32_t flags,
+			spdk_file_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_file *f = NULL;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+		cb_fn(cb_arg, NULL, -ENAMETOOLONG);
+		return;
+	}
+
+	f = fs_find_file(fs, name);
+	if (f == NULL && !(flags & SPDK_BLOBFS_OPEN_CREATE)) {
+		cb_fn(cb_arg, NULL, -ENOENT);
+		return;
+	}
+
+	if (f != NULL && f->is_deleted == true) {
+		cb_fn(cb_arg, NULL, -ENOENT);
+		return;
+	}
+
+	req = alloc_fs_request(fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate async open req for file=%s\n", name);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.file_op_with_handle = cb_fn;
+	args->arg = cb_arg;
+	args->file = f;
+	args->fs = fs;
+	args->op.open.name = name;
+
+	if (f == NULL) {
+		spdk_fs_create_file_async(fs, name, fs_open_blob_create_cb, req);
+	} else {
+		fs_open_blob_create_cb(req, 0);
+	}
+}
+
+static void
+__fs_open_file_done(void *arg, struct spdk_file *file, int bserrno)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	args->file = file;
+	__wake_caller(args, bserrno);
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name);
+}
+
+static void
+__fs_open_file(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name);
+	spdk_fs_open_file_async(args->fs, args->op.open.name, args->op.open.flags,
+				__fs_open_file_done, req);
+}
+
+int
+spdk_fs_open_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+		  const char *name, uint32_t flags, struct spdk_file **file)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name);
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate req for opening file=%s\n", name);
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+	args->fs = fs;
+	args->op.open.name = name;
+	args->op.open.flags = flags;
+	args->sem = &channel->sem;
+	fs->send_request(__fs_open_file, req);
+	sem_wait(&channel->sem);
+	rc = args->rc;
+	if (rc == 0) {
+		*file = args->file;
+	} else {
+		*file = NULL;
+	}
+	free_fs_request(req);
+
+	return rc;
+}
+
+static void
+fs_rename_blob_close_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	args->fn.fs_op(args->arg, bserrno);
+	free_fs_request(req);
+}
+
+static void
+fs_rename_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	const char *new_name = args->op.rename.new_name;
+
+	spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1);
+	spdk_blob_close(blob, fs_rename_blob_close_cb, req);
+}
+
+static void
+_fs_md_rename_file(struct spdk_fs_request *req)
+{
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *f;
+
+	f = fs_find_file(args->fs, args->op.rename.old_name);
+	if (f == NULL) {
+		args->fn.fs_op(args->arg, -ENOENT);
+		free_fs_request(req);
+		return;
+	}
+
+	free(f->name);
+	f->name = strdup(args->op.rename.new_name);
+	_file_build_trace_arg_name(f);
+	args->file = f;
+	spdk_bs_open_blob(args->fs->bs, f->blobid, fs_rename_blob_open_cb, req);
+}
+
+static void
+fs_rename_delete_done(void *arg, int fserrno)
+{
+	_fs_md_rename_file(arg);
+}
+
+void
+spdk_fs_rename_file_async(struct spdk_filesystem *fs,
+			  const char *old_name, const char *new_name,
+			  spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_file *f;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "old=%s new=%s\n", old_name, new_name);
+	if (strnlen(new_name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+		cb_fn(cb_arg, -ENAMETOOLONG);
+		return;
+	}
+
+	req = alloc_fs_request(fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate rename async req for renaming file from %s to %s\n", old_name,
+			    new_name);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.fs_op = cb_fn;
+	args->fs = fs;
+	args->arg = cb_arg;
+	args->op.rename.old_name = old_name;
+	args->op.rename.new_name = new_name;
+
+	f = fs_find_file(fs, new_name);
+	if (f == NULL) {
+		_fs_md_rename_file(req);
+		return;
+	}
+
+	/*
+	 * The rename overwrites an existing file.  So delete the existing file, then
+	 *  do the actual rename.
+	 */
+	spdk_fs_delete_file_async(fs, new_name, fs_rename_delete_done, req);
+}
+
+static void
+__fs_rename_file_done(void *arg, int fserrno)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	__wake_caller(args, fserrno);
+}
+
+static void
+__fs_rename_file(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	spdk_fs_rename_file_async(args->fs, args->op.rename.old_name, args->op.rename.new_name,
+				  __fs_rename_file_done, req);
+}
+
+int
+spdk_fs_rename_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+		    const char *old_name, const char *new_name)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	int rc;
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate rename req for file=%s\n", old_name);
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+
+	args->fs = fs;
+	args->op.rename.old_name = old_name;
+	args->op.rename.new_name = new_name;
+	args->sem = &channel->sem;
+	fs->send_request(__fs_rename_file, req);
+	sem_wait(&channel->sem);
+	rc = args->rc;
+	free_fs_request(req);
+	return rc;
+}
+
+static void
+blob_delete_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	args->fn.file_op(args->arg, bserrno);
+	free_fs_request(req);
+}
+
+void
+spdk_fs_delete_file_async(struct spdk_filesystem *fs, const char *name,
+			  spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_file *f;
+	spdk_blob_id blobid;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name);
+
+	if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+		cb_fn(cb_arg, -ENAMETOOLONG);
+		return;
+	}
+
+	f = fs_find_file(fs, name);
+	if (f == NULL) {
+		SPDK_ERRLOG("Cannot find the file=%s to deleted\n", name);
+		cb_fn(cb_arg, -ENOENT);
+		return;
+	}
+
+	req = alloc_fs_request(fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate the req for the file=%s to deleted\n", name);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.file_op = cb_fn;
+	args->arg = cb_arg;
+
+	if (f->ref_count > 0) {
+		/* If the ref > 0, we mark the file as deleted and delete it when we close it. */
+		f->is_deleted = true;
+		spdk_blob_set_xattr(f->blob, "is_deleted", &f->is_deleted, sizeof(bool));
+		spdk_blob_sync_md(f->blob, blob_delete_cb, req);
+		return;
+	}
+
+	blobid = f->blobid;
+	TAILQ_REMOVE(&fs->files, f, tailq);
+
+	file_free(f);
+
+	spdk_bs_delete_blob(fs->bs, blobid, blob_delete_cb, req);
+}
+
+static uint64_t
+fs_name_to_uint64(const char *name)
+{
+	uint64_t result = 0;
+	memcpy(&result, name, spdk_min(sizeof(result), strlen(name)));
+	return result;
+}
+
+static void
+__fs_delete_file_done(void *arg, int fserrno)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	spdk_trace_record(TRACE_BLOBFS_DELETE_DONE, 0, 0, 0, fs_name_to_uint64(args->op.delete.name));
+	__wake_caller(args, fserrno);
+}
+
+static void
+__fs_delete_file(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	spdk_trace_record(TRACE_BLOBFS_DELETE_START, 0, 0, 0, fs_name_to_uint64(args->op.delete.name));
+	spdk_fs_delete_file_async(args->fs, args->op.delete.name, __fs_delete_file_done, req);
+}
+
+int
+spdk_fs_delete_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+		    const char *name)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	int rc;
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Cannot allocate req to delete file=%s\n", name);
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+	args->fs = fs;
+	args->op.delete.name = name;
+	args->sem = &channel->sem;
+	fs->send_request(__fs_delete_file, req);
+	sem_wait(&channel->sem);
+	rc = args->rc;
+	free_fs_request(req);
+
+	return rc;
+}
+
+spdk_fs_iter
+spdk_fs_iter_first(struct spdk_filesystem *fs)
+{
+	struct spdk_file *f;
+
+	f = TAILQ_FIRST(&fs->files);
+	return f;
+}
+
+spdk_fs_iter
+spdk_fs_iter_next(spdk_fs_iter iter)
+{
+	struct spdk_file *f = iter;
+
+	if (f == NULL) {
+		return NULL;
+	}
+
+	f = TAILQ_NEXT(f, tailq);
+	return f;
+}
+
+const char *
+spdk_file_get_name(struct spdk_file *file)
+{
+	return file->name;
+}
+
+uint64_t
+spdk_file_get_length(struct spdk_file *file)
+{
+	uint64_t length;
+
+	assert(file != NULL);
+
+	length = file->append_pos >= file->length ? file->append_pos : file->length;
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s length=0x%jx\n", file->name, length);
+	return length;
+}
+
+static void
+fs_truncate_complete_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	args->fn.file_op(args->arg, bserrno);
+	free_fs_request(req);
+}
+
+static void
+fs_truncate_resize_cb(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+	uint64_t *length = &args->op.truncate.length;
+
+	if (bserrno) {
+		args->fn.file_op(args->arg, bserrno);
+		free_fs_request(req);
+		return;
+	}
+
+	spdk_blob_set_xattr(file->blob, "length", length, sizeof(*length));
+
+	file->length = *length;
+	if (file->append_pos > file->length) {
+		file->append_pos = file->length;
+	}
+
+	spdk_blob_sync_md(file->blob, fs_truncate_complete_cb, req);
+}
+
+static uint64_t
+__bytes_to_clusters(uint64_t length, uint64_t cluster_sz)
+{
+	return (length + cluster_sz - 1) / cluster_sz;
+}
+
+void
+spdk_file_truncate_async(struct spdk_file *file, uint64_t length,
+			 spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_filesystem *fs;
+	size_t num_clusters;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s old=0x%jx new=0x%jx\n", file->name, file->length, length);
+	if (length == file->length) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	req = alloc_fs_request(file->fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->fn.file_op = cb_fn;
+	args->arg = cb_arg;
+	args->file = file;
+	args->op.truncate.length = length;
+	fs = file->fs;
+
+	num_clusters = __bytes_to_clusters(length, fs->bs_opts.cluster_sz);
+
+	spdk_blob_resize(file->blob, num_clusters, fs_truncate_resize_cb, req);
+}
+
+static void
+__truncate(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	spdk_file_truncate_async(args->file, args->op.truncate.length,
+				 args->fn.file_op, args);
+}
+
+int
+spdk_file_truncate(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx,
+		   uint64_t length)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	int rc;
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+
+	args->file = file;
+	args->op.truncate.length = length;
+	args->fn.file_op = __wake_caller;
+	args->sem = &channel->sem;
+
+	channel->send_request(__truncate, req);
+	sem_wait(&channel->sem);
+	rc = args->rc;
+	free_fs_request(req);
+
+	return rc;
+}
+
+static void
+__rw_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	spdk_free(args->op.rw.pin_buf);
+	args->fn.file_op(args->arg, bserrno);
+	free_fs_request(req);
+}
+
+static void
+_copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt)
+{
+	int i;
+	size_t len;
+
+	for (i = 0; i < iovcnt; i++) {
+		len = spdk_min(iovs[i].iov_len, buf_len);
+		memcpy(buf, iovs[i].iov_base, len);
+		buf += len;
+		assert(buf_len >= len);
+		buf_len -= len;
+	}
+}
+
+static void
+_copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len)
+{
+	int i;
+	size_t len;
+
+	for (i = 0; i < iovcnt; i++) {
+		len = spdk_min(iovs[i].iov_len, buf_len);
+		memcpy(iovs[i].iov_base, buf, len);
+		buf += len;
+		assert(buf_len >= len);
+		buf_len -= len;
+	}
+}
+
+static void
+__read_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	void *buf;
+
+	assert(req != NULL);
+	buf = (void *)((uintptr_t)args->op.rw.pin_buf + (args->op.rw.offset & (args->op.rw.blocklen - 1)));
+	if (args->op.rw.is_read) {
+		_copy_buf_to_iovs(args->iovs, args->iovcnt, buf, args->op.rw.length);
+		__rw_done(req, 0);
+	} else {
+		_copy_iovs_to_buf(buf, args->op.rw.length, args->iovs, args->iovcnt);
+		spdk_blob_io_write(args->file->blob, args->op.rw.channel,
+				   args->op.rw.pin_buf,
+				   args->op.rw.start_lba, args->op.rw.num_lba,
+				   __rw_done, req);
+	}
+}
+
+static void
+__do_blob_read(void *ctx, int fserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	if (fserrno) {
+		__rw_done(req, fserrno);
+		return;
+	}
+	spdk_blob_io_read(args->file->blob, args->op.rw.channel,
+			  args->op.rw.pin_buf,
+			  args->op.rw.start_lba, args->op.rw.num_lba,
+			  __read_done, req);
+}
+
+static void
+__get_page_parameters(struct spdk_file *file, uint64_t offset, uint64_t length,
+		      uint64_t *start_lba, uint32_t *lba_size, uint64_t *num_lba)
+{
+	uint64_t end_lba;
+
+	*lba_size = spdk_bs_get_io_unit_size(file->fs->bs);
+	*start_lba = offset / *lba_size;
+	end_lba = (offset + length - 1) / *lba_size;
+	*num_lba = (end_lba - *start_lba + 1);
+}
+
+static bool
+__is_lba_aligned(struct spdk_file *file, uint64_t offset, uint64_t length)
+{
+	uint32_t lba_size = spdk_bs_get_io_unit_size(file->fs->bs);
+
+	if ((offset % lba_size == 0) && (length % lba_size == 0)) {
+		return true;
+	}
+
+	return false;
+}
+
+static void
+_fs_request_setup_iovs(struct spdk_fs_request *req, struct iovec *iovs, uint32_t iovcnt)
+{
+	uint32_t i;
+
+	for (i = 0; i < iovcnt; i++) {
+		req->args.iovs[i].iov_base = iovs[i].iov_base;
+		req->args.iovs[i].iov_len = iovs[i].iov_len;
+	}
+}
+
+static void
+__readvwritev(struct spdk_file *file, struct spdk_io_channel *_channel,
+	      struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length,
+	      spdk_file_op_complete cb_fn, void *cb_arg, int is_read)
+{
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+	struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel);
+	uint64_t start_lba, num_lba, pin_buf_length;
+	uint32_t lba_size;
+
+	if (is_read && offset + length > file->length) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	req = alloc_fs_request_with_iov(channel, iovcnt);
+	if (req == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	__get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba);
+
+	args = &req->args;
+	args->fn.file_op = cb_fn;
+	args->arg = cb_arg;
+	args->file = file;
+	args->op.rw.channel = channel->bs_channel;
+	_fs_request_setup_iovs(req, iovs, iovcnt);
+	args->op.rw.is_read = is_read;
+	args->op.rw.offset = offset;
+	args->op.rw.blocklen = lba_size;
+
+	pin_buf_length = num_lba * lba_size;
+	args->op.rw.length = pin_buf_length;
+	args->op.rw.pin_buf = spdk_malloc(pin_buf_length, lba_size, NULL,
+					  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (args->op.rw.pin_buf == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Failed to allocate buf for: file=%s offset=%jx length=%jx\n",
+			      file->name, offset, length);
+		free_fs_request(req);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args->op.rw.start_lba = start_lba;
+	args->op.rw.num_lba = num_lba;
+
+	if (!is_read && file->length < offset + length) {
+		spdk_file_truncate_async(file, offset + length, __do_blob_read, req);
+	} else if (!is_read && __is_lba_aligned(file, offset, length)) {
+		_copy_iovs_to_buf(args->op.rw.pin_buf, args->op.rw.length, args->iovs, args->iovcnt);
+		spdk_blob_io_write(args->file->blob, args->op.rw.channel,
+				   args->op.rw.pin_buf,
+				   args->op.rw.start_lba, args->op.rw.num_lba,
+				   __rw_done, req);
+	} else {
+		__do_blob_read(req, 0);
+	}
+}
+
+static void
+__readwrite(struct spdk_file *file, struct spdk_io_channel *channel,
+	    void *payload, uint64_t offset, uint64_t length,
+	    spdk_file_op_complete cb_fn, void *cb_arg, int is_read)
+{
+	struct iovec iov;
+
+	iov.iov_base = payload;
+	iov.iov_len = (size_t)length;
+
+	__readvwritev(file, channel, &iov, 1, offset, length, cb_fn, cb_arg, is_read);
+}
+
+void
+spdk_file_write_async(struct spdk_file *file, struct spdk_io_channel *channel,
+		      void *payload, uint64_t offset, uint64_t length,
+		      spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	__readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 0);
+}
+
+void
+spdk_file_writev_async(struct spdk_file *file, struct spdk_io_channel *channel,
+		       struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length,
+		       spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n",
+		      file->name, offset, length);
+
+	__readvwritev(file, channel, iovs, iovcnt, offset, length, cb_fn, cb_arg, 0);
+}
+
+void
+spdk_file_read_async(struct spdk_file *file, struct spdk_io_channel *channel,
+		     void *payload, uint64_t offset, uint64_t length,
+		     spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n",
+		      file->name, offset, length);
+	__readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 1);
+}
+
+void
+spdk_file_readv_async(struct spdk_file *file, struct spdk_io_channel *channel,
+		      struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length,
+		      spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n",
+		      file->name, offset, length);
+
+	__readvwritev(file, channel, iovs, iovcnt, offset, length, cb_fn, cb_arg, 1);
+}
+
+struct spdk_io_channel *
+spdk_fs_alloc_io_channel(struct spdk_filesystem *fs)
+{
+	struct spdk_io_channel *io_channel;
+	struct spdk_fs_channel *fs_channel;
+
+	io_channel = spdk_get_io_channel(&fs->io_target);
+	fs_channel = spdk_io_channel_get_ctx(io_channel);
+	fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs);
+	fs_channel->send_request = __send_request_direct;
+
+	return io_channel;
+}
+
+void
+spdk_fs_free_io_channel(struct spdk_io_channel *channel)
+{
+	spdk_put_io_channel(channel);
+}
+
+struct spdk_fs_thread_ctx *
+spdk_fs_alloc_thread_ctx(struct spdk_filesystem *fs)
+{
+	struct spdk_fs_thread_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		return NULL;
+	}
+
+	if (pthread_spin_init(&ctx->ch.lock, 0)) {
+		free(ctx);
+		return NULL;
+	}
+
+	fs_channel_create(fs, &ctx->ch, 512);
+
+	ctx->ch.send_request = fs->send_request;
+	ctx->ch.sync = 1;
+
+	return ctx;
+}
+
+
+void
+spdk_fs_free_thread_ctx(struct spdk_fs_thread_ctx *ctx)
+{
+	assert(ctx->ch.sync == 1);
+
+	while (true) {
+		pthread_spin_lock(&ctx->ch.lock);
+		if (ctx->ch.outstanding_reqs == 0) {
+			pthread_spin_unlock(&ctx->ch.lock);
+			break;
+		}
+		pthread_spin_unlock(&ctx->ch.lock);
+		usleep(1000);
+	}
+
+	fs_channel_destroy(NULL, &ctx->ch);
+	free(ctx);
+}
+
+int
+spdk_fs_set_cache_size(uint64_t size_in_mb)
+{
+	/* setting g_fs_cache_size is only permitted if cache pool
+	 * is already freed or hasn't been initialized
+	 */
+	if (g_cache_pool != NULL) {
+		return -EPERM;
+	}
+
+	g_fs_cache_size = size_in_mb * 1024 * 1024;
+
+	return 0;
+}
+
+uint64_t
+spdk_fs_get_cache_size(void)
+{
+	return g_fs_cache_size / (1024 * 1024);
+}
+
+static void __file_flush(void *ctx);
+
+/* Try to free some cache buffers from this file.
+ */
+static int
+reclaim_cache_buffers(struct spdk_file *file)
+{
+	int rc;
+
+	BLOBFS_TRACE(file, "free=%s\n", file->name);
+
+	/* The function is safe to be called with any threads, while the file
+	 * lock maybe locked by other thread for now, so try to get the file
+	 * lock here.
+	 */
+	rc = pthread_spin_trylock(&file->lock);
+	if (rc != 0) {
+		return -1;
+	}
+
+	if (file->tree->present_mask == 0) {
+		pthread_spin_unlock(&file->lock);
+		return -1;
+	}
+	tree_free_buffers(file->tree);
+
+	TAILQ_REMOVE(&g_caches, file, cache_tailq);
+	/* If not freed, put it in the end of the queue */
+	if (file->tree->present_mask != 0) {
+		TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq);
+	} else {
+		file->last = NULL;
+	}
+	pthread_spin_unlock(&file->lock);
+
+	return 0;
+}
+
+static int
+_blobfs_cache_pool_reclaim(void *arg)
+{
+	struct spdk_file *file, *tmp;
+	int rc;
+
+	if (!blobfs_cache_pool_need_reclaim()) {
+		return SPDK_POLLER_IDLE;
+	}
+
+	TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) {
+		if (!file->open_for_writing &&
+		    file->priority == SPDK_FILE_PRIORITY_LOW) {
+			rc = reclaim_cache_buffers(file);
+			if (rc < 0) {
+				continue;
+			}
+			if (!blobfs_cache_pool_need_reclaim()) {
+				return SPDK_POLLER_BUSY;
+			}
+			break;
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) {
+		if (!file->open_for_writing) {
+			rc = reclaim_cache_buffers(file);
+			if (rc < 0) {
+				continue;
+			}
+			if (!blobfs_cache_pool_need_reclaim()) {
+				return SPDK_POLLER_BUSY;
+			}
+			break;
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) {
+		rc = reclaim_cache_buffers(file);
+		if (rc < 0) {
+			continue;
+		}
+		break;
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+_add_file_to_cache_pool(void *ctx)
+{
+	struct spdk_file *file = ctx;
+
+	TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq);
+}
+
+static void
+_remove_file_from_cache_pool(void *ctx)
+{
+	struct spdk_file *file = ctx;
+
+	TAILQ_REMOVE(&g_caches, file, cache_tailq);
+}
+
+static struct cache_buffer *
+cache_insert_buffer(struct spdk_file *file, uint64_t offset)
+{
+	struct cache_buffer *buf;
+	int count = 0;
+	bool need_update = false;
+
+	buf = calloc(1, sizeof(*buf));
+	if (buf == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "calloc failed\n");
+		return NULL;
+	}
+
+	do {
+		buf->buf = spdk_mempool_get(g_cache_pool);
+		if (buf->buf) {
+			break;
+		}
+		if (count++ == 100) {
+			SPDK_ERRLOG("Could not allocate cache buffer for file=%p on offset=%jx\n",
+				    file, offset);
+			free(buf);
+			return NULL;
+		}
+		usleep(BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US);
+	} while (true);
+
+	buf->buf_size = CACHE_BUFFER_SIZE;
+	buf->offset = offset;
+
+	if (file->tree->present_mask == 0) {
+		need_update = true;
+	}
+	file->tree = tree_insert_buffer(file->tree, buf);
+
+	if (need_update) {
+		spdk_thread_send_msg(g_cache_pool_thread, _add_file_to_cache_pool, file);
+	}
+
+	return buf;
+}
+
+static struct cache_buffer *
+cache_append_buffer(struct spdk_file *file)
+{
+	struct cache_buffer *last;
+
+	assert(file->last == NULL || file->last->bytes_filled == file->last->buf_size);
+	assert((file->append_pos % CACHE_BUFFER_SIZE) == 0);
+
+	last = cache_insert_buffer(file, file->append_pos);
+	if (last == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "cache_insert_buffer failed\n");
+		return NULL;
+	}
+
+	file->last = last;
+
+	return last;
+}
+
+static void __check_sync_reqs(struct spdk_file *file);
+
+static void
+__file_cache_finish_sync(void *ctx, int bserrno)
+{
+	struct spdk_file *file;
+	struct spdk_fs_request *sync_req = ctx;
+	struct spdk_fs_cb_args *sync_args;
+
+	sync_args = &sync_req->args;
+	file = sync_args->file;
+	pthread_spin_lock(&file->lock);
+	file->length_xattr = sync_args->op.sync.length;
+	assert(sync_args->op.sync.offset <= file->length_flushed);
+	spdk_trace_record(TRACE_BLOBFS_XATTR_END, 0, sync_args->op.sync.offset,
+			  0, file->trace_arg_name);
+	BLOBFS_TRACE(file, "sync done offset=%jx\n", sync_args->op.sync.offset);
+	TAILQ_REMOVE(&file->sync_requests, sync_req, args.op.sync.tailq);
+	pthread_spin_unlock(&file->lock);
+
+	sync_args->fn.file_op(sync_args->arg, bserrno);
+
+	free_fs_request(sync_req);
+	__check_sync_reqs(file);
+}
+
+static void
+__check_sync_reqs(struct spdk_file *file)
+{
+	struct spdk_fs_request *sync_req;
+
+	pthread_spin_lock(&file->lock);
+
+	TAILQ_FOREACH(sync_req, &file->sync_requests, args.op.sync.tailq) {
+		if (sync_req->args.op.sync.offset <= file->length_flushed) {
+			break;
+		}
+	}
+
+	if (sync_req != NULL && !sync_req->args.op.sync.xattr_in_progress) {
+		BLOBFS_TRACE(file, "set xattr length 0x%jx\n", file->length_flushed);
+		sync_req->args.op.sync.xattr_in_progress = true;
+		sync_req->args.op.sync.length = file->length_flushed;
+		spdk_blob_set_xattr(file->blob, "length", &file->length_flushed,
+				    sizeof(file->length_flushed));
+
+		pthread_spin_unlock(&file->lock);
+		spdk_trace_record(TRACE_BLOBFS_XATTR_START, 0, file->length_flushed,
+				  0, file->trace_arg_name);
+		spdk_blob_sync_md(file->blob, __file_cache_finish_sync, sync_req);
+	} else {
+		pthread_spin_unlock(&file->lock);
+	}
+}
+
+static void
+__file_flush_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+	struct cache_buffer *next = args->op.flush.cache_buffer;
+
+	BLOBFS_TRACE(file, "length=%jx\n", args->op.flush.length);
+
+	pthread_spin_lock(&file->lock);
+	next->in_progress = false;
+	next->bytes_flushed += args->op.flush.length;
+	file->length_flushed += args->op.flush.length;
+	if (file->length_flushed > file->length) {
+		file->length = file->length_flushed;
+	}
+	if (next->bytes_flushed == next->buf_size) {
+		BLOBFS_TRACE(file, "write buffer fully flushed 0x%jx\n", file->length_flushed);
+		next = tree_find_buffer(file->tree, file->length_flushed);
+	}
+
+	/*
+	 * Assert that there is no cached data that extends past the end of the underlying
+	 *  blob.
+	 */
+	assert(next == NULL || next->offset < __file_get_blob_size(file) ||
+	       next->bytes_filled == 0);
+
+	pthread_spin_unlock(&file->lock);
+
+	__check_sync_reqs(file);
+
+	__file_flush(req);
+}
+
+static void
+__file_flush(void *ctx)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+	struct cache_buffer *next;
+	uint64_t offset, length, start_lba, num_lba;
+	uint32_t lba_size;
+
+	pthread_spin_lock(&file->lock);
+	next = tree_find_buffer(file->tree, file->length_flushed);
+	if (next == NULL || next->in_progress ||
+	    ((next->bytes_filled < next->buf_size) && TAILQ_EMPTY(&file->sync_requests))) {
+		/*
+		 * There is either no data to flush, a flush I/O is already in
+		 *  progress, or the next buffer is partially filled but there's no
+		 *  outstanding request to sync it.
+		 * So return immediately - if a flush I/O is in progress we will flush
+		 *  more data after that is completed, or a partial buffer will get flushed
+		 *  when it is either filled or the file is synced.
+		 */
+		free_fs_request(req);
+		if (next == NULL) {
+			/*
+			 * For cases where a file's cache was evicted, and then the
+			 *  file was later appended, we will write the data directly
+			 *  to disk and bypass cache.  So just update length_flushed
+			 *  here to reflect that all data was already written to disk.
+			 */
+			file->length_flushed = file->append_pos;
+		}
+		pthread_spin_unlock(&file->lock);
+		if (next == NULL) {
+			/*
+			 * There is no data to flush, but we still need to check for any
+			 *  outstanding sync requests to make sure metadata gets updated.
+			 */
+			__check_sync_reqs(file);
+		}
+		return;
+	}
+
+	offset = next->offset + next->bytes_flushed;
+	length = next->bytes_filled - next->bytes_flushed;
+	if (length == 0) {
+		free_fs_request(req);
+		pthread_spin_unlock(&file->lock);
+		/*
+		 * There is no data to flush, but we still need to check for any
+		 *  outstanding sync requests to make sure metadata gets updated.
+		 */
+		__check_sync_reqs(file);
+		return;
+	}
+	args->op.flush.length = length;
+	args->op.flush.cache_buffer = next;
+
+	__get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba);
+
+	next->in_progress = true;
+	BLOBFS_TRACE(file, "offset=0x%jx length=0x%jx page start=0x%jx num=0x%jx\n",
+		     offset, length, start_lba, num_lba);
+	pthread_spin_unlock(&file->lock);
+	spdk_blob_io_write(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel,
+			   next->buf + (start_lba * lba_size) - next->offset,
+			   start_lba, num_lba, __file_flush_done, req);
+}
+
+static void
+__file_extend_done(void *arg, int bserrno)
+{
+	struct spdk_fs_cb_args *args = arg;
+
+	__wake_caller(args, bserrno);
+}
+
+static void
+__file_extend_resize_cb(void *_args, int bserrno)
+{
+	struct spdk_fs_cb_args *args = _args;
+	struct spdk_file *file = args->file;
+
+	if (bserrno) {
+		__wake_caller(args, bserrno);
+		return;
+	}
+
+	spdk_blob_sync_md(file->blob, __file_extend_done, args);
+}
+
+static void
+__file_extend_blob(void *_args)
+{
+	struct spdk_fs_cb_args *args = _args;
+	struct spdk_file *file = args->file;
+
+	spdk_blob_resize(file->blob, args->op.resize.num_clusters, __file_extend_resize_cb, args);
+}
+
+static void
+__rw_from_file_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+
+	__wake_caller(&req->args, bserrno);
+	free_fs_request(req);
+}
+
+static void
+__rw_from_file(void *ctx)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+
+	if (args->op.rw.is_read) {
+		spdk_file_read_async(file, file->fs->sync_target.sync_io_channel, args->iovs[0].iov_base,
+				     args->op.rw.offset, (uint64_t)args->iovs[0].iov_len,
+				     __rw_from_file_done, req);
+	} else {
+		spdk_file_write_async(file, file->fs->sync_target.sync_io_channel, args->iovs[0].iov_base,
+				      args->op.rw.offset, (uint64_t)args->iovs[0].iov_len,
+				      __rw_from_file_done, req);
+	}
+}
+
+static int
+__send_rw_from_file(struct spdk_file *file, void *payload,
+		    uint64_t offset, uint64_t length, bool is_read,
+		    struct spdk_fs_channel *channel)
+{
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	req = alloc_fs_request_with_iov(channel, 1);
+	if (req == NULL) {
+		sem_post(&channel->sem);
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+	args->file = file;
+	args->sem = &channel->sem;
+	args->iovs[0].iov_base = payload;
+	args->iovs[0].iov_len = (size_t)length;
+	args->op.rw.offset = offset;
+	args->op.rw.is_read = is_read;
+	file->fs->send_request(__rw_from_file, req);
+	return 0;
+}
+
+int
+spdk_file_write(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx,
+		void *payload, uint64_t offset, uint64_t length)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *flush_req;
+	uint64_t rem_length, copy, blob_size, cluster_sz;
+	uint32_t cache_buffers_filled = 0;
+	uint8_t *cur_payload;
+	struct cache_buffer *last;
+
+	BLOBFS_TRACE_RW(file, "offset=%jx length=%jx\n", offset, length);
+
+	if (length == 0) {
+		return 0;
+	}
+
+	if (offset != file->append_pos) {
+		BLOBFS_TRACE(file, " error offset=%jx append_pos=%jx\n", offset, file->append_pos);
+		return -EINVAL;
+	}
+
+	pthread_spin_lock(&file->lock);
+	file->open_for_writing = true;
+
+	if ((file->last == NULL) && (file->append_pos % CACHE_BUFFER_SIZE == 0)) {
+		cache_append_buffer(file);
+	}
+
+	if (file->last == NULL) {
+		int rc;
+
+		file->append_pos += length;
+		pthread_spin_unlock(&file->lock);
+		rc = __send_rw_from_file(file, payload, offset, length, false, channel);
+		sem_wait(&channel->sem);
+		return rc;
+	}
+
+	blob_size = __file_get_blob_size(file);
+
+	if ((offset + length) > blob_size) {
+		struct spdk_fs_cb_args extend_args = {};
+
+		cluster_sz = file->fs->bs_opts.cluster_sz;
+		extend_args.sem = &channel->sem;
+		extend_args.op.resize.num_clusters = __bytes_to_clusters((offset + length), cluster_sz);
+		extend_args.file = file;
+		BLOBFS_TRACE(file, "start resize to %u clusters\n", extend_args.op.resize.num_clusters);
+		pthread_spin_unlock(&file->lock);
+		file->fs->send_request(__file_extend_blob, &extend_args);
+		sem_wait(&channel->sem);
+		if (extend_args.rc) {
+			return extend_args.rc;
+		}
+	}
+
+	flush_req = alloc_fs_request(channel);
+	if (flush_req == NULL) {
+		pthread_spin_unlock(&file->lock);
+		return -ENOMEM;
+	}
+
+	last = file->last;
+	rem_length = length;
+	cur_payload = payload;
+	while (rem_length > 0) {
+		copy = last->buf_size - last->bytes_filled;
+		if (copy > rem_length) {
+			copy = rem_length;
+		}
+		BLOBFS_TRACE_RW(file, "  fill offset=%jx length=%jx\n", file->append_pos, copy);
+		memcpy(&last->buf[last->bytes_filled], cur_payload, copy);
+		file->append_pos += copy;
+		if (file->length < file->append_pos) {
+			file->length = file->append_pos;
+		}
+		cur_payload += copy;
+		last->bytes_filled += copy;
+		rem_length -= copy;
+		if (last->bytes_filled == last->buf_size) {
+			cache_buffers_filled++;
+			last = cache_append_buffer(file);
+			if (last == NULL) {
+				BLOBFS_TRACE(file, "nomem\n");
+				free_fs_request(flush_req);
+				pthread_spin_unlock(&file->lock);
+				return -ENOMEM;
+			}
+		}
+	}
+
+	pthread_spin_unlock(&file->lock);
+
+	if (cache_buffers_filled == 0) {
+		free_fs_request(flush_req);
+		return 0;
+	}
+
+	flush_req->args.file = file;
+	file->fs->send_request(__file_flush, flush_req);
+	return 0;
+}
+
+static void
+__readahead_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct cache_buffer *cache_buffer = args->op.readahead.cache_buffer;
+	struct spdk_file *file = args->file;
+
+	BLOBFS_TRACE(file, "offset=%jx\n", cache_buffer->offset);
+
+	pthread_spin_lock(&file->lock);
+	cache_buffer->bytes_filled = args->op.readahead.length;
+	cache_buffer->bytes_flushed = args->op.readahead.length;
+	cache_buffer->in_progress = false;
+	pthread_spin_unlock(&file->lock);
+
+	free_fs_request(req);
+}
+
+static void
+__readahead(void *ctx)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+	uint64_t offset, length, start_lba, num_lba;
+	uint32_t lba_size;
+
+	offset = args->op.readahead.offset;
+	length = args->op.readahead.length;
+	assert(length > 0);
+
+	__get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba);
+
+	BLOBFS_TRACE(file, "offset=%jx length=%jx page start=%jx num=%jx\n",
+		     offset, length, start_lba, num_lba);
+	spdk_blob_io_read(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel,
+			  args->op.readahead.cache_buffer->buf,
+			  start_lba, num_lba, __readahead_done, req);
+}
+
+static uint64_t
+__next_cache_buffer_offset(uint64_t offset)
+{
+	return (offset + CACHE_BUFFER_SIZE) & ~(CACHE_TREE_LEVEL_MASK(0));
+}
+
+static void
+check_readahead(struct spdk_file *file, uint64_t offset,
+		struct spdk_fs_channel *channel)
+{
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	offset = __next_cache_buffer_offset(offset);
+	if (tree_find_buffer(file->tree, offset) != NULL || file->length <= offset) {
+		return;
+	}
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		return;
+	}
+	args = &req->args;
+
+	BLOBFS_TRACE(file, "offset=%jx\n", offset);
+
+	args->file = file;
+	args->op.readahead.offset = offset;
+	args->op.readahead.cache_buffer = cache_insert_buffer(file, offset);
+	if (!args->op.readahead.cache_buffer) {
+		BLOBFS_TRACE(file, "Cannot allocate buf for offset=%jx\n", offset);
+		free_fs_request(req);
+		return;
+	}
+
+	args->op.readahead.cache_buffer->in_progress = true;
+	if (file->length < (offset + CACHE_BUFFER_SIZE)) {
+		args->op.readahead.length = file->length & (CACHE_BUFFER_SIZE - 1);
+	} else {
+		args->op.readahead.length = CACHE_BUFFER_SIZE;
+	}
+	file->fs->send_request(__readahead, req);
+}
+
+int64_t
+spdk_file_read(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx,
+	       void *payload, uint64_t offset, uint64_t length)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	uint64_t final_offset, final_length;
+	uint32_t sub_reads = 0;
+	struct cache_buffer *buf;
+	uint64_t read_len;
+	int rc = 0;
+
+	pthread_spin_lock(&file->lock);
+
+	BLOBFS_TRACE_RW(file, "offset=%ju length=%ju\n", offset, length);
+
+	file->open_for_writing = false;
+
+	if (length == 0 || offset >= file->append_pos) {
+		pthread_spin_unlock(&file->lock);
+		return 0;
+	}
+
+	if (offset + length > file->append_pos) {
+		length = file->append_pos - offset;
+	}
+
+	if (offset != file->next_seq_offset) {
+		file->seq_byte_count = 0;
+	}
+	file->seq_byte_count += length;
+	file->next_seq_offset = offset + length;
+	if (file->seq_byte_count >= CACHE_READAHEAD_THRESHOLD) {
+		check_readahead(file, offset, channel);
+		check_readahead(file, offset + CACHE_BUFFER_SIZE, channel);
+	}
+
+	final_length = 0;
+	final_offset = offset + length;
+	while (offset < final_offset) {
+		length = NEXT_CACHE_BUFFER_OFFSET(offset) - offset;
+		if (length > (final_offset - offset)) {
+			length = final_offset - offset;
+		}
+
+		buf = tree_find_filled_buffer(file->tree, offset);
+		if (buf == NULL) {
+			pthread_spin_unlock(&file->lock);
+			rc = __send_rw_from_file(file, payload, offset, length, true, channel);
+			pthread_spin_lock(&file->lock);
+			if (rc == 0) {
+				sub_reads++;
+			}
+		} else {
+			read_len = length;
+			if ((offset + length) > (buf->offset + buf->bytes_filled)) {
+				read_len = buf->offset + buf->bytes_filled - offset;
+			}
+			BLOBFS_TRACE(file, "read %p offset=%ju length=%ju\n", payload, offset, read_len);
+			memcpy(payload, &buf->buf[offset - buf->offset], read_len);
+			if ((offset + read_len) % CACHE_BUFFER_SIZE == 0) {
+				tree_remove_buffer(file->tree, buf);
+				if (file->tree->present_mask == 0) {
+					spdk_thread_send_msg(g_cache_pool_thread, _remove_file_from_cache_pool, file);
+				}
+			}
+		}
+
+		if (rc == 0) {
+			final_length += length;
+		} else {
+			break;
+		}
+		payload += length;
+		offset += length;
+	}
+	pthread_spin_unlock(&file->lock);
+	while (sub_reads > 0) {
+		sem_wait(&channel->sem);
+		sub_reads--;
+	}
+	if (rc == 0) {
+		return final_length;
+	} else {
+		return rc;
+	}
+}
+
+static void
+_file_sync(struct spdk_file *file, struct spdk_fs_channel *channel,
+	   spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_fs_request *sync_req;
+	struct spdk_fs_request *flush_req;
+	struct spdk_fs_cb_args *sync_args;
+	struct spdk_fs_cb_args *flush_args;
+
+	BLOBFS_TRACE(file, "offset=%jx\n", file->append_pos);
+
+	pthread_spin_lock(&file->lock);
+	if (file->append_pos <= file->length_xattr) {
+		BLOBFS_TRACE(file, "done - file already synced\n");
+		pthread_spin_unlock(&file->lock);
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	sync_req = alloc_fs_request(channel);
+	if (!sync_req) {
+		SPDK_ERRLOG("Cannot allocate sync req for file=%s\n", file->name);
+		pthread_spin_unlock(&file->lock);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	sync_args = &sync_req->args;
+
+	flush_req = alloc_fs_request(channel);
+	if (!flush_req) {
+		SPDK_ERRLOG("Cannot allocate flush req for file=%s\n", file->name);
+		free_fs_request(sync_req);
+		pthread_spin_unlock(&file->lock);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	flush_args = &flush_req->args;
+
+	sync_args->file = file;
+	sync_args->fn.file_op = cb_fn;
+	sync_args->arg = cb_arg;
+	sync_args->op.sync.offset = file->append_pos;
+	sync_args->op.sync.xattr_in_progress = false;
+	TAILQ_INSERT_TAIL(&file->sync_requests, sync_req, args.op.sync.tailq);
+	pthread_spin_unlock(&file->lock);
+
+	flush_args->file = file;
+	channel->send_request(__file_flush, flush_req);
+}
+
+int
+spdk_file_sync(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_cb_args args = {};
+
+	args.sem = &channel->sem;
+	_file_sync(file, channel, __wake_caller, &args);
+	sem_wait(&channel->sem);
+
+	return args.rc;
+}
+
+void
+spdk_file_sync_async(struct spdk_file *file, struct spdk_io_channel *_channel,
+		     spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel);
+
+	_file_sync(file, channel, cb_fn, cb_arg);
+}
+
+void
+spdk_file_set_priority(struct spdk_file *file, uint32_t priority)
+{
+	BLOBFS_TRACE(file, "priority=%u\n", priority);
+	file->priority = priority;
+
+}
+
+/*
+ * Close routines
+ */
+
+static void
+__file_close_async_done(void *ctx, int bserrno)
+{
+	struct spdk_fs_request *req = ctx;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+
+	spdk_trace_record(TRACE_BLOBFS_CLOSE, 0, 0, 0, file->trace_arg_name);
+
+	if (file->is_deleted) {
+		spdk_fs_delete_file_async(file->fs, file->name, blob_delete_cb, ctx);
+		return;
+	}
+
+	args->fn.file_op(args->arg, bserrno);
+	free_fs_request(req);
+}
+
+static void
+__file_close_async(struct spdk_file *file, struct spdk_fs_request *req)
+{
+	struct spdk_blob *blob;
+
+	pthread_spin_lock(&file->lock);
+	if (file->ref_count == 0) {
+		pthread_spin_unlock(&file->lock);
+		__file_close_async_done(req, -EBADF);
+		return;
+	}
+
+	file->ref_count--;
+	if (file->ref_count > 0) {
+		pthread_spin_unlock(&file->lock);
+		req->args.fn.file_op(req->args.arg, 0);
+		free_fs_request(req);
+		return;
+	}
+
+	pthread_spin_unlock(&file->lock);
+
+	blob = file->blob;
+	file->blob = NULL;
+	spdk_blob_close(blob, __file_close_async_done, req);
+}
+
+static void
+__file_close_async__sync_done(void *arg, int fserrno)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+
+	__file_close_async(args->file, req);
+}
+
+void
+spdk_file_close_async(struct spdk_file *file, spdk_file_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	req = alloc_fs_request(file->fs->md_target.md_fs_channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate close async req for file=%s\n", file->name);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	args = &req->args;
+	args->file = file;
+	args->fn.file_op = cb_fn;
+	args->arg = cb_arg;
+
+	spdk_file_sync_async(file, file->fs->md_target.md_io_channel, __file_close_async__sync_done, req);
+}
+
+static void
+__file_close(void *arg)
+{
+	struct spdk_fs_request *req = arg;
+	struct spdk_fs_cb_args *args = &req->args;
+	struct spdk_file *file = args->file;
+
+	__file_close_async(file, req);
+}
+
+int
+spdk_file_close(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx)
+{
+	struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+	struct spdk_fs_request *req;
+	struct spdk_fs_cb_args *args;
+
+	req = alloc_fs_request(channel);
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot allocate close req for file=%s\n", file->name);
+		return -ENOMEM;
+	}
+
+	args = &req->args;
+
+	spdk_file_sync(file, ctx);
+	BLOBFS_TRACE(file, "name=%s\n", file->name);
+	args->file = file;
+	args->sem = &channel->sem;
+	args->fn.file_op = __wake_caller;
+	args->arg = args;
+	channel->send_request(__file_close, req);
+	sem_wait(&channel->sem);
+
+	return args->rc;
+}
+
+int
+spdk_file_get_id(struct spdk_file *file, void *id, size_t size)
+{
+	if (size < sizeof(spdk_blob_id)) {
+		return -EINVAL;
+	}
+
+	memcpy(id, &file->blobid, sizeof(spdk_blob_id));
+
+	return sizeof(spdk_blob_id);
+}
+
+static void
+_file_free(void *ctx)
+{
+	struct spdk_file *file = ctx;
+
+	TAILQ_REMOVE(&g_caches, file, cache_tailq);
+
+	free(file->name);
+	free(file->tree);
+	free(file);
+}
+
+static void
+file_free(struct spdk_file *file)
+{
+	BLOBFS_TRACE(file, "free=%s\n", file->name);
+	pthread_spin_lock(&file->lock);
+	if (file->tree->present_mask == 0) {
+		pthread_spin_unlock(&file->lock);
+		free(file->name);
+		free(file->tree);
+		free(file);
+		return;
+	}
+
+	tree_free_buffers(file->tree);
+	assert(file->tree->present_mask == 0);
+	spdk_thread_send_msg(g_cache_pool_thread, _file_free, file);
+	pthread_spin_unlock(&file->lock);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("blobfs", SPDK_LOG_BLOBFS)
+SPDK_LOG_REGISTER_COMPONENT("blobfs_rw", SPDK_LOG_BLOBFS_RW)
diff --git a/src/spdk/lib/blobfs/spdk_blobfs.map b/src/spdk/lib/blobfs/spdk_blobfs.map
new file mode 100644
index 000000000..91c02f61e
--- /dev/null
+++ b/src/spdk/lib/blobfs/spdk_blobfs.map
@@ -0,0 +1,45 @@
+{
+	global:
+
+	# public functions
+	spdk_fs_opts_init;
+	spdk_fs_init;
+	spdk_fs_load;
+	spdk_fs_unload;
+	spdk_fs_alloc_io_channel;
+	spdk_fs_free_io_channel;
+	spdk_fs_alloc_thread_ctx;
+	spdk_fs_free_thread_ctx;
+	spdk_fs_file_stat;
+	spdk_fs_create_file;
+	spdk_fs_open_file;
+	spdk_file_close;
+	spdk_fs_rename_file;
+	spdk_fs_delete_file;
+	spdk_fs_iter_first;
+	spdk_fs_iter_next;
+	spdk_file_truncate;
+	spdk_file_get_name;
+	spdk_file_get_length;
+	spdk_file_write;
+	spdk_file_read;
+	spdk_fs_set_cache_size;
+	spdk_fs_get_cache_size;
+	spdk_file_set_priority;
+	spdk_file_sync;
+	spdk_file_get_id;
+	spdk_file_readv_async;
+	spdk_file_writev_async;
+	spdk_fs_file_stat_async;
+	spdk_fs_create_file_async;
+	spdk_fs_open_file_async;
+	spdk_file_close_async;
+	spdk_fs_rename_file_async;
+	spdk_fs_delete_file_async;
+	spdk_file_truncate_async;
+	spdk_file_write_async;
+	spdk_file_read_async;
+	spdk_file_sync_async;
+
+	local: *;
+};
diff --git a/src/spdk/lib/blobfs/tree.c b/src/spdk/lib/blobfs/tree.c
new file mode 100644
index 000000000..32779766f
--- /dev/null
+++ b/src/spdk/lib/blobfs/tree.c
@@ -0,0 +1,181 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blobfs.h"
+#include "tree.h"
+
+#include "spdk/queue.h"
+#include "spdk/assert.h"
+#include "spdk/env.h"
+#include "spdk_internal/log.h"
+
+uint32_t g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT;
+
+struct cache_buffer *
+tree_find_buffer(struct cache_tree *tree, uint64_t offset)
+{
+	uint64_t index;
+
+	while (tree != NULL) {
+		index = offset / CACHE_TREE_LEVEL_SIZE(tree->level);
+		if (index >= CACHE_TREE_WIDTH) {
+			return NULL;
+		}
+		if (tree->level == 0) {
+			return tree->u.buffer[index];
+		} else {
+			offset &= CACHE_TREE_LEVEL_MASK(tree->level);
+			tree = tree->u.tree[index];
+		}
+	}
+
+	return NULL;
+}
+
+struct cache_buffer *
+tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset)
+{
+	struct cache_buffer *buf;
+
+	buf = tree_find_buffer(tree, offset);
+	if (buf != NULL && buf->bytes_filled > 0) {
+		return buf;
+	} else {
+		return NULL;
+	}
+}
+
+struct cache_tree *
+tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer)
+{
+	struct cache_tree *tree;
+	uint64_t index, offset;
+
+	offset = buffer->offset;
+	while (offset >= CACHE_TREE_LEVEL_SIZE(root->level + 1)) {
+		if (root->present_mask != 0) {
+			tree = calloc(1, sizeof(*tree));
+			tree->level = root->level + 1;
+			tree->u.tree[0] = root;
+			root = tree;
+			root->present_mask = 0x1ULL;
+		} else {
+			root->level++;
+		}
+	}
+
+	tree = root;
+	while (tree->level > 0) {
+		index = offset / CACHE_TREE_LEVEL_SIZE(tree->level);
+		assert(index < CACHE_TREE_WIDTH);
+		offset &= CACHE_TREE_LEVEL_MASK(tree->level);
+		if (tree->u.tree[index] == NULL) {
+			tree->u.tree[index] = calloc(1, sizeof(*tree));
+			tree->u.tree[index]->level = tree->level - 1;
+			tree->present_mask |= (1ULL << index);
+		}
+		tree = tree->u.tree[index];
+	}
+
+	index = offset / CACHE_BUFFER_SIZE;
+	assert(index < CACHE_TREE_WIDTH);
+	assert(tree->u.buffer[index] == NULL);
+	tree->u.buffer[index] = buffer;
+	tree->present_mask |= (1ULL << index);
+	return root;
+}
+
+void
+tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer)
+{
+	struct cache_tree *child;
+	uint64_t index;
+
+	index = CACHE_TREE_INDEX(tree->level, buffer->offset);
+
+	if (tree->level == 0) {
+		assert(tree->u.buffer[index] != NULL);
+		assert(buffer == tree->u.buffer[index]);
+		tree->present_mask &= ~(1ULL << index);
+		tree->u.buffer[index] = NULL;
+		cache_buffer_free(buffer);
+		return;
+	}
+
+	child = tree->u.tree[index];
+	assert(child != NULL);
+	tree_remove_buffer(child, buffer);
+	if (child->present_mask == 0) {
+		tree->present_mask &= ~(1ULL << index);
+		tree->u.tree[index] = NULL;
+		free(child);
+	}
+}
+
+void
+tree_free_buffers(struct cache_tree *tree)
+{
+	struct cache_buffer *buffer;
+	struct cache_tree *child;
+	uint32_t i;
+
+	if (tree->present_mask == 0) {
+		return;
+	}
+
+	if (tree->level == 0) {
+		for (i = 0; i < CACHE_TREE_WIDTH; i++) {
+			buffer = tree->u.buffer[i];
+			if (buffer != NULL && buffer->in_progress == false &&
+			    buffer->bytes_filled == buffer->bytes_flushed) {
+				cache_buffer_free(buffer);
+				tree->u.buffer[i] = NULL;
+				tree->present_mask &= ~(1ULL << i);
+			}
+		}
+	} else {
+		for (i = 0; i < CACHE_TREE_WIDTH; i++) {
+			child = tree->u.tree[i];
+			if (child != NULL) {
+				tree_free_buffers(child);
+				if (child->present_mask == 0) {
+					free(child);
+					tree->u.tree[i] = NULL;
+					tree->present_mask &= ~(1ULL << i);
+				}
+			}
+		}
+	}
+}
diff --git a/src/spdk/lib/blobfs/tree.h b/src/spdk/lib/blobfs/tree.h
new file mode 100644
index 000000000..71df71090
--- /dev/null
+++ b/src/spdk/lib/blobfs/tree.h
@@ -0,0 +1,77 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_TREE_H_
+#define SPDK_TREE_H_
+
+struct cache_buffer {
+	uint8_t			*buf;
+	uint64_t		offset;
+	uint32_t		buf_size;
+	uint32_t		bytes_filled;
+	uint32_t		bytes_flushed;
+	bool			in_progress;
+};
+
+extern uint32_t g_fs_cache_buffer_shift;
+
+#define CACHE_BUFFER_SHIFT_DEFAULT 18
+#define CACHE_BUFFER_SIZE (1U << g_fs_cache_buffer_shift)
+#define NEXT_CACHE_BUFFER_OFFSET(offset)	\
+	(((offset + CACHE_BUFFER_SIZE) >> g_fs_cache_buffer_shift) << g_fs_cache_buffer_shift)
+
+#define CACHE_TREE_SHIFT 6
+#define CACHE_TREE_WIDTH (1U << CACHE_TREE_SHIFT)
+#define CACHE_TREE_LEVEL_SHIFT(level)	(g_fs_cache_buffer_shift + (level) * CACHE_TREE_SHIFT)
+#define CACHE_TREE_LEVEL_SIZE(level)	(1ULL << CACHE_TREE_LEVEL_SHIFT(level))
+#define CACHE_TREE_LEVEL_MASK(level)	(CACHE_TREE_LEVEL_SIZE(level) - 1)
+#define CACHE_TREE_INDEX(level, offset)	((offset >> CACHE_TREE_LEVEL_SHIFT(level)) & (CACHE_TREE_WIDTH - 1))
+
+struct cache_tree {
+	uint8_t			level;
+	uint64_t		present_mask;
+	union {
+		struct cache_buffer	*buffer[CACHE_TREE_WIDTH];
+		struct cache_tree	*tree[CACHE_TREE_WIDTH];
+	} u;
+};
+
+void cache_buffer_free(struct cache_buffer *cache_buffer);
+
+struct cache_tree *tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer);
+void tree_free_buffers(struct cache_tree *tree);
+struct cache_buffer *tree_find_buffer(struct cache_tree *tree, uint64_t offset);
+struct cache_buffer *tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset);
+void tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer);
+
+#endif /* SPDK_TREE_H_ */
diff --git a/src/spdk/lib/conf/Makefile b/src/spdk/lib/conf/Makefile
new file mode 100644
index 000000000..09966ea12
--- /dev/null
+++ b/src/spdk/lib/conf/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 1
+
+C_SRCS = conf.c
+LIBNAME = conf
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_conf.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/conf/conf.c b/src/spdk/lib/conf/conf.c
new file mode 100644
index 000000000..287e157a5
--- /dev/null
+++ b/src/spdk/lib/conf/conf.c
@@ -0,0 +1,704 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/string.h"
+#include "spdk/log.h"
+
+struct spdk_conf_value {
+	struct spdk_conf_value *next;
+	char *value;
+};
+
+struct spdk_conf_item {
+	struct spdk_conf_item *next;
+	char *key;
+	struct spdk_conf_value *val;
+};
+
+struct spdk_conf_section {
+	struct spdk_conf_section *next;
+	char *name;
+	int num;
+	struct spdk_conf_item *item;
+};
+
+struct spdk_conf {
+	char *file;
+	struct spdk_conf_section *current_section;
+	struct spdk_conf_section *section;
+	bool merge_sections;
+};
+
+#define CF_DELIM " \t"
+#define CF_DELIM_KEY " \t="
+
+#define LIB_MAX_TMPBUF 1024
+
+static struct spdk_conf *default_config = NULL;
+
+struct spdk_conf *
+spdk_conf_allocate(void)
+{
+	struct spdk_conf *ret = calloc(1, sizeof(struct spdk_conf));
+
+	if (ret) {
+		ret->merge_sections = true;
+	}
+
+	return ret;
+}
+
+static void
+free_conf_value(struct spdk_conf_value *vp)
+{
+	if (vp == NULL) {
+		return;
+	}
+
+	if (vp->value) {
+		free(vp->value);
+	}
+
+	free(vp);
+}
+
+static void
+free_all_conf_value(struct spdk_conf_value *vp)
+{
+	struct spdk_conf_value *next;
+
+	if (vp == NULL) {
+		return;
+	}
+
+	while (vp != NULL) {
+		next = vp->next;
+		free_conf_value(vp);
+		vp = next;
+	}
+}
+
+static void
+free_conf_item(struct spdk_conf_item *ip)
+{
+	if (ip == NULL) {
+		return;
+	}
+
+	if (ip->val != NULL) {
+		free_all_conf_value(ip->val);
+	}
+
+	if (ip->key != NULL) {
+		free(ip->key);
+	}
+
+	free(ip);
+}
+
+static void
+free_all_conf_item(struct spdk_conf_item *ip)
+{
+	struct spdk_conf_item *next;
+
+	if (ip == NULL) {
+		return;
+	}
+
+	while (ip != NULL) {
+		next = ip->next;
+		free_conf_item(ip);
+		ip = next;
+	}
+}
+
+static void
+free_conf_section(struct spdk_conf_section *sp)
+{
+	if (sp == NULL) {
+		return;
+	}
+
+	if (sp->item) {
+		free_all_conf_item(sp->item);
+	}
+
+	if (sp->name) {
+		free(sp->name);
+	}
+
+	free(sp);
+}
+
+static void
+free_all_conf_section(struct spdk_conf_section *sp)
+{
+	struct spdk_conf_section *next;
+
+	if (sp == NULL) {
+		return;
+	}
+
+	while (sp != NULL) {
+		next = sp->next;
+		free_conf_section(sp);
+		sp = next;
+	}
+}
+
+void
+spdk_conf_free(struct spdk_conf *cp)
+{
+	if (cp == NULL) {
+		return;
+	}
+
+	if (cp->section != NULL) {
+		free_all_conf_section(cp->section);
+	}
+
+	if (cp->file != NULL) {
+		free(cp->file);
+	}
+
+	free(cp);
+}
+
+static struct spdk_conf_section *
+allocate_cf_section(void)
+{
+	return calloc(1, sizeof(struct spdk_conf_section));
+}
+
+static struct spdk_conf_item *
+allocate_cf_item(void)
+{
+	return calloc(1, sizeof(struct spdk_conf_item));
+}
+
+static struct spdk_conf_value *
+allocate_cf_value(void)
+{
+	return calloc(1, sizeof(struct spdk_conf_value));
+}
+
+
+#define CHECK_CP_OR_USE_DEFAULT(cp) (((cp) == NULL) && (default_config != NULL)) ? default_config : (cp)
+
+struct spdk_conf_section *
+spdk_conf_find_section(struct spdk_conf *cp, const char *name)
+{
+	struct spdk_conf_section *sp;
+
+	if (name == NULL || name[0] == '\0') {
+		return NULL;
+	}
+
+	cp = CHECK_CP_OR_USE_DEFAULT(cp);
+	if (cp == NULL) {
+		return NULL;
+	}
+
+	for (sp = cp->section; sp != NULL; sp = sp->next) {
+		if (sp->name != NULL && sp->name[0] == name[0]
+		    && strcasecmp(sp->name, name) == 0) {
+			return sp;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_conf_section *
+spdk_conf_first_section(struct spdk_conf *cp)
+{
+	cp = CHECK_CP_OR_USE_DEFAULT(cp);
+	if (cp == NULL) {
+		return NULL;
+	}
+
+	return cp->section;
+}
+
+struct spdk_conf_section *
+spdk_conf_next_section(struct spdk_conf_section *sp)
+{
+	if (sp == NULL) {
+		return NULL;
+	}
+
+	return sp->next;
+}
+
+static void
+append_cf_section(struct spdk_conf *cp, struct spdk_conf_section *sp)
+{
+	struct spdk_conf_section *last;
+
+	cp = CHECK_CP_OR_USE_DEFAULT(cp);
+	if (cp == NULL) {
+		SPDK_ERRLOG("cp == NULL\n");
+		return;
+	}
+
+	if (cp->section == NULL) {
+		cp->section = sp;
+		return;
+	}
+
+	for (last = cp->section; last->next != NULL; last = last->next)
+		;
+	last->next = sp;
+}
+
+static struct spdk_conf_item *
+find_cf_nitem(struct spdk_conf_section *sp, const char *key, int idx)
+{
+	struct spdk_conf_item *ip;
+	int i;
+
+	if (key == NULL || key[0] == '\0') {
+		return NULL;
+	}
+
+	i = 0;
+	for (ip = sp->item; ip != NULL; ip = ip->next) {
+		if (ip->key != NULL && ip->key[0] == key[0]
+		    && strcasecmp(ip->key, key) == 0) {
+			if (i == idx) {
+				return ip;
+			}
+			i++;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+append_cf_item(struct spdk_conf_section *sp, struct spdk_conf_item *ip)
+{
+	struct spdk_conf_item *last;
+
+	if (sp == NULL) {
+		return;
+	}
+
+	if (sp->item == NULL) {
+		sp->item = ip;
+		return;
+	}
+
+	for (last = sp->item; last->next != NULL; last = last->next)
+		;
+	last->next = ip;
+}
+
+static void
+append_cf_value(struct spdk_conf_item *ip, struct spdk_conf_value *vp)
+{
+	struct spdk_conf_value *last;
+
+	if (ip == NULL) {
+		return;
+	}
+
+	if (ip->val == NULL) {
+		ip->val = vp;
+		return;
+	}
+
+	for (last = ip->val; last->next != NULL; last = last->next)
+		;
+	last->next = vp;
+}
+
+bool
+spdk_conf_section_match_prefix(const struct spdk_conf_section *sp, const char *name_prefix)
+{
+	return strncasecmp(sp->name, name_prefix, strlen(name_prefix)) == 0;
+}
+
+const char *
+spdk_conf_section_get_name(const struct spdk_conf_section *sp)
+{
+	return sp->name;
+}
+
+int
+spdk_conf_section_get_num(const struct spdk_conf_section *sp)
+{
+	return sp->num;
+}
+
+char *
+spdk_conf_section_get_nmval(struct spdk_conf_section *sp, const char *key, int idx1, int idx2)
+{
+	struct spdk_conf_item *ip;
+	struct spdk_conf_value *vp;
+	int i;
+
+	ip = find_cf_nitem(sp, key, idx1);
+	if (ip == NULL) {
+		return NULL;
+	}
+
+	vp = ip->val;
+	if (vp == NULL) {
+		return NULL;
+	}
+
+	for (i = 0; vp != NULL; vp = vp->next, i++) {
+		if (i == idx2) {
+			return vp->value;
+		}
+	}
+
+	return NULL;
+}
+
+char *
+spdk_conf_section_get_nval(struct spdk_conf_section *sp, const char *key, int idx)
+{
+	struct spdk_conf_item *ip;
+	struct spdk_conf_value *vp;
+
+	ip = find_cf_nitem(sp, key, idx);
+	if (ip == NULL) {
+		return NULL;
+	}
+
+	vp = ip->val;
+	if (vp == NULL) {
+		return NULL;
+	}
+
+	return vp->value;
+}
+
+char *
+spdk_conf_section_get_val(struct spdk_conf_section *sp, const char *key)
+{
+	return spdk_conf_section_get_nval(sp, key, 0);
+}
+
+int
+spdk_conf_section_get_intval(struct spdk_conf_section *sp, const char *key)
+{
+	const char *v;
+	int value;
+
+	v = spdk_conf_section_get_nval(sp, key, 0);
+	if (v == NULL) {
+		return -1;
+	}
+
+	value = (int)spdk_strtol(v, 10);
+	return value;
+}
+
+bool
+spdk_conf_section_get_boolval(struct spdk_conf_section *sp, const char *key, bool default_val)
+{
+	const char *v;
+
+	v = spdk_conf_section_get_nval(sp, key, 0);
+	if (v == NULL) {
+		return default_val;
+	}
+
+	if (!strcasecmp(v, "Yes") || !strcasecmp(v, "Y") || !strcasecmp(v, "True")) {
+		return true;
+	}
+
+	if (!strcasecmp(v, "No") || !strcasecmp(v, "N") || !strcasecmp(v, "False")) {
+		return false;
+	}
+
+	return default_val;
+}
+
+static int
+parse_line(struct spdk_conf *cp, char *lp)
+{
+	struct spdk_conf_section *sp;
+	struct spdk_conf_item *ip;
+	struct spdk_conf_value *vp;
+	char *arg;
+	char *key;
+	char *val;
+	char *p;
+	int num;
+
+	arg = spdk_str_trim(lp);
+	if (arg == NULL) {
+		SPDK_ERRLOG("no section\n");
+		return -1;
+	}
+
+	if (arg[0] == '[') {
+		/* section */
+		arg++;
+		key = spdk_strsepq(&arg, "]");
+		if (key == NULL || arg != NULL) {
+			SPDK_ERRLOG("broken section\n");
+			return -1;
+		}
+		/* determine section number */
+		for (p = key; *p != '\0' && !isdigit((int) *p); p++)
+			;
+		if (*p != '\0') {
+			num = (int)spdk_strtol(p, 10);
+		} else {
+			num = 0;
+		}
+
+		if (cp->merge_sections) {
+			sp = spdk_conf_find_section(cp, key);
+		} else {
+			sp = NULL;
+		}
+
+		if (sp == NULL) {
+			sp = allocate_cf_section();
+			append_cf_section(cp, sp);
+
+			sp->name = strdup(key);
+			if (sp->name == NULL) {
+				SPDK_ERRLOG("cannot duplicate %s to sp->name\n", key);
+				return -1;
+			}
+		}
+		cp->current_section = sp;
+
+
+		sp->num = num;
+	} else {
+		/* parameters */
+		sp = cp->current_section;
+		if (sp == NULL) {
+			SPDK_ERRLOG("unknown section\n");
+			return -1;
+		}
+		key = spdk_strsepq(&arg, CF_DELIM_KEY);
+		if (key == NULL) {
+			SPDK_ERRLOG("broken key\n");
+			return -1;
+		}
+
+		ip = allocate_cf_item();
+		if (ip == NULL) {
+			SPDK_ERRLOG("cannot allocate cf item\n");
+			return -1;
+		}
+		append_cf_item(sp, ip);
+		ip->key = strdup(key);
+		if (ip->key == NULL) {
+			SPDK_ERRLOG("cannot make duplicate of %s\n", key);
+			return -1;
+		}
+		ip->val = NULL;
+		if (arg != NULL) {
+			/* key has value(s) */
+			while (arg != NULL) {
+				val = spdk_strsepq(&arg, CF_DELIM);
+				vp = allocate_cf_value();
+				if (vp == NULL) {
+					SPDK_ERRLOG("cannot allocate cf value\n");
+					return -1;
+				}
+				append_cf_value(ip, vp);
+				vp->value = strdup(val);
+				if (vp->value == NULL) {
+					SPDK_ERRLOG("cannot duplicate %s to vp->value\n", val);
+					return -1;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+static char *
+fgets_line(FILE *fp)
+{
+	char *dst, *dst2, *p;
+	size_t total, len;
+
+	dst = p = malloc(LIB_MAX_TMPBUF);
+	if (!dst) {
+		return NULL;
+	}
+
+	dst[0] = '\0';
+	total = 0;
+
+	while (fgets(p, LIB_MAX_TMPBUF, fp) != NULL) {
+		len = strlen(p);
+		total += len;
+		if (len + 1 < LIB_MAX_TMPBUF || dst[total - 1] == '\n') {
+			dst2 = realloc(dst, total + 1);
+			if (!dst2) {
+				free(dst);
+				return NULL;
+			} else {
+				return dst2;
+			}
+		}
+
+		dst2 = realloc(dst, total + LIB_MAX_TMPBUF);
+		if (!dst2) {
+			free(dst);
+			return NULL;
+		} else {
+			dst = dst2;
+		}
+
+		p = dst + total;
+	}
+
+	if (feof(fp) && total != 0) {
+		dst2 = realloc(dst, total + 2);
+		if (!dst2) {
+			free(dst);
+			return NULL;
+		} else {
+			dst = dst2;
+		}
+
+		dst[total] = '\n';
+		dst[total + 1] = '\0';
+		return dst;
+	}
+
+	free(dst);
+
+	return NULL;
+}
+
+int
+spdk_conf_read(struct spdk_conf *cp, const char *file)
+{
+	FILE *fp;
+	char *lp, *p;
+	char *lp2, *q;
+	int line;
+	int n, n2;
+
+	if (file == NULL || file[0] == '\0') {
+		return -1;
+	}
+	SPDK_ERRLOG("INI configuration has been deprecated and will be removed in a future release. Please switch to JSON-RPC.\n");
+
+	fp = fopen(file, "r");
+	if (fp == NULL) {
+		SPDK_ERRLOG("open error: %s\n", file);
+		return -1;
+	}
+
+	cp->file = strdup(file);
+	if (cp->file == NULL) {
+		SPDK_ERRLOG("cannot duplicate %s to cp->file\n", file);
+		fclose(fp);
+		return -1;
+	}
+
+	line = 1;
+	while ((lp = fgets_line(fp)) != NULL) {
+		/* skip spaces */
+		for (p = lp; *p != '\0' && isspace((int) *p); p++)
+			;
+		/* skip comment, empty line */
+		if (p[0] == '#' || p[0] == '\0') {
+			goto next_line;
+		}
+
+		/* concatenate line end with '\' */
+		n = strlen(p);
+		while (n > 2 && p[n - 1] == '\n' && p[n - 2] == '\\') {
+			n -= 2;
+			lp2 = fgets_line(fp);
+			if (lp2 == NULL) {
+				break;
+			}
+
+			line++;
+			n2 = strlen(lp2);
+
+			q = malloc(n + n2 + 1);
+			if (!q) {
+				free(lp2);
+				free(lp);
+				SPDK_ERRLOG("malloc failed at line %d of %s\n", line, cp->file);
+				fclose(fp);
+				return -1;
+			}
+
+			memcpy(q, p, n);
+			memcpy(q + n, lp2, n2);
+			q[n + n2] = '\0';
+			free(lp2);
+			free(lp);
+			p = lp = q;
+			n += n2;
+		}
+
+		/* parse one line */
+		if (parse_line(cp, p) < 0) {
+			SPDK_ERRLOG("parse error at line %d of %s\n", line, cp->file);
+		}
+next_line:
+		line++;
+		free(lp);
+	}
+
+	fclose(fp);
+	return 0;
+}
+
+void
+spdk_conf_set_as_default(struct spdk_conf *cp)
+{
+	default_config = cp;
+}
+
+void
+spdk_conf_disable_sections_merge(struct spdk_conf *cp)
+{
+	cp->merge_sections = false;
+}
diff --git a/src/spdk/lib/conf/spdk_conf.map b/src/spdk/lib/conf/spdk_conf.map
new file mode 100644
index 000000000..0fc01c8aa
--- /dev/null
+++ b/src/spdk/lib/conf/spdk_conf.map
@@ -0,0 +1,23 @@
+{
+	global:
+
+	# Public functions
+	spdk_conf_allocate;
+	spdk_conf_free;
+	spdk_conf_read;
+	spdk_conf_find_section;
+	spdk_conf_first_section;
+	spdk_conf_next_section;
+	spdk_conf_section_match_prefix;
+	spdk_conf_section_get_name;
+	spdk_conf_section_get_num;
+	spdk_conf_section_get_nmval;
+	spdk_conf_section_get_nval;
+	spdk_conf_section_get_val;
+	spdk_conf_section_get_intval;
+	spdk_conf_section_get_boolval;
+	spdk_conf_set_as_default;
+	spdk_conf_disable_sections_merge;
+
+	local: *;
+};
diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile
new file mode 100644
index 000000000..11433fe86
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/Makefile
@@ -0,0 +1,47 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = env.c memory.c pci.c init.c threads.c
+C_SRCS += pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c
+LIBNAME = env_dpdk
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_env_dpdk.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c
new file mode 100644
index 000000000..94b709de9
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.c
@@ -0,0 +1,451 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+#include "spdk/env_dpdk.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_mempool.h>
+#include <rte_memzone.h>
+#include <rte_version.h>
+
+static uint64_t
+virt_to_phys(void *vaddr)
+{
+	uint64_t ret;
+
+	ret = rte_malloc_virt2iova(vaddr);
+	if (ret != RTE_BAD_IOVA) {
+		return ret;
+	}
+
+	return spdk_vtophys(vaddr, NULL);
+}
+
+void *
+spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+	void *buf;
+
+	if (flags == 0) {
+		return NULL;
+	}
+
+	align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+	buf = rte_malloc_socket(NULL, size, align, socket_id);
+	if (buf && phys_addr) {
+#ifdef DEBUG
+		fprintf(stderr, "phys_addr param in spdk_*malloc() is deprecated\n");
+#endif
+		*phys_addr = virt_to_phys(buf);
+	}
+	return buf;
+}
+
+void *
+spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+	void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags);
+	if (buf) {
+		memset(buf, 0, size);
+	}
+	return buf;
+}
+
+void *
+spdk_realloc(void *buf, size_t size, size_t align)
+{
+	align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+	return rte_realloc(buf, size, align);
+}
+
+void
+spdk_free(void *buf)
+{
+	rte_free(buf);
+}
+
+void *
+spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+	return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+	return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+	return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+	return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr)
+{
+	void *new_buf;
+
+	align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+	new_buf = rte_realloc(buf, size, align);
+	if (new_buf && phys_addr) {
+		*phys_addr = virt_to_phys(new_buf);
+	}
+	return new_buf;
+}
+
+void
+spdk_dma_free(void *buf)
+{
+	spdk_free(buf);
+}
+
+void *
+spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
+			     unsigned flags, unsigned align)
+{
+	const struct rte_memzone *mz;
+	unsigned dpdk_flags = 0;
+
+	if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) {
+		dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG;
+	}
+
+	if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+		socket_id = SOCKET_ID_ANY;
+	}
+
+	mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align);
+
+	if (mz != NULL) {
+		memset(mz->addr, 0, len);
+		return mz->addr;
+	} else {
+		return NULL;
+	}
+}
+
+void *
+spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags)
+{
+	return spdk_memzone_reserve_aligned(name, len, socket_id, flags,
+					    RTE_CACHE_LINE_SIZE);
+}
+
+void *
+spdk_memzone_lookup(const char *name)
+{
+	const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+	if (mz != NULL) {
+		return mz->addr;
+	} else {
+		return NULL;
+	}
+}
+
+int
+spdk_memzone_free(const char *name)
+{
+	const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+	if (mz != NULL) {
+		return rte_memzone_free(mz);
+	}
+
+	return -1;
+}
+
+void
+spdk_memzone_dump(FILE *f)
+{
+	rte_memzone_dump(f);
+}
+
+struct spdk_mempool *
+spdk_mempool_create_ctor(const char *name, size_t count,
+			 size_t ele_size, size_t cache_size, int socket_id,
+			 spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg)
+{
+	struct rte_mempool *mp;
+	size_t tmp;
+
+	if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+		socket_id = SOCKET_ID_ANY;
+	}
+
+	/* No more than half of all elements can be in cache */
+	tmp = (count / 2) / rte_lcore_count();
+	if (cache_size > tmp) {
+		cache_size = tmp;
+	}
+
+	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+	}
+
+	mp = rte_mempool_create(name, count, ele_size, cache_size,
+				0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg,
+				socket_id, MEMPOOL_F_NO_PHYS_CONTIG);
+
+	return (struct spdk_mempool *)mp;
+}
+
+
+struct spdk_mempool *
+spdk_mempool_create(const char *name, size_t count,
+		    size_t ele_size, size_t cache_size, int socket_id)
+{
+	return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id,
+					NULL, NULL);
+}
+
+char *
+spdk_mempool_get_name(struct spdk_mempool *mp)
+{
+	return ((struct rte_mempool *)mp)->name;
+}
+
+void
+spdk_mempool_free(struct spdk_mempool *mp)
+{
+	rte_mempool_free((struct rte_mempool *)mp);
+}
+
+void *
+spdk_mempool_get(struct spdk_mempool *mp)
+{
+	void *ele = NULL;
+	int rc;
+
+	rc = rte_mempool_get((struct rte_mempool *)mp, &ele);
+	if (rc != 0) {
+		return NULL;
+	}
+	return ele;
+}
+
+int
+spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+	return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+void
+spdk_mempool_put(struct spdk_mempool *mp, void *ele)
+{
+	rte_mempool_put((struct rte_mempool *)mp, ele);
+}
+
+void
+spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+	rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+size_t
+spdk_mempool_count(const struct spdk_mempool *pool)
+{
+	return rte_mempool_avail_count((struct rte_mempool *)pool);
+}
+
+uint32_t
+spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb,
+		      void *obj_cb_arg)
+{
+	return rte_mempool_obj_iter((struct rte_mempool *)mp, (rte_mempool_obj_cb_t *)obj_cb,
+				    obj_cb_arg);
+}
+
+struct spdk_mempool *
+spdk_mempool_lookup(const char *name)
+{
+	return (struct spdk_mempool *)rte_mempool_lookup(name);
+}
+
+bool
+spdk_process_is_primary(void)
+{
+	return (rte_eal_process_type() == RTE_PROC_PRIMARY);
+}
+
+uint64_t spdk_get_ticks(void)
+{
+	return rte_get_timer_cycles();
+}
+
+uint64_t spdk_get_ticks_hz(void)
+{
+	return rte_get_timer_hz();
+}
+
+void spdk_delay_us(unsigned int us)
+{
+	rte_delay_us(us);
+}
+
+void spdk_pause(void)
+{
+	rte_pause();
+}
+
+void
+spdk_unaffinitize_thread(void)
+{
+	rte_cpuset_t new_cpuset, orig_cpuset;
+	long num_cores, i, orig_num_cores;
+
+	CPU_ZERO(&new_cpuset);
+
+	num_cores = sysconf(_SC_NPROCESSORS_CONF);
+
+	/* Create a mask containing all CPUs */
+	for (i = 0; i < num_cores; i++) {
+		CPU_SET(i, &new_cpuset);
+	}
+
+	rte_thread_get_affinity(&orig_cpuset);
+	orig_num_cores = CPU_COUNT(&orig_cpuset);
+	if (orig_num_cores < num_cores) {
+		for (i = 0; i < orig_num_cores; i++) {
+			if (CPU_ISSET(i, &orig_cpuset)) {
+				CPU_CLR(i, &new_cpuset);
+			}
+		}
+	}
+
+	rte_thread_set_affinity(&new_cpuset);
+}
+
+void *
+spdk_call_unaffinitized(void *cb(void *arg), void *arg)
+{
+	rte_cpuset_t orig_cpuset;
+	void *ret;
+
+	if (cb == NULL) {
+		return NULL;
+	}
+
+	rte_thread_get_affinity(&orig_cpuset);
+
+	spdk_unaffinitize_thread();
+
+	ret = cb(arg);
+
+	rte_thread_set_affinity(&orig_cpuset);
+
+	return ret;
+}
+
+struct spdk_ring *
+spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id)
+{
+	char ring_name[64];
+	static uint32_t ring_num = 0;
+	unsigned flags = RING_F_EXACT_SZ;
+
+	switch (type) {
+	case SPDK_RING_TYPE_SP_SC:
+		flags |= RING_F_SP_ENQ | RING_F_SC_DEQ;
+		break;
+	case SPDK_RING_TYPE_MP_SC:
+		flags |= RING_F_SC_DEQ;
+		break;
+	case SPDK_RING_TYPE_MP_MC:
+		flags |= 0;
+		break;
+	default:
+		return NULL;
+	}
+
+	snprintf(ring_name, sizeof(ring_name), "ring_%u_%d",
+		 __atomic_fetch_add(&ring_num, 1, __ATOMIC_RELAXED), getpid());
+
+	return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags);
+}
+
+void
+spdk_ring_free(struct spdk_ring *ring)
+{
+	rte_ring_free((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_count(struct spdk_ring *ring)
+{
+	return rte_ring_count((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count,
+		  size_t *free_space)
+{
+	return rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count,
+				     (unsigned int *)free_space);
+}
+
+size_t
+spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count)
+{
+	return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL);
+}
+
+void
+spdk_env_dpdk_dump_mem_stats(FILE *file)
+{
+	fprintf(file, "DPDK memory size %lu\n", rte_eal_get_physmem_size());
+	fprintf(file, "DPDK memory layout\n");
+	rte_dump_physmem_layout(file);
+	fprintf(file, "DPDK memzones.\n");
+	rte_memzone_dump(file);
+	fprintf(file, "DPDK mempools.\n");
+	rte_mempool_list_dump(file);
+	fprintf(file, "DPDK malloc stats.\n");
+	rte_malloc_dump_stats(file, NULL);
+	fprintf(file, "DPDK malloc heaps.\n");
+	rte_malloc_dump_heaps(file);
+}
diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk
new file mode 100644
index 000000000..c2bfb0d19
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.mk
@@ -0,0 +1,176 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# This makefile snippet must define the following flags:
+# ENV_CFLAGS
+# ENV_CXXFLAGS
+# ENV_LIBS
+# ENV_LINKER_ARGS
+
+DPDK_DIR = $(CONFIG_DPDK_DIR)
+
+export DPDK_ABS_DIR = $(abspath $(DPDK_DIR))
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h))
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include
+else
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk
+endif
+DPDK_INC := -I$(DPDK_INC_DIR)
+
+ifeq ($(CONFIG_SHARED),y)
+DPDK_LIB_EXT = .so
+else
+DPDK_LIB_EXT = .a
+endif
+
+DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf
+
+# librte_mempool_ring was new added from DPDK 17.05. Link this library used for
+#   ring based mempool management API.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*))
+DPDK_LIB_LIST += rte_mempool_ring
+endif
+
+# librte_malloc was removed after DPDK 2.1.  Link this library conditionally based on its
+#  existence to maintain backward compatibility.
+ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),)
+DPDK_LIB_LIST += rte_malloc
+endif
+
+# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally
+# based on their existence to maintain backward compatibility.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*))
+DPDK_LIB_LIST += rte_pci
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*))
+DPDK_LIB_LIST += rte_bus_pci
+endif
+
+# DPDK 20.05 eal dependency
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_telemetry.*))
+DPDK_LIB_LIST += rte_telemetry
+endif
+
+# There are some complex dependencies when using crypto, reduce or both so
+# here we add the feature specific ones and set a flag to add the common
+# ones after that.
+DPDK_FRAMEWORK=n
+ifeq ($(CONFIG_CRYPTO),y)
+DPDK_FRAMEWORK=y
+DPDK_LIB_LIST += rte_pmd_aesni_mb rte_reorder
+endif
+
+ifeq ($(CONFIG_REDUCE),y)
+DPDK_FRAMEWORK=y
+DPDK_LIB_LIST += rte_pmd_isal
+endif
+
+ifeq ($(DPDK_FRAMEWORK),y)
+DPDK_LIB_LIST += rte_cryptodev rte_compressdev rte_bus_vdev rte_pmd_qat
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*))
+DPDK_LIB_LIST += rte_kvargs
+endif
+
+LINK_HASH=n
+
+ifeq ($(CONFIG_VHOST),y)
+ifneq ($(CONFIG_VHOST_INTERNAL_LIB),y)
+DPDK_LIB_LIST += rte_vhost rte_net
+LINK_HASH=y
+ifneq ($(DPDK_FRAMEWORK),y)
+DPDK_LIB_LIST += rte_cryptodev
+endif
+endif
+endif
+
+ifeq ($(CONFIG_RAID5),y)
+LINK_HASH=y
+endif
+
+ifeq ($(LINK_HASH),y)
+DPDK_LIB_LIST += rte_hash
+endif
+
+define dpdk_lib_list_to_libs
+$(1:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT))
+endef
+
+define dpdk_env_linker_args
+$(ENV_DPDK_FILE) -Wl,--whole-archive,--no-as-needed $(call dpdk_lib_list_to_libs,$1) -Wl,--no-whole-archive
+endef
+
+DPDK_LIB = $(call dpdk_lib_list_to_libs,$(DPDK_LIB_LIST))
+
+# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05
+ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations
+ENV_CXXFLAGS = $(ENV_CFLAGS)
+ifeq ($(CONFIG_SHARED),y)
+ENV_DPDK_FILE = $(call spdk_lib_list_to_shared_libs,env_dpdk)
+else
+ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk)
+endif
+ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB)
+ENV_LINKER_ARGS = -Wl,-rpath-link $(DPDK_ABS_DIR)/lib
+ENV_LINKER_ARGS += $(call dpdk_env_linker_args,$(DPDK_LIB_LIST))
+
+ifeq ($(CONFIG_IPSEC_MB),y)
+ENV_LINKER_ARGS += -lIPSec_MB -L$(IPSEC_MB_DIR)
+endif
+
+ifeq ($(CONFIG_REDUCE),y)
+ENV_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs
+endif
+
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+# DPDK built with meson puts those defines elsewhere
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_build_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_build_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+ifeq ($(OS),Linux)
+ENV_LINKER_ARGS += -ldl
+endif
+ifeq ($(OS),FreeBSD)
+ENV_LINKER_ARGS += -lexecinfo
+endif
diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h
new file mode 100644
index 000000000..c7900d9d3
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env_internal.h
@@ -0,0 +1,98 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ENV_INTERNAL_H
+#define SPDK_ENV_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+
+#include <rte_config.h>
+#include <rte_version.h>
+#include <rte_eal.h>
+#include <rte_bus.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_dev.h>
+
+#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0)
+#error RTE_VERSION is too old! Minimum 18.11 is required.
+#endif
+
+/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
+ * which is enough to cover 256 TB.
+ */
+#define SHIFT_256TB	48 /* (1 << 48) == 256 TB */
+#define MASK_256TB	((1ULL << SHIFT_256TB) - 1)
+
+#define SHIFT_1GB	30 /* (1 << 30) == 1 GB */
+#define MASK_1GB	((1ULL << SHIFT_1GB) - 1)
+
+#define SPDK_PCI_DRIVER_MAX_NAME_LEN 32
+struct spdk_pci_driver {
+	struct rte_pci_driver		driver;
+
+	const char                      *name;
+	const struct spdk_pci_id	*id_table;
+	uint32_t			drv_flags;
+
+	spdk_pci_enum_cb		cb_fn;
+	void				*cb_arg;
+	TAILQ_ENTRY(spdk_pci_driver)	tailq;
+};
+
+int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
+int pci_device_fini(struct rte_pci_device *device);
+
+void pci_env_init(void);
+void pci_env_reinit(void);
+void pci_env_fini(void);
+int mem_map_init(bool legacy_mem);
+int vtophys_init(void);
+
+/**
+ * Report a DMA-capable PCI device to the vtophys translation code.
+ * Increases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called after a `rte_pci_device` is created.
+ */
+void vtophys_pci_device_added(struct rte_pci_device *pci_device);
+
+/**
+ * Report the removal of a DMA-capable PCI device to the vtophys translation code.
+ * Decreases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called before a `rte_pci_device` is destroyed.
+ */
+void vtophys_pci_device_removed(struct rte_pci_device *pci_device);
+
+#endif
diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c
new file mode 100644
index 000000000..0376dbe7b
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/init.c
@@ -0,0 +1,604 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include "spdk/version.h"
+#include "spdk/env_dpdk.h"
+
+#include <rte_config.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
+#define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
+#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE	-1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
+#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
+#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
+
+static char **g_eal_cmdline;
+static int g_eal_cmdline_argcount;
+static bool g_external_init = true;
+
+static char *
+_sprintf_alloc(const char *format, ...)
+{
+	va_list args;
+	va_list args_copy;
+	char *buf;
+	size_t bufsize;
+	int rc;
+
+	va_start(args, format);
+
+	/* Try with a small buffer first. */
+	bufsize = 32;
+
+	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
+	while (bufsize <= 1024 * 1024) {
+		buf = malloc(bufsize);
+		if (buf == NULL) {
+			va_end(args);
+			return NULL;
+		}
+
+		va_copy(args_copy, args);
+		rc = vsnprintf(buf, bufsize, format, args_copy);
+		va_end(args_copy);
+
+		/*
+		 * If vsnprintf() returned a count within our current buffer size, we are done.
+		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
+		 */
+		if (rc >= 0 && (size_t)rc < bufsize) {
+			va_end(args);
+			return buf;
+		}
+
+		/*
+		 * vsnprintf() should return the required space, but some libc versions do not
+		 * implement this correctly, so just double the buffer size and try again.
+		 *
+		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
+		 * again to avoid a copy.
+		 */
+		free(buf);
+		bufsize *= 2;
+	}
+
+	va_end(args);
+	return NULL;
+}
+
+void
+spdk_env_opts_init(struct spdk_env_opts *opts)
+{
+	if (!opts) {
+		return;
+	}
+
+	memset(opts, 0, sizeof(*opts));
+
+	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
+	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
+	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
+	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
+	opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
+	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
+	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
+}
+
+static void
+free_args(char **args, int argcount)
+{
+	int i;
+
+	if (args == NULL) {
+		return;
+	}
+
+	for (i = 0; i < argcount; i++) {
+		free(args[i]);
+	}
+
+	if (argcount) {
+		free(args);
+	}
+}
+
+static char **
+push_arg(char *args[], int *argcount, char *arg)
+{
+	char **tmp;
+
+	if (arg == NULL) {
+		fprintf(stderr, "%s: NULL arg supplied\n", __func__);
+		free_args(args, *argcount);
+		return NULL;
+	}
+
+	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
+	if (tmp == NULL) {
+		free(arg);
+		free_args(args, *argcount);
+		return NULL;
+	}
+
+	tmp[*argcount] = arg;
+	(*argcount)++;
+
+	return tmp;
+}
+
+#if defined(__linux__) && defined(__x86_64__)
+
+/* TODO: Can likely get this value from rlimits in the future */
+#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
+#define VTD_CAP_MGAW_SHIFT 16
+#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
+
+static int
+get_iommu_width(void)
+{
+	DIR *dir;
+	FILE *file;
+	struct dirent *entry;
+	char mgaw_path[64];
+	char buf[64];
+	char *end;
+	long long int val;
+	int width, tmp;
+
+	dir = opendir("/sys/devices/virtual/iommu/");
+	if (dir == NULL) {
+		return -EINVAL;
+	}
+
+	width = 0;
+
+	while ((entry = readdir(dir)) != NULL) {
+		/* Find directories named "dmar0", "dmar1", etc */
+		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
+			continue;
+		}
+
+		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
+			       entry->d_name);
+		if ((unsigned)tmp >= sizeof(mgaw_path)) {
+			continue;
+		}
+
+		file = fopen(mgaw_path, "r");
+		if (file == NULL) {
+			continue;
+		}
+
+		if (fgets(buf, sizeof(buf), file) == NULL) {
+			fclose(file);
+			continue;
+		}
+
+		val = strtoll(buf, &end, 16);
+		if (val == LLONG_MIN || val == LLONG_MAX) {
+			fclose(file);
+			continue;
+		}
+
+		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
+		if (width == 0 || tmp < width) {
+			width = tmp;
+		}
+
+		fclose(file);
+	}
+
+	closedir(dir);
+
+	return width;
+}
+
+#endif
+
+static int
+build_eal_cmdline(const struct spdk_env_opts *opts)
+{
+	int argcount = 0;
+	char **args;
+
+	args = NULL;
+
+	/* set the program name */
+	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
+	if (args == NULL) {
+		return -1;
+	}
+
+	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
+	if (opts->shm_id < 0) {
+		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* set the coremask */
+	/* NOTE: If coremask starts with '[' and ends with ']' it is a core list
+	 */
+	if (opts->core_mask[0] == '[') {
+		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
+
+		if (l_arg != NULL) {
+			int len = strlen(l_arg);
+
+			if (l_arg[len - 1] == ']') {
+				l_arg[len - 1] = '\0';
+			}
+		}
+		args = push_arg(args, &argcount, l_arg);
+	} else {
+		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
+	}
+
+	if (args == NULL) {
+		return -1;
+	}
+
+	/* set the memory channel number */
+	if (opts->mem_channel > 0) {
+		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* set the memory size */
+	if (opts->mem_size >= 0) {
+		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* set the master core */
+	if (opts->master_core > 0) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
+				opts->master_core));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* set no pci  if enabled */
+	if (opts->no_pci) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* create just one hugetlbfs file */
+	if (opts->hugepage_single_segments) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* unlink hugepages after initialization */
+	if (opts->unlink_hugepage) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	/* use a specific hugetlbfs mount */
+	if (opts->hugedir) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+	if (opts->num_pci_addr) {
+		size_t i;
+		char bdf[32];
+		struct spdk_pci_addr *pci_addr =
+				opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
+
+		for (i = 0; i < opts->num_pci_addr; i++) {
+			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
+			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
+					(opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
+					bdf));
+			if (args == NULL) {
+				return -1;
+			}
+		}
+	}
+
+	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
+	 * This can be overridden by specifying the same option in opts->env_context
+	 */
+	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
+	if (args == NULL) {
+		return -1;
+	}
+
+	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
+	 * This can be overridden by specifying the same option in opts->env_context
+	 */
+	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
+	if (args == NULL) {
+		return -1;
+	}
+
+	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
+	 * vhost user message. We don't want that. The same log type is also used by a couple
+	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
+	 * be overridden via opts->env_context.
+	 */
+	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
+	if (args == NULL) {
+		return -1;
+	}
+
+	if (opts->env_context) {
+		args = push_arg(args, &argcount, strdup(opts->env_context));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+
+#ifdef __linux__
+
+	if (opts->iova_mode) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
+		if (args == NULL) {
+			return -1;
+		}
+	} else {
+		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
+		 * but DPDK guesses it should be iova-mode=va. Add a check and force
+		 * iova-mode=pa here. */
+		if (rte_vfio_noiommu_is_enabled()) {
+			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+			if (args == NULL) {
+				return -1;
+			}
+		}
+
+#if defined(__x86_64__)
+		/* DPDK by default guesses that it should be using iova-mode=va so that it can
+		 * support running as an unprivileged user. However, some systems (especially
+		 * virtual machines) don't have an IOMMU capable of handling the full virtual
+		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
+		 * and force iova-mode=pa here. */
+		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
+			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+			if (args == NULL) {
+				return -1;
+			}
+		}
+#elif defined(__PPC64__)
+		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
+		 * auto-detect at the moment, so we'll just force it here. */
+		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+		if (args == NULL) {
+			return -1;
+		}
+#endif
+	}
+
+
+	/* Set the base virtual address - it must be an address that is not in the
+	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
+	 * mmap hint.
+	 *
+	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+	 */
+	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
+	if (args == NULL) {
+		return -1;
+	}
+
+	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
+	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
+	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
+	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
+	 */
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
+		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+#endif
+
+	if (opts->shm_id < 0) {
+		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
+				getpid()));
+		if (args == NULL) {
+			return -1;
+		}
+	} else {
+		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
+				opts->shm_id));
+		if (args == NULL) {
+			return -1;
+		}
+
+		/* set the process type */
+		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
+		if (args == NULL) {
+			return -1;
+		}
+	}
+#endif
+
+	g_eal_cmdline = args;
+	g_eal_cmdline_argcount = argcount;
+	return argcount;
+}
+
+int
+spdk_env_dpdk_post_init(bool legacy_mem)
+{
+	int rc;
+
+	pci_env_init();
+
+	rc = mem_map_init(legacy_mem);
+	if (rc < 0) {
+		fprintf(stderr, "Failed to allocate mem_map\n");
+		return rc;
+	}
+
+	rc = vtophys_init();
+	if (rc < 0) {
+		fprintf(stderr, "Failed to initialize vtophys\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+void
+spdk_env_dpdk_post_fini(void)
+{
+	pci_env_fini();
+
+	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
+	g_eal_cmdline = NULL;
+	g_eal_cmdline_argcount = 0;
+}
+
+int
+spdk_env_init(const struct spdk_env_opts *opts)
+{
+	char **dpdk_args = NULL;
+	int i, rc;
+	int orig_optind;
+	bool legacy_mem;
+
+	/* If SPDK env has been initialized before, then only pci env requires
+	 * reinitialization.
+	 */
+	if (g_external_init == false) {
+		if (opts != NULL) {
+			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
+			return -EINVAL;
+		}
+
+		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
+		pci_env_reinit();
+
+		return 0;
+	}
+
+	if (opts == NULL) {
+		fprintf(stderr, "NULL arguments to initialize DPDK\n");
+		return -EINVAL;
+	}
+
+	rc = build_eal_cmdline(opts);
+	if (rc < 0) {
+		fprintf(stderr, "Invalid arguments to initialize DPDK\n");
+		return -EINVAL;
+	}
+
+	printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
+	printf("[ DPDK EAL parameters: ");
+	for (i = 0; i < g_eal_cmdline_argcount; i++) {
+		printf("%s ", g_eal_cmdline[i]);
+	}
+	printf("]\n");
+
+	/* DPDK rearranges the array we pass to it, so make a copy
+	 * before passing so we can still free the individual strings
+	 * correctly.
+	 */
+	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
+	if (dpdk_args == NULL) {
+		fprintf(stderr, "Failed to allocate dpdk_args\n");
+		return -ENOMEM;
+	}
+	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
+
+	fflush(stdout);
+	orig_optind = optind;
+	optind = 1;
+	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
+	optind = orig_optind;
+
+	free(dpdk_args);
+
+	if (rc < 0) {
+		if (rte_errno == EALREADY) {
+			fprintf(stderr, "DPDK already initialized\n");
+		} else {
+			fprintf(stderr, "Failed to initialize DPDK\n");
+		}
+		return -rte_errno;
+	}
+
+	legacy_mem = false;
+	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
+		legacy_mem = true;
+	}
+
+	rc = spdk_env_dpdk_post_init(legacy_mem);
+	if (rc == 0) {
+		g_external_init = false;
+	}
+
+	return rc;
+}
+
+void
+spdk_env_fini(void)
+{
+	spdk_env_dpdk_post_fini();
+}
+
+bool
+spdk_env_dpdk_external_init(void)
+{
+	return g_external_init;
+}
diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c
new file mode 100644
index 000000000..4c2205a46
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/memory.c
@@ -0,0 +1,1442 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "spdk_internal/assert.h"
+
+#include "spdk/assert.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/env_dpdk.h"
+
+#ifdef __FreeBSD__
+#define VFIO_ENABLED 0
+#else
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
+#define VFIO_ENABLED 1
+#include <linux/vfio.h>
+#include <rte_vfio.h>
+
+struct spdk_vfio_dma_map {
+	struct vfio_iommu_type1_dma_map map;
+	struct vfio_iommu_type1_dma_unmap unmap;
+	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
+};
+
+struct vfio_cfg {
+	int fd;
+	bool enabled;
+	bool noiommu_enabled;
+	unsigned device_ref;
+	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
+	pthread_mutex_t mutex;
+};
+
+static struct vfio_cfg g_vfio = {
+	.fd = -1,
+	.enabled = false,
+	.noiommu_enabled = false,
+	.device_ref = 0,
+	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
+	.mutex = PTHREAD_MUTEX_INITIALIZER
+};
+
+#else
+#define VFIO_ENABLED 0
+#endif
+#endif
+
+#if DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
+#define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
+
+#define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
+#define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
+
+/* Page is registered */
+#define REG_MAP_REGISTERED	(1ULL << 62)
+
+/* A notification region barrier. The 2MB translation entry that's marked
+ * with this flag must be unregistered separately. This allows contiguous
+ * regions to be unregistered in the same chunks they were registered.
+ */
+#define REG_MAP_NOTIFY_START	(1ULL << 63)
+
+/* Translation of a single 2MB page. */
+struct map_2mb {
+	uint64_t translation_2mb;
+};
+
+/* Second-level map table indexed by bits [21..29] of the virtual address.
+ * Each entry contains the address translation or error for entries that haven't
+ * been retrieved yet.
+ */
+struct map_1gb {
+	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
+};
+
+/* Top-level map table indexed by bits [30..47] of the virtual address.
+ * Each entry points to a second-level map table or NULL.
+ */
+struct map_256tb {
+	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
+};
+
+/* Page-granularity memory address translation */
+struct spdk_mem_map {
+	struct map_256tb map_256tb;
+	pthread_mutex_t mutex;
+	uint64_t default_translation;
+	struct spdk_mem_map_ops ops;
+	void *cb_ctx;
+	TAILQ_ENTRY(spdk_mem_map) tailq;
+};
+
+/* Registrations map. The 64 bit translations are bit fields with the
+ * following layout (starting with the low bits):
+ *    0 - 61 : reserved
+ *   62 - 63 : flags
+ */
+static struct spdk_mem_map *g_mem_reg_map;
+static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
+	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
+static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static bool g_legacy_mem;
+
+/*
+ * Walk the currently registered memory via the main memory registration map
+ * and call the new map's notify callback for each virtually contiguous region.
+ */
+static int
+mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
+{
+	size_t idx_256tb;
+	uint64_t idx_1gb;
+	uint64_t contig_start = UINT64_MAX;
+	uint64_t contig_end = UINT64_MAX;
+	struct map_1gb *map_1gb;
+	int rc;
+
+	if (!g_mem_reg_map) {
+		return -EINVAL;
+	}
+
+	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
+	pthread_mutex_lock(&g_mem_reg_map->mutex);
+
+	for (idx_256tb = 0;
+	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
+	     idx_256tb++) {
+		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+		if (!map_1gb) {
+			if (contig_start != UINT64_MAX) {
+				/* End of of a virtually contiguous range */
+				rc = map->ops.notify_cb(map->cb_ctx, map, action,
+							(void *)contig_start,
+							contig_end - contig_start + VALUE_2MB);
+				/* Don't bother handling unregister failures. It can't be any worse */
+				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+					goto err_unregister;
+				}
+			}
+			contig_start = UINT64_MAX;
+			continue;
+		}
+
+		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
+			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+			    (contig_start == UINT64_MAX ||
+			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+				/* Rebuild the virtual address from the indexes */
+				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+				if (contig_start == UINT64_MAX) {
+					contig_start = vaddr;
+				}
+
+				contig_end = vaddr;
+			} else {
+				if (contig_start != UINT64_MAX) {
+					/* End of of a virtually contiguous range */
+					rc = map->ops.notify_cb(map->cb_ctx, map, action,
+								(void *)contig_start,
+								contig_end - contig_start + VALUE_2MB);
+					/* Don't bother handling unregister failures. It can't be any worse */
+					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+						goto err_unregister;
+					}
+
+					/* This page might be a part of a neighbour region, so process
+					 * it again. The idx_1gb will be incremented immediately.
+					 */
+					idx_1gb--;
+				}
+				contig_start = UINT64_MAX;
+			}
+		}
+	}
+
+	pthread_mutex_unlock(&g_mem_reg_map->mutex);
+	return 0;
+
+err_unregister:
+	/* Unwind to the first empty translation so we don't unregister
+	 * a region that just failed to register.
+	 */
+	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
+	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
+	contig_start = UINT64_MAX;
+	contig_end = UINT64_MAX;
+
+	/* Unregister any memory we managed to register before the failure */
+	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
+		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+		if (!map_1gb) {
+			if (contig_end != UINT64_MAX) {
+				/* End of of a virtually contiguous range */
+				map->ops.notify_cb(map->cb_ctx, map,
+						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+						   (void *)contig_start,
+						   contig_end - contig_start + VALUE_2MB);
+			}
+			contig_end = UINT64_MAX;
+			continue;
+		}
+
+		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
+			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+				/* Rebuild the virtual address from the indexes */
+				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+				if (contig_end == UINT64_MAX) {
+					contig_end = vaddr;
+				}
+				contig_start = vaddr;
+			} else {
+				if (contig_end != UINT64_MAX) {
+					/* End of of a virtually contiguous range */
+					map->ops.notify_cb(map->cb_ctx, map,
+							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+							   (void *)contig_start,
+							   contig_end - contig_start + VALUE_2MB);
+					idx_1gb++;
+				}
+				contig_end = UINT64_MAX;
+			}
+		}
+		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
+	}
+
+	pthread_mutex_unlock(&g_mem_reg_map->mutex);
+	return rc;
+}
+
+struct spdk_mem_map *
+spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
+{
+	struct spdk_mem_map *map;
+	int rc;
+
+	map = calloc(1, sizeof(*map));
+	if (map == NULL) {
+		return NULL;
+	}
+
+	if (pthread_mutex_init(&map->mutex, NULL)) {
+		free(map);
+		return NULL;
+	}
+
+	map->default_translation = default_translation;
+	map->cb_ctx = cb_ctx;
+	if (ops) {
+		map->ops = *ops;
+	}
+
+	if (ops && ops->notify_cb) {
+		pthread_mutex_lock(&g_spdk_mem_map_mutex);
+		rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
+		if (rc != 0) {
+			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+			DEBUG_PRINT("Initial mem_map notify failed\n");
+			pthread_mutex_destroy(&map->mutex);
+			free(map);
+			return NULL;
+		}
+		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
+		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+	}
+
+	return map;
+}
+
+void
+spdk_mem_map_free(struct spdk_mem_map **pmap)
+{
+	struct spdk_mem_map *map;
+	size_t i;
+
+	if (!pmap) {
+		return;
+	}
+
+	map = *pmap;
+
+	if (!map) {
+		return;
+	}
+
+	if (map->ops.notify_cb) {
+		pthread_mutex_lock(&g_spdk_mem_map_mutex);
+		mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
+		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
+		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+	}
+
+	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
+		free(map->map_256tb.map[i]);
+	}
+
+	pthread_mutex_destroy(&map->mutex);
+
+	free(map);
+	*pmap = NULL;
+}
+
+int
+spdk_mem_register(void *vaddr, size_t len)
+{
+	struct spdk_mem_map *map;
+	int rc;
+	void *seg_vaddr;
+	size_t seg_len;
+	uint64_t reg;
+
+	if ((uintptr_t)vaddr & ~MASK_256TB) {
+		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+		return -EINVAL;
+	}
+
+	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+			    __func__, vaddr, len);
+		return -EINVAL;
+	}
+
+	if (len == 0) {
+		return 0;
+	}
+
+	pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+	seg_vaddr = vaddr;
+	seg_len = len;
+	while (seg_len > 0) {
+		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+		if (reg & REG_MAP_REGISTERED) {
+			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+			return -EBUSY;
+		}
+		seg_vaddr += VALUE_2MB;
+		seg_len -= VALUE_2MB;
+	}
+
+	seg_vaddr = vaddr;
+	seg_len = 0;
+	while (len > 0) {
+		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
+					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
+		seg_len += VALUE_2MB;
+		vaddr += VALUE_2MB;
+		len -= VALUE_2MB;
+	}
+
+	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
+		if (rc != 0) {
+			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+			return rc;
+		}
+	}
+
+	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+	return 0;
+}
+
+int
+spdk_mem_unregister(void *vaddr, size_t len)
+{
+	struct spdk_mem_map *map;
+	int rc;
+	void *seg_vaddr;
+	size_t seg_len;
+	uint64_t reg, newreg;
+
+	if ((uintptr_t)vaddr & ~MASK_256TB) {
+		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+		return -EINVAL;
+	}
+
+	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+			    __func__, vaddr, len);
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+	/* The first page must be a start of a region. Also check if it's
+	 * registered to make sure we don't return -ERANGE for non-registered
+	 * regions.
+	 */
+	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
+		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+		return -ERANGE;
+	}
+
+	seg_vaddr = vaddr;
+	seg_len = len;
+	while (seg_len > 0) {
+		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+		if ((reg & REG_MAP_REGISTERED) == 0) {
+			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+			return -EINVAL;
+		}
+		seg_vaddr += VALUE_2MB;
+		seg_len -= VALUE_2MB;
+	}
+
+	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+	/* If the next page is registered, it must be a start of a region as well,
+	 * otherwise we'd be unregistering only a part of a region.
+	 */
+	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
+		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+		return -ERANGE;
+	}
+	seg_vaddr = vaddr;
+	seg_len = 0;
+
+	while (len > 0) {
+		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
+
+		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
+			TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
+				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+				if (rc != 0) {
+					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+					return rc;
+				}
+			}
+
+			seg_vaddr = vaddr;
+			seg_len = VALUE_2MB;
+		} else {
+			seg_len += VALUE_2MB;
+		}
+
+		vaddr += VALUE_2MB;
+		len -= VALUE_2MB;
+	}
+
+	if (seg_len > 0) {
+		TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
+			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+			if (rc != 0) {
+				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+				return rc;
+			}
+		}
+	}
+
+	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+	return 0;
+}
+
+int
+spdk_mem_reserve(void *vaddr, size_t len)
+{
+	struct spdk_mem_map *map;
+	void *seg_vaddr;
+	size_t seg_len;
+	uint64_t reg;
+
+	if ((uintptr_t)vaddr & ~MASK_256TB) {
+		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+		return -EINVAL;
+	}
+
+	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+			    __func__, vaddr, len);
+		return -EINVAL;
+	}
+
+	if (len == 0) {
+		return 0;
+	}
+
+	pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+	/* Check if any part of this range is already registered */
+	seg_vaddr = vaddr;
+	seg_len = len;
+	while (seg_len > 0) {
+		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+		if (reg & REG_MAP_REGISTERED) {
+			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+			return -EBUSY;
+		}
+		seg_vaddr += VALUE_2MB;
+		seg_len -= VALUE_2MB;
+	}
+
+	/* Simply set the translation to the memory map's default. This allocates the space in the
+	 * map but does not provide a valid translation. */
+	spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
+				     g_mem_reg_map->default_translation);
+
+	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
+	}
+
+	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+	return 0;
+}
+
+static struct map_1gb *
+mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
+{
+	struct map_1gb *map_1gb;
+	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
+	size_t i;
+
+	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
+		return NULL;
+	}
+
+	map_1gb = map->map_256tb.map[idx_256tb];
+
+	if (!map_1gb) {
+		pthread_mutex_lock(&map->mutex);
+
+		/* Recheck to make sure nobody else got the mutex first. */
+		map_1gb = map->map_256tb.map[idx_256tb];
+		if (!map_1gb) {
+			map_1gb = malloc(sizeof(struct map_1gb));
+			if (map_1gb) {
+				/* initialize all entries to default translation */
+				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
+					map_1gb->map[i].translation_2mb = map->default_translation;
+				}
+				map->map_256tb.map[idx_256tb] = map_1gb;
+			}
+		}
+
+		pthread_mutex_unlock(&map->mutex);
+
+		if (!map_1gb) {
+			DEBUG_PRINT("allocation failed\n");
+			return NULL;
+		}
+	}
+
+	return map_1gb;
+}
+
+int
+spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
+			     uint64_t translation)
+{
+	uint64_t vfn_2mb;
+	struct map_1gb *map_1gb;
+	uint64_t idx_1gb;
+	struct map_2mb *map_2mb;
+
+	if ((uintptr_t)vaddr & ~MASK_256TB) {
+		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
+		return -EINVAL;
+	}
+
+	/* For now, only 2 MB-aligned registrations are supported */
+	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
+		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
+			    __func__, vaddr, size);
+		return -EINVAL;
+	}
+
+	vfn_2mb = vaddr >> SHIFT_2MB;
+
+	while (size) {
+		map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
+		if (!map_1gb) {
+			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
+			return -ENOMEM;
+		}
+
+		idx_1gb = MAP_1GB_IDX(vfn_2mb);
+		map_2mb = &map_1gb->map[idx_1gb];
+		map_2mb->translation_2mb = translation;
+
+		size -= VALUE_2MB;
+		vfn_2mb++;
+	}
+
+	return 0;
+}
+
+int
+spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
+{
+	return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
+}
+
+inline uint64_t
+spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
+{
+	const struct map_1gb *map_1gb;
+	const struct map_2mb *map_2mb;
+	uint64_t idx_256tb;
+	uint64_t idx_1gb;
+	uint64_t vfn_2mb;
+	uint64_t cur_size;
+	uint64_t prev_translation;
+	uint64_t orig_translation;
+
+	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
+		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
+		return map->default_translation;
+	}
+
+	vfn_2mb = vaddr >> SHIFT_2MB;
+	idx_256tb = MAP_256TB_IDX(vfn_2mb);
+	idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+	map_1gb = map->map_256tb.map[idx_256tb];
+	if (spdk_unlikely(!map_1gb)) {
+		return map->default_translation;
+	}
+
+	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
+	map_2mb = &map_1gb->map[idx_1gb];
+	if (size == NULL || map->ops.are_contiguous == NULL ||
+	    map_2mb->translation_2mb == map->default_translation) {
+		if (size != NULL) {
+			*size = spdk_min(*size, cur_size);
+		}
+		return map_2mb->translation_2mb;
+	}
+
+	orig_translation = map_2mb->translation_2mb;
+	prev_translation = orig_translation;
+	while (cur_size < *size) {
+		vfn_2mb++;
+		idx_256tb = MAP_256TB_IDX(vfn_2mb);
+		idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+		map_1gb = map->map_256tb.map[idx_256tb];
+		if (spdk_unlikely(!map_1gb)) {
+			break;
+		}
+
+		map_2mb = &map_1gb->map[idx_1gb];
+		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
+			break;
+		}
+
+		cur_size += VALUE_2MB;
+		prev_translation = map_2mb->translation_2mb;
+	}
+
+	*size = spdk_min(*size, cur_size);
+	return orig_translation;
+}
+
+static void
+memory_hotplug_cb(enum rte_mem_event event_type,
+		  const void *addr, size_t len, void *arg)
+{
+	if (event_type == RTE_MEM_EVENT_ALLOC) {
+		spdk_mem_register((void *)addr, len);
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+		if (!spdk_env_dpdk_external_init()) {
+			return;
+		}
+#endif
+
+		/* Prior to DPDK 19.02, we have to worry about DPDK
+		 * freeing memory in different units than it was allocated.
+		 * That doesn't work with things like RDMA MRs.  So for
+		 * those versions of DPDK, mark each segment so that DPDK
+		 * won't later free it.  That ensures we don't have to deal
+		 * with that scenario.
+		 *
+		 * DPDK 19.02 added the --match-allocations RTE flag to
+		 * avoid this condition.
+		 *
+		 * Note: if the user initialized DPDK separately, we can't
+		 * be sure that --match-allocations was specified, so need
+		 * to still mark the segments so they aren't freed.
+		 */
+		while (len > 0) {
+			struct rte_memseg *seg;
+
+			seg = rte_mem_virt2memseg(addr, NULL);
+			assert(seg != NULL);
+			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
+			len -= seg->hugepage_sz;
+		}
+	} else if (event_type == RTE_MEM_EVENT_FREE) {
+		spdk_mem_unregister((void *)addr, len);
+	}
+}
+
+static int
+memory_iter_cb(const struct rte_memseg_list *msl,
+	       const struct rte_memseg *ms, size_t len, void *arg)
+{
+	return spdk_mem_register(ms->addr, len);
+}
+
+int
+mem_map_init(bool legacy_mem)
+{
+	g_legacy_mem = legacy_mem;
+
+	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
+	if (g_mem_reg_map == NULL) {
+		DEBUG_PRINT("memory registration map allocation failed\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Walk all DPDK memory segments and register them
+	 * with the master memory map
+	 */
+	rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
+	rte_memseg_contig_walk(memory_iter_cb, NULL);
+	return 0;
+}
+
+bool
+spdk_iommu_is_enabled(void)
+{
+#if VFIO_ENABLED
+	return g_vfio.enabled && !g_vfio.noiommu_enabled;
+#else
+	return false;
+#endif
+}
+
+struct spdk_vtophys_pci_device {
+	struct rte_pci_device *pci_device;
+	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
+};
+
+static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
+	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
+
+static struct spdk_mem_map *g_vtophys_map;
+static struct spdk_mem_map *g_phys_ref_map;
+
+#if VFIO_ENABLED
+static int
+vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
+{
+	struct spdk_vfio_dma_map *dma_map;
+	uint64_t refcount;
+	int ret;
+
+	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
+	assert(refcount < UINT64_MAX);
+	if (refcount > 0) {
+		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
+		return 0;
+	}
+
+	dma_map = calloc(1, sizeof(*dma_map));
+	if (dma_map == NULL) {
+		return -ENOMEM;
+	}
+
+	dma_map->map.argsz = sizeof(dma_map->map);
+	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+	dma_map->map.vaddr = vaddr;
+	dma_map->map.iova = iova;
+	dma_map->map.size = size;
+
+	dma_map->unmap.argsz = sizeof(dma_map->unmap);
+	dma_map->unmap.flags = 0;
+	dma_map->unmap.iova = iova;
+	dma_map->unmap.size = size;
+
+	pthread_mutex_lock(&g_vfio.mutex);
+	if (g_vfio.device_ref == 0) {
+		/* VFIO requires at least one device (IOMMU group) to be added to
+		 * a VFIO container before it is possible to perform any IOMMU
+		 * operations on that container. This memory will be mapped once
+		 * the first device (IOMMU group) is hotplugged.
+		 *
+		 * Since the vfio container is managed internally by DPDK, it is
+		 * also possible that some device is already in that container, but
+		 * it's not managed by SPDK -  e.g. an NIC attached internally
+		 * inside DPDK. We could map the memory straight away in such
+		 * scenario, but there's no need to do it. DPDK devices clearly
+		 * don't need our mappings and hence we defer the mapping
+		 * unconditionally until the first SPDK-managed device is
+		 * hotplugged.
+		 */
+		goto out_insert;
+	}
+
+	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+	if (ret) {
+		DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
+		pthread_mutex_unlock(&g_vfio.mutex);
+		free(dma_map);
+		return ret;
+	}
+
+out_insert:
+	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
+	pthread_mutex_unlock(&g_vfio.mutex);
+	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
+	return 0;
+}
+
+static int
+vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
+{
+	struct spdk_vfio_dma_map *dma_map;
+	uint64_t refcount;
+	int ret;
+
+	pthread_mutex_lock(&g_vfio.mutex);
+	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+		if (dma_map->map.iova == iova) {
+			break;
+		}
+	}
+
+	if (dma_map == NULL) {
+		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
+		pthread_mutex_unlock(&g_vfio.mutex);
+		return -ENXIO;
+	}
+
+	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
+	assert(refcount < UINT64_MAX);
+	if (refcount > 0) {
+		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
+	}
+
+	/* We still have outstanding references, don't clear it. */
+	if (refcount > 1) {
+		pthread_mutex_unlock(&g_vfio.mutex);
+		return 0;
+	}
+
+	/** don't support partial or multiple-page unmap for now */
+	assert(dma_map->map.size == size);
+
+	if (g_vfio.device_ref == 0) {
+		/* Memory is not mapped anymore, just remove it's references */
+		goto out_remove;
+	}
+
+
+	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+	if (ret) {
+		DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
+		pthread_mutex_unlock(&g_vfio.mutex);
+		return ret;
+	}
+
+out_remove:
+	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
+	pthread_mutex_unlock(&g_vfio.mutex);
+	free(dma_map);
+	return 0;
+}
+#endif
+
+static uint64_t
+vtophys_get_paddr_memseg(uint64_t vaddr)
+{
+	uintptr_t paddr;
+	struct rte_memseg *seg;
+
+	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
+	if (seg != NULL) {
+		paddr = seg->phys_addr;
+		if (paddr == RTE_BAD_IOVA) {
+			return SPDK_VTOPHYS_ERROR;
+		}
+		paddr += (vaddr - (uintptr_t)seg->addr);
+		return paddr;
+	}
+
+	return SPDK_VTOPHYS_ERROR;
+}
+
+/* Try to get the paddr from /proc/self/pagemap */
+static uint64_t
+vtophys_get_paddr_pagemap(uint64_t vaddr)
+{
+	uintptr_t paddr;
+
+	/* Silence static analyzers */
+	assert(vaddr != 0);
+	paddr = rte_mem_virt2iova((void *)vaddr);
+	if (paddr == RTE_BAD_IOVA) {
+		/*
+		 * The vaddr may be valid but doesn't have a backing page
+		 * assigned yet.  Touch the page to ensure a backing page
+		 * gets assigned, then try to translate again.
+		 */
+		rte_atomic64_read((rte_atomic64_t *)vaddr);
+		paddr = rte_mem_virt2iova((void *)vaddr);
+	}
+	if (paddr == RTE_BAD_IOVA) {
+		/* Unable to get to the physical address. */
+		return SPDK_VTOPHYS_ERROR;
+	}
+
+	return paddr;
+}
+
+/* Try to get the paddr from pci devices */
+static uint64_t
+vtophys_get_paddr_pci(uint64_t vaddr)
+{
+	struct spdk_vtophys_pci_device *vtophys_dev;
+	uintptr_t paddr;
+	struct rte_pci_device	*dev;
+	struct rte_mem_resource *res;
+	unsigned r;
+
+	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+		dev = vtophys_dev->pci_device;
+
+		for (r = 0; r < PCI_MAX_RESOURCE; r++) {
+			res = &dev->mem_resource[r];
+			if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
+			    vaddr < (uint64_t)res->addr + res->len) {
+				paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
+				DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
+					    (void *)paddr);
+				pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+				return paddr;
+			}
+		}
+	}
+	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+	return  SPDK_VTOPHYS_ERROR;
+}
+
+static int
+vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
+	       enum spdk_mem_map_notify_action action,
+	       void *vaddr, size_t len)
+{
+	int rc = 0, pci_phys = 0;
+	uint64_t paddr;
+
+	if ((uintptr_t)vaddr & ~MASK_256TB) {
+		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+		return -EINVAL;
+	}
+
+	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+		DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
+			    vaddr, len);
+		return -EINVAL;
+	}
+
+	/* Get the physical address from the DPDK memsegs */
+	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+
+	switch (action) {
+	case SPDK_MEM_MAP_NOTIFY_REGISTER:
+		if (paddr == SPDK_VTOPHYS_ERROR) {
+			/* This is not an address that DPDK is managing. */
+#if VFIO_ENABLED
+			enum rte_iova_mode iova_mode;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0)
+			iova_mode = rte_eal_iova_mode();
+#else
+			iova_mode = rte_eal_get_configuration()->iova_mode;
+#endif
+
+			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
+				/* We'll use the virtual address as the iova to match DPDK. */
+				paddr = (uint64_t)vaddr;
+				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
+				if (rc) {
+					return -EFAULT;
+				}
+				while (len > 0) {
+					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+					if (rc != 0) {
+						return rc;
+					}
+					vaddr += VALUE_2MB;
+					paddr += VALUE_2MB;
+					len -= VALUE_2MB;
+				}
+			} else
+#endif
+			{
+				/* Get the physical address from /proc/self/pagemap. */
+				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+				if (paddr == SPDK_VTOPHYS_ERROR) {
+					/* Get the physical address from PCI devices */
+					paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+					if (paddr == SPDK_VTOPHYS_ERROR) {
+						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+						return -EFAULT;
+					}
+					/* The beginning of this address range points to a PCI resource,
+					 * so the rest must point to a PCI resource as well.
+					 */
+					pci_phys = 1;
+				}
+
+				/* Get paddr for each 2MB chunk in this address range */
+				while (len > 0) {
+					/* Get the physical address from /proc/self/pagemap. */
+					if (pci_phys) {
+						paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+					} else {
+						paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+					}
+
+					if (paddr == SPDK_VTOPHYS_ERROR) {
+						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+						return -EFAULT;
+					}
+
+					/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
+					if (!pci_phys && (paddr & MASK_2MB)) {
+						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
+						return -EINVAL;
+					}
+#if VFIO_ENABLED
+					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
+					 * with the IOMMU using the physical address to match. */
+					if (spdk_iommu_is_enabled()) {
+						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
+						if (rc) {
+							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
+							return -EFAULT;
+						}
+					}
+#endif
+
+					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+					if (rc != 0) {
+						return rc;
+					}
+
+					vaddr += VALUE_2MB;
+					len -= VALUE_2MB;
+				}
+			}
+		} else {
+			/* This is an address managed by DPDK. Just setup the translations. */
+			while (len > 0) {
+				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+				if (paddr == SPDK_VTOPHYS_ERROR) {
+					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+					return -EFAULT;
+				}
+
+				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+				if (rc != 0) {
+					return rc;
+				}
+
+				vaddr += VALUE_2MB;
+				len -= VALUE_2MB;
+			}
+		}
+
+		break;
+	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+#if VFIO_ENABLED
+		if (paddr == SPDK_VTOPHYS_ERROR) {
+			/*
+			 * This is not an address that DPDK is managing. If vfio is enabled,
+			 * we need to unmap the range from the IOMMU
+			 */
+			if (spdk_iommu_is_enabled()) {
+				uint64_t buffer_len = len;
+				uint8_t *va = vaddr;
+				enum rte_iova_mode iova_mode;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0)
+				iova_mode = rte_eal_iova_mode();
+#else
+				iova_mode = rte_eal_get_configuration()->iova_mode;
+#endif
+				/*
+				 * In virtual address mode, the region is contiguous and can be done in
+				 * one unmap.
+				 */
+				if (iova_mode == RTE_IOVA_VA) {
+					paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
+					if (buffer_len != len || paddr != (uintptr_t)va) {
+						DEBUG_PRINT("Unmapping %p with length %lu failed because "
+							    "translation had address 0x%" PRIx64 " and length %lu\n",
+							    va, len, paddr, buffer_len);
+						return -EINVAL;
+					}
+					rc = vtophys_iommu_unmap_dma(paddr, len);
+					if (rc) {
+						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
+						return -EFAULT;
+					}
+				} else if (iova_mode == RTE_IOVA_PA) {
+					/* Get paddr for each 2MB chunk in this address range */
+					while (buffer_len > 0) {
+						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
+
+						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
+							DEBUG_PRINT("could not get phys addr for %p\n", va);
+							return -EFAULT;
+						}
+
+						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
+						if (rc) {
+							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
+							return -EFAULT;
+						}
+
+						va += VALUE_2MB;
+						buffer_len -= VALUE_2MB;
+					}
+				}
+			}
+		}
+#endif
+		while (len > 0) {
+			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
+			if (rc != 0) {
+				return rc;
+			}
+
+			vaddr += VALUE_2MB;
+			len -= VALUE_2MB;
+		}
+
+		break;
+	default:
+		SPDK_UNREACHABLE();
+	}
+
+	return rc;
+}
+
+static int
+vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
+{
+	/* This function is always called with paddrs for two subsequent
+	 * 2MB chunks in virtual address space, so those chunks will be only
+	 * physically contiguous if the physical addresses are 2MB apart
+	 * from each other as well.
+	 */
+	return (paddr2 - paddr1 == VALUE_2MB);
+}
+
+#if VFIO_ENABLED
+
+static bool
+vfio_enabled(void)
+{
+	return rte_vfio_is_enabled("vfio_pci");
+}
+
+/* Check if IOMMU is enabled on the system */
+static bool
+has_iommu_groups(void)
+{
+	struct dirent *d;
+	int count = 0;
+	DIR *dir = opendir("/sys/kernel/iommu_groups");
+
+	if (dir == NULL) {
+		return false;
+	}
+
+	while (count < 3 && (d = readdir(dir)) != NULL) {
+		count++;
+	}
+
+	closedir(dir);
+	/* there will always be ./ and ../ entries */
+	return count > 2;
+}
+
+static bool
+vfio_noiommu_enabled(void)
+{
+	return rte_vfio_noiommu_is_enabled();
+}
+
+static void
+vtophys_iommu_init(void)
+{
+	char proc_fd_path[PATH_MAX + 1];
+	char link_path[PATH_MAX + 1];
+	const char vfio_path[] = "/dev/vfio/vfio";
+	DIR *dir;
+	struct dirent *d;
+
+	if (!vfio_enabled()) {
+		return;
+	}
+
+	if (vfio_noiommu_enabled()) {
+		g_vfio.noiommu_enabled = true;
+	} else if (!has_iommu_groups()) {
+		return;
+	}
+
+	dir = opendir("/proc/self/fd");
+	if (!dir) {
+		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
+		return;
+	}
+
+	while ((d = readdir(dir)) != NULL) {
+		if (d->d_type != DT_LNK) {
+			continue;
+		}
+
+		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
+		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
+			continue;
+		}
+
+		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
+			sscanf(d->d_name, "%d", &g_vfio.fd);
+			break;
+		}
+	}
+
+	closedir(dir);
+
+	if (g_vfio.fd < 0) {
+		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
+		return;
+	}
+
+	g_vfio.enabled = true;
+
+	return;
+}
+#endif
+
+void
+vtophys_pci_device_added(struct rte_pci_device *pci_device)
+{
+	struct spdk_vtophys_pci_device *vtophys_dev;
+
+	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+
+	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
+	if (vtophys_dev) {
+		vtophys_dev->pci_device = pci_device;
+		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
+	} else {
+		DEBUG_PRINT("Memory allocation error\n");
+	}
+	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if VFIO_ENABLED
+	struct spdk_vfio_dma_map *dma_map;
+	int ret;
+
+	if (!g_vfio.enabled) {
+		return;
+	}
+
+	pthread_mutex_lock(&g_vfio.mutex);
+	g_vfio.device_ref++;
+	if (g_vfio.device_ref > 1) {
+		pthread_mutex_unlock(&g_vfio.mutex);
+		return;
+	}
+
+	/* This is the first SPDK device using DPDK vfio. This means that the first
+	 * IOMMU group might have been just been added to the DPDK vfio container.
+	 * From this point it is certain that the memory can be mapped now.
+	 */
+	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+		if (ret) {
+			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+void
+vtophys_pci_device_removed(struct rte_pci_device *pci_device)
+{
+	struct spdk_vtophys_pci_device *vtophys_dev;
+
+	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+		if (vtophys_dev->pci_device == pci_device) {
+			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
+			free(vtophys_dev);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if VFIO_ENABLED
+	struct spdk_vfio_dma_map *dma_map;
+	int ret;
+
+	if (!g_vfio.enabled) {
+		return;
+	}
+
+	pthread_mutex_lock(&g_vfio.mutex);
+	assert(g_vfio.device_ref > 0);
+	g_vfio.device_ref--;
+	if (g_vfio.device_ref > 0) {
+		pthread_mutex_unlock(&g_vfio.mutex);
+		return;
+	}
+
+	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
+	 * any additional devices using it's vfio container, all the mappings
+	 * will be automatically removed by the Linux vfio driver. We unmap
+	 * the memory manually to be able to easily re-map it later regardless
+	 * of other, external factors.
+	 */
+	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+		if (ret) {
+			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+int
+vtophys_init(void)
+{
+	const struct spdk_mem_map_ops vtophys_map_ops = {
+		.notify_cb = vtophys_notify,
+		.are_contiguous = vtophys_check_contiguous_entries,
+	};
+
+	const struct spdk_mem_map_ops phys_ref_map_ops = {
+		.notify_cb = NULL,
+		.are_contiguous = NULL,
+	};
+
+#if VFIO_ENABLED
+	vtophys_iommu_init();
+#endif
+
+	g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
+	if (g_phys_ref_map == NULL) {
+		DEBUG_PRINT("phys_ref map allocation failed.\n");
+		return -ENOMEM;
+	}
+
+	g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
+	if (g_vtophys_map == NULL) {
+		DEBUG_PRINT("vtophys map allocation failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+uint64_t
+spdk_vtophys(void *buf, uint64_t *size)
+{
+	uint64_t vaddr, paddr_2mb;
+
+	vaddr = (uint64_t)buf;
+	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
+
+	/*
+	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
+	 * we will still bitwise-or it with the buf offset below, but the result will still be
+	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
+	 * unaligned) we must now check the return value before addition.
+	 */
+	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
+	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
+		return SPDK_VTOPHYS_ERROR;
+	} else {
+		return paddr_2mb + (vaddr & MASK_2MB);
+	}
+}
diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c
new file mode 100644
index 000000000..5fd1b4abd
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci.c
@@ -0,0 +1,1063 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include <rte_alarm.h>
+#include <rte_devargs.h>
+#include "spdk/env.h"
+
+#define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
+
+#define PCI_CFG_SIZE		256
+#define PCI_EXT_CAP_ID_SN	0x03
+
+/* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
+ * might cause the internal IPC to misbehave. Just retry in such case.
+ */
+#define DPDK_HOTPLUG_RETRY_COUNT 4
+
+/* DPDK alarm/interrupt thread */
+static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
+/* devices hotplugged on a dpdk thread */
+static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
+	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
+static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
+
+static int
+map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
+	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+	struct rte_pci_device *dev = device->dev_handle;
+
+	*mapped_addr = dev->mem_resource[bar].addr;
+	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
+	*size = (uint64_t)dev->mem_resource[bar].len;
+
+	return 0;
+}
+
+static int
+unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
+{
+	return 0;
+}
+
+static int
+cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+	int rc;
+
+	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
+
+	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+static int
+cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+	int rc;
+
+	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
+
+#ifdef __FreeBSD__
+	/* DPDK returns 0 on success and -1 on failure */
+	return rc;
+#endif
+	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+static void
+remove_rte_dev(struct rte_pci_device *rte_dev)
+{
+	char bdf[32];
+	int i = 0, rc;
+
+	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
+	do {
+		rc = rte_eal_hotplug_remove("pci", bdf);
+	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
+}
+
+static void
+detach_rte_cb(void *_dev)
+{
+	remove_rte_dev(_dev);
+}
+
+static void
+detach_rte(struct spdk_pci_device *dev)
+{
+	struct rte_pci_device *rte_dev = dev->dev_handle;
+	int i;
+	bool removed;
+
+	if (!spdk_process_is_primary()) {
+		remove_rte_dev(rte_dev);
+		return;
+	}
+
+	pthread_mutex_lock(&g_pci_mutex);
+	dev->internal.attached = false;
+	/* prevent the hotremove notification from removing this device */
+	dev->internal.pending_removal = true;
+	pthread_mutex_unlock(&g_pci_mutex);
+
+	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
+
+	/* wait up to 2s for the cb to execute */
+	for (i = 2000; i > 0; i--) {
+
+		spdk_delay_us(1000);
+		pthread_mutex_lock(&g_pci_mutex);
+		removed = dev->internal.removed;
+		pthread_mutex_unlock(&g_pci_mutex);
+
+		if (removed) {
+			break;
+		}
+	}
+
+	/* besides checking the removed flag, we also need to wait
+	 * for the dpdk detach function to unwind, as it's doing some
+	 * operations even after calling our detach callback. Simply
+	 * cancel the alarm - if it started executing already, this
+	 * call will block and wait for it to finish.
+	 */
+	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
+
+	/* the device could have been finally removed, so just check
+	 * it again.
+	 */
+	pthread_mutex_lock(&g_pci_mutex);
+	removed = dev->internal.removed;
+	pthread_mutex_unlock(&g_pci_mutex);
+	if (!removed) {
+		fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n",
+			rte_dev->name);
+		/* If we reach this state, then the device couldn't be removed and most likely
+		   a subsequent hot add of a device in the same BDF will fail */
+	}
+}
+
+void
+spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
+{
+	struct spdk_pci_driver *driver;
+
+	driver = calloc(1, sizeof(*driver));
+	if (!driver) {
+		/* we can't do any better than bailing atm */
+		return;
+	}
+
+	driver->name = name;
+	driver->id_table = id_table;
+	driver->drv_flags = flags;
+	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
+}
+
+struct spdk_pci_driver *
+spdk_pci_nvme_get_driver(void)
+{
+	return spdk_pci_get_driver("nvme");
+}
+
+struct spdk_pci_driver *
+spdk_pci_get_driver(const char *name)
+{
+	struct spdk_pci_driver *driver;
+
+	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
+		if (strcmp(driver->name, name) == 0) {
+			return driver;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+pci_device_rte_hotremove(const char *device_name,
+			 enum rte_dev_event_type event,
+			 void *cb_arg)
+{
+	struct spdk_pci_device *dev;
+	bool can_detach = false;
+
+	if (event != RTE_DEV_EVENT_REMOVE) {
+		return;
+	}
+
+	pthread_mutex_lock(&g_pci_mutex);
+	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+		struct rte_pci_device *rte_dev = dev->dev_handle;
+
+		if (strcmp(rte_dev->name, device_name) == 0 &&
+		    !dev->internal.pending_removal) {
+			can_detach = !dev->internal.attached;
+			/* prevent any further attaches */
+			dev->internal.pending_removal = true;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_pci_mutex);
+
+	if (dev != NULL && can_detach) {
+		/* if device is not attached we can remove it right away.
+		 * Otherwise it will be removed at detach.
+		 */
+		remove_rte_dev(dev->dev_handle);
+	}
+}
+
+static void
+cleanup_pci_devices(void)
+{
+	struct spdk_pci_device *dev, *tmp;
+
+	pthread_mutex_lock(&g_pci_mutex);
+	/* cleanup removed devices */
+	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
+		if (!dev->internal.removed) {
+			continue;
+		}
+
+		vtophys_pci_device_removed(dev->dev_handle);
+		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
+		free(dev);
+	}
+
+	/* add newly-attached devices */
+	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
+		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
+		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
+		vtophys_pci_device_added(dev->dev_handle);
+	}
+	pthread_mutex_unlock(&g_pci_mutex);
+}
+
+static int scan_pci_bus(bool delay_init);
+
+/* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
+static int
+register_rte_driver(struct spdk_pci_driver *driver)
+{
+	unsigned pci_id_count = 0;
+	struct rte_pci_id *rte_id_table;
+	char *rte_name;
+	size_t rte_name_len;
+	uint32_t rte_flags;
+
+	assert(driver->id_table);
+	while (driver->id_table[pci_id_count].vendor_id) {
+		pci_id_count++;
+	}
+	assert(pci_id_count > 0);
+
+	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
+	if (!rte_id_table) {
+		return -ENOMEM;
+	}
+
+	while (pci_id_count > 0) {
+		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
+		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
+
+		rte_id->class_id = spdk_id->class_id;
+		rte_id->vendor_id = spdk_id->vendor_id;
+		rte_id->device_id = spdk_id->device_id;
+		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
+		rte_id->subsystem_device_id = spdk_id->subdevice_id;
+		pci_id_count--;
+	}
+
+	assert(driver->name);
+	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
+	rte_name = calloc(rte_name_len, 1);
+	if (!rte_name) {
+		free(rte_id_table);
+		return -ENOMEM;
+	}
+
+	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
+	driver->driver.driver.name = rte_name;
+	driver->driver.id_table = rte_id_table;
+
+	rte_flags = 0;
+	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
+		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
+	}
+	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
+		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
+	}
+	driver->driver.drv_flags = rte_flags;
+
+	driver->driver.probe = pci_device_init;
+	driver->driver.remove = pci_device_fini;
+
+	rte_pci_register(&driver->driver);
+	return 0;
+}
+
+static inline void
+_pci_env_init(void)
+{
+	/* We assume devices were present on the bus for more than 2 seconds
+	 * before initializing SPDK and there's no need to wait more. We scan
+	 * the bus, but we don't blacklist any devices.
+	 */
+	scan_pci_bus(false);
+
+	/* Register a single hotremove callback for all devices. */
+	if (spdk_process_is_primary()) {
+		rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL);
+	}
+}
+
+void
+pci_env_init(void)
+{
+	struct spdk_pci_driver *driver;
+
+	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
+		register_rte_driver(driver);
+	}
+
+	_pci_env_init();
+}
+
+void
+pci_env_reinit(void)
+{
+	/* There is no need to register pci drivers again, since they were
+	 * already pre-registered in pci_env_init.
+	 */
+
+	_pci_env_init();
+}
+
+void
+pci_env_fini(void)
+{
+	struct spdk_pci_device *dev;
+	char bdf[32];
+
+	cleanup_pci_devices();
+	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+		if (dev->internal.attached) {
+			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
+			fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf);
+		}
+	}
+
+	if (spdk_process_is_primary()) {
+		rte_dev_event_callback_unregister(NULL, pci_device_rte_hotremove, NULL);
+	}
+}
+
+int
+pci_device_init(struct rte_pci_driver *_drv,
+		struct rte_pci_device *_dev)
+{
+	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
+	struct spdk_pci_device *dev;
+	int rc;
+
+	dev = calloc(1, sizeof(*dev));
+	if (dev == NULL) {
+		return -1;
+	}
+
+	dev->dev_handle = _dev;
+
+	dev->addr.domain = _dev->addr.domain;
+	dev->addr.bus = _dev->addr.bus;
+	dev->addr.dev = _dev->addr.devid;
+	dev->addr.func = _dev->addr.function;
+	dev->id.class_id = _dev->id.class_id;
+	dev->id.vendor_id = _dev->id.vendor_id;
+	dev->id.device_id = _dev->id.device_id;
+	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
+	dev->id.subdevice_id = _dev->id.subsystem_device_id;
+	dev->socket_id = _dev->device.numa_node;
+	dev->type = "pci";
+
+	dev->map_bar = map_bar_rte;
+	dev->unmap_bar = unmap_bar_rte;
+	dev->cfg_read = cfg_read_rte;
+	dev->cfg_write = cfg_write_rte;
+
+	dev->internal.driver = driver;
+	dev->internal.claim_fd = -1;
+
+	if (driver->cb_fn != NULL) {
+		rc = driver->cb_fn(driver->cb_arg, dev);
+		if (rc != 0) {
+			free(dev);
+			return rc;
+		}
+		dev->internal.attached = true;
+	}
+
+	pthread_mutex_lock(&g_pci_mutex);
+	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
+	pthread_mutex_unlock(&g_pci_mutex);
+	return 0;
+}
+
+int
+pci_device_fini(struct rte_pci_device *_dev)
+{
+	struct spdk_pci_device *dev;
+
+	pthread_mutex_lock(&g_pci_mutex);
+	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+		if (dev->dev_handle == _dev) {
+			break;
+		}
+	}
+
+	if (dev == NULL || dev->internal.attached) {
+		/* The device might be still referenced somewhere in SPDK. */
+		pthread_mutex_unlock(&g_pci_mutex);
+		return -1;
+	}
+
+	/* remove our whitelist_at option */
+	if (_dev->device.devargs) {
+		_dev->device.devargs->data = NULL;
+	}
+
+	assert(!dev->internal.removed);
+	dev->internal.removed = true;
+	pthread_mutex_unlock(&g_pci_mutex);
+	return 0;
+
+}
+
+void
+spdk_pci_device_detach(struct spdk_pci_device *dev)
+{
+	assert(dev->internal.attached);
+
+	if (dev->internal.claim_fd >= 0) {
+		spdk_pci_device_unclaim(dev);
+	}
+
+	if (strcmp(dev->type, "pci") == 0) {
+		/* if it's a physical device we need to deal with DPDK on
+		 * a different process and we can't just unset one flag
+		 * here. We also want to stop using any device resources
+		 * so that the device isn't "in use" by the userspace driver
+		 * once we detach it. This would allow attaching the device
+		 * to a different process, or to a kernel driver like nvme.
+		 */
+		detach_rte(dev);
+	} else {
+		dev->internal.attached = false;
+	}
+
+	cleanup_pci_devices();
+}
+
+static int
+scan_pci_bus(bool delay_init)
+{
+	struct spdk_pci_driver *driver;
+	struct rte_pci_device *rte_dev;
+	uint64_t now;
+
+	rte_bus_scan();
+	now = spdk_get_ticks();
+
+	driver = TAILQ_FIRST(&g_pci_drivers);
+	if (!driver) {
+		return 0;
+	}
+
+	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
+		struct rte_devargs *da;
+
+		da = rte_dev->device.devargs;
+		if (!da) {
+			char devargs_str[128];
+
+			/* the device was never blacklisted or whitelisted */
+			da = calloc(1, sizeof(*da));
+			if (!da) {
+				return -1;
+			}
+
+			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
+			if (rte_devargs_parse(da, devargs_str) != 0) {
+				free(da);
+				return -1;
+			}
+
+			rte_devargs_insert(&da);
+			rte_dev->device.devargs = da;
+		}
+
+		if (da->data) {
+			uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data;
+
+			/* this device was seen by spdk before... */
+			if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) {
+				da->policy = RTE_DEV_WHITELISTED;
+			}
+		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST &&
+			    da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) {
+			/* override the policy only if not permanently blacklisted */
+
+			if (delay_init) {
+				da->policy = RTE_DEV_BLACKLISTED;
+				da->data = (void *)(now + 2 * spdk_get_ticks_hz());
+			} else {
+				da->policy = RTE_DEV_WHITELISTED;
+				da->data = (void *)(uintptr_t)now;
+			}
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_pci_device_attach(struct spdk_pci_driver *driver,
+		       spdk_pci_enum_cb enum_cb,
+		       void *enum_ctx, struct spdk_pci_addr *pci_address)
+{
+	struct spdk_pci_device *dev;
+	struct rte_pci_device *rte_dev;
+	struct rte_devargs *da;
+	int rc;
+	char bdf[32];
+
+	spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
+
+	cleanup_pci_devices();
+
+	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
+			break;
+		}
+	}
+
+	if (dev != NULL && dev->internal.driver == driver) {
+		pthread_mutex_lock(&g_pci_mutex);
+		if (dev->internal.attached || dev->internal.pending_removal) {
+			pthread_mutex_unlock(&g_pci_mutex);
+			return -1;
+		}
+
+		rc = enum_cb(enum_ctx, dev);
+		if (rc == 0) {
+			dev->internal.attached = true;
+		}
+		pthread_mutex_unlock(&g_pci_mutex);
+		return rc;
+	}
+
+	driver->cb_fn = enum_cb;
+	driver->cb_arg = enum_ctx;
+
+	int i = 0;
+
+	do {
+		rc = rte_eal_hotplug_add("pci", bdf, "");
+	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
+
+	if (i > 1 && rc == -EEXIST) {
+		/* Even though the previous request timed out, the device
+		 * was attached successfully.
+		 */
+		rc = 0;
+	}
+
+	driver->cb_arg = NULL;
+	driver->cb_fn = NULL;
+
+	cleanup_pci_devices();
+
+	if (rc != 0) {
+		return -1;
+	}
+
+	/* explicit attach ignores the whitelist, so if we blacklisted this
+	 * device before let's enable it now - just for clarity.
+	 */
+	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
+			break;
+		}
+	}
+	assert(dev != NULL);
+
+	rte_dev = dev->dev_handle;
+	da = rte_dev->device.devargs;
+	if (da && da->data) {
+		da->data = (void *)(uintptr_t)spdk_get_ticks();
+		da->policy = RTE_DEV_WHITELISTED;
+	}
+
+	return 0;
+}
+
+/* Note: You can call spdk_pci_enumerate from more than one thread
+ *       simultaneously safely, but you cannot call spdk_pci_enumerate
+ *       and rte_eal_pci_probe simultaneously.
+ */
+int
+spdk_pci_enumerate(struct spdk_pci_driver *driver,
+		   spdk_pci_enum_cb enum_cb,
+		   void *enum_ctx)
+{
+	struct spdk_pci_device *dev;
+	int rc;
+
+	cleanup_pci_devices();
+
+	pthread_mutex_lock(&g_pci_mutex);
+	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+		if (dev->internal.attached ||
+		    dev->internal.driver != driver ||
+		    dev->internal.pending_removal) {
+			continue;
+		}
+
+		rc = enum_cb(enum_ctx, dev);
+		if (rc == 0) {
+			dev->internal.attached = true;
+		} else if (rc < 0) {
+			pthread_mutex_unlock(&g_pci_mutex);
+			return -1;
+		}
+	}
+	pthread_mutex_unlock(&g_pci_mutex);
+
+	if (scan_pci_bus(true) != 0) {
+		return -1;
+	}
+
+	driver->cb_fn = enum_cb;
+	driver->cb_arg = enum_ctx;
+
+	if (rte_bus_probe() != 0) {
+		driver->cb_arg = NULL;
+		driver->cb_fn = NULL;
+		return -1;
+	}
+
+	driver->cb_arg = NULL;
+	driver->cb_fn = NULL;
+
+	cleanup_pci_devices();
+	return 0;
+}
+
+struct spdk_pci_device *
+spdk_pci_get_first_device(void)
+{
+	return TAILQ_FIRST(&g_pci_devices);
+}
+
+struct spdk_pci_device *
+spdk_pci_get_next_device(struct spdk_pci_device *prev)
+{
+	return TAILQ_NEXT(prev, internal.tailq);
+}
+
+int
+spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
+			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
+}
+
+int
+spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
+{
+	return dev->unmap_bar(dev, bar, addr);
+}
+
+uint32_t
+spdk_pci_device_get_domain(struct spdk_pci_device *dev)
+{
+	return dev->addr.domain;
+}
+
+uint8_t
+spdk_pci_device_get_bus(struct spdk_pci_device *dev)
+{
+	return dev->addr.bus;
+}
+
+uint8_t
+spdk_pci_device_get_dev(struct spdk_pci_device *dev)
+{
+	return dev->addr.dev;
+}
+
+uint8_t
+spdk_pci_device_get_func(struct spdk_pci_device *dev)
+{
+	return dev->addr.func;
+}
+
+uint16_t
+spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
+{
+	return dev->id.vendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
+{
+	return dev->id.device_id;
+}
+
+uint16_t
+spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
+{
+	return dev->id.subvendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
+{
+	return dev->id.subdevice_id;
+}
+
+struct spdk_pci_id
+spdk_pci_device_get_id(struct spdk_pci_device *dev)
+{
+	return dev->id;
+}
+
+int
+spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
+{
+	return dev->socket_id;
+}
+
+int
+spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+	return dev->cfg_read(dev, value, len, offset);
+}
+
+int
+spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+	return dev->cfg_write(dev, value, len, offset);
+}
+
+int
+spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
+{
+	return spdk_pci_device_cfg_read(dev, value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
+{
+	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
+{
+	return spdk_pci_device_cfg_read(dev, value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
+{
+	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
+{
+	return spdk_pci_device_cfg_read(dev, value, 4, offset);
+}
+
+int
+spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
+{
+	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
+}
+
+int
+spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
+{
+	int err;
+	uint32_t pos, header = 0;
+	uint32_t i, buf[2];
+
+	if (len < 17) {
+		return -1;
+	}
+
+	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
+	if (err || !header) {
+		return -1;
+	}
+
+	pos = PCI_CFG_SIZE;
+	while (1) {
+		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
+			if (pos) {
+				/* skip the header */
+				pos += 4;
+				for (i = 0; i < 2; i++) {
+					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
+					if (err) {
+						return -1;
+					}
+				}
+				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
+				return 0;
+			}
+		}
+		pos = (header >> 20) & 0xffc;
+		/* 0 if no other items exist */
+		if (pos < PCI_CFG_SIZE) {
+			return -1;
+		}
+		err = spdk_pci_device_cfg_read32(dev, &header, pos);
+		if (err) {
+			return -1;
+		}
+	}
+	return -1;
+}
+
+struct spdk_pci_addr
+spdk_pci_device_get_addr(struct spdk_pci_device *dev)
+{
+	return dev->addr;
+}
+
+bool
+spdk_pci_device_is_removed(struct spdk_pci_device *dev)
+{
+	return dev->internal.pending_removal;
+}
+
+int
+spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
+{
+	if (a1->domain > a2->domain) {
+		return 1;
+	} else if (a1->domain < a2->domain) {
+		return -1;
+	} else if (a1->bus > a2->bus) {
+		return 1;
+	} else if (a1->bus < a2->bus) {
+		return -1;
+	} else if (a1->dev > a2->dev) {
+		return 1;
+	} else if (a1->dev < a2->dev) {
+		return -1;
+	} else if (a1->func > a2->func) {
+		return 1;
+	} else if (a1->func < a2->func) {
+		return -1;
+	}
+
+	return 0;
+}
+
+#ifdef __linux__
+int
+spdk_pci_device_claim(struct spdk_pci_device *dev)
+{
+	int dev_fd;
+	char dev_name[64];
+	int pid;
+	void *dev_map;
+	struct flock pcidev_lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0,
+	};
+
+	snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
+		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
+
+	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	if (dev_fd == -1) {
+		fprintf(stderr, "could not open %s\n", dev_name);
+		return -errno;
+	}
+
+	if (ftruncate(dev_fd, sizeof(int)) != 0) {
+		fprintf(stderr, "could not truncate %s\n", dev_name);
+		close(dev_fd);
+		return -errno;
+	}
+
+	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+		       MAP_SHARED, dev_fd, 0);
+	if (dev_map == MAP_FAILED) {
+		fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno);
+		close(dev_fd);
+		return -errno;
+	}
+
+	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
+		pid = *(int *)dev_map;
+		fprintf(stderr, "Cannot create lock on device %s, probably"
+			" process %d has claimed it\n", dev_name, pid);
+		munmap(dev_map, sizeof(int));
+		close(dev_fd);
+		/* F_SETLK returns unspecified errnos, normalize them */
+		return -EACCES;
+	}
+
+	*(int *)dev_map = (int)getpid();
+	munmap(dev_map, sizeof(int));
+	dev->internal.claim_fd = dev_fd;
+	/* Keep dev_fd open to maintain the lock. */
+	return 0;
+}
+
+void
+spdk_pci_device_unclaim(struct spdk_pci_device *dev)
+{
+	char dev_name[64];
+
+	snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
+		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
+
+	close(dev->internal.claim_fd);
+	dev->internal.claim_fd = -1;
+	unlink(dev_name);
+}
+#endif /* __linux__ */
+
+#ifdef __FreeBSD__
+int
+spdk_pci_device_claim(struct spdk_pci_device *dev)
+{
+	/* TODO */
+	return 0;
+}
+
+void
+spdk_pci_device_unclaim(struct spdk_pci_device *dev)
+{
+	/* TODO */
+}
+#endif /* __FreeBSD__ */
+
+int
+spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
+{
+	unsigned domain, bus, dev, func;
+
+	if (addr == NULL || bdf == NULL) {
+		return -EINVAL;
+	}
+
+	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
+	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
+		/* Matched a full address - all variables are initialized */
+	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+		func = 0;
+	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
+		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
+		domain = 0;
+	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
+		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
+		domain = 0;
+		func = 0;
+	} else {
+		return -EINVAL;
+	}
+
+	if (bus > 0xFF || dev > 0x1F || func > 7) {
+		return -EINVAL;
+	}
+
+	addr->domain = domain;
+	addr->bus = bus;
+	addr->dev = dev;
+	addr->func = func;
+
+	return 0;
+}
+
+int
+spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
+{
+	int rc;
+
+	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
+		      addr->domain, addr->bus,
+		      addr->dev, addr->func);
+
+	if (rc > 0 && (size_t)rc < sz) {
+		return 0;
+	}
+
+	return -1;
+}
+
+void
+spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
+{
+	assert(dev->map_bar != NULL);
+	assert(dev->unmap_bar != NULL);
+	assert(dev->cfg_read != NULL);
+	assert(dev->cfg_write != NULL);
+	dev->internal.driver = drv;
+	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
+}
+
+void
+spdk_pci_unhook_device(struct spdk_pci_device *dev)
+{
+	assert(!dev->internal.attached);
+	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
+}
+
+const char *
+spdk_pci_device_get_type(const struct spdk_pci_device *dev)
+{
+	return dev->type;
+}
diff --git a/src/spdk/lib/env_dpdk/pci_idxd.c b/src/spdk/lib/env_dpdk/pci_idxd.c
new file mode 100644
index 000000000..eddbfa4af
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_idxd.c
@@ -0,0 +1,50 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct spdk_pci_id idxd_driver_id[] = {
+	{SPDK_IDXD_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IDXD)},
+	{ .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_idxd_get_driver(void)
+{
+	return spdk_pci_get_driver("idxd");
+}
+
+SPDK_PCI_DRIVER_REGISTER("idxd", idxd_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING);
diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c
new file mode 100644
index 000000000..28b7bdb44
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_ioat.c
@@ -0,0 +1,98 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct spdk_pci_id ioat_driver_id[] = {
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)},
+	{SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_ICX)},
+	{ .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_ioat_get_driver(void)
+{
+	return spdk_pci_get_driver("ioat");
+}
+
+SPDK_PCI_DRIVER_REGISTER("ioat", ioat_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING);
diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c
new file mode 100644
index 000000000..e525a4a8e
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_virtio.c
@@ -0,0 +1,53 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct spdk_pci_id virtio_pci_driver_id[] = {
+	{ SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) },
+	{ SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) },
+	{ SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) },
+	{ SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_virtio_get_driver(void)
+{
+	return spdk_pci_get_driver("virtio");
+}
+
+SPDK_PCI_DRIVER_REGISTER("virtio", virtio_pci_driver_id,
+			 SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
diff --git a/src/spdk/lib/env_dpdk/pci_vmd.c b/src/spdk/lib/env_dpdk/pci_vmd.c
new file mode 100644
index 000000000..fb6860873
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_vmd.c
@@ -0,0 +1,50 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct spdk_pci_id vmd_pci_driver_id[] = {
+	{ SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_vmd_get_driver(void)
+{
+	return spdk_pci_get_driver("vmd");
+}
+
+SPDK_PCI_DRIVER_REGISTER("vmd", vmd_pci_driver_id,
+			 SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
diff --git a/src/spdk/lib/env_dpdk/spdk_env_dpdk.map b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map
new file mode 100644
index 000000000..a465f0938
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map
@@ -0,0 +1,114 @@
+{
+	global:
+
+	# Public functions in env.h
+	spdk_malloc;
+	spdk_zmalloc;
+	spdk_realloc;
+	spdk_free;
+	spdk_env_opts_init;
+	spdk_env_init;
+	spdk_env_fini;
+	spdk_dma_malloc;
+	spdk_dma_malloc_socket;
+	spdk_dma_zmalloc;
+	spdk_dma_zmalloc_socket;
+	spdk_dma_realloc;
+	spdk_dma_free;
+	spdk_memzone_reserve;
+	spdk_memzone_reserve_aligned;
+	spdk_memzone_lookup;
+	spdk_memzone_free;
+	spdk_memzone_dump;
+	spdk_mempool_create;
+	spdk_mempool_create_ctor;
+	spdk_mempool_get_name;
+	spdk_mempool_free;
+	spdk_mempool_get;
+	spdk_mempool_get_bulk;
+	spdk_mempool_put;
+	spdk_mempool_put_bulk;
+	spdk_mempool_count;
+	spdk_mempool_obj_iter;
+	spdk_mempool_lookup;
+	spdk_env_get_core_count;
+	spdk_env_get_current_core;
+	spdk_env_get_first_core;
+	spdk_env_get_last_core;
+	spdk_env_get_next_core;
+	spdk_env_get_socket_id;
+	spdk_env_thread_launch_pinned;
+	spdk_env_thread_wait_all;
+	spdk_process_is_primary;
+	spdk_get_ticks;
+	spdk_get_ticks_hz;
+	spdk_delay_us;
+	spdk_pause;
+	spdk_ring_create;
+	spdk_ring_free;
+	spdk_ring_count;
+	spdk_ring_enqueue;
+	spdk_ring_dequeue;
+	spdk_iommu_is_enabled;
+	spdk_vtophys;
+	spdk_pci_get_driver;
+	spdk_pci_driver_register;
+	spdk_pci_nvme_get_driver;
+	spdk_pci_vmd_get_driver;
+	spdk_pci_idxd_get_driver;
+	spdk_pci_ioat_get_driver;
+	spdk_pci_virtio_get_driver;
+	spdk_pci_enumerate;
+	spdk_pci_get_first_device;
+	spdk_pci_get_next_device;
+	spdk_pci_device_map_bar;
+	spdk_pci_device_unmap_bar;
+	spdk_pci_device_get_domain;
+	spdk_pci_device_get_bus;
+	spdk_pci_device_get_dev;
+	spdk_pci_device_get_func;
+	spdk_pci_device_get_addr;
+	spdk_pci_device_get_vendor_id;
+	spdk_pci_device_get_device_id;
+	spdk_pci_device_get_subvendor_id;
+	spdk_pci_device_get_subdevice_id;
+	spdk_pci_device_get_id;
+	spdk_pci_device_get_socket_id;
+	spdk_pci_device_get_serial_number;
+	spdk_pci_device_claim;
+	spdk_pci_device_unclaim;
+	spdk_pci_device_detach;
+	spdk_pci_device_attach;
+	spdk_pci_device_cfg_read;
+	spdk_pci_device_cfg_write;
+	spdk_pci_device_cfg_read8;
+	spdk_pci_device_cfg_write8;
+	spdk_pci_device_cfg_read16;
+	spdk_pci_device_cfg_write16;
+	spdk_pci_device_cfg_read32;
+	spdk_pci_device_cfg_write32;
+	spdk_pci_device_is_removed;
+	spdk_pci_addr_compare;
+	spdk_pci_addr_parse;
+	spdk_pci_addr_fmt;
+	spdk_pci_hook_device;
+	spdk_pci_unhook_device;
+	spdk_pci_device_get_type;
+	spdk_unaffinitize_thread;
+	spdk_call_unaffinitized;
+	spdk_mem_map_alloc;
+	spdk_mem_map_free;
+	spdk_mem_map_set_translation;
+	spdk_mem_map_clear_translation;
+	spdk_mem_map_translate;
+	spdk_mem_register;
+	spdk_mem_unregister;
+
+	# Public functions in env_dpdk.h
+	spdk_env_dpdk_post_init;
+	spdk_env_dpdk_post_fini;
+	spdk_env_dpdk_external_init;
+	spdk_env_dpdk_dump_mem_stats;
+
+	local: *;
+};
diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c
new file mode 100644
index 000000000..01c7b8d9f
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/threads.c
@@ -0,0 +1,108 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_lcore.h>
+
+uint32_t
+spdk_env_get_core_count(void)
+{
+	return rte_lcore_count();
+}
+
+uint32_t
+spdk_env_get_current_core(void)
+{
+	return rte_lcore_id();
+}
+
+uint32_t
+spdk_env_get_first_core(void)
+{
+	return rte_get_next_lcore(-1, 0, 0);
+}
+
+uint32_t
+spdk_env_get_last_core(void)
+{
+	uint32_t i;
+	uint32_t last_core = UINT32_MAX;
+
+	SPDK_ENV_FOREACH_CORE(i) {
+		last_core = i;
+	}
+
+	assert(last_core != UINT32_MAX);
+
+	return last_core;
+}
+
+uint32_t
+spdk_env_get_next_core(uint32_t prev_core)
+{
+	unsigned lcore;
+
+	lcore = rte_get_next_lcore(prev_core, 0, 0);
+	if (lcore == RTE_MAX_LCORE) {
+		return UINT32_MAX;
+	}
+	return lcore;
+}
+
+uint32_t
+spdk_env_get_socket_id(uint32_t core)
+{
+	if (core >= RTE_MAX_LCORE) {
+		return SPDK_ENV_SOCKET_ID_ANY;
+	}
+
+	return rte_lcore_to_socket_id(core);
+}
+
+int
+spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg)
+{
+	int rc;
+
+	rc = rte_eal_remote_launch(fn, arg, core);
+
+	return rc;
+}
+
+void
+spdk_env_thread_wait_all(void)
+{
+	rte_eal_mp_wait_lcore();
+}
diff --git a/src/spdk/lib/env_ocf/.gitignore b/src/spdk/lib/env_ocf/.gitignore
new file mode 100644
index 000000000..f5452c248
--- /dev/null
+++ b/src/spdk/lib/env_ocf/.gitignore
@@ -0,0 +1,2 @@
+src/
+include/
diff --git a/src/spdk/lib/env_ocf/Makefile b/src/spdk/lib/env_ocf/Makefile
new file mode 100644
index 000000000..0ac51eecd
--- /dev/null
+++ b/src/spdk/lib/env_ocf/Makefile
@@ -0,0 +1,108 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# OCF requires users to build with their sources
+# If SPDK is configured with OCF source directory,
+# we export its files and then compile SPDK LIB with them
+# Else if SPDK is configured with OCF precompiled library
+# we just use it as SPDK lib by copying it to /build/lib/
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+OCFDIR=$(CONFIG_OCF_DIR)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+LIBNAME := ocfenv
+
+CFLAGS +=  $(ENV_CFLAGS) -I$(CURDIR) -I$(CURDIR)/include -w
+C_SRCS = $(shell find -name \*.c)
+
+LIB = $(call spdk_lib_list_to_static_libs,$(LIBNAME))
+
+
+ifeq ($(CONFIG_CUSTOMOCF),y)
+
+.PHONY: all clean install
+
+all:
+	$(Q)$(MAKE) $(LIB)
+
+clean:
+	$(Q)rm -f $(LIB)
+
+$(LIB):
+	cp $(CONFIG_OCF_PATH) $(LIB)
+
+install:
+
+uninstall:
+	$(UNINSTALL_LIB)
+
+else
+
+.PHONY: all clean install ocf_inc ocf_src ocf_distclean all exportlib
+
+all: ocf_inc ocf_src
+	$(Q)$(MAKE) $(LIB)
+
+ocf_inc:
+	$(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" inc O="$(SPDK_ROOT_DIR)/lib/env_ocf/" ENV= --quiet
+
+ocf_src: ocf_inc
+	$(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" src O="$(SPDK_ROOT_DIR)/lib/env_ocf/" CMD=cp ENV= --quiet
+
+ocf_distclean:
+	$(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" distclean O="$(SPDK_ROOT_DIR)/lib/env_ocf/" ENV= --quiet
+
+clean: ocf_distclean
+	$(Q)rm -rf	"$(SPDK_ROOT_DIR)/lib/env_ocf/include" \
+				"$(SPDK_ROOT_DIR)/lib/env_ocf/src" \
+				$(LIB) $(OBJS);
+
+$(LIB): $(OBJS)
+	$(LIB_C)
+
+install:
+
+uninstall:
+	$(UNINSTALL_LIB)
+
+endif
+
+exportlib: all
+	@ if [ -z $(O) ]; then echo "No output specified"; exit 1; fi
+	cp $(LIB) $(O)
+
+help:
+	@ echo "all                     Default"
+	@ echo "exportlib O=<outpath>   Default build to specified outpath"
diff --git a/src/spdk/lib/env_ocf/ocf_env.c b/src/spdk/lib/env_ocf/ocf_env.c
new file mode 100644
index 000000000..ab5445203
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env.c
@@ -0,0 +1,176 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "ocf/ocf_def.h"
+#include "ocf_env.h"
+
+#include "spdk/crc32.h"
+#include "spdk/env.h"
+#include "spdk_internal/log.h"
+
+/* Number of buffers for mempool
+ * Need to be power of two - 1 for better memory utilization
+ * It depends on memory usage of OCF which
+ * in itself depends on the workload
+ * It is a big number because OCF uses allocators
+ * for every request it sends and recieves
+ */
+#define ENV_ALLOCATOR_NBUFS 32767
+
+/* Use unique index for env allocators */
+static env_atomic g_env_allocator_index = 0;
+
+void *
+env_allocator_new(env_allocator *allocator)
+{
+	void *mem = spdk_mempool_get(allocator->mempool);
+
+	if (spdk_likely(mem)) {
+		memset(mem, 0, allocator->element_size);
+	}
+
+	return mem;
+}
+
+env_allocator *
+env_allocator_create(uint32_t size, const char *name)
+{
+	env_allocator *allocator;
+	char qualified_name[128] = {0};
+
+	snprintf(qualified_name, 128, "ocf_env_%d", env_atomic_inc_return(&g_env_allocator_index));
+
+	allocator = calloc(1, sizeof(*allocator));
+	if (!allocator) {
+		return NULL;
+	}
+
+	allocator->mempool = spdk_mempool_create(qualified_name,
+			     ENV_ALLOCATOR_NBUFS, size,
+			     SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+			     SPDK_ENV_SOCKET_ID_ANY);
+
+	if (!allocator->mempool) {
+		free(allocator);
+		return NULL;
+	}
+
+	allocator->element_size = size;
+
+	return allocator;
+}
+
+void
+env_allocator_del(env_allocator *allocator, void *item)
+{
+	spdk_mempool_put(allocator->mempool, item);
+}
+
+void
+env_allocator_destroy(env_allocator *allocator)
+{
+	if (allocator) {
+		if (ENV_ALLOCATOR_NBUFS - spdk_mempool_count(allocator->mempool)) {
+			SPDK_ERRLOG("Not all objects deallocated\n");
+			assert(false);
+		}
+
+		spdk_mempool_free(allocator->mempool);
+		free(allocator);
+	}
+}
+/* *** CRC *** */
+
+uint32_t
+env_crc32(uint32_t crc, uint8_t const *message, size_t len)
+{
+	return spdk_crc32_ieee_update(message, len, crc);
+}
+
+/* EXECUTION CONTEXTS */
+pthread_mutex_t *exec_context_mutex;
+
+static void __attribute__((constructor)) init_execution_context(void)
+{
+	unsigned count = env_get_execution_context_count();
+	unsigned i;
+
+	ENV_BUG_ON(count == 0);
+	exec_context_mutex = malloc(count * sizeof(exec_context_mutex[0]));
+	ENV_BUG_ON(exec_context_mutex == NULL);
+	for (i = 0; i < count; i++) {
+		ENV_BUG_ON(pthread_mutex_init(&exec_context_mutex[i], NULL));
+	}
+}
+
+static void __attribute__((destructor)) deinit_execution_context(void)
+{
+	unsigned count = env_get_execution_context_count();
+	unsigned i;
+
+	ENV_BUG_ON(count == 0);
+	ENV_BUG_ON(exec_context_mutex == NULL);
+
+	for (i = 0; i < count; i++) {
+		ENV_BUG_ON(pthread_mutex_destroy(&exec_context_mutex[i]));
+	}
+	free(exec_context_mutex);
+}
+
+/* get_execuction_context must assure that after the call finishes, the caller
+ * will not get preempted from current execution context. For userspace env
+ * we simulate this behavior by acquiring per execution context mutex. As a
+ * result the caller might actually get preempted, but no other thread will
+ * execute in this context by the time the caller puts current execution ctx. */
+unsigned env_get_execution_context(void)
+{
+	unsigned cpu;
+
+	cpu = sched_getcpu();
+	cpu = (cpu == -1) ?  0 : cpu;
+
+	ENV_BUG_ON(pthread_mutex_lock(&exec_context_mutex[cpu]));
+
+	return cpu;
+}
+
+void env_put_execution_context(unsigned ctx)
+{
+	pthread_mutex_unlock(&exec_context_mutex[ctx]);
+}
+
+unsigned env_get_execution_context_count(void)
+{
+	int num = sysconf(_SC_NPROCESSORS_ONLN);
+
+	return (num == -1) ? 0 : num;
+}
diff --git a/src/spdk/lib/env_ocf/ocf_env.h b/src/spdk/lib/env_ocf/ocf_env.h
new file mode 100644
index 000000000..81d2e814b
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env.h
@@ -0,0 +1,834 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __LIBOCF_ENV_H__
+#define __LIBOCF_ENV_H__
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+
+#include <linux/limits.h>
+#include <linux/stddef.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+
+#include "ocf_env_list.h"
+#include "ocf/ocf_err.h"
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef uint64_t sector_t;
+
+#define __packed __attribute__((packed))
+#define __aligned(x) __attribute__((aligned(x)))
+
+/* linux sector 512-bytes */
+#define ENV_SECTOR_SHIFT	9
+#define ENV_SECTOR_SIZE (1<<ENV_SECTOR_SHIFT)
+#define BYTES_TO_SECTOR(x)	((x) >> ENV_SECTOR_SHIFT)
+
+/* *** MEMORY MANAGEMENT *** */
+
+#define ENV_MEM_NORMAL	0
+#define ENV_MEM_NOIO	0
+#define ENV_MEM_ATOMIC	0
+
+#define likely spdk_likely
+#define unlikely spdk_unlikely
+
+#define min(x, y) MIN(x, y)
+#ifndef MIN
+#define MIN(x, y) spdk_min(x, y)
+#endif
+
+#define ARRAY_SIZE(x) SPDK_COUNTOF(x)
+
+/* LOGGING */
+#define ENV_PRIu64 PRIu64
+
+#define ENV_WARN(cond, fmt, args...) ({ \
+		if (spdk_unlikely((uintptr_t)(cond))) \
+			SPDK_NOTICELOG("WARNING" fmt, ##args); \
+	})
+
+#define ENV_WARN_ON(cond) ({ \
+	if (spdk_unlikely((uintptr_t)(cond))) \
+		SPDK_NOTICELOG("WARNING\n"); \
+	})
+
+#define ENV_BUG() ({ \
+		SPDK_ERRLOG("BUG\n"); \
+		assert(0); \
+		abort(); \
+	})
+
+#define ENV_BUG_ON(cond) ({ \
+		if (spdk_unlikely((uintptr_t)(cond))) { \
+			SPDK_ERRLOG("BUG\n"); \
+			assert(0); \
+			abort(); \
+		} \
+	})
+
+#define ENV_BUILD_BUG_ON(cond)		_Static_assert(!(cond), "static "\
+					"assertion failure")
+
+#define container_of(ptr, type, member) SPDK_CONTAINEROF(ptr, type, member)
+
+static inline void *env_malloc(size_t size, int flags)
+{
+	return spdk_malloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+			   SPDK_MALLOC_DMA);
+}
+
+static inline void *env_zalloc(size_t size, int flags)
+{
+	return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+			    SPDK_MALLOC_DMA);
+}
+
+static inline void env_free(const void *ptr)
+{
+	return spdk_free((void *)ptr);
+}
+
+static inline void *env_vmalloc(size_t size)
+{
+	return spdk_malloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+			   SPDK_MALLOC_DMA);
+}
+
+static inline void *env_vzalloc(size_t size)
+{
+	/* TODO: raw_ram init can request huge amount of memory to store
+	 * hashtable in it. need to ensure that allocation succedds */
+	return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+			    SPDK_MALLOC_DMA);
+}
+
+static inline void *env_vzalloc_flags(size_t size, int flags)
+{
+	return env_vzalloc(size);
+}
+
+static inline void *env_secure_alloc(size_t size)
+{
+	return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+			    SPDK_MALLOC_DMA);
+}
+
+static inline void env_secure_free(const void *ptr, size_t size)
+{
+	return spdk_free((void *)ptr);
+}
+
+static inline void env_vfree(const void *ptr)
+{
+	return spdk_free((void *)ptr);
+}
+
+static inline uint64_t env_get_free_memory(void)
+{
+	return -1;
+}
+
+/* *** ALLOCATOR *** */
+
+#define OCF_ALLOCATOR_NAME_MAX 128
+
+typedef struct {
+	struct spdk_mempool *mempool;
+	size_t element_size;
+} env_allocator;
+
+env_allocator *env_allocator_create(uint32_t size, const char *name);
+
+void env_allocator_destroy(env_allocator *allocator);
+
+void *env_allocator_new(env_allocator *allocator);
+
+void env_allocator_del(env_allocator *allocator, void *item);
+
+uint32_t env_allocator_item_count(env_allocator *allocator);
+
+/* *** MUTEX *** */
+
+typedef struct {
+	pthread_mutex_t m;
+} env_mutex;
+
+static inline int env_mutex_init(env_mutex *mutex)
+{
+	return !!pthread_mutex_init(&mutex->m, NULL);
+}
+
+static inline void env_mutex_lock(env_mutex *mutex)
+{
+	ENV_BUG_ON(pthread_mutex_lock(&mutex->m));
+}
+
+static inline int env_mutex_lock_interruptible(env_mutex *mutex)
+{
+	env_mutex_lock(mutex);
+	return 0;
+}
+
+static inline int env_mutex_trylock(env_mutex *mutex)
+{
+	return pthread_mutex_trylock(&mutex->m) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline void env_mutex_unlock(env_mutex *mutex)
+{
+	ENV_BUG_ON(pthread_mutex_unlock(&mutex->m));
+}
+
+static inline int env_mutex_is_locked(env_mutex *mutex)
+{
+	if (env_mutex_trylock(mutex) == 0) {
+		env_mutex_unlock(mutex);
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int env_mutex_destroy(env_mutex *mutex)
+{
+	if (pthread_mutex_destroy(&mutex->m)) {
+		return 1;
+	}
+
+	return 0;
+}
+
+/* *** RECURSIVE MUTEX *** */
+
+typedef env_mutex env_rmutex;
+
+static inline int env_rmutex_init(env_rmutex *rmutex)
+{
+	pthread_mutexattr_t attr;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutex_init(&rmutex->m, &attr);
+
+	return 0;
+}
+
+static inline void env_rmutex_lock(env_rmutex *rmutex)
+{
+	env_mutex_lock(rmutex);
+}
+
+static inline int env_rmutex_lock_interruptible(env_rmutex *rmutex)
+{
+	return env_mutex_lock_interruptible(rmutex);
+}
+
+static inline int env_rmutex_trylock(env_rmutex *rmutex)
+{
+	return env_mutex_trylock(rmutex);
+}
+
+static inline void env_rmutex_unlock(env_rmutex *rmutex)
+{
+	env_mutex_unlock(rmutex);
+}
+
+static inline int env_rmutex_is_locked(env_rmutex *rmutex)
+{
+	return env_mutex_is_locked(rmutex);
+}
+
+static inline int env_rmutex_destroy(env_rmutex *rmutex)
+{
+	return env_mutex_destroy(rmutex);
+}
+
+/* *** RW SEMAPHORE *** */
+typedef struct {
+	pthread_rwlock_t lock;
+} env_rwsem;
+
+static inline int env_rwsem_init(env_rwsem *s)
+{
+	return !!pthread_rwlock_init(&s->lock, NULL);
+}
+
+static inline void env_rwsem_up_read(env_rwsem *s)
+{
+	ENV_BUG_ON(pthread_rwlock_unlock(&s->lock));
+}
+
+static inline void env_rwsem_down_read(env_rwsem *s)
+{
+	ENV_BUG_ON(pthread_rwlock_rdlock(&s->lock));
+}
+
+static inline int env_rwsem_down_read_trylock(env_rwsem *s)
+{
+	return pthread_rwlock_tryrdlock(&s->lock) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline void env_rwsem_up_write(env_rwsem *s)
+{
+	ENV_BUG_ON(pthread_rwlock_unlock(&s->lock));
+}
+
+static inline void env_rwsem_down_write(env_rwsem *s)
+{
+	ENV_BUG_ON(pthread_rwlock_wrlock(&s->lock));
+}
+
+static inline int env_rwsem_down_write_trylock(env_rwsem *s)
+{
+	return pthread_rwlock_trywrlock(&s->lock) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline int env_rwsem_is_locked(env_rwsem *s)
+{
+	if (env_rwsem_down_read_trylock(s) == 0) {
+		env_rwsem_up_read(s);
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int env_rwsem_down_read_interruptible(env_rwsem *s)
+{
+	return pthread_rwlock_rdlock(&s->lock);
+}
+static inline int env_rwsem_down_write_interruptible(env_rwsem *s)
+{
+	return pthread_rwlock_wrlock(&s->lock);
+}
+
+static inline int env_rwsem_destroy(env_rwsem *s)
+{
+	return pthread_rwlock_destroy(&s->lock);
+}
+
+/* *** ATOMIC VARIABLES *** */
+
+typedef int env_atomic;
+
+typedef long env_atomic64;
+
+#ifndef atomic_read
+#define atomic_read(ptr)       (*(__typeof__(*ptr) *volatile) (ptr))
+#endif
+
+#ifndef atomic_set
+#define atomic_set(ptr, i)     ((*(__typeof__(*ptr) *volatile) (ptr)) = (i))
+#endif
+
+#define atomic_inc(ptr)        ((void) __sync_fetch_and_add(ptr, 1))
+#define atomic_dec(ptr)        ((void) __sync_fetch_and_add(ptr, -1))
+#define atomic_add(ptr, n)     ((void) __sync_fetch_and_add(ptr, n))
+#define atomic_sub(ptr, n)     ((void) __sync_fetch_and_sub(ptr, n))
+
+#define atomic_cmpxchg         __sync_val_compare_and_swap
+
+static inline int env_atomic_read(const env_atomic *a)
+{
+	return atomic_read(a);
+}
+
+static inline void env_atomic_set(env_atomic *a, int i)
+{
+	atomic_set(a, i);
+}
+
+static inline void env_atomic_add(int i, env_atomic *a)
+{
+	atomic_add(a, i);
+}
+
+static inline void env_atomic_sub(int i, env_atomic *a)
+{
+	atomic_sub(a, i);
+}
+
+static inline bool env_atomic_sub_and_test(int i, env_atomic *a)
+{
+	return __sync_sub_and_fetch(a, i) == 0;
+}
+
+static inline void env_atomic_inc(env_atomic *a)
+{
+	atomic_inc(a);
+}
+
+static inline void env_atomic_dec(env_atomic *a)
+{
+	atomic_dec(a);
+}
+
+static inline bool env_atomic_dec_and_test(env_atomic *a)
+{
+	return __sync_sub_and_fetch(a, 1) == 0;
+}
+
+static inline bool env_atomic_inc_and_test(env_atomic *a)
+{
+	return __sync_add_and_fetch(a, 1) == 0;
+}
+
+static inline int env_atomic_add_return(int i, env_atomic *a)
+{
+	return __sync_add_and_fetch(a, i);
+}
+
+static inline int env_atomic_sub_return(int i, env_atomic *a)
+{
+	return __sync_sub_and_fetch(a, i);
+}
+
+static inline int env_atomic_inc_return(env_atomic *a)
+{
+	return env_atomic_add_return(1, a);
+}
+
+static inline int env_atomic_dec_return(env_atomic *a)
+{
+	return env_atomic_sub_return(1, a);
+}
+
+static inline int env_atomic_cmpxchg(env_atomic *a, int old, int new_value)
+{
+	return atomic_cmpxchg(a, old, new_value);
+}
+
+static inline int env_atomic_add_unless(env_atomic *a, int i, int u)
+{
+	int c, old;
+	c = env_atomic_read(a);
+	for (;;) {
+		if (spdk_unlikely(c == (u))) {
+			break;
+		}
+		old = env_atomic_cmpxchg((a), c, c + (i));
+		if (spdk_likely(old == c)) {
+			break;
+		}
+		c = old;
+	}
+	return c != (u);
+}
+
+static inline long env_atomic64_read(const env_atomic64 *a)
+{
+	return atomic_read(a);
+}
+
+static inline void env_atomic64_set(env_atomic64 *a, long i)
+{
+	atomic_set(a, i);
+}
+
+static inline void env_atomic64_add(long i, env_atomic64 *a)
+{
+	atomic_add(a, i);
+}
+
+static inline void env_atomic64_sub(long i, env_atomic64 *a)
+{
+	atomic_sub(a, i);
+}
+
+static inline void env_atomic64_inc(env_atomic64 *a)
+{
+	atomic_inc(a);
+}
+
+static inline void env_atomic64_dec(env_atomic64 *a)
+{
+	atomic_dec(a);
+}
+
+static inline int env_atomic64_add_return(int i, env_atomic *a)
+{
+	return __sync_add_and_fetch(a, i);
+}
+
+static inline int env_atomic64_sub_return(int i, env_atomic *a)
+{
+	return __sync_sub_and_fetch(a, i);
+}
+
+static inline int env_atomic64_inc_return(env_atomic *a)
+{
+	return env_atomic64_add_return(1, a);
+}
+
+static inline int env_atomic64_dec_return(env_atomic *a)
+{
+	return env_atomic_sub_return(1, a);
+}
+
+static inline long env_atomic64_cmpxchg(env_atomic64 *a, long old, long new)
+{
+	return atomic_cmpxchg(a, old, new);
+}
+
+/* *** COMPLETION *** */
+typedef struct completion {
+	sem_t sem;
+} env_completion;
+
+static inline void env_completion_init(env_completion *completion)
+{
+	sem_init(&completion->sem, 0, 0);
+}
+
+static inline void env_completion_wait(env_completion *completion)
+{
+	sem_wait(&completion->sem);
+}
+
+static inline void env_completion_complete(env_completion *completion)
+{
+	sem_post(&completion->sem);
+}
+
+static inline void env_completion_destroy(env_completion *completion)
+{
+	sem_destroy(&completion->sem);
+}
+
+/* *** SPIN LOCKS *** */
+
+typedef struct {
+	pthread_spinlock_t lock;
+} env_spinlock;
+
+static inline int env_spinlock_init(env_spinlock *l)
+{
+	return pthread_spin_init(&l->lock, 0);
+}
+
+static inline int env_spinlock_trylock(env_spinlock *l)
+{
+	return pthread_spin_trylock(&l->lock) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline void env_spinlock_lock(env_spinlock *l)
+{
+	ENV_BUG_ON(pthread_spin_lock(&l->lock));
+}
+
+static inline void env_spinlock_unlock(env_spinlock *l)
+{
+	ENV_BUG_ON(pthread_spin_unlock(&l->lock));
+}
+
+#define env_spinlock_lock_irqsave(l, flags) \
+		(void)flags; \
+		env_spinlock_lock(l)
+
+#define env_spinlock_unlock_irqrestore(l, flags) \
+		(void)flags; \
+		env_spinlock_unlock(l)
+
+static inline void env_spinlock_destroy(env_spinlock *l)
+{
+	ENV_BUG_ON(pthread_spin_destroy(&l->lock));
+}
+
+/* *** RW LOCKS *** */
+
+typedef struct {
+	pthread_rwlock_t lock;
+} env_rwlock;
+
+static inline void env_rwlock_init(env_rwlock *l)
+{
+	ENV_BUG_ON(pthread_rwlock_init(&l->lock, NULL));
+}
+
+static inline void env_rwlock_read_lock(env_rwlock *l)
+{
+	ENV_BUG_ON(pthread_rwlock_rdlock(&l->lock));
+}
+
+static inline void env_rwlock_read_unlock(env_rwlock *l)
+{
+	ENV_BUG_ON(pthread_rwlock_unlock(&l->lock));
+}
+
+static inline void env_rwlock_write_lock(env_rwlock *l)
+{
+	ENV_BUG_ON(pthread_rwlock_wrlock(&l->lock));
+}
+
+static inline void env_rwlock_write_unlock(env_rwlock *l)
+{
+	ENV_BUG_ON(pthread_rwlock_unlock(&l->lock));
+}
+
+static inline void env_rwlock_destroy(env_rwlock *l)
+{
+	ENV_BUG_ON(pthread_rwlock_destroy(&l->lock));
+}
+
+static inline void env_bit_set(int nr, volatile void *addr)
+{
+	char *byte = (char *)addr + (nr >> 3);
+	char mask = 1 << (nr & 7);
+
+	__sync_or_and_fetch(byte, mask);
+}
+
+static inline void env_bit_clear(int nr, volatile void *addr)
+{
+	char *byte = (char *)addr + (nr >> 3);
+	char mask = 1 << (nr & 7);
+
+	mask = ~mask;
+	__sync_and_and_fetch(byte, mask);
+}
+
+static inline bool env_bit_test(int nr, const volatile unsigned long *addr)
+{
+	const char *byte = (char *)addr + (nr >> 3);
+	char mask = 1 << (nr & 7);
+
+	return !!(*byte & mask);
+}
+
+/* *** WAITQUEUE *** */
+
+typedef struct {
+	sem_t sem;
+} env_waitqueue;
+
+static inline void env_waitqueue_init(env_waitqueue *w)
+{
+	sem_init(&w->sem, 0, 0);
+}
+
+static inline void env_waitqueue_wake_up(env_waitqueue *w)
+{
+	sem_post(&w->sem);
+}
+
+#define env_waitqueue_wait(w, condition)	\
+({						\
+	int __ret = 0;				\
+	if (!(condition))			\
+		sem_wait(&w.sem);		\
+	__ret = __ret;				\
+})
+
+/* *** SCHEDULING *** */
+
+/* CAS does not need this while in user-space */
+static inline void env_schedule(void)
+{
+}
+
+#define env_cond_resched	env_schedule
+
+static inline int env_in_interrupt(void)
+{
+	return 0;
+}
+
+static inline uint64_t env_get_tick_count(void)
+{
+	return spdk_get_ticks();
+}
+
+static inline uint64_t env_ticks_to_secs(uint64_t j)
+{
+	return j / spdk_get_ticks_hz();
+}
+
+static inline uint64_t env_ticks_to_msecs(uint64_t j)
+{
+	return env_ticks_to_secs(j) * 1000;
+}
+
+static inline uint64_t env_ticks_to_nsecs(uint64_t j)
+{
+	return env_ticks_to_secs(j) * 1000 * 1000;
+}
+
+static inline uint64_t env_ticks_to_usecs(uint64_t j)
+{
+	return env_ticks_to_secs(j) * 1000 * 1000 * 1000;
+}
+
+static inline uint64_t env_secs_to_ticks(uint64_t j)
+{
+	return j * spdk_get_ticks_hz();
+}
+
+/* *** STRING OPERATIONS *** */
+
+/* 512 KB is sufficient amount of memory for OCF operations */
+#define ENV_MAX_MEM (512 * 1024)
+
+static inline int env_memset(void *dest, size_t len, uint8_t value)
+{
+	if (dest == NULL || len == 0) {
+		return 1;
+	}
+
+	memset(dest, value, len);
+	return 0;
+}
+
+static inline int env_memcpy(void *dest, size_t dmax, const void *src, size_t len)
+{
+	if (dest == NULL || src == NULL) {
+		return 1;
+	}
+	if (dmax == 0 || dmax > ENV_MAX_MEM) {
+		return 1;
+	}
+	if (len == 0 || len > dmax) {
+		return 1;
+	}
+
+	memcpy(dest, src, len);
+	return 0;
+}
+
+static inline int env_memcmp(const void *aptr, size_t dmax, const void *bptr, size_t len,
+			     int *diff)
+{
+	if (diff == NULL || aptr == NULL || bptr == NULL) {
+		return 1;
+	}
+	if (dmax == 0 || dmax > ENV_MAX_MEM) {
+		return 1;
+	}
+	if (len == 0 || len > dmax) {
+		return 1;
+	}
+
+	*diff = memcmp(aptr, bptr, len);
+	return 0;
+}
+
+/* 4096 is sufficient max length for any OCF operation on string */
+#define ENV_MAX_STR (4 * 1024)
+
+static inline size_t env_strnlen(const char *src, size_t dmax)
+{
+	return strnlen(src, dmax);
+}
+
+static inline int env_strncpy(char *dest, size_t dmax, const char *src, size_t len)
+{
+	if (dest == NULL  || src == NULL) {
+		return 1;
+	}
+	if (dmax == 0 || dmax > ENV_MAX_STR) {
+		return 1;
+	}
+	if (len == 0) {
+		return 1;
+	}
+	/* Just copy as many characters as we can instead of return failure */
+	len = min(len, dmax);
+
+	strncpy(dest, src, len);
+	return 0;
+}
+
+#define env_strncmp(s1, slen1, s2, slen2) strncmp(s1, s2, min(slen1, slen2))
+
+static inline char *env_strdup(const char *src, int flags)
+{
+	int len;
+	char *ret;
+
+	if (src == NULL) {
+		return NULL;
+	}
+
+	len = env_strnlen(src, ENV_MAX_STR) + 1;
+	ret = env_malloc(len, flags);
+
+	if (env_strncpy(ret, ENV_MAX_STR, src, len)) {
+		return NULL;
+	} else {
+		return ret;
+	}
+}
+
+/* *** SORTING *** */
+
+static inline void env_sort(void *base, size_t num, size_t size,
+			    int (*cmp_fn)(const void *, const void *),
+			    void (*swap_fn)(void *, void *, int size))
+{
+	qsort(base, num, size, cmp_fn);
+}
+
+static inline void env_msleep(uint64_t n)
+{
+	usleep(n * 1000);
+}
+
+static inline void env_touch_softlockup_wd(void)
+{
+}
+
+/* *** CRC *** */
+
+uint32_t env_crc32(uint32_t crc, uint8_t const *data, size_t len);
+
+/* EXECUTION CONTEXTS */
+unsigned env_get_execution_context(void);
+void env_put_execution_context(unsigned ctx);
+unsigned env_get_execution_context_count(void);
+
+#endif /* __OCF_ENV_H__ */
diff --git a/src/spdk/lib/env_ocf/ocf_env_headers.h b/src/spdk/lib/env_ocf/ocf_env_headers.h
new file mode 100644
index 000000000..742479374
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env_headers.h
@@ -0,0 +1,43 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OCF_ENV_HEADERS_H__
+#define __OCF_ENV_HEADERS_H__
+
+#include "spdk/stdinc.h"
+
+#define OCF_VERSION_MAIN 20
+#define OCF_VERSION_MAJOR 3
+#define OCF_VERSION_MINOR 0
+
+#endif /* __OCF_ENV_HEADERS_H__ */
diff --git a/src/spdk/lib/env_ocf/ocf_env_list.h b/src/spdk/lib/env_ocf/ocf_env_list.h
new file mode 100644
index 000000000..e5f60d6c3
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env_list.h
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OCF_LIST_H__
+#define __OCF_LIST_H__
+
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/**
+ * List entry structure mimicking linux kernel based one.
+ */
+struct list_head {
+	struct list_head *next;
+	struct list_head *prev;
+};
+
+/**
+ * start an empty list
+ */
+#define INIT_LIST_HEAD(l) { (l)->prev = l; (l)->next = l; }
+
+/**
+ * Add item to list head.
+ * @param it list entry to be added
+ * @param l1 list main node (head)
+ */
+static inline void list_add(struct list_head *it, struct list_head *l1)
+{
+	it->prev = l1;
+	it->next = l1->next;
+
+	l1->next->prev = it;
+	l1->next = it;
+}
+
+/**
+ * Add item it to tail.
+ * @param it list entry to be added
+ * @param l1 list main node (head)
+ */
+static inline void list_add_tail(struct list_head *it, struct list_head *l1)
+{
+	it->prev = l1->prev;
+	it->next = l1;
+
+	l1->prev->next = it;
+	l1->prev = it;
+}
+
+/**
+ * check if a list is empty (return true)
+ */
+static inline int list_empty(struct list_head *it)
+{
+	return it->next == it;
+}
+
+/**
+ * delete an entry from a list
+ */
+static inline void list_del(struct list_head *it)
+{
+	it->next->prev = it->prev;
+	it->prev->next = it->next;
+}
+
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+	list_del(list);
+	list_add_tail(list, head);
+}
+
+static inline void list_move(struct list_head *list,
+			     struct list_head *head)
+{
+	list_del(list);
+	list_add(list, head);
+}
+
+/**
+ * Extract an entry.
+ * @param list_head_i list head item, from which entry is extracted
+ * @param item_type type (struct) of list entry
+ * @param field_name name of list_head field within item_type
+ */
+#define list_entry(list_head_i, item_type, field_name) \
+	(item_type *)(((void*)(list_head_i)) - offsetof(item_type, field_name))
+
+#define list_first_entry(list_head_i, item_type, field_name) \
+	list_entry((list_head_i)->next, item_type, field_name)
+
+/**
+ * @param iterator uninitialized list_head pointer, to be used as iterator
+ * @param plist list head (main node)
+ */
+#define list_for_each(iterator, plist) \
+	for (iterator = (plist)->next; \
+	     (iterator)->next != (plist)->next; \
+	     iterator = (iterator)->next)
+
+/**
+ * Safe version of list_for_each which works even if entries are deleted during
+ * loop.
+ * @param iterator uninitialized list_head pointer, to be used as iterator
+ * @param q another uninitialized list_head, used as helper
+ * @param plist list head (main node)
+ */
+/*
+ * Algorithm handles situation, where q is deleted.
+ * consider in example 3 element list with header h:
+ *
+ *   h -> 1 -> 2 -> 3 ->
+ *1.      i    q
+ *
+ *2.           i    q
+ *
+ *3. q              i
+ */
+#define list_for_each_safe(iterator, q, plist) \
+	for (iterator = (q = (plist)->next->next)->prev; \
+	     (q) != (plist)->next; \
+	     iterator = (q = (q)->next)->prev)
+
+#define _list_entry_helper(item, head, field_name) list_entry(head, typeof(*item), field_name)
+
+/**
+ * Iterate over list entries.
+ * @param list pointer to list item (iterator)
+ * @param plist pointer to list_head item
+ * @param field_name name of list_head field in list entry
+ */
+#define list_for_each_entry(item, plist, field_name) \
+	for (item = _list_entry_helper(item, (plist)->next, field_name); \
+	     _list_entry_helper(item, (item)->field_name.next, field_name) !=\
+		     _list_entry_helper(item, (plist)->next, field_name); \
+	     item = _list_entry_helper(item, (item)->field_name.next, field_name))
+
+/**
+ * Safe version of list_for_each_entry which works even if entries are deleted
+ * during loop.
+ * @param list pointer to list item (iterator)
+ * @param q another pointer to list item, used as helper
+ * @param plist pointer to list_head item
+ * @param field_name name of list_head field in list entry
+ */
+#define list_for_each_entry_safe(item, q, plist, field_name)		\
+	for (item = _list_entry_helper(item, (plist)->next, field_name), \
+	     q = _list_entry_helper(item, (item)->field_name.next, field_name); \
+	     _list_entry_helper(item, (item)->field_name.next, field_name) != \
+		     _list_entry_helper(item, (plist)->next, field_name); \
+	     item = q, q = _list_entry_helper(q, (q)->field_name.next, field_name))
+
+#endif
diff --git a/src/spdk/lib/event/Makefile b/src/spdk/lib/event/Makefile
new file mode 100644
index 000000000..87a6209c7
--- /dev/null
+++ b/src/spdk/lib/event/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+LIBNAME = event
+C_SRCS = app.c reactor.c rpc.c subsystem.c json_config.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_event.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/event/app.c b/src/spdk/lib/event/app.c
new file mode 100644
index 000000000..b6cab05a3
--- /dev/null
+++ b/src/spdk/lib/event/app.c
@@ -0,0 +1,1177 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/version.h"
+
+#include "spdk_internal/event.h"
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/trace.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#define SPDK_APP_DEFAULT_LOG_LEVEL		SPDK_LOG_NOTICE
+#define SPDK_APP_DEFAULT_LOG_PRINT_LEVEL	SPDK_LOG_INFO
+#define SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES	SPDK_DEFAULT_NUM_TRACE_ENTRIES
+
+#define SPDK_APP_DPDK_DEFAULT_MEM_SIZE		-1
+#define SPDK_APP_DPDK_DEFAULT_MASTER_CORE	-1
+#define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL	-1
+#define SPDK_APP_DPDK_DEFAULT_CORE_MASK		"0x1"
+#define SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
+#define SPDK_APP_DEFAULT_CORE_LIMIT		0x140000000 /* 5 GiB */
+
+struct spdk_app {
+	struct spdk_conf		*config;
+	const char			*json_config_file;
+	bool				json_config_ignore_errors;
+	const char			*rpc_addr;
+	int				shm_id;
+	spdk_app_shutdown_cb		shutdown_cb;
+	int				rc;
+};
+
+static struct spdk_app g_spdk_app;
+static spdk_msg_fn g_start_fn = NULL;
+static void *g_start_arg = NULL;
+static struct spdk_thread *g_app_thread = NULL;
+static bool g_delay_subsystem_init = false;
+static bool g_shutdown_sig_received = false;
+static char *g_executable_name;
+static struct spdk_app_opts g_default_opts;
+
+int
+spdk_app_get_shm_id(void)
+{
+	return g_spdk_app.shm_id;
+}
+
+/* append one empty option to indicate the end of the array */
+static const struct option g_cmdline_options[] = {
+#define CONFIG_FILE_OPT_IDX	'c'
+	{"config",			required_argument,	NULL, CONFIG_FILE_OPT_IDX},
+#define LIMIT_COREDUMP_OPT_IDX 'd'
+	{"limit-coredump",		no_argument,		NULL, LIMIT_COREDUMP_OPT_IDX},
+#define TPOINT_GROUP_MASK_OPT_IDX 'e'
+	{"tpoint-group-mask",		required_argument,	NULL, TPOINT_GROUP_MASK_OPT_IDX},
+#define SINGLE_FILE_SEGMENTS_OPT_IDX 'g'
+	{"single-file-segments",	no_argument,		NULL, SINGLE_FILE_SEGMENTS_OPT_IDX},
+#define HELP_OPT_IDX		'h'
+	{"help",			no_argument,		NULL, HELP_OPT_IDX},
+#define SHM_ID_OPT_IDX		'i'
+	{"shm-id",			required_argument,	NULL, SHM_ID_OPT_IDX},
+#define CPUMASK_OPT_IDX		'm'
+	{"cpumask",			required_argument,	NULL, CPUMASK_OPT_IDX},
+#define MEM_CHANNELS_OPT_IDX	'n'
+	{"mem-channels",		required_argument,	NULL, MEM_CHANNELS_OPT_IDX},
+#define MASTER_CORE_OPT_IDX	'p'
+	{"master-core",			required_argument,	NULL, MASTER_CORE_OPT_IDX},
+#define RPC_SOCKET_OPT_IDX	'r'
+	{"rpc-socket",			required_argument,	NULL, RPC_SOCKET_OPT_IDX},
+#define MEM_SIZE_OPT_IDX	's'
+	{"mem-size",			required_argument,	NULL, MEM_SIZE_OPT_IDX},
+#define NO_PCI_OPT_IDX		'u'
+	{"no-pci",			no_argument,		NULL, NO_PCI_OPT_IDX},
+#define VERSION_OPT_IDX		'v'
+	{"version",			no_argument,		NULL, VERSION_OPT_IDX},
+#define PCI_BLACKLIST_OPT_IDX	'B'
+	{"pci-blacklist",		required_argument,	NULL, PCI_BLACKLIST_OPT_IDX},
+#define LOGFLAG_OPT_IDX		'L'
+	{"logflag",			required_argument,	NULL, LOGFLAG_OPT_IDX},
+#define HUGE_UNLINK_OPT_IDX	'R'
+	{"huge-unlink",			no_argument,		NULL, HUGE_UNLINK_OPT_IDX},
+#define PCI_WHITELIST_OPT_IDX	'W'
+	{"pci-whitelist",		required_argument,	NULL, PCI_WHITELIST_OPT_IDX},
+#define SILENCE_NOTICELOG_OPT_IDX 257
+	{"silence-noticelog",		no_argument,		NULL, SILENCE_NOTICELOG_OPT_IDX},
+#define WAIT_FOR_RPC_OPT_IDX	258
+	{"wait-for-rpc",		no_argument,		NULL, WAIT_FOR_RPC_OPT_IDX},
+#define HUGE_DIR_OPT_IDX	259
+	{"huge-dir",			required_argument,	NULL, HUGE_DIR_OPT_IDX},
+#define NUM_TRACE_ENTRIES_OPT_IDX	260
+	{"num-trace-entries",		required_argument,	NULL, NUM_TRACE_ENTRIES_OPT_IDX},
+#define MAX_REACTOR_DELAY_OPT_IDX	261
+	{"max-delay",			required_argument,	NULL, MAX_REACTOR_DELAY_OPT_IDX},
+#define JSON_CONFIG_OPT_IDX		262
+	{"json",			required_argument,	NULL, JSON_CONFIG_OPT_IDX},
+#define JSON_CONFIG_IGNORE_INIT_ERRORS_IDX	263
+	{"json-ignore-init-errors",	no_argument,		NULL, JSON_CONFIG_IGNORE_INIT_ERRORS_IDX},
+#define IOVA_MODE_OPT_IDX	264
+	{"iova-mode",			required_argument,	NULL, IOVA_MODE_OPT_IDX},
+#define BASE_VIRTADDR_OPT_IDX	265
+	{"base-virtaddr",		required_argument,	NULL, BASE_VIRTADDR_OPT_IDX},
+};
+
+/* Global section */
+#define GLOBAL_CONFIG_TMPL \
+"# Configuration file\n" \
+"#\n" \
+"# Please write all parameters using ASCII.\n" \
+"# The parameter must be quoted if it includes whitespace.\n" \
+"#\n" \
+"# Configuration syntax:\n" \
+"# Spaces at head of line are deleted, other spaces are as separator\n" \
+"# Lines starting with '#' are comments and not evaluated.\n" \
+"# Lines ending with '\\' are concatenated with the next line.\n" \
+"# Bracketed keys are section keys grouping the following value keys.\n" \
+"# Number of section key is used as a tag number.\n" \
+"#  Ex. [TargetNode1] = TargetNode section key with tag number 1\n" \
+"[Global]\n" \
+"  Comment \"Global section\"\n" \
+"\n" \
+"  # Users can restrict work items to only run on certain cores by\n" \
+"  #  specifying a ReactorMask.  Default is to allow work items to run\n" \
+"  #  on all cores.  Core 0 must be set in the mask if one is specified.\n" \
+"  # Default: 0xFFFF (cores 0-15)\n" \
+"  ReactorMask \"0x%s\"\n" \
+"\n" \
+"  # Tracepoint group mask for spdk trace buffers\n" \
+"  # Default: 0x0 (all tracepoint groups disabled)\n" \
+"  # Set to 0xFFFF to enable all tracepoint groups.\n" \
+"  TpointGroupMask \"0x%" PRIX64 "\"\n" \
+"\n" \
+
+static void
+app_config_dump_global_section(FILE *fp)
+{
+	struct spdk_cpuset *coremask;
+
+	if (NULL == fp) {
+		return;
+	}
+
+	coremask = spdk_app_get_core_mask();
+
+	fprintf(fp, GLOBAL_CONFIG_TMPL, spdk_cpuset_fmt(coremask),
+		spdk_trace_get_tpoint_group_mask());
+}
+
+int
+spdk_app_get_running_config(char **config_str, char *name)
+{
+	FILE *fp = NULL;
+	int fd = -1;
+	long length = 0, ret = 0;
+	char vbuf[BUFSIZ];
+	char config_template[64];
+
+	snprintf(config_template, sizeof(config_template), "/tmp/%s.XXXXXX", name);
+	/* Create temporary file to hold config */
+	fd = mkstemp(config_template);
+	if (fd == -1) {
+		SPDK_ERRLOG("mkstemp failed\n");
+		return -1;
+	}
+	fp = fdopen(fd, "wb+");
+	if (NULL == fp) {
+		SPDK_ERRLOG("error opening tmpfile fd = %d\n", fd);
+		return -1;
+	}
+
+	/* Buffered IO */
+	setvbuf(fp, vbuf, _IOFBF, BUFSIZ);
+
+	app_config_dump_global_section(fp);
+	spdk_subsystem_config(fp);
+
+	length = ftell(fp);
+
+	*config_str = malloc(length + 1);
+	if (!*config_str) {
+		SPDK_ERRLOG("out-of-memory for config\n");
+		fclose(fp);
+		return -1;
+	}
+	fseek(fp, 0, SEEK_SET);
+	ret = fread(*config_str, sizeof(char), length, fp);
+	if (ret < length) {
+		SPDK_ERRLOG("short read\n");
+	}
+	fclose(fp);
+	(*config_str)[length] = '\0';
+
+	return 0;
+}
+
+static void
+app_start_shutdown(void *ctx)
+{
+	if (g_spdk_app.shutdown_cb) {
+		g_spdk_app.shutdown_cb();
+		g_spdk_app.shutdown_cb = NULL;
+	} else {
+		spdk_app_stop(0);
+	}
+}
+
+void
+spdk_app_start_shutdown(void)
+{
+	spdk_thread_send_critical_msg(g_app_thread, app_start_shutdown);
+}
+
+static void
+__shutdown_signal(int signo)
+{
+	if (!g_shutdown_sig_received) {
+		g_shutdown_sig_received = true;
+		spdk_app_start_shutdown();
+	}
+}
+
+static int
+app_opts_validate(const char *app_opts)
+{
+	int i = 0, j;
+
+	for (i = 0; app_opts[i] != '\0'; i++) {
+		/* ignore getopt control characters */
+		if (app_opts[i] == ':' || app_opts[i] == '+' || app_opts[i] == '-') {
+			continue;
+		}
+
+		for (j = 0; SPDK_APP_GETOPT_STRING[j] != '\0'; j++) {
+			if (app_opts[i] == SPDK_APP_GETOPT_STRING[j]) {
+				return app_opts[i];
+			}
+		}
+	}
+	return 0;
+}
+
+void
+spdk_app_opts_init(struct spdk_app_opts *opts)
+{
+	if (!opts) {
+		return;
+	}
+
+	memset(opts, 0, sizeof(*opts));
+
+	opts->enable_coredump = true;
+	opts->shm_id = -1;
+	opts->mem_size = SPDK_APP_DPDK_DEFAULT_MEM_SIZE;
+	opts->master_core = SPDK_APP_DPDK_DEFAULT_MASTER_CORE;
+	opts->mem_channel = SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL;
+	opts->reactor_mask = NULL;
+	opts->base_virtaddr = SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR;
+	opts->print_level = SPDK_APP_DEFAULT_LOG_PRINT_LEVEL;
+	opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR;
+	opts->num_entries = SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES;
+	opts->delay_subsystem_init = false;
+}
+
+static int
+app_setup_signal_handlers(struct spdk_app_opts *opts)
+{
+	struct sigaction	sigact;
+	sigset_t		sigmask;
+	int			rc;
+
+	sigemptyset(&sigmask);
+	memset(&sigact, 0, sizeof(sigact));
+	sigemptyset(&sigact.sa_mask);
+
+	sigact.sa_handler = SIG_IGN;
+	rc = sigaction(SIGPIPE, &sigact, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("sigaction(SIGPIPE) failed\n");
+		return rc;
+	}
+
+	/* Install the same handler for SIGINT and SIGTERM */
+	g_shutdown_sig_received = false;
+	sigact.sa_handler = __shutdown_signal;
+	rc = sigaction(SIGINT, &sigact, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("sigaction(SIGINT) failed\n");
+		return rc;
+	}
+	sigaddset(&sigmask, SIGINT);
+
+	rc = sigaction(SIGTERM, &sigact, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("sigaction(SIGTERM) failed\n");
+		return rc;
+	}
+	sigaddset(&sigmask, SIGTERM);
+
+	if (opts->usr1_handler != NULL) {
+		sigact.sa_handler = opts->usr1_handler;
+		rc = sigaction(SIGUSR1, &sigact, NULL);
+		if (rc < 0) {
+			SPDK_ERRLOG("sigaction(SIGUSR1) failed\n");
+			return rc;
+		}
+		sigaddset(&sigmask, SIGUSR1);
+	}
+
+	pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL);
+
+	return 0;
+}
+
+static void
+app_start_application(void)
+{
+	assert(spdk_get_thread() == g_app_thread);
+
+	g_start_fn(g_start_arg);
+}
+
+static void
+app_start_rpc(int rc, void *arg1)
+{
+	if (rc) {
+		spdk_app_stop(rc);
+		return;
+	}
+
+	spdk_rpc_initialize(g_spdk_app.rpc_addr);
+	if (!g_delay_subsystem_init) {
+		spdk_rpc_set_state(SPDK_RPC_RUNTIME);
+		app_start_application();
+	}
+}
+
+static struct spdk_conf *
+app_setup_conf(const char *config_file)
+{
+	struct spdk_conf *config;
+	int rc;
+
+	config = spdk_conf_allocate();
+	assert(config != NULL);
+	if (config_file) {
+		rc = spdk_conf_read(config, config_file);
+		if (rc != 0) {
+			SPDK_ERRLOG("Could not read config file %s\n", config_file);
+			goto error;
+		}
+		if (spdk_conf_first_section(config) == NULL) {
+			SPDK_ERRLOG("Invalid config file %s\n", config_file);
+			goto error;
+		}
+	}
+	spdk_conf_set_as_default(config);
+	return config;
+
+error:
+	spdk_conf_free(config);
+	return NULL;
+}
+
+static int
+app_opts_add_pci_addr(struct spdk_app_opts *opts, struct spdk_pci_addr **list, char *bdf)
+{
+	struct spdk_pci_addr *tmp = *list;
+	size_t i = opts->num_pci_addr;
+
+	tmp = realloc(tmp, sizeof(*tmp) * (i + 1));
+	if (tmp == NULL) {
+		SPDK_ERRLOG("realloc error\n");
+		return -ENOMEM;
+	}
+
+	*list = tmp;
+	if (spdk_pci_addr_parse(*list + i, bdf) < 0) {
+		SPDK_ERRLOG("Invalid address %s\n", bdf);
+		return -EINVAL;
+	}
+
+	opts->num_pci_addr++;
+	return 0;
+}
+
+static int
+app_read_config_file_global_params(struct spdk_app_opts *opts)
+{
+	struct spdk_conf_section *sp;
+	char *bdf;
+	int i, rc = 0;
+
+	sp = spdk_conf_find_section(NULL, "Global");
+
+	if (opts->shm_id == -1) {
+		if (sp != NULL) {
+			opts->shm_id = spdk_conf_section_get_intval(sp, "SharedMemoryID");
+		}
+	}
+
+	if (opts->reactor_mask == NULL) {
+		if (sp && spdk_conf_section_get_val(sp, "ReactorMask")) {
+			SPDK_ERRLOG("ReactorMask config option is deprecated.  Use -m/--cpumask\n"
+				    "command line parameter instead.\n");
+			opts->reactor_mask = spdk_conf_section_get_val(sp, "ReactorMask");
+		} else {
+			opts->reactor_mask = SPDK_APP_DPDK_DEFAULT_CORE_MASK;
+		}
+	}
+
+	if (!opts->no_pci && sp) {
+		opts->no_pci = spdk_conf_section_get_boolval(sp, "NoPci", false);
+	}
+
+	if (opts->tpoint_group_mask == NULL) {
+		if (sp != NULL) {
+			opts->tpoint_group_mask = spdk_conf_section_get_val(sp, "TpointGroupMask");
+		}
+	}
+
+	if (sp == NULL) {
+		return 0;
+	}
+
+	for (i = 0; ; i++) {
+		bdf = spdk_conf_section_get_nmval(sp, "PciBlacklist", i, 0);
+		if (!bdf) {
+			break;
+		}
+
+		rc = app_opts_add_pci_addr(opts, &opts->pci_blacklist, bdf);
+		if (rc != 0) {
+			free(opts->pci_blacklist);
+			return rc;
+		}
+	}
+
+	for (i = 0; ; i++) {
+		bdf = spdk_conf_section_get_nmval(sp, "PciWhitelist", i, 0);
+		if (!bdf) {
+			break;
+		}
+
+		if (opts->pci_blacklist != NULL) {
+			SPDK_ERRLOG("PciBlacklist and PciWhitelist cannot be used at the same time\n");
+			free(opts->pci_blacklist);
+			return -EINVAL;
+		}
+
+		rc = app_opts_add_pci_addr(opts, &opts->pci_whitelist, bdf);
+		if (rc != 0) {
+			free(opts->pci_whitelist);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static int
+app_setup_env(struct spdk_app_opts *opts)
+{
+	struct spdk_env_opts env_opts = {};
+	int rc;
+
+	if (opts == NULL) {
+		rc = spdk_env_init(NULL);
+		if (rc != 0) {
+			SPDK_ERRLOG("Unable to reinitialize SPDK env\n");
+		}
+
+		return rc;
+	}
+
+
+	spdk_env_opts_init(&env_opts);
+
+	env_opts.name = opts->name;
+	env_opts.core_mask = opts->reactor_mask;
+	env_opts.shm_id = opts->shm_id;
+	env_opts.mem_channel = opts->mem_channel;
+	env_opts.master_core = opts->master_core;
+	env_opts.mem_size = opts->mem_size;
+	env_opts.hugepage_single_segments = opts->hugepage_single_segments;
+	env_opts.unlink_hugepage = opts->unlink_hugepage;
+	env_opts.hugedir = opts->hugedir;
+	env_opts.no_pci = opts->no_pci;
+	env_opts.num_pci_addr = opts->num_pci_addr;
+	env_opts.pci_blacklist = opts->pci_blacklist;
+	env_opts.pci_whitelist = opts->pci_whitelist;
+	env_opts.env_context = opts->env_context;
+	env_opts.iova_mode = opts->iova_mode;
+
+	rc = spdk_env_init(&env_opts);
+	free(env_opts.pci_blacklist);
+	free(env_opts.pci_whitelist);
+
+
+	if (rc < 0) {
+		SPDK_ERRLOG("Unable to initialize SPDK env\n");
+	}
+
+	return rc;
+}
+
+static int
+app_setup_trace(struct spdk_app_opts *opts)
+{
+	char		shm_name[64];
+	uint64_t	tpoint_group_mask;
+	char		*end;
+
+	if (opts->shm_id >= 0) {
+		snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", opts->name, opts->shm_id);
+	} else {
+		snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", opts->name, (int)getpid());
+	}
+
+	if (spdk_trace_init(shm_name, opts->num_entries) != 0) {
+		return -1;
+	}
+
+	if (opts->tpoint_group_mask != NULL) {
+		errno = 0;
+		tpoint_group_mask = strtoull(opts->tpoint_group_mask, &end, 16);
+		if (*end != '\0' || errno) {
+			SPDK_ERRLOG("invalid tpoint mask %s\n", opts->tpoint_group_mask);
+		} else {
+			SPDK_NOTICELOG("Tracepoint Group Mask %s specified.\n", opts->tpoint_group_mask);
+			SPDK_NOTICELOG("Use 'spdk_trace -s %s %s %d' to capture a snapshot of events at runtime.\n",
+				       opts->name,
+				       opts->shm_id >= 0 ? "-i" : "-p",
+				       opts->shm_id >= 0 ? opts->shm_id : getpid());
+#if defined(__linux__)
+			SPDK_NOTICELOG("Or copy /dev/shm%s for offline analysis/debug.\n", shm_name);
+#endif
+			spdk_trace_set_tpoint_group_mask(tpoint_group_mask);
+		}
+	}
+
+	return 0;
+}
+
+static void
+bootstrap_fn(void *arg1)
+{
+	if (g_spdk_app.json_config_file) {
+		g_delay_subsystem_init = false;
+		spdk_app_json_config_load(g_spdk_app.json_config_file, g_spdk_app.rpc_addr, app_start_rpc,
+					  NULL, !g_spdk_app.json_config_ignore_errors);
+	} else {
+		if (!g_delay_subsystem_init) {
+			spdk_subsystem_init(app_start_rpc, NULL);
+		} else {
+			spdk_rpc_initialize(g_spdk_app.rpc_addr);
+		}
+	}
+}
+
+int
+spdk_app_start(struct spdk_app_opts *opts, spdk_msg_fn start_fn,
+	       void *arg1)
+{
+	struct spdk_conf	*config = NULL;
+	int			rc;
+	char			*tty;
+	struct spdk_cpuset	tmp_cpumask = {};
+	static bool		g_env_was_setup = false;
+
+	if (!opts) {
+		SPDK_ERRLOG("opts should not be NULL\n");
+		return 1;
+	}
+
+	if (!start_fn) {
+		SPDK_ERRLOG("start_fn should not be NULL\n");
+		return 1;
+	}
+
+	tty = ttyname(STDERR_FILENO);
+	if (opts->print_level > SPDK_LOG_WARN &&
+	    isatty(STDERR_FILENO) &&
+	    tty &&
+	    !strncmp(tty, "/dev/tty", strlen("/dev/tty"))) {
+		printf("Warning: printing stderr to console terminal without -q option specified.\n");
+		printf("Suggest using --silence-noticelog to disable logging to stderr and\n");
+		printf("monitor syslog, or redirect stderr to a file.\n");
+		printf("(Delaying for 10 seconds...)\n");
+		sleep(10);
+	}
+
+	spdk_log_set_print_level(opts->print_level);
+
+#ifndef SPDK_NO_RLIMIT
+	if (opts->enable_coredump) {
+		struct rlimit core_limits;
+
+		core_limits.rlim_cur = core_limits.rlim_max = SPDK_APP_DEFAULT_CORE_LIMIT;
+		setrlimit(RLIMIT_CORE, &core_limits);
+	}
+#endif
+
+	config = app_setup_conf(opts->config_file);
+	if (config == NULL) {
+		return 1;
+	}
+
+	if (app_read_config_file_global_params(opts) < 0) {
+		spdk_conf_free(config);
+		return 1;
+	}
+
+	memset(&g_spdk_app, 0, sizeof(g_spdk_app));
+	g_spdk_app.config = config;
+	g_spdk_app.json_config_file = opts->json_config_file;
+	g_spdk_app.json_config_ignore_errors = opts->json_config_ignore_errors;
+	g_spdk_app.rpc_addr = opts->rpc_addr;
+	g_spdk_app.shm_id = opts->shm_id;
+	g_spdk_app.shutdown_cb = opts->shutdown_cb;
+	g_spdk_app.rc = 0;
+
+	spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL);
+
+	/* Pass NULL to app_setup_env if SPDK app has been set up, in order to
+	 * indicate that this is a reinitialization.
+	 */
+	if (app_setup_env(g_env_was_setup ? NULL : opts) < 0) {
+		return 1;
+	}
+
+	spdk_log_open(opts->log);
+	SPDK_NOTICELOG("Total cores available: %d\n", spdk_env_get_core_count());
+
+	/*
+	 * If mask not specified on command line or in configuration file,
+	 *  reactor_mask will be 0x1 which will enable core 0 to run one
+	 *  reactor.
+	 */
+	if ((rc = spdk_reactors_init()) != 0) {
+		SPDK_ERRLOG("Reactor Initilization failed: rc = %d\n", rc);
+		return 1;
+	}
+
+	spdk_cpuset_set_cpu(&tmp_cpumask, spdk_env_get_current_core(), true);
+
+	/* Now that the reactors have been initialized, we can create an
+	 * initialization thread. */
+	g_app_thread = spdk_thread_create("app_thread", &tmp_cpumask);
+	if (!g_app_thread) {
+		SPDK_ERRLOG("Unable to create an spdk_thread for initialization\n");
+		return 1;
+	}
+
+	/*
+	 * Note the call to app_setup_trace() is located here
+	 * ahead of app_setup_signal_handlers().
+	 * That's because there is not an easy/direct clean
+	 * way of unwinding alloc'd resources that can occur
+	 * in app_setup_signal_handlers().
+	 */
+	if (app_setup_trace(opts) != 0) {
+		return 1;
+	}
+
+	if ((rc = app_setup_signal_handlers(opts)) != 0) {
+		return 1;
+	}
+
+	g_delay_subsystem_init = opts->delay_subsystem_init;
+	g_start_fn = start_fn;
+	g_start_arg = arg1;
+
+	spdk_thread_send_msg(g_app_thread, bootstrap_fn, NULL);
+
+	/* This blocks until spdk_app_stop is called */
+	spdk_reactors_start();
+
+	g_env_was_setup = true;
+
+	return g_spdk_app.rc;
+}
+
+void
+spdk_app_fini(void)
+{
+	spdk_trace_cleanup();
+	spdk_reactors_fini();
+	spdk_env_fini();
+	spdk_conf_free(g_spdk_app.config);
+	spdk_log_close();
+}
+
+static void
+app_stop(void *arg1)
+{
+	spdk_rpc_finish();
+	spdk_subsystem_fini(spdk_reactors_stop, NULL);
+}
+
+void
+spdk_app_stop(int rc)
+{
+	if (rc) {
+		SPDK_WARNLOG("spdk_app_stop'd on non-zero\n");
+	}
+	g_spdk_app.rc = rc;
+	/*
+	 * We want to run spdk_subsystem_fini() from the same thread where spdk_subsystem_init()
+	 * was called.
+	 */
+	spdk_thread_send_msg(g_app_thread, app_stop, NULL);
+}
+
+static void
+usage(void (*app_usage)(void))
+{
+	printf("%s [options]\n", g_executable_name);
+	printf("options:\n");
+	printf(" -c, --config <config>     config file (default %s)\n",
+	       g_default_opts.config_file != NULL ? g_default_opts.config_file : "none");
+	printf("     --json <config>       JSON config file (default %s)\n",
+	       g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none");
+	printf("     --json-ignore-init-errors\n");
+	printf("                           don't exit on invalid config entry\n");
+	printf(" -d, --limit-coredump      do not set max coredump size to RLIM_INFINITY\n");
+	printf(" -g, --single-file-segments\n");
+	printf("                           force creating just one hugetlbfs file\n");
+	printf(" -h, --help                show this usage\n");
+	printf(" -i, --shm-id <id>         shared memory ID (optional)\n");
+	printf(" -m, --cpumask <mask>      core mask for DPDK\n");
+	printf(" -n, --mem-channels <num>  channel number of memory channels used for DPDK\n");
+	printf(" -p, --master-core <id>    master (primary) core for DPDK\n");
+	printf(" -r, --rpc-socket <path>   RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR);
+	printf(" -s, --mem-size <size>     memory size in MB for DPDK (default: ");
+#ifndef __linux__
+	if (g_default_opts.mem_size <= 0) {
+		printf("all hugepage memory)\n");
+	} else
+#endif
+	{
+		printf("%dMB)\n", g_default_opts.mem_size >= 0 ? g_default_opts.mem_size : 0);
+	}
+	printf("     --silence-noticelog   disable notice level logging to stderr\n");
+	printf(" -u, --no-pci              disable PCI access\n");
+	printf("     --wait-for-rpc        wait for RPCs to initialize subsystems\n");
+	printf("     --max-delay <num>     maximum reactor delay (in microseconds)\n");
+	printf(" -B, --pci-blacklist <bdf>\n");
+	printf("                           pci addr to blacklist (can be used more than once)\n");
+	printf(" -R, --huge-unlink         unlink huge files after initialization\n");
+	printf(" -v, --version             print SPDK version\n");
+	printf(" -W, --pci-whitelist <bdf>\n");
+	printf("                           pci addr to whitelist (-B and -W cannot be used at the same time)\n");
+	printf("      --huge-dir <path>    use a specific hugetlbfs mount to reserve memory from\n");
+	printf("      --iova-mode <pa/va>  set IOVA mode ('pa' for IOVA_PA and 'va' for IOVA_VA)\n");
+	printf("      --base-virtaddr <addr>      the base virtual address for DPDK (default: 0x200000000000)\n");
+	printf("      --num-trace-entries <num>   number of trace entries for each core, must be power of 2. (default %d)\n",
+	       SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES);
+	spdk_log_usage(stdout, "-L");
+	spdk_trace_mask_usage(stdout, "-e");
+	if (app_usage) {
+		app_usage();
+	}
+}
+
+spdk_app_parse_args_rvals_t
+spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts,
+		    const char *app_getopt_str, struct option *app_long_opts,
+		    int (*app_parse)(int ch, char *arg),
+		    void (*app_usage)(void))
+{
+	int ch, rc, opt_idx, global_long_opts_len, app_long_opts_len;
+	struct option *cmdline_options;
+	char *cmdline_short_opts = NULL;
+	enum spdk_app_parse_args_rvals retval = SPDK_APP_PARSE_ARGS_FAIL;
+	long int tmp;
+
+	memcpy(&g_default_opts, opts, sizeof(g_default_opts));
+
+	if (opts->config_file && access(opts->config_file, R_OK) != 0) {
+		SPDK_WARNLOG("Can't read legacy configuration file '%s'\n", opts->config_file);
+		opts->config_file = NULL;
+	}
+
+	if (opts->json_config_file && access(opts->json_config_file, R_OK) != 0) {
+		SPDK_WARNLOG("Can't read JSON configuration file '%s'\n", opts->json_config_file);
+		opts->json_config_file = NULL;
+	}
+
+	if (app_long_opts == NULL) {
+		app_long_opts_len = 0;
+	} else {
+		for (app_long_opts_len = 0;
+		     app_long_opts[app_long_opts_len].name != NULL;
+		     app_long_opts_len++);
+	}
+
+	global_long_opts_len = SPDK_COUNTOF(g_cmdline_options);
+
+	cmdline_options = calloc(global_long_opts_len + app_long_opts_len + 1, sizeof(*cmdline_options));
+	if (!cmdline_options) {
+		SPDK_ERRLOG("Out of memory\n");
+		return SPDK_APP_PARSE_ARGS_FAIL;
+	}
+
+	memcpy(&cmdline_options[0], g_cmdline_options, sizeof(g_cmdline_options));
+	if (app_long_opts) {
+		memcpy(&cmdline_options[global_long_opts_len], app_long_opts,
+		       app_long_opts_len * sizeof(*app_long_opts));
+	}
+
+	if (app_getopt_str != NULL) {
+		ch = app_opts_validate(app_getopt_str);
+		if (ch) {
+			SPDK_ERRLOG("Duplicated option '%c' between the generic and application specific spdk opts.\n",
+				    ch);
+			goto out;
+		}
+	}
+
+	cmdline_short_opts = spdk_sprintf_alloc("%s%s", app_getopt_str, SPDK_APP_GETOPT_STRING);
+	if (!cmdline_short_opts) {
+		SPDK_ERRLOG("Out of memory\n");
+		goto out;
+	}
+
+	g_executable_name = argv[0];
+
+	while ((ch = getopt_long(argc, argv, cmdline_short_opts, cmdline_options, &opt_idx)) != -1) {
+		switch (ch) {
+		case CONFIG_FILE_OPT_IDX:
+			opts->config_file = optarg;
+			break;
+		case JSON_CONFIG_OPT_IDX:
+			opts->json_config_file = optarg;
+			break;
+		case JSON_CONFIG_IGNORE_INIT_ERRORS_IDX:
+			opts->json_config_ignore_errors = true;
+			break;
+		case LIMIT_COREDUMP_OPT_IDX:
+			opts->enable_coredump = false;
+			break;
+		case TPOINT_GROUP_MASK_OPT_IDX:
+			opts->tpoint_group_mask = optarg;
+			break;
+		case SINGLE_FILE_SEGMENTS_OPT_IDX:
+			opts->hugepage_single_segments = true;
+			break;
+		case HELP_OPT_IDX:
+			usage(app_usage);
+			retval = SPDK_APP_PARSE_ARGS_HELP;
+			goto out;
+		case SHM_ID_OPT_IDX:
+			opts->shm_id = spdk_strtol(optarg, 0);
+			if (opts->shm_id < 0) {
+				SPDK_ERRLOG("Invalid shared memory ID %s\n", optarg);
+				goto out;
+			}
+			break;
+		case CPUMASK_OPT_IDX:
+			opts->reactor_mask = optarg;
+			break;
+		case MEM_CHANNELS_OPT_IDX:
+			opts->mem_channel = spdk_strtol(optarg, 0);
+			if (opts->mem_channel < 0) {
+				SPDK_ERRLOG("Invalid memory channel %s\n", optarg);
+				goto out;
+			}
+			break;
+		case MASTER_CORE_OPT_IDX:
+			opts->master_core = spdk_strtol(optarg, 0);
+			if (opts->master_core < 0) {
+				SPDK_ERRLOG("Invalid master core %s\n", optarg);
+				goto out;
+			}
+			break;
+		case SILENCE_NOTICELOG_OPT_IDX:
+			opts->print_level = SPDK_LOG_WARN;
+			break;
+		case RPC_SOCKET_OPT_IDX:
+			opts->rpc_addr = optarg;
+			break;
+		case MEM_SIZE_OPT_IDX: {
+			uint64_t mem_size_mb;
+			bool mem_size_has_prefix;
+
+			rc = spdk_parse_capacity(optarg, &mem_size_mb, &mem_size_has_prefix);
+			if (rc != 0) {
+				SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg);
+				usage(app_usage);
+				goto out;
+			}
+
+			if (mem_size_has_prefix) {
+				/* the mem size is in MB by default, so if a prefix was
+				 * specified, we need to manually convert to MB.
+				 */
+				mem_size_mb /= 1024 * 1024;
+			}
+
+			if (mem_size_mb > INT_MAX) {
+				SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg);
+				usage(app_usage);
+				goto out;
+			}
+
+			opts->mem_size = (int) mem_size_mb;
+			break;
+		}
+		case NO_PCI_OPT_IDX:
+			opts->no_pci = true;
+			break;
+		case WAIT_FOR_RPC_OPT_IDX:
+			opts->delay_subsystem_init = true;
+			break;
+		case PCI_BLACKLIST_OPT_IDX:
+			if (opts->pci_whitelist) {
+				free(opts->pci_whitelist);
+				opts->pci_whitelist = NULL;
+				SPDK_ERRLOG("-B and -W cannot be used at the same time\n");
+				usage(app_usage);
+				goto out;
+			}
+
+			rc = app_opts_add_pci_addr(opts, &opts->pci_blacklist, optarg);
+			if (rc != 0) {
+				free(opts->pci_blacklist);
+				opts->pci_blacklist = NULL;
+				goto out;
+			}
+			break;
+		case LOGFLAG_OPT_IDX:
+#ifndef DEBUG
+			SPDK_ERRLOG("%s must be configured with --enable-debug for -L flag\n",
+				    argv[0]);
+			usage(app_usage);
+			goto out;
+#else
+			rc = spdk_log_set_flag(optarg);
+			if (rc < 0) {
+				SPDK_ERRLOG("unknown flag\n");
+				usage(app_usage);
+				goto out;
+			}
+			opts->print_level = SPDK_LOG_DEBUG;
+			break;
+#endif
+		case HUGE_UNLINK_OPT_IDX:
+			opts->unlink_hugepage = true;
+			break;
+		case PCI_WHITELIST_OPT_IDX:
+			if (opts->pci_blacklist) {
+				free(opts->pci_blacklist);
+				opts->pci_blacklist = NULL;
+				SPDK_ERRLOG("-B and -W cannot be used at the same time\n");
+				usage(app_usage);
+				goto out;
+			}
+
+			rc = app_opts_add_pci_addr(opts, &opts->pci_whitelist, optarg);
+			if (rc != 0) {
+				free(opts->pci_whitelist);
+				opts->pci_whitelist = NULL;
+				goto out;
+			}
+			break;
+		case BASE_VIRTADDR_OPT_IDX:
+			tmp = spdk_strtoll(optarg, 0);
+			if (tmp <= 0) {
+				SPDK_ERRLOG("Invalid base-virtaddr %s\n", optarg);
+				usage(app_usage);
+				goto out;
+			}
+			opts->base_virtaddr = (uint64_t)tmp;
+			break;
+		case HUGE_DIR_OPT_IDX:
+			opts->hugedir = optarg;
+			break;
+		case IOVA_MODE_OPT_IDX:
+			opts->iova_mode = optarg;
+			break;
+		case NUM_TRACE_ENTRIES_OPT_IDX:
+			tmp = spdk_strtoll(optarg, 0);
+			if (tmp <= 0) {
+				SPDK_ERRLOG("Invalid num-trace-entries %s\n", optarg);
+				usage(app_usage);
+				goto out;
+			}
+			opts->num_entries = (uint64_t)tmp;
+			if (opts->num_entries & (opts->num_entries - 1)) {
+				SPDK_ERRLOG("num-trace-entries must be power of 2\n");
+				usage(app_usage);
+				goto out;
+			}
+			break;
+		case MAX_REACTOR_DELAY_OPT_IDX:
+			SPDK_ERRLOG("Deprecation warning: The maximum allowed latency parameter is no longer supported.\n");
+			break;
+		case VERSION_OPT_IDX:
+			printf(SPDK_VERSION_STRING"\n");
+			retval = SPDK_APP_PARSE_ARGS_HELP;
+			goto out;
+		case '?':
+			/*
+			 * In the event getopt() above detects an option
+			 * in argv that is NOT in the getopt_str,
+			 * getopt() will return a '?' indicating failure.
+			 */
+			usage(app_usage);
+			goto out;
+		default:
+			rc = app_parse(ch, optarg);
+			if (rc) {
+				SPDK_ERRLOG("Parsing application specific arguments failed: %d\n", rc);
+				goto out;
+			}
+		}
+	}
+
+	if (opts->config_file && opts->json_config_file) {
+		SPDK_ERRLOG("ERROR: Legacy config and JSON config can't be used together.\n");
+		goto out;
+	}
+
+	if (opts->json_config_file && opts->delay_subsystem_init) {
+		SPDK_ERRLOG("ERROR: JSON configuration file can't be used together with --wait-for-rpc.\n");
+		goto out;
+	}
+
+	/* TBD: Replace warning by failure when RPCs for startup are prepared. */
+	if (opts->config_file && opts->delay_subsystem_init) {
+		fprintf(stderr,
+			"WARNING: --wait-for-rpc and config file are used at the same time. "
+			"- Please be careful one options might overwrite others.\n");
+	}
+
+	retval = SPDK_APP_PARSE_ARGS_SUCCESS;
+out:
+	if (retval != SPDK_APP_PARSE_ARGS_SUCCESS) {
+		free(opts->pci_blacklist);
+		opts->pci_blacklist = NULL;
+		free(opts->pci_whitelist);
+		opts->pci_whitelist = NULL;
+	}
+	free(cmdline_short_opts);
+	free(cmdline_options);
+	return retval;
+}
+
+void
+spdk_app_usage(void)
+{
+	if (g_executable_name == NULL) {
+		SPDK_ERRLOG("%s not valid before calling spdk_app_parse_args()\n", __func__);
+		return;
+	}
+
+	usage(NULL);
+}
+
+static void
+rpc_framework_start_init_cpl(int rc, void *arg1)
+{
+	struct spdk_jsonrpc_request *request = arg1;
+	struct spdk_json_write_ctx *w;
+
+	assert(spdk_get_thread() == g_app_thread);
+
+	if (rc) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "framework_initialization failed");
+		return;
+	}
+
+	spdk_rpc_set_state(SPDK_RPC_RUNTIME);
+	app_start_application();
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_framework_start_init(struct spdk_jsonrpc_request *request,
+			 const struct spdk_json_val *params)
+{
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "framework_start_init requires no parameters");
+		return;
+	}
+
+	spdk_subsystem_init(rpc_framework_start_init_cpl, request);
+}
+SPDK_RPC_REGISTER("framework_start_init", rpc_framework_start_init, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_start_init, start_subsystem_init)
+
+struct subsystem_init_poller_ctx {
+	struct spdk_poller *init_poller;
+	struct spdk_jsonrpc_request *request;
+};
+
+static int
+rpc_subsystem_init_poller_ctx(void *ctx)
+{
+	struct spdk_json_write_ctx *w;
+	struct subsystem_init_poller_ctx *poller_ctx = ctx;
+
+	if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) {
+		w = spdk_jsonrpc_begin_result(poller_ctx->request);
+		spdk_json_write_bool(w, true);
+		spdk_jsonrpc_end_result(poller_ctx->request, w);
+		spdk_poller_unregister(&poller_ctx->init_poller);
+		free(poller_ctx);
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+rpc_framework_wait_init(struct spdk_jsonrpc_request *request,
+			const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+	struct subsystem_init_poller_ctx *ctx;
+
+	if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) {
+		w = spdk_jsonrpc_begin_result(request);
+		spdk_json_write_bool(w, true);
+		spdk_jsonrpc_end_result(request, w);
+	} else {
+		ctx = malloc(sizeof(struct subsystem_init_poller_ctx));
+		if (ctx == NULL) {
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+							 "Unable to allocate memory for the request context\n");
+			return;
+		}
+		ctx->request = request;
+		ctx->init_poller = SPDK_POLLER_REGISTER(rpc_subsystem_init_poller_ctx, ctx, 0);
+	}
+}
+SPDK_RPC_REGISTER("framework_wait_init", rpc_framework_wait_init,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_wait_init, wait_subsystem_init)
diff --git a/src/spdk/lib/event/json_config.c b/src/spdk/lib/event/json_config.c
new file mode 100644
index 000000000..69a95097a
--- /dev/null
+++ b/src/spdk/lib/event/json_config.c
@@ -0,0 +1,630 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/util.h"
+#include "spdk/file.h"
+#include "spdk/log.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/jsonrpc.h"
+#include "spdk/rpc.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+
+#define SPDK_DEBUG_APP_CFG(...) SPDK_DEBUGLOG(SPDK_LOG_APP_CONFIG, __VA_ARGS__)
+
+/* JSON configuration format is as follows
+ *
+ * {
+ *  "subsystems" : [                          <<== *subsystems JSON array
+ *    {                                       <<== *subsystems_it array entry pointer (iterator)
+ *      "subsystem": "<< SUBSYSTEM NAME >>",
+ *      "config": [                           <<== *config JSON array
+ *         {                                  <<== *config_it array entry pointer (iterator)
+ *           "method": "<< METHOD NAME >>",   <<== *method
+ *           "params": { << PARAMS >> }       <<== *params
+ *         },
+ *         << MORE "config" ARRY ENTRIES >>
+ *      ]
+ *    },
+ *    << MORE "subsystems" ARRAY ENTRIES >>
+ *  ]
+ *
+ *  << ANYTHING ELSE IS IGNORRED IN ROOT OBJECT>>
+ * }
+ *
+ */
+
+struct load_json_config_ctx;
+typedef void (*client_resp_handler)(struct load_json_config_ctx *,
+				    struct spdk_jsonrpc_client_response *);
+
+#define RPC_SOCKET_PATH_MAX sizeof(((struct sockaddr_un *)0)->sun_path)
+
+/* 1s connections timeout */
+#define RPC_CLIENT_CONNECT_TIMEOUT_US (1U * 1000U * 1000U)
+
+/*
+ * Currently there is no timeout in SPDK for any RPC command. This result that
+ * we can't put a hard limit during configuration load as it most likely randomly fail.
+ * So just print WARNLOG every 10s. */
+#define RPC_CLIENT_REQUEST_TIMEOUT_US (10U * 1000 * 1000)
+
+struct load_json_config_ctx {
+	/* Thread used during configuration. */
+	struct spdk_thread *thread;
+	spdk_subsystem_init_fn cb_fn;
+	void *cb_arg;
+	bool stop_on_error;
+
+	/* Current subsystem */
+	struct spdk_json_val *subsystems; /* "subsystems" array */
+	struct spdk_json_val *subsystems_it; /* current subsystem array position in "subsystems" array */
+
+	struct spdk_json_val *subsystem_name; /* current subsystem name */
+
+	/* Current "config" entry we are processing */
+	struct spdk_json_val *config; /* "config" array */
+	struct spdk_json_val *config_it; /* current config position in "config" array */
+
+	/* Current request id we are sending. */
+	uint32_t rpc_request_id;
+
+	/* Whole configuration file read and parsed. */
+	size_t json_data_size;
+	char *json_data;
+
+	size_t values_cnt;
+	struct spdk_json_val *values;
+
+	char rpc_socket_path_temp[RPC_SOCKET_PATH_MAX + 1];
+
+	struct spdk_jsonrpc_client *client_conn;
+	struct spdk_poller *client_conn_poller;
+
+	client_resp_handler client_resp_cb;
+
+	/* Timeout for current RPC client action. */
+	uint64_t timeout;
+};
+
+static void app_json_config_load_subsystem(void *_ctx);
+
+static void
+app_json_config_load_done(struct load_json_config_ctx *ctx, int rc)
+{
+	spdk_poller_unregister(&ctx->client_conn_poller);
+	if (ctx->client_conn != NULL) {
+		spdk_jsonrpc_client_close(ctx->client_conn);
+	}
+
+	spdk_rpc_finish();
+
+	SPDK_DEBUG_APP_CFG("Config load finished with rc %d\n", rc);
+	ctx->cb_fn(rc, ctx->cb_arg);
+
+	free(ctx->json_data);
+	free(ctx->values);
+	free(ctx);
+}
+
+static void
+rpc_client_set_timeout(struct load_json_config_ctx *ctx, uint64_t timeout_us)
+{
+	ctx->timeout = spdk_get_ticks() + timeout_us * spdk_get_ticks_hz() / (1000 * 1000);
+}
+
+static int
+rpc_client_check_timeout(struct load_json_config_ctx *ctx)
+{
+	if (ctx->timeout < spdk_get_ticks()) {
+		SPDK_WARNLOG("RPC client command timeout.\n");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+struct json_write_buf {
+	char data[1024];
+	unsigned cur_off;
+};
+
+static int
+json_write_stdout(void *cb_ctx, const void *data, size_t size)
+{
+	struct json_write_buf *buf = cb_ctx;
+	size_t rc;
+
+	rc = snprintf(buf->data + buf->cur_off, sizeof(buf->data) - buf->cur_off,
+		      "%s", (const char *)data);
+	if (rc > 0) {
+		buf->cur_off += rc;
+	}
+	return rc == size ? 0 : -1;
+}
+
+static int
+rpc_client_poller(void *arg)
+{
+	struct load_json_config_ctx *ctx = arg;
+	struct spdk_jsonrpc_client_response *resp;
+	client_resp_handler cb;
+	int rc;
+
+	assert(spdk_get_thread() == ctx->thread);
+
+	rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0);
+	if (rc == 0) {
+		rc = rpc_client_check_timeout(ctx);
+		if (rc == -ETIMEDOUT) {
+			rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US);
+			rc = 0;
+		}
+	}
+
+	if (rc == 0) {
+		/* No response yet */
+		return SPDK_POLLER_BUSY;
+	} else if (rc < 0) {
+		app_json_config_load_done(ctx, rc);
+		return SPDK_POLLER_BUSY;
+	}
+
+	resp = spdk_jsonrpc_client_get_response(ctx->client_conn);
+	assert(resp);
+
+	if (resp->error) {
+		struct json_write_buf buf = {};
+		struct spdk_json_write_ctx *w = spdk_json_write_begin(json_write_stdout,
+						&buf, SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE);
+
+		if (w == NULL) {
+			SPDK_ERRLOG("error response: (?)\n");
+		} else {
+			spdk_json_write_val(w, resp->error);
+			spdk_json_write_end(w);
+			SPDK_ERRLOG("error response: \n%s\n", buf.data);
+		}
+	}
+
+	if (resp->error && ctx->stop_on_error) {
+		spdk_jsonrpc_client_free_response(resp);
+		app_json_config_load_done(ctx, -EINVAL);
+	} else {
+		/* We have response so we must have callback for it. */
+		cb = ctx->client_resp_cb;
+		assert(cb != NULL);
+
+		/* Mark we are done with this handler. */
+		ctx->client_resp_cb = NULL;
+		cb(ctx, resp);
+	}
+
+
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+rpc_client_connect_poller(void *_ctx)
+{
+	struct load_json_config_ctx *ctx = _ctx;
+	int rc;
+
+	rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0);
+	if (rc != -ENOTCONN) {
+		/* We are connected. Start regular poller and issue first request */
+		spdk_poller_unregister(&ctx->client_conn_poller);
+		ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_poller, ctx, 100);
+		app_json_config_load_subsystem(ctx);
+	} else {
+		rc = rpc_client_check_timeout(ctx);
+		if (rc) {
+			app_json_config_load_done(ctx, rc);
+		}
+
+		return SPDK_POLLER_IDLE;
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+client_send_request(struct load_json_config_ctx *ctx, struct spdk_jsonrpc_client_request *request,
+		    client_resp_handler client_resp_cb)
+{
+	int rc;
+
+	assert(spdk_get_thread() == ctx->thread);
+
+	ctx->client_resp_cb = client_resp_cb;
+	rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US);
+	rc = spdk_jsonrpc_client_send_request(ctx->client_conn, request);
+
+	if (rc) {
+		SPDK_DEBUG_APP_CFG("Sending request to client failed (%d)\n", rc);
+	}
+
+	return rc;
+}
+
+static int
+cap_string(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	if (val->type != SPDK_JSON_VAL_STRING) {
+		return -EINVAL;
+	}
+
+	*vptr = val;
+	return 0;
+}
+
+static int
+cap_object(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	if (val->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+		return -EINVAL;
+	}
+
+	*vptr = val;
+	return 0;
+}
+
+
+static int
+cap_array_or_null(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	if (val->type != SPDK_JSON_VAL_ARRAY_BEGIN && val->type != SPDK_JSON_VAL_NULL) {
+		return -EINVAL;
+	}
+
+	*vptr = val;
+	return 0;
+}
+
+struct config_entry {
+	char *method;
+	struct spdk_json_val *params;
+};
+
+static struct spdk_json_object_decoder jsonrpc_cmd_decoders[] = {
+	{"method", offsetof(struct config_entry, method), spdk_json_decode_string},
+	{"params", offsetof(struct config_entry, params), cap_object, true}
+};
+
+static void app_json_config_load_subsystem_config_entry(void *_ctx);
+
+static void
+app_json_config_load_subsystem_config_entry_next(struct load_json_config_ctx *ctx,
+		struct spdk_jsonrpc_client_response *resp)
+{
+	/* Don't care about the response */
+	spdk_jsonrpc_client_free_response(resp);
+
+	ctx->config_it = spdk_json_next(ctx->config_it);
+	app_json_config_load_subsystem_config_entry(ctx);
+}
+
+/* Load "config" entry */
+static void
+app_json_config_load_subsystem_config_entry(void *_ctx)
+{
+	struct load_json_config_ctx *ctx = _ctx;
+	struct spdk_jsonrpc_client_request *rpc_request;
+	struct spdk_json_write_ctx *w;
+	struct config_entry cfg = {};
+	struct spdk_json_val *params_end;
+	size_t params_len;
+	int rc;
+
+	if (ctx->config_it == NULL) {
+		SPDK_DEBUG_APP_CFG("Subsystem '%.*s': configuration done.\n", ctx->subsystem_name->len,
+				   (char *)ctx->subsystem_name->start);
+		ctx->subsystems_it = spdk_json_next(ctx->subsystems_it);
+		/* Invoke later to avoid recurrency */
+		spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem, ctx);
+		return;
+	}
+
+	if (spdk_json_decode_object(ctx->config_it, jsonrpc_cmd_decoders,
+				    SPDK_COUNTOF(jsonrpc_cmd_decoders), &cfg)) {
+		params_end = spdk_json_next(ctx->config_it);
+		assert(params_end != NULL);
+		params_len = params_end->start - ctx->config->start + 1;
+		SPDK_ERRLOG("Failed to decode config entry: %.*s!\n", (int)params_len, (char *)ctx->config_it);
+		app_json_config_load_done(ctx, -EINVAL);
+		goto out;
+	}
+
+	rc = spdk_rpc_is_method_allowed(cfg.method, spdk_rpc_get_state());
+	if (rc == -EPERM) {
+		SPDK_DEBUG_APP_CFG("Method '%s' not allowed -> skipping\n", cfg.method);
+		/* Invoke later to avoid recurrency */
+		ctx->config_it = spdk_json_next(ctx->config_it);
+		spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx);
+		goto out;
+	}
+
+	/* Get _END by skipping params and going back by one element. */
+	params_end = cfg.params + spdk_json_val_len(cfg.params) - 1;
+
+	/* Need to add one character to include '}' */
+	params_len = params_end->start - cfg.params->start + 1;
+
+	SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method);
+	SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start);
+
+	rpc_request = spdk_jsonrpc_client_create_request();
+	if (!rpc_request) {
+		app_json_config_load_done(ctx, -errno);
+		goto out;
+	}
+
+	w = spdk_jsonrpc_begin_request(rpc_request, ctx->rpc_request_id, NULL);
+	if (!w) {
+		spdk_jsonrpc_client_free_request(rpc_request);
+		app_json_config_load_done(ctx, -ENOMEM);
+		goto out;
+	}
+
+	spdk_json_write_named_string(w, "method", cfg.method);
+
+	/* No need to parse "params". Just dump the whole content of "params"
+	 * directly into the request and let the remote side verify it. */
+	spdk_json_write_name(w, "params");
+	spdk_json_write_val_raw(w, cfg.params->start, params_len);
+	spdk_jsonrpc_end_request(rpc_request, w);
+
+	rc = client_send_request(ctx, rpc_request, app_json_config_load_subsystem_config_entry_next);
+	if (rc != 0) {
+		app_json_config_load_done(ctx, -rc);
+		goto out;
+	}
+out:
+	free(cfg.method);
+}
+
+static void
+subsystem_init_done(int rc, void *arg1)
+{
+	struct load_json_config_ctx *ctx = arg1;
+
+	if (rc) {
+		app_json_config_load_done(ctx, rc);
+		return;
+	}
+
+	spdk_rpc_set_state(SPDK_RPC_RUNTIME);
+	/* Another round. This time for RUNTIME methods */
+	SPDK_DEBUG_APP_CFG("'framework_start_init' done - continuing configuration\n");
+
+	assert(ctx != NULL);
+	if (ctx->subsystems) {
+		ctx->subsystems_it = spdk_json_array_first(ctx->subsystems);
+	}
+
+	app_json_config_load_subsystem(ctx);
+}
+
+static struct spdk_json_object_decoder subsystem_decoders[] = {
+	{"subsystem", offsetof(struct load_json_config_ctx, subsystem_name), cap_string},
+	{"config", offsetof(struct load_json_config_ctx, config), cap_array_or_null}
+};
+
+/*
+ * Start loading subsystem pointed by ctx->subsystems_it. This must point to the
+ * beginning of the "subsystem" object in "subsystems" array or be NULL. If it is
+ * NULL then no more subsystems to load.
+ *
+ * There are two iterations:
+ *
+ * In first iteration only STARTUP RPC methods are used, other methods are ignored. When
+ * allsubsystems are walked the ctx->subsystems_it became NULL and "framework_start_init"
+ * is called to let the SPDK move to RUNTIME state (initialize all subsystems) and
+ * second iteration begins.
+ *
+ * In second iteration "subsystems" array is walked through again, this time only
+ * RUNTIME RPC methods are used. When ctx->subsystems_it became NULL second time it
+ * indicate that there is no more subsystems to load. The cb_fn is called to finish
+ * configuration.
+ */
+static void
+app_json_config_load_subsystem(void *_ctx)
+{
+	struct load_json_config_ctx *ctx = _ctx;
+
+	if (ctx->subsystems_it == NULL) {
+		if (spdk_rpc_get_state() == SPDK_RPC_STARTUP) {
+			SPDK_DEBUG_APP_CFG("No more entries for current state, calling 'framework_start_init'\n");
+			spdk_subsystem_init(subsystem_init_done, ctx);
+		} else {
+			app_json_config_load_done(ctx, 0);
+		}
+
+		return;
+	}
+
+	/* Capture subsystem name and config array */
+	if (spdk_json_decode_object(ctx->subsystems_it, subsystem_decoders,
+				    SPDK_COUNTOF(subsystem_decoders), ctx)) {
+		SPDK_ERRLOG("Failed to parse subsystem configuration\n");
+		app_json_config_load_done(ctx, -EINVAL);
+		return;
+	}
+
+	SPDK_DEBUG_APP_CFG("Loading subsystem '%.*s' configuration\n", ctx->subsystem_name->len,
+			   (char *)ctx->subsystem_name->start);
+
+	/* Get 'config' array first configuration entry */
+	ctx->config_it = spdk_json_array_first(ctx->config);
+	app_json_config_load_subsystem_config_entry(ctx);
+}
+
+static void *
+read_file(const char *filename, size_t *size)
+{
+	FILE *file = fopen(filename, "r");
+	void *data;
+
+	if (file == NULL) {
+		/* errno is set by fopen */
+		return NULL;
+	}
+
+	data = spdk_posix_file_load(file, size);
+	fclose(file);
+	return data;
+}
+
+static int
+app_json_config_read(const char *config_file, struct load_json_config_ctx *ctx)
+{
+	struct spdk_json_val *values = NULL;
+	void *json = NULL, *end;
+	ssize_t values_cnt, rc;
+	size_t json_size;
+
+	json = read_file(config_file, &json_size);
+	if (!json) {
+		return -errno;
+	}
+
+	rc = spdk_json_parse(json, json_size, NULL, 0, &end,
+			     SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS);
+	if (rc < 0) {
+		SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc);
+		goto err;
+	}
+
+	values_cnt = rc;
+	values = calloc(values_cnt, sizeof(struct spdk_json_val));
+	if (values == NULL) {
+		SPDK_ERRLOG("Out of memory\n");
+		goto err;
+	}
+
+	rc = spdk_json_parse(json, json_size, values, values_cnt, &end,
+			     SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS);
+	if (rc != values_cnt) {
+		SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc);
+		goto err;
+	}
+
+	ctx->json_data = json;
+	ctx->json_data_size = json_size;
+
+	ctx->values = values;
+	ctx->values_cnt = values_cnt;
+
+	return 0;
+err:
+	free(json);
+	free(values);
+	return rc;
+}
+
+void
+spdk_app_json_config_load(const char *json_config_file, const char *rpc_addr,
+			  spdk_subsystem_init_fn cb_fn, void *cb_arg,
+			  bool stop_on_error)
+{
+	struct load_json_config_ctx *ctx = calloc(1, sizeof(*ctx));
+	int rc;
+
+	assert(cb_fn);
+	if (!ctx) {
+		cb_fn(-ENOMEM, cb_arg);
+		return;
+	}
+
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+	ctx->stop_on_error = stop_on_error;
+	ctx->thread = spdk_get_thread();
+
+	rc = app_json_config_read(json_config_file, ctx);
+	if (rc) {
+		goto fail;
+	}
+
+	/* Capture subsystems array */
+	rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems);
+	if (rc) {
+		SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n");
+	} else {
+		/* Get first subsystem */
+		ctx->subsystems_it = spdk_json_array_first(ctx->subsystems);
+		if (ctx->subsystems_it == NULL) {
+			SPDK_NOTICELOG("'subsystems' configuration is empty\n");
+		}
+	}
+
+	/* If rpc_addr is not an Unix socket use default address as prefix. */
+	if (rpc_addr == NULL || rpc_addr[0] != '/') {
+		rpc_addr = SPDK_DEFAULT_RPC_ADDR;
+	}
+
+	/* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */
+	rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config",
+		      rpc_addr, getpid());
+	if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) {
+		SPDK_ERRLOG("Socket name create failed\n");
+		goto fail;
+	}
+
+	/* FIXME: spdk_rpc_initialize() function should return error code. */
+	spdk_rpc_initialize(ctx->rpc_socket_path_temp);
+	ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX);
+	if (ctx->client_conn == NULL) {
+		SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp);
+		goto fail;
+	}
+
+	rpc_client_set_timeout(ctx, RPC_CLIENT_CONNECT_TIMEOUT_US);
+	ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100);
+	return;
+
+fail:
+	app_json_config_load_done(ctx, -EINVAL);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("app_config", SPDK_LOG_APP_CONFIG)
diff --git a/src/spdk/lib/event/reactor.c b/src/spdk/lib/event/reactor.c
new file mode 100644
index 000000000..cda4a32b1
--- /dev/null
+++ b/src/spdk/lib/event/reactor.c
@@ -0,0 +1,664 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/thread.h"
+
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+
+#ifdef __linux__
+#include <sys/prctl.h>
+#endif
+
+#ifdef __FreeBSD__
+#include <pthread_np.h>
+#endif
+
+#define SPDK_EVENT_BATCH_SIZE		8
+
+static struct spdk_reactor *g_reactors;
+static struct spdk_cpuset g_reactor_core_mask;
+static enum spdk_reactor_state	g_reactor_state = SPDK_REACTOR_STATE_UNINITIALIZED;
+
+static bool g_framework_context_switch_monitor_enabled = true;
+
+static struct spdk_mempool *g_spdk_event_mempool = NULL;
+
+static void
+reactor_construct(struct spdk_reactor *reactor, uint32_t lcore)
+{
+	reactor->lcore = lcore;
+	reactor->flags.is_valid = true;
+
+	TAILQ_INIT(&reactor->threads);
+	reactor->thread_count = 0;
+
+	reactor->events = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);
+	assert(reactor->events != NULL);
+}
+
+struct spdk_reactor *
+spdk_reactor_get(uint32_t lcore)
+{
+	struct spdk_reactor *reactor;
+
+	if (g_reactors == NULL) {
+		SPDK_WARNLOG("Called spdk_reactor_get() while the g_reactors array was NULL!\n");
+		return NULL;
+	}
+
+	reactor = &g_reactors[lcore];
+
+	if (reactor->flags.is_valid == false) {
+		return NULL;
+	}
+
+	return reactor;
+}
+
+static int reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op);
+static bool reactor_thread_op_supported(enum spdk_thread_op op);
+
+int
+spdk_reactors_init(void)
+{
+	int rc;
+	uint32_t i, last_core;
+	char mempool_name[32];
+
+	snprintf(mempool_name, sizeof(mempool_name), "evtpool_%d", getpid());
+	g_spdk_event_mempool = spdk_mempool_create(mempool_name,
+			       262144 - 1, /* Power of 2 minus 1 is optimal for memory consumption */
+			       sizeof(struct spdk_event),
+			       SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+			       SPDK_ENV_SOCKET_ID_ANY);
+
+	if (g_spdk_event_mempool == NULL) {
+		SPDK_ERRLOG("spdk_event_mempool creation failed\n");
+		return -1;
+	}
+
+	/* struct spdk_reactor must be aligned on 64 byte boundary */
+	last_core = spdk_env_get_last_core();
+	rc = posix_memalign((void **)&g_reactors, 64,
+			    (last_core + 1) * sizeof(struct spdk_reactor));
+	if (rc != 0) {
+		SPDK_ERRLOG("Could not allocate array size=%u for g_reactors\n",
+			    last_core + 1);
+		spdk_mempool_free(g_spdk_event_mempool);
+		return -1;
+	}
+
+	memset(g_reactors, 0, (last_core + 1) * sizeof(struct spdk_reactor));
+
+	spdk_thread_lib_init_ext(reactor_thread_op, reactor_thread_op_supported,
+				 sizeof(struct spdk_lw_thread));
+
+	SPDK_ENV_FOREACH_CORE(i) {
+		reactor_construct(&g_reactors[i], i);
+	}
+
+	g_reactor_state = SPDK_REACTOR_STATE_INITIALIZED;
+
+	return 0;
+}
+
+void
+spdk_reactors_fini(void)
+{
+	uint32_t i;
+	struct spdk_reactor *reactor;
+
+	if (g_reactor_state == SPDK_REACTOR_STATE_UNINITIALIZED) {
+		return;
+	}
+
+	spdk_thread_lib_fini();
+
+	SPDK_ENV_FOREACH_CORE(i) {
+		reactor = spdk_reactor_get(i);
+		assert(reactor != NULL);
+		assert(reactor->thread_count == 0);
+		if (reactor->events != NULL) {
+			spdk_ring_free(reactor->events);
+		}
+	}
+
+	spdk_mempool_free(g_spdk_event_mempool);
+
+	free(g_reactors);
+	g_reactors = NULL;
+}
+
+struct spdk_event *
+spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, void *arg1, void *arg2)
+{
+	struct spdk_event *event = NULL;
+	struct spdk_reactor *reactor = spdk_reactor_get(lcore);
+
+	if (!reactor) {
+		assert(false);
+		return NULL;
+	}
+
+	event = spdk_mempool_get(g_spdk_event_mempool);
+	if (event == NULL) {
+		assert(false);
+		return NULL;
+	}
+
+	event->lcore = lcore;
+	event->fn = fn;
+	event->arg1 = arg1;
+	event->arg2 = arg2;
+
+	return event;
+}
+
+void
+spdk_event_call(struct spdk_event *event)
+{
+	int rc;
+	struct spdk_reactor *reactor;
+
+	reactor = spdk_reactor_get(event->lcore);
+
+	assert(reactor != NULL);
+	assert(reactor->events != NULL);
+
+	rc = spdk_ring_enqueue(reactor->events, (void **)&event, 1, NULL);
+	if (rc != 1) {
+		assert(false);
+	}
+}
+
+static inline uint32_t
+event_queue_run_batch(struct spdk_reactor *reactor)
+{
+	unsigned count, i;
+	void *events[SPDK_EVENT_BATCH_SIZE];
+	struct spdk_thread *thread;
+	struct spdk_lw_thread *lw_thread;
+
+#ifdef DEBUG
+	/*
+	 * spdk_ring_dequeue() fills events and returns how many entries it wrote,
+	 * so we will never actually read uninitialized data from events, but just to be sure
+	 * (and to silence a static analyzer false positive), initialize the array to NULL pointers.
+	 */
+	memset(events, 0, sizeof(events));
+#endif
+
+	count = spdk_ring_dequeue(reactor->events, events, SPDK_EVENT_BATCH_SIZE);
+	if (count == 0) {
+		return 0;
+	}
+
+	/* Execute the events. There are still some remaining events
+	 * that must occur on an SPDK thread. To accomodate those, try to
+	 * run them on the first thread in the list, if it exists. */
+	lw_thread = TAILQ_FIRST(&reactor->threads);
+	if (lw_thread) {
+		thread = spdk_thread_get_from_ctx(lw_thread);
+	} else {
+		thread = NULL;
+	}
+
+	spdk_set_thread(thread);
+
+	for (i = 0; i < count; i++) {
+		struct spdk_event *event = events[i];
+
+		assert(event != NULL);
+		event->fn(event->arg1, event->arg2);
+	}
+
+	spdk_set_thread(NULL);
+
+	spdk_mempool_put_bulk(g_spdk_event_mempool, events, count);
+
+	return count;
+}
+
+/* 1s */
+#define CONTEXT_SWITCH_MONITOR_PERIOD 1000000
+
+static int
+get_rusage(struct spdk_reactor *reactor)
+{
+	struct rusage		rusage;
+
+	if (getrusage(RUSAGE_THREAD, &rusage) != 0) {
+		return -1;
+	}
+
+	if (rusage.ru_nvcsw != reactor->rusage.ru_nvcsw || rusage.ru_nivcsw != reactor->rusage.ru_nivcsw) {
+		SPDK_INFOLOG(SPDK_LOG_REACTOR,
+			     "Reactor %d: %ld voluntary context switches and %ld involuntary context switches in the last second.\n",
+			     reactor->lcore, rusage.ru_nvcsw - reactor->rusage.ru_nvcsw,
+			     rusage.ru_nivcsw - reactor->rusage.ru_nivcsw);
+	}
+	reactor->rusage = rusage;
+
+	return -1;
+}
+
+void
+spdk_framework_enable_context_switch_monitor(bool enable)
+{
+	/* This global is being read by multiple threads, so this isn't
+	 * strictly thread safe. However, we're toggling between true and
+	 * false here, and if a thread sees the value update later than it
+	 * should, it's no big deal. */
+	g_framework_context_switch_monitor_enabled = enable;
+}
+
+bool
+spdk_framework_context_switch_monitor_enabled(void)
+{
+	return g_framework_context_switch_monitor_enabled;
+}
+
+static void
+_set_thread_name(const char *thread_name)
+{
+#if defined(__linux__)
+	prctl(PR_SET_NAME, thread_name, 0, 0, 0);
+#elif defined(__FreeBSD__)
+	pthread_set_name_np(pthread_self(), thread_name);
+#else
+#error missing platform support for thread name
+#endif
+}
+
+static int _reactor_schedule_thread(struct spdk_thread *thread);
+static uint64_t g_rusage_period;
+
+static void
+_reactor_run(struct spdk_reactor *reactor)
+{
+	struct spdk_thread	*thread;
+	struct spdk_lw_thread	*lw_thread, *tmp;
+	uint64_t		now;
+	int			rc;
+
+	event_queue_run_batch(reactor);
+
+	TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
+		thread = spdk_thread_get_from_ctx(lw_thread);
+		rc = spdk_thread_poll(thread, 0, reactor->tsc_last);
+
+		now = spdk_thread_get_last_tsc(thread);
+		if (rc == 0) {
+			reactor->idle_tsc += now - reactor->tsc_last;
+		} else if (rc > 0) {
+			reactor->busy_tsc += now - reactor->tsc_last;
+		}
+		reactor->tsc_last = now;
+
+		if (spdk_unlikely(lw_thread->resched)) {
+			lw_thread->resched = false;
+			TAILQ_REMOVE(&reactor->threads, lw_thread, link);
+			assert(reactor->thread_count > 0);
+			reactor->thread_count--;
+			_reactor_schedule_thread(thread);
+			continue;
+		}
+
+		if (spdk_unlikely(spdk_thread_is_exited(thread) &&
+				  spdk_thread_is_idle(thread))) {
+			TAILQ_REMOVE(&reactor->threads, lw_thread, link);
+			assert(reactor->thread_count > 0);
+			reactor->thread_count--;
+			spdk_thread_destroy(thread);
+			continue;
+		}
+	}
+
+	if (g_framework_context_switch_monitor_enabled) {
+		if ((reactor->last_rusage + g_rusage_period) < reactor->tsc_last) {
+			get_rusage(reactor);
+			reactor->last_rusage = reactor->tsc_last;
+		}
+	}
+}
+
+static int
+reactor_run(void *arg)
+{
+	struct spdk_reactor	*reactor = arg;
+	struct spdk_thread	*thread;
+	struct spdk_lw_thread	*lw_thread, *tmp;
+	char			thread_name[32];
+
+	SPDK_NOTICELOG("Reactor started on core %u\n", reactor->lcore);
+
+	/* Rename the POSIX thread because the reactor is tied to the POSIX
+	 * thread in the SPDK event library.
+	 */
+	snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore);
+	_set_thread_name(thread_name);
+
+	reactor->tsc_last = spdk_get_ticks();
+
+	while (1) {
+		_reactor_run(reactor);
+
+		if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING) {
+			break;
+		}
+	}
+
+	TAILQ_FOREACH(lw_thread, &reactor->threads, link) {
+		thread = spdk_thread_get_from_ctx(lw_thread);
+		spdk_set_thread(thread);
+		spdk_thread_exit(thread);
+	}
+
+	while (!TAILQ_EMPTY(&reactor->threads)) {
+		TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
+			thread = spdk_thread_get_from_ctx(lw_thread);
+			spdk_set_thread(thread);
+			if (spdk_thread_is_exited(thread)) {
+				TAILQ_REMOVE(&reactor->threads, lw_thread, link);
+				assert(reactor->thread_count > 0);
+				reactor->thread_count--;
+				spdk_thread_destroy(thread);
+			} else {
+				spdk_thread_poll(thread, 0, 0);
+			}
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+{
+	int ret;
+	struct spdk_cpuset *validmask;
+
+	ret = spdk_cpuset_parse(cpumask, mask);
+	if (ret < 0) {
+		return ret;
+	}
+
+	validmask = spdk_app_get_core_mask();
+	spdk_cpuset_and(cpumask, validmask);
+
+	return 0;
+}
+
+struct spdk_cpuset *
+spdk_app_get_core_mask(void)
+{
+	return &g_reactor_core_mask;
+}
+
+void
+spdk_reactors_start(void)
+{
+	struct spdk_reactor *reactor;
+	struct spdk_cpuset tmp_cpumask = {};
+	uint32_t i, current_core;
+	int rc;
+	char thread_name[32];
+
+	g_rusage_period = (CONTEXT_SWITCH_MONITOR_PERIOD * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC;
+	g_reactor_state = SPDK_REACTOR_STATE_RUNNING;
+
+	current_core = spdk_env_get_current_core();
+	SPDK_ENV_FOREACH_CORE(i) {
+		if (i != current_core) {
+			reactor = spdk_reactor_get(i);
+			if (reactor == NULL) {
+				continue;
+			}
+
+			rc = spdk_env_thread_launch_pinned(reactor->lcore, reactor_run, reactor);
+			if (rc < 0) {
+				SPDK_ERRLOG("Unable to start reactor thread on core %u\n", reactor->lcore);
+				assert(false);
+				return;
+			}
+
+			/* For now, for each reactor spawn one thread. */
+			snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore);
+
+			spdk_cpuset_zero(&tmp_cpumask);
+			spdk_cpuset_set_cpu(&tmp_cpumask, i, true);
+
+			spdk_thread_create(thread_name, &tmp_cpumask);
+		}
+		spdk_cpuset_set_cpu(&g_reactor_core_mask, i, true);
+	}
+
+	/* Start the master reactor */
+	reactor = spdk_reactor_get(current_core);
+	assert(reactor != NULL);
+	reactor_run(reactor);
+
+	spdk_env_thread_wait_all();
+
+	g_reactor_state = SPDK_REACTOR_STATE_SHUTDOWN;
+}
+
+void
+spdk_reactors_stop(void *arg1)
+{
+	g_reactor_state = SPDK_REACTOR_STATE_EXITING;
+}
+
+static pthread_mutex_t g_scheduler_mtx = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t g_next_core = UINT32_MAX;
+
+static void
+_schedule_thread(void *arg1, void *arg2)
+{
+	struct spdk_lw_thread *lw_thread = arg1;
+	struct spdk_thread *thread;
+	struct spdk_cpuset *cpumask;
+	struct spdk_reactor *reactor;
+	uint32_t current_core;
+
+	current_core = spdk_env_get_current_core();
+
+	thread = spdk_thread_get_from_ctx(lw_thread);
+	cpumask = spdk_thread_get_cpumask(thread);
+	if (!spdk_cpuset_get_cpu(cpumask, current_core)) {
+		SPDK_ERRLOG("Thread was scheduled to the wrong core %d\n", current_core);
+		assert(false);
+	}
+
+	reactor = spdk_reactor_get(current_core);
+	assert(reactor != NULL);
+
+	TAILQ_INSERT_TAIL(&reactor->threads, lw_thread, link);
+	reactor->thread_count++;
+}
+
+static int
+_reactor_schedule_thread(struct spdk_thread *thread)
+{
+	uint32_t core;
+	struct spdk_lw_thread *lw_thread;
+	struct spdk_event *evt = NULL;
+	struct spdk_cpuset *cpumask;
+	uint32_t i;
+
+	cpumask = spdk_thread_get_cpumask(thread);
+
+	lw_thread = spdk_thread_get_ctx(thread);
+	assert(lw_thread != NULL);
+	memset(lw_thread, 0, sizeof(*lw_thread));
+
+	pthread_mutex_lock(&g_scheduler_mtx);
+	for (i = 0; i < spdk_env_get_core_count(); i++) {
+		if (g_next_core > spdk_env_get_last_core()) {
+			g_next_core = spdk_env_get_first_core();
+		}
+		core = g_next_core;
+		g_next_core = spdk_env_get_next_core(g_next_core);
+
+		if (spdk_cpuset_get_cpu(cpumask, core)) {
+			evt = spdk_event_allocate(core, _schedule_thread, lw_thread, NULL);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_scheduler_mtx);
+
+	assert(evt != NULL);
+	if (evt == NULL) {
+		SPDK_ERRLOG("Unable to schedule thread on requested core mask.\n");
+		return -1;
+	}
+
+	lw_thread->tsc_start = spdk_get_ticks();
+
+	spdk_event_call(evt);
+
+	return 0;
+}
+
+static void
+_reactor_request_thread_reschedule(struct spdk_thread *thread)
+{
+	struct spdk_lw_thread *lw_thread;
+
+	assert(thread == spdk_get_thread());
+
+	lw_thread = spdk_thread_get_ctx(thread);
+
+	assert(lw_thread != NULL);
+
+	lw_thread->resched = true;
+}
+
+static int
+reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op)
+{
+	switch (op) {
+	case SPDK_THREAD_OP_NEW:
+		return _reactor_schedule_thread(thread);
+	case SPDK_THREAD_OP_RESCHED:
+		_reactor_request_thread_reschedule(thread);
+		return 0;
+	default:
+		return -ENOTSUP;
+	}
+}
+
+static bool
+reactor_thread_op_supported(enum spdk_thread_op op)
+{
+	switch (op) {
+	case SPDK_THREAD_OP_NEW:
+	case SPDK_THREAD_OP_RESCHED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct call_reactor {
+	uint32_t cur_core;
+	spdk_event_fn fn;
+	void *arg1;
+	void *arg2;
+
+	uint32_t orig_core;
+	spdk_event_fn cpl;
+};
+
+static void
+on_reactor(void *arg1, void *arg2)
+{
+	struct call_reactor *cr = arg1;
+	struct spdk_event *evt;
+
+	cr->fn(cr->arg1, cr->arg2);
+
+	cr->cur_core = spdk_env_get_next_core(cr->cur_core);
+
+	if (cr->cur_core > spdk_env_get_last_core()) {
+		SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Completed reactor iteration\n");
+
+		evt = spdk_event_allocate(cr->orig_core, cr->cpl, cr->arg1, cr->arg2);
+		free(cr);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Continuing reactor iteration to %d\n",
+			      cr->cur_core);
+
+		evt = spdk_event_allocate(cr->cur_core, on_reactor, arg1, NULL);
+	}
+	assert(evt != NULL);
+	spdk_event_call(evt);
+}
+
+void
+spdk_for_each_reactor(spdk_event_fn fn, void *arg1, void *arg2, spdk_event_fn cpl)
+{
+	struct call_reactor *cr;
+	struct spdk_event *evt;
+
+	cr = calloc(1, sizeof(*cr));
+	if (!cr) {
+		SPDK_ERRLOG("Unable to perform reactor iteration\n");
+		cpl(arg1, arg2);
+		return;
+	}
+
+	cr->fn = fn;
+	cr->arg1 = arg1;
+	cr->arg2 = arg2;
+	cr->cpl = cpl;
+	cr->orig_core = spdk_env_get_current_core();
+	cr->cur_core = spdk_env_get_first_core();
+
+	SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Starting reactor iteration from %d\n", cr->orig_core);
+
+	evt = spdk_event_allocate(cr->cur_core, on_reactor, cr, NULL);
+	assert(evt != NULL);
+
+	spdk_event_call(evt);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("reactor", SPDK_LOG_REACTOR)
diff --git a/src/spdk/lib/event/rpc.c b/src/spdk/lib/event/rpc.c
new file mode 100644
index 000000000..a42d5ebeb
--- /dev/null
+++ b/src/spdk/lib/event/rpc.c
@@ -0,0 +1,87 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/log.h"
+#include "spdk/rpc.h"
+
+#include "spdk_internal/event.h"
+
+#define RPC_SELECT_INTERVAL	4000 /* 4ms */
+
+static struct spdk_poller *g_rpc_poller = NULL;
+
+static int
+rpc_subsystem_poll(void *arg)
+{
+	spdk_rpc_accept();
+	return SPDK_POLLER_BUSY;
+}
+
+void
+spdk_rpc_initialize(const char *listen_addr)
+{
+	int rc;
+
+	if (listen_addr == NULL) {
+		return;
+	}
+
+	if (!spdk_rpc_verify_methods()) {
+		spdk_app_stop(-EINVAL);
+		return;
+	}
+
+	/* Listen on the requested address */
+	rc = spdk_rpc_listen(listen_addr);
+	if (rc != 0) {
+		SPDK_ERRLOG("Unable to start RPC service at %s\n", listen_addr);
+		return;
+	}
+
+	spdk_rpc_set_state(SPDK_RPC_STARTUP);
+
+	/* Register a poller to periodically check for RPCs */
+	g_rpc_poller = SPDK_POLLER_REGISTER(rpc_subsystem_poll, NULL, RPC_SELECT_INTERVAL);
+}
+
+void
+spdk_rpc_finish(void)
+{
+	spdk_rpc_close();
+	spdk_poller_unregister(&g_rpc_poller);
+}
diff --git a/src/spdk/lib/event/spdk_event.map b/src/spdk/lib/event/spdk_event.map
new file mode 100644
index 000000000..8208c5e1f
--- /dev/null
+++ b/src/spdk/lib/event/spdk_event.map
@@ -0,0 +1,46 @@
+{
+	global:
+
+	# Public functions
+	spdk_app_opts_init;
+	spdk_app_start;
+	spdk_app_fini;
+	spdk_app_start_shutdown;
+	spdk_app_stop;
+	spdk_app_get_running_config;
+	spdk_app_get_shm_id;
+	spdk_app_parse_core_mask;
+	spdk_app_get_core_mask;
+	spdk_app_parse_args;
+	spdk_app_usage;
+	spdk_event_allocate;
+	spdk_event_call;
+	spdk_framework_enable_context_switch_monitor;
+	spdk_framework_context_switch_monitor_enabled;
+
+	# Functions used by other SPDK libraries
+	spdk_reactors_init;
+	spdk_reactors_fini;
+	spdk_reactors_start;
+	spdk_reactors_stop;
+	spdk_reactor_get;
+	spdk_for_each_reactor;
+	spdk_subsystem_find;
+	spdk_subsystem_get_first;
+	spdk_subsystem_get_next;
+	spdk_subsystem_get_first_depend;
+	spdk_subsystem_get_next_depend;
+	spdk_add_subsystem;
+	spdk_add_subsystem_depend;
+	spdk_subsystem_init;
+	spdk_subsystem_fini;
+	spdk_subsystem_init_next;
+	spdk_subsystem_fini_next;
+	spdk_subsystem_config;
+	spdk_app_json_config_load;
+	spdk_subsystem_config_json;
+	spdk_rpc_initialize;
+	spdk_rpc_finish;
+
+	local: *;
+};
diff --git a/src/spdk/lib/event/subsystem.c b/src/spdk/lib/event/subsystem.c
new file mode 100644
index 000000000..2cff890b2
--- /dev/null
+++ b/src/spdk/lib/event/subsystem.c
@@ -0,0 +1,288 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/log.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/event.h"
+#include "spdk/env.h"
+
+TAILQ_HEAD(spdk_subsystem_list, spdk_subsystem);
+struct spdk_subsystem_list g_subsystems = TAILQ_HEAD_INITIALIZER(g_subsystems);
+
+TAILQ_HEAD(spdk_subsystem_depend_list, spdk_subsystem_depend);
+struct spdk_subsystem_depend_list g_subsystems_deps = TAILQ_HEAD_INITIALIZER(g_subsystems_deps);
+static struct spdk_subsystem *g_next_subsystem;
+static bool g_subsystems_initialized = false;
+static bool g_subsystems_init_interrupted = false;
+static spdk_subsystem_init_fn g_subsystem_start_fn = NULL;
+static void *g_subsystem_start_arg = NULL;
+static spdk_msg_fn g_subsystem_stop_fn = NULL;
+static void *g_subsystem_stop_arg = NULL;
+static struct spdk_thread *g_fini_thread = NULL;
+
+void
+spdk_add_subsystem(struct spdk_subsystem *subsystem)
+{
+	TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq);
+}
+
+void
+spdk_add_subsystem_depend(struct spdk_subsystem_depend *depend)
+{
+	TAILQ_INSERT_TAIL(&g_subsystems_deps, depend, tailq);
+}
+
+static struct spdk_subsystem *
+_subsystem_find(struct spdk_subsystem_list *list, const char *name)
+{
+	struct spdk_subsystem *iter;
+
+	TAILQ_FOREACH(iter, list, tailq) {
+		if (strcmp(name, iter->name) == 0) {
+			return iter;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_subsystem *
+spdk_subsystem_find(const char *name)
+{
+	return _subsystem_find(&g_subsystems, name);
+}
+
+struct spdk_subsystem *
+spdk_subsystem_get_first(void)
+{
+	return TAILQ_FIRST(&g_subsystems);
+}
+
+struct spdk_subsystem *
+spdk_subsystem_get_next(struct spdk_subsystem *cur_subsystem)
+{
+	return TAILQ_NEXT(cur_subsystem, tailq);
+}
+
+
+struct spdk_subsystem_depend *
+spdk_subsystem_get_first_depend(void)
+{
+	return TAILQ_FIRST(&g_subsystems_deps);
+}
+
+struct spdk_subsystem_depend *
+spdk_subsystem_get_next_depend(struct spdk_subsystem_depend *cur_depend)
+{
+	return TAILQ_NEXT(cur_depend, tailq);
+}
+
+static void
+subsystem_sort(void)
+{
+	bool depends_on, depends_on_sorted;
+	struct spdk_subsystem *subsystem, *subsystem_tmp;
+	struct spdk_subsystem_depend *subsystem_dep;
+
+	struct spdk_subsystem_list subsystems_list = TAILQ_HEAD_INITIALIZER(subsystems_list);
+
+	while (!TAILQ_EMPTY(&g_subsystems)) {
+		TAILQ_FOREACH_SAFE(subsystem, &g_subsystems, tailq, subsystem_tmp) {
+			depends_on = false;
+			TAILQ_FOREACH(subsystem_dep, &g_subsystems_deps, tailq) {
+				if (strcmp(subsystem->name, subsystem_dep->name) == 0) {
+					depends_on = true;
+					depends_on_sorted = !!_subsystem_find(&subsystems_list, subsystem_dep->depends_on);
+					if (depends_on_sorted) {
+						continue;
+					}
+					break;
+				}
+			}
+
+			if (depends_on == false) {
+				TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
+				TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq);
+			} else {
+				if (depends_on_sorted == true) {
+					TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
+					TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq);
+				}
+			}
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(subsystem, &subsystems_list, tailq, subsystem_tmp) {
+		TAILQ_REMOVE(&subsystems_list, subsystem, tailq);
+		TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq);
+	}
+}
+
+void
+spdk_subsystem_init_next(int rc)
+{
+	/* The initialization is interrupted by the spdk_subsystem_fini, so just return */
+	if (g_subsystems_init_interrupted) {
+		return;
+	}
+
+	if (rc) {
+		SPDK_ERRLOG("Init subsystem %s failed\n", g_next_subsystem->name);
+		g_subsystem_start_fn(rc, g_subsystem_start_arg);
+		return;
+	}
+
+	if (!g_next_subsystem) {
+		g_next_subsystem = TAILQ_FIRST(&g_subsystems);
+	} else {
+		g_next_subsystem = TAILQ_NEXT(g_next_subsystem, tailq);
+	}
+
+	if (!g_next_subsystem) {
+		g_subsystems_initialized = true;
+		g_subsystem_start_fn(0, g_subsystem_start_arg);
+		return;
+	}
+
+	if (g_next_subsystem->init) {
+		g_next_subsystem->init();
+	} else {
+		spdk_subsystem_init_next(0);
+	}
+}
+
+void
+spdk_subsystem_init(spdk_subsystem_init_fn cb_fn, void *cb_arg)
+{
+	struct spdk_subsystem_depend *dep;
+
+	g_subsystem_start_fn = cb_fn;
+	g_subsystem_start_arg = cb_arg;
+
+	/* Verify that all dependency name and depends_on subsystems are registered */
+	TAILQ_FOREACH(dep, &g_subsystems_deps, tailq) {
+		if (!spdk_subsystem_find(dep->name)) {
+			SPDK_ERRLOG("subsystem %s is missing\n", dep->name);
+			g_subsystem_start_fn(-1, g_subsystem_start_arg);
+			return;
+		}
+		if (!spdk_subsystem_find(dep->depends_on)) {
+			SPDK_ERRLOG("subsystem %s dependency %s is missing\n",
+				    dep->name, dep->depends_on);
+			g_subsystem_start_fn(-1, g_subsystem_start_arg);
+			return;
+		}
+	}
+
+	subsystem_sort();
+
+	spdk_subsystem_init_next(0);
+}
+
+static void
+subsystem_fini_next(void *arg1)
+{
+	assert(g_fini_thread == spdk_get_thread());
+
+	if (!g_next_subsystem) {
+		/* If the initialized flag is false, then we've failed to initialize
+		 * the very first subsystem and no de-init is needed
+		 */
+		if (g_subsystems_initialized) {
+			g_next_subsystem = TAILQ_LAST(&g_subsystems, spdk_subsystem_list);
+		}
+	} else {
+		if (g_subsystems_initialized || g_subsystems_init_interrupted) {
+			g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq);
+		} else {
+			g_subsystems_init_interrupted = true;
+		}
+	}
+
+	while (g_next_subsystem) {
+		if (g_next_subsystem->fini) {
+			g_next_subsystem->fini();
+			return;
+		}
+		g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq);
+	}
+
+	g_subsystem_stop_fn(g_subsystem_stop_arg);
+	return;
+}
+
+void
+spdk_subsystem_fini_next(void)
+{
+	if (g_fini_thread != spdk_get_thread()) {
+		spdk_thread_send_msg(g_fini_thread, subsystem_fini_next, NULL);
+	} else {
+		subsystem_fini_next(NULL);
+	}
+}
+
+void
+spdk_subsystem_fini(spdk_msg_fn cb_fn, void *cb_arg)
+{
+	g_subsystem_stop_fn = cb_fn;
+	g_subsystem_stop_arg = cb_arg;
+
+	g_fini_thread = spdk_get_thread();
+
+	spdk_subsystem_fini_next();
+}
+
+void
+spdk_subsystem_config(FILE *fp)
+{
+	struct spdk_subsystem *subsystem;
+
+	TAILQ_FOREACH(subsystem, &g_subsystems, tailq) {
+		if (subsystem->config) {
+			subsystem->config(fp);
+		}
+	}
+}
+
+void
+spdk_subsystem_config_json(struct spdk_json_write_ctx *w, struct spdk_subsystem *subsystem)
+{
+	if (subsystem && subsystem->write_config_json) {
+		subsystem->write_config_json(w);
+	} else {
+		spdk_json_write_null(w);
+	}
+}
diff --git a/src/spdk/lib/ftl/Makefile b/src/spdk/lib/ftl/Makefile
new file mode 100644
index 000000000..c24274622
--- /dev/null
+++ b/src/spdk/lib/ftl/Makefile
@@ -0,0 +1,47 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = ftl_band.c ftl_core.c ftl_debug.c ftl_io.c ftl_reloc.c \
+	 ftl_restore.c ftl_init.c ftl_trace.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ftl.map)
+
+LIBNAME = ftl
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/ftl/ftl_addr.h b/src/spdk/lib/ftl/ftl_addr.h
new file mode 100644
index 000000000..36d2ffb00
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_addr.h
@@ -0,0 +1,76 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_ADDR_H
+#define FTL_ADDR_H
+
+#include "spdk/stdinc.h"
+
+/* Marks address as invalid */
+#define FTL_ADDR_INVALID	(-1)
+/* Marks LBA as invalid */
+#define FTL_LBA_INVALID		((uint64_t)-1)
+/* Smallest data unit size */
+#define FTL_BLOCK_SIZE		4096
+
+/* This structure represents on-disk address. It can have one of the following */
+/* formats: */
+/*        - offset inside the disk */
+/*        - cache_offset inside the cache (indicated by the cached flag) */
+/*        - packed version of the two formats above (can be only used when the */
+/*          offset can be represented in less than 32 bits) */
+/* Packed format is used, when possible, to avoid wasting RAM on the L2P table. */
+struct ftl_addr {
+	union {
+		struct {
+			uint64_t cache_offset : 63;
+			uint64_t cached	      : 1;
+		};
+
+		struct {
+			union {
+				struct  {
+					uint32_t cache_offset : 31;
+					uint32_t cached	      : 1;
+				};
+
+				uint32_t offset;
+			};
+			uint32_t rsvd;
+		} pack;
+
+		uint64_t offset;
+	};
+};
+
+#endif /* FTL_ADDR_H */
diff --git a/src/spdk/lib/ftl/ftl_band.c b/src/spdk/lib/ftl/ftl_band.c
new file mode 100644
index 000000000..62221dcf6
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_band.c
@@ -0,0 +1,1097 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/crc32.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+#include "spdk/ftl.h"
+
+#include "ftl_band.h"
+#include "ftl_io.h"
+#include "ftl_core.h"
+#include "ftl_reloc.h"
+#include "ftl_debug.h"
+
+/* TODO: define some signature for meta version */
+#define FTL_MD_VER 1
+
+struct __attribute__((packed)) ftl_md_hdr {
+	/* Device instance */
+	struct spdk_uuid	uuid;
+
+	/* Meta version */
+	uint8_t			ver;
+
+	/* Sequence number */
+	uint64_t		seq;
+
+	/* CRC32 checksum */
+	uint32_t		checksum;
+};
+
+/* End metadata layout stored on media (with all three being aligned to block size): */
+/* - header */
+/* - valid bitmap */
+/* - LBA map */
+struct __attribute__((packed)) ftl_tail_md {
+	struct ftl_md_hdr	hdr;
+
+	/* Max number of blocks */
+	uint64_t		num_blocks;
+
+	uint8_t			reserved[4059];
+};
+SPDK_STATIC_ASSERT(sizeof(struct ftl_tail_md) == FTL_BLOCK_SIZE, "Incorrect metadata size");
+
+struct __attribute__((packed)) ftl_head_md {
+	struct ftl_md_hdr	hdr;
+
+	/* Number of defrag cycles */
+	uint64_t		wr_cnt;
+
+	/* Number of surfaced LBAs */
+	uint64_t		lba_cnt;
+
+	/* Transfer size */
+	uint32_t		xfer_size;
+};
+
+size_t
+ftl_tail_md_hdr_num_blocks(void)
+{
+	return spdk_divide_round_up(sizeof(struct ftl_tail_md), FTL_BLOCK_SIZE);
+}
+
+size_t
+ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev)
+{
+	return spdk_divide_round_up(ftl_vld_map_size(dev), FTL_BLOCK_SIZE);
+}
+
+size_t
+ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev)
+{
+	return spdk_divide_round_up(ftl_get_num_blocks_in_band(dev) * sizeof(uint64_t), FTL_BLOCK_SIZE);
+}
+
+size_t
+ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev)
+{
+	return dev->xfer_size;
+}
+
+size_t
+ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev)
+{
+	return spdk_divide_round_up(ftl_tail_md_hdr_num_blocks() +
+				    ftl_vld_map_num_blocks(dev) +
+				    ftl_lba_map_num_blocks(dev),
+				    dev->xfer_size) * dev->xfer_size;
+}
+
+static uint64_t
+ftl_band_tail_md_offset(const struct ftl_band *band)
+{
+	return ftl_band_num_usable_blocks(band) -
+	       ftl_tail_md_num_blocks(band->dev);
+}
+
+int
+ftl_band_full(struct ftl_band *band, size_t offset)
+{
+	return offset == ftl_band_tail_md_offset(band);
+}
+
+void
+ftl_band_write_failed(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+
+	band->high_prio = 1;
+
+	ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 1, true);
+	ftl_band_set_state(band, FTL_BAND_STATE_CLOSED);
+}
+
+static void
+ftl_band_free_lba_map(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_lba_map *lba_map = &band->lba_map;
+
+	assert(band->state == FTL_BAND_STATE_CLOSED ||
+	       band->state == FTL_BAND_STATE_FREE);
+	assert(lba_map->ref_cnt == 0);
+	assert(lba_map->map != NULL);
+	assert(!band->high_prio);
+
+	/* Verify that band's metadata is consistent with l2p */
+	if (band->num_zones) {
+		assert(ftl_band_validate_md(band) == true);
+	}
+
+	spdk_mempool_put(dev->lba_pool, lba_map->dma_buf);
+	lba_map->map = NULL;
+	lba_map->dma_buf = NULL;
+}
+
+static void
+_ftl_band_set_free(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_band *lband, *prev;
+
+	/* Remove the band from the closed band list */
+	LIST_REMOVE(band, list_entry);
+
+	/* Keep the list sorted by band's write count */
+	LIST_FOREACH(lband, &dev->free_bands, list_entry) {
+		if (lband->wr_cnt > band->wr_cnt) {
+			LIST_INSERT_BEFORE(lband, band, list_entry);
+			break;
+		}
+		prev = lband;
+	}
+
+	if (!lband) {
+		if (LIST_EMPTY(&dev->free_bands)) {
+			LIST_INSERT_HEAD(&dev->free_bands, band, list_entry);
+		} else {
+			LIST_INSERT_AFTER(prev, band, list_entry);
+		}
+	}
+
+#if defined(DEBUG)
+	prev = NULL;
+	LIST_FOREACH(lband, &dev->free_bands, list_entry) {
+		if (!prev) {
+			continue;
+		}
+		assert(prev->wr_cnt <= lband->wr_cnt);
+	}
+#endif
+	dev->num_free++;
+	ftl_apply_limits(dev);
+}
+
+static void
+_ftl_band_set_preparing(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+
+	/* Remove band from free list */
+	LIST_REMOVE(band, list_entry);
+
+	band->wr_cnt++;
+
+	assert(dev->num_free > 0);
+	dev->num_free--;
+
+	ftl_apply_limits(dev);
+}
+
+static void
+_ftl_band_set_closed(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+
+	/* Set the state as free_md() checks for that */
+	band->state = FTL_BAND_STATE_CLOSED;
+
+	/* Free the lba map if there are no outstanding IOs */
+	ftl_band_release_lba_map(band);
+
+	if (spdk_likely(band->num_zones)) {
+		LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry);
+	} else {
+		LIST_REMOVE(band, list_entry);
+	}
+}
+
+static uint32_t
+ftl_md_calc_crc(const struct ftl_md_hdr *hdr, size_t size)
+{
+	size_t checkoff = offsetof(struct ftl_md_hdr, checksum);
+	size_t mdoff = checkoff + sizeof(hdr->checksum);
+	uint32_t crc;
+
+	crc = spdk_crc32c_update(hdr, checkoff, 0);
+	return spdk_crc32c_update((const char *)hdr + mdoff, size - mdoff, crc);
+}
+
+static void
+ftl_set_md_hdr(struct ftl_band *band, struct ftl_md_hdr *hdr, size_t size)
+{
+	hdr->seq = band->seq;
+	hdr->ver = FTL_MD_VER;
+	hdr->uuid = band->dev->uuid;
+	hdr->checksum = ftl_md_calc_crc(hdr, size);
+}
+
+static int
+ftl_pack_head_md(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_head_md *head = band->lba_map.dma_buf;
+
+	head->wr_cnt = band->wr_cnt;
+	head->lba_cnt = dev->num_lbas;
+	head->xfer_size = dev->xfer_size;
+	ftl_set_md_hdr(band, &head->hdr, sizeof(struct ftl_head_md));
+
+	return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_pack_tail_md(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	struct ftl_tail_md *tail = lba_map->dma_buf;
+	void *vld_offset;
+
+	vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE;
+
+	/* Clear out the buffer */
+	memset(tail, 0, ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE);
+	tail->num_blocks = ftl_get_num_blocks_in_band(dev);
+
+	pthread_spin_lock(&lba_map->lock);
+	spdk_bit_array_store_mask(lba_map->vld, vld_offset);
+	pthread_spin_unlock(&lba_map->lock);
+
+	ftl_set_md_hdr(band, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE);
+
+	return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_md_hdr_vld(struct spdk_ftl_dev *dev, const struct ftl_md_hdr *hdr, size_t size)
+{
+	if (spdk_uuid_compare(&dev->uuid, &hdr->uuid) != 0) {
+		return FTL_MD_NO_MD;
+	}
+
+	if (hdr->ver != FTL_MD_VER) {
+		return FTL_MD_INVALID_VER;
+	}
+
+	if (ftl_md_calc_crc(hdr, size) != hdr->checksum) {
+		return FTL_MD_INVALID_CRC;
+	}
+
+	return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_unpack_tail_md(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	void *vld_offset;
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	struct ftl_tail_md *tail = lba_map->dma_buf;
+	int rc;
+
+	vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE;
+
+	rc = ftl_md_hdr_vld(dev, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE);
+	if (rc) {
+		return rc;
+	}
+
+	/*
+	 * When restoring from a dirty shutdown it's possible old tail meta wasn't yet cleared -
+	 * band had saved head meta, but didn't manage to send erase to all zones.
+	 * The already found tail md header is valid, but inconsistent with the head meta. Treat
+	 * such a band as open/without valid tail md.
+	 */
+	if (band->seq != tail->hdr.seq) {
+		return FTL_MD_NO_MD;
+	}
+
+	if (tail->num_blocks != ftl_get_num_blocks_in_band(dev)) {
+		return FTL_MD_INVALID_SIZE;
+	}
+
+	spdk_bit_array_load_mask(lba_map->vld, vld_offset);
+
+	return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_unpack_head_md(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_head_md *head = band->lba_map.dma_buf;
+	int rc;
+
+	rc = ftl_md_hdr_vld(dev, &head->hdr, sizeof(struct ftl_head_md));
+	if (rc) {
+		return rc;
+	}
+
+	band->seq = head->hdr.seq;
+	band->wr_cnt = head->wr_cnt;
+
+	if (dev->global_md.num_lbas == 0) {
+		dev->global_md.num_lbas = head->lba_cnt;
+	}
+
+	if (dev->global_md.num_lbas != head->lba_cnt) {
+		return FTL_MD_INVALID_SIZE;
+	}
+
+	if (dev->xfer_size != head->xfer_size) {
+		return FTL_MD_INVALID_SIZE;
+	}
+
+	return FTL_MD_SUCCESS;
+}
+
+struct ftl_addr
+ftl_band_tail_md_addr(struct ftl_band *band)
+{
+	struct ftl_addr addr = {};
+	struct ftl_zone *zone;
+	struct spdk_ftl_dev *dev = band->dev;
+	size_t xfer_size = dev->xfer_size;
+	size_t num_req = ftl_band_tail_md_offset(band) / xfer_size;
+	size_t i;
+
+	if (spdk_unlikely(!band->num_zones)) {
+		return ftl_to_addr(FTL_ADDR_INVALID);
+	}
+
+	/* Metadata should be aligned to xfer size */
+	assert(ftl_band_tail_md_offset(band) % xfer_size == 0);
+
+	zone = CIRCLEQ_FIRST(&band->zones);
+	for (i = 0; i < num_req % band->num_zones; ++i) {
+		zone = ftl_band_next_zone(band, zone);
+	}
+
+	addr.offset = (num_req / band->num_zones) * xfer_size;
+	addr.offset += zone->info.zone_id;
+
+	return addr;
+}
+
+struct ftl_addr
+ftl_band_head_md_addr(struct ftl_band *band)
+{
+	if (spdk_unlikely(!band->num_zones)) {
+		return ftl_to_addr(FTL_ADDR_INVALID);
+	}
+
+	return ftl_to_addr(CIRCLEQ_FIRST(&band->zones)->info.zone_id);
+}
+
+void
+ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state)
+{
+	switch (state) {
+	case FTL_BAND_STATE_FREE:
+		assert(band->state == FTL_BAND_STATE_CLOSED);
+		_ftl_band_set_free(band);
+		break;
+
+	case FTL_BAND_STATE_PREP:
+		assert(band->state == FTL_BAND_STATE_FREE);
+		_ftl_band_set_preparing(band);
+		break;
+
+	case FTL_BAND_STATE_CLOSED:
+		if (band->state != FTL_BAND_STATE_CLOSED) {
+			assert(band->state == FTL_BAND_STATE_CLOSING || band->high_prio);
+			_ftl_band_set_closed(band);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	band->state = state;
+}
+
+void
+ftl_band_set_addr(struct ftl_band *band, uint64_t lba, struct ftl_addr addr)
+{
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	uint64_t offset;
+
+	assert(lba != FTL_LBA_INVALID);
+
+	offset = ftl_band_block_offset_from_addr(band, addr);
+	pthread_spin_lock(&lba_map->lock);
+
+	lba_map->num_vld++;
+	lba_map->map[offset] = lba;
+	spdk_bit_array_set(lba_map->vld, offset);
+
+	pthread_spin_unlock(&lba_map->lock);
+}
+
+size_t
+ftl_band_age(const struct ftl_band *band)
+{
+	return (size_t)(band->dev->seq - band->seq);
+}
+
+size_t
+ftl_band_num_usable_blocks(const struct ftl_band *band)
+{
+	return band->num_zones * ftl_get_num_blocks_in_zone(band->dev);
+}
+
+size_t
+ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset)
+{
+	size_t tail_md_offset = ftl_band_tail_md_offset(band);
+
+	if (spdk_unlikely(offset <= ftl_head_md_num_blocks(band->dev))) {
+		return ftl_band_user_blocks(band);
+	}
+
+	if (spdk_unlikely(offset > tail_md_offset)) {
+		return 0;
+	}
+
+	return tail_md_offset - offset;
+}
+
+size_t
+ftl_band_user_blocks(const struct ftl_band *band)
+{
+	return ftl_band_num_usable_blocks(band) -
+	       ftl_head_md_num_blocks(band->dev) -
+	       ftl_tail_md_num_blocks(band->dev);
+}
+
+struct ftl_band *
+ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	size_t band_id = ftl_addr_get_band(dev, addr);
+
+	assert(band_id < ftl_get_num_bands(dev));
+	return &dev->bands[band_id];
+}
+
+struct ftl_zone *
+ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+	size_t pu_id = ftl_addr_get_punit(band->dev, addr);
+
+	assert(pu_id < ftl_get_num_punits(band->dev));
+	return &band->zone_buf[pu_id];
+}
+
+uint64_t
+ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+	assert(ftl_addr_get_band(band->dev, addr) == band->id);
+	assert(ftl_addr_get_punit(band->dev, addr) < ftl_get_num_punits(band->dev));
+	return addr.offset % ftl_get_num_blocks_in_band(band->dev);
+}
+
+struct ftl_addr
+ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr, size_t num_blocks)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_zone *zone;
+	size_t num_xfers, num_stripes;
+	uint64_t offset;
+
+	assert(ftl_addr_get_band(dev, addr) == band->id);
+
+	offset = ftl_addr_get_zone_offset(dev, addr);
+	zone = ftl_band_zone_from_addr(band, addr);
+
+	num_blocks += (offset % dev->xfer_size);
+	offset  -= (offset % dev->xfer_size);
+
+#if defined(DEBUG)
+	/* Check that the number of zones has not been changed */
+	struct ftl_zone *_zone;
+	size_t _num_zones = 0;
+	CIRCLEQ_FOREACH(_zone, &band->zones, circleq) {
+		if (spdk_likely(_zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) {
+			_num_zones++;
+		}
+	}
+	assert(band->num_zones == _num_zones);
+#endif
+	assert(band->num_zones != 0);
+	num_stripes = (num_blocks / dev->xfer_size) / band->num_zones;
+	offset += num_stripes * dev->xfer_size;
+	num_blocks -= num_stripes * dev->xfer_size * band->num_zones;
+
+	if (offset > ftl_get_num_blocks_in_zone(dev)) {
+		return ftl_to_addr(FTL_ADDR_INVALID);
+	}
+
+	num_xfers = num_blocks / dev->xfer_size;
+	for (size_t i = 0; i < num_xfers; ++i) {
+		/* When the last zone is reached the block part of the address */
+		/* needs to be increased by xfer_size */
+		if (ftl_band_zone_is_last(band, zone)) {
+			offset += dev->xfer_size;
+			if (offset > ftl_get_num_blocks_in_zone(dev)) {
+				return ftl_to_addr(FTL_ADDR_INVALID);
+			}
+		}
+
+		zone = ftl_band_next_operational_zone(band, zone);
+		assert(zone);
+
+		num_blocks -= dev->xfer_size;
+	}
+
+	if (num_blocks) {
+		offset += num_blocks;
+		if (offset > ftl_get_num_blocks_in_zone(dev)) {
+			return ftl_to_addr(FTL_ADDR_INVALID);
+		}
+	}
+
+	addr.offset = zone->info.zone_id + offset;
+	return addr;
+}
+
+static size_t
+ftl_xfer_offset_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+	struct ftl_zone *zone, *current_zone;
+	unsigned int punit_offset = 0;
+	size_t num_stripes, xfer_size = band->dev->xfer_size;
+	uint64_t offset;
+
+	assert(ftl_addr_get_band(band->dev, addr) == band->id);
+
+	offset = ftl_addr_get_zone_offset(band->dev, addr);
+	num_stripes = (offset / xfer_size) * band->num_zones;
+
+	current_zone = ftl_band_zone_from_addr(band, addr);
+	CIRCLEQ_FOREACH(zone, &band->zones, circleq) {
+		if (current_zone == zone) {
+			break;
+		}
+		punit_offset++;
+	}
+
+	return xfer_size * (num_stripes + punit_offset) + offset % xfer_size;
+}
+
+struct ftl_addr
+ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off)
+{
+	struct ftl_addr addr = { .offset = 0 };
+
+	addr.offset = block_off + band->id * ftl_get_num_blocks_in_band(band->dev);
+	return addr;
+}
+
+struct ftl_addr
+ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr, size_t offset)
+{
+	uint64_t block_off = ftl_band_block_offset_from_addr(band, addr);
+	return ftl_band_addr_from_block_offset(band, block_off + offset);
+}
+
+void
+ftl_band_acquire_lba_map(struct ftl_band *band)
+{
+	assert(band->lba_map.map != NULL);
+	band->lba_map.ref_cnt++;
+}
+
+int
+ftl_band_alloc_lba_map(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_lba_map *lba_map = &band->lba_map;
+
+	assert(lba_map->ref_cnt == 0);
+	assert(lba_map->map == NULL);
+
+	lba_map->dma_buf = spdk_mempool_get(dev->lba_pool);
+
+	if (!lba_map->dma_buf) {
+		return -1;
+	}
+
+	memset(lba_map->dma_buf, 0, ftl_lba_map_pool_elem_size(band->dev));
+
+	lba_map->map = (uint64_t *)((char *)lba_map->dma_buf + FTL_BLOCK_SIZE *
+				    (ftl_tail_md_hdr_num_blocks() + ftl_vld_map_num_blocks(dev)));
+
+	lba_map->segments = (char *)lba_map->dma_buf + ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE;
+
+	ftl_band_acquire_lba_map(band);
+	return 0;
+}
+
+void
+ftl_band_release_lba_map(struct ftl_band *band)
+{
+	struct ftl_lba_map *lba_map = &band->lba_map;
+
+	assert(lba_map->map != NULL);
+	assert(lba_map->ref_cnt > 0);
+	lba_map->ref_cnt--;
+
+	if (lba_map->ref_cnt == 0) {
+		ftl_band_free_lba_map(band);
+	}
+}
+
+static void
+ftl_read_md_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct ftl_md_io *md_io = (struct ftl_md_io *)io;
+
+	if (!status) {
+		status = md_io->pack_fn(md_io->io.band);
+	} else {
+		status = FTL_MD_IO_FAILURE;
+	}
+
+	md_io->cb_fn(io, md_io->cb_ctx, status);
+}
+
+static struct ftl_md_io *
+ftl_io_init_md_read(struct spdk_ftl_dev *dev, struct ftl_addr addr,
+		    struct ftl_band *band, size_t num_blocks, void *buf,
+		    ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx)
+{
+	struct ftl_md_io *io;
+	struct ftl_io_init_opts opts = {
+		.dev		= dev,
+		.io		= NULL,
+		.band		= band,
+		.size		= sizeof(*io),
+		.flags		= FTL_IO_MD | FTL_IO_PHYSICAL_MODE,
+		.type		= FTL_IO_READ,
+		.num_blocks	= num_blocks,
+		.cb_fn		= fn,
+		.iovs		= {
+			{
+				.iov_base = buf,
+				.iov_len = num_blocks * FTL_BLOCK_SIZE,
+			}
+		},
+		.iovcnt		= 1,
+	};
+
+	io = (struct ftl_md_io *)ftl_io_init_internal(&opts);
+	if (!io) {
+		return NULL;
+	}
+
+	io->io.addr = addr;
+	io->pack_fn = pack_fn;
+	io->cb_fn = cb_fn;
+	io->cb_ctx = cb_ctx;
+
+	return io;
+}
+
+static struct ftl_io *
+ftl_io_init_md_write(struct spdk_ftl_dev *dev, struct ftl_band *band,
+		     void *data, size_t num_blocks, ftl_io_fn cb)
+{
+	struct ftl_io_init_opts opts = {
+		.dev		= dev,
+		.io		= NULL,
+		.band		= band,
+		.size		= sizeof(struct ftl_io),
+		.flags		= FTL_IO_MD | FTL_IO_PHYSICAL_MODE,
+		.type		= FTL_IO_WRITE,
+		.num_blocks	= num_blocks,
+		.cb_fn		= cb,
+		.iovs		= {
+			{
+				.iov_base = data,
+				.iov_len = num_blocks * FTL_BLOCK_SIZE,
+			}
+		},
+		.iovcnt		= 1,
+		.md		= NULL,
+	};
+
+	return ftl_io_init_internal(&opts);
+}
+
+static int
+ftl_band_write_md(struct ftl_band *band, size_t num_blocks,
+		  ftl_md_pack_fn md_fn, ftl_io_fn cb)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_io *io;
+
+	io = ftl_io_init_md_write(dev, band, band->lba_map.dma_buf, num_blocks, cb);
+	if (!io) {
+		return -ENOMEM;
+	}
+
+	md_fn(band);
+
+	ftl_io_write(io);
+	return 0;
+}
+
+void
+ftl_band_md_clear(struct ftl_band *band)
+{
+	band->seq = 0;
+	band->wr_cnt = 0;
+	band->lba_map.num_vld = 0;
+	band->lba_map.map = NULL;
+}
+
+int
+ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb)
+{
+	return ftl_band_write_md(band, ftl_head_md_num_blocks(band->dev),
+				 ftl_pack_head_md, cb);
+}
+
+int
+ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb)
+{
+	return ftl_band_write_md(band, ftl_tail_md_num_blocks(band->dev),
+				 ftl_pack_tail_md, cb);
+}
+
+static struct ftl_addr
+ftl_band_lba_map_addr(struct ftl_band *band, size_t offset)
+{
+	return ftl_band_next_xfer_addr(band, band->tail_md_addr,
+				       ftl_tail_md_hdr_num_blocks() +
+				       ftl_vld_map_num_blocks(band->dev) +
+				       offset);
+}
+
+static int
+ftl_band_read_md(struct ftl_band *band, size_t num_blocks, struct ftl_addr start_addr,
+		 void *buf, ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_md_io *io;
+
+	if (spdk_unlikely(!band->num_zones)) {
+		return -ENOENT;
+	}
+
+	io = ftl_io_init_md_read(dev, start_addr, band, num_blocks, buf, fn, pack_fn, cb_fn, cb_ctx);
+	if (!io) {
+		return -ENOMEM;
+	}
+
+	ftl_io_read((struct ftl_io *)io);
+	return 0;
+}
+
+int
+ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr addr, ftl_io_fn cb_fn, void *cb_ctx)
+{
+	return ftl_band_read_md(band, ftl_tail_md_num_blocks(band->dev), addr, band->lba_map.dma_buf,
+				ftl_read_md_cb, ftl_unpack_tail_md, cb_fn, cb_ctx);
+}
+
+static size_t
+ftl_lba_map_request_segment_done(struct ftl_lba_map_request *request, size_t offset,
+				 size_t num_segments)
+{
+	size_t i, num_done = 0;
+
+	for (i = offset; i < offset + num_segments; ++i) {
+		if (spdk_bit_array_get(request->segments, i)) {
+			spdk_bit_array_clear(request->segments, offset);
+			num_done++;
+		}
+	}
+
+	assert(request->num_pending >= num_done);
+	request->num_pending -= num_done;
+
+	return num_done;
+}
+
+static void
+ftl_lba_map_set_segment_state(struct ftl_lba_map *lba_map, size_t offset, size_t num_segments,
+			      enum ftl_lba_map_seg_state state)
+{
+	size_t i;
+
+	for (i = offset; i < offset + num_segments; ++i) {
+		lba_map->segments[i] = state;
+	}
+}
+
+static void
+ftl_lba_map_request_free(struct spdk_ftl_dev *dev, struct ftl_lba_map_request *request)
+{
+	spdk_bit_array_clear_mask(request->segments);
+	spdk_mempool_put(dev->lba_request_pool, request);
+}
+
+static void
+ftl_process_lba_map_requests(struct spdk_ftl_dev *dev, struct ftl_lba_map *lba_map, size_t offset,
+			     size_t num_segments, int status)
+{
+	struct ftl_lba_map_request *request, *trequest;
+	size_t num_done;
+
+	LIST_FOREACH_SAFE(request, &lba_map->request_list, list_entry, trequest) {
+		num_done = ftl_lba_map_request_segment_done(request, offset, num_segments);
+		if (request->num_pending == 0 || (status && num_done)) {
+			request->cb(NULL, request->cb_ctx, status);
+			LIST_REMOVE(request, list_entry);
+			ftl_lba_map_request_free(dev, request);
+		}
+	}
+}
+
+static size_t
+ftl_lba_map_offset_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+	size_t offset;
+	struct ftl_addr start_addr = ftl_band_lba_map_addr(band, 0);
+
+	offset =  ftl_xfer_offset_from_addr(band, addr) - ftl_xfer_offset_from_addr(band, start_addr);
+	assert(offset < ftl_lba_map_num_blocks(band->dev));
+
+	return offset;
+}
+
+static void
+ftl_read_lba_map_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct ftl_lba_map *lba_map = &io->band->lba_map;
+	uint64_t block_off;
+
+	block_off = ftl_lba_map_offset_from_addr(io->band, io->addr);
+	assert(block_off + io->num_blocks <= ftl_lba_map_num_blocks(io->dev));
+
+	if (!status) {
+		ftl_lba_map_set_segment_state(lba_map, block_off, io->num_blocks,
+					      FTL_LBA_MAP_SEG_CACHED);
+	}
+
+	ftl_process_lba_map_requests(io->dev, lba_map, block_off, io->num_blocks, status);
+}
+
+static struct ftl_lba_map_request *
+ftl_lba_map_alloc_request(struct ftl_band *band, size_t offset, size_t num_segments,
+			  ftl_io_fn cb, void *cb_ctx)
+{
+	struct ftl_lba_map_request *request;
+	struct spdk_ftl_dev *dev = band->dev;
+	size_t i;
+
+	request = spdk_mempool_get(dev->lba_request_pool);
+	if (!request) {
+		return NULL;
+	}
+
+	request->cb = cb;
+	request->cb_ctx = cb_ctx;
+	request->num_pending = num_segments;
+
+	for (i = offset; i < offset + num_segments; ++i) {
+		spdk_bit_array_set(request->segments, i);
+	}
+
+	return request;
+}
+
+static size_t
+ftl_lba_map_num_clear_segments(struct ftl_lba_map *lba_map,
+			       size_t offset, size_t num_segments)
+{
+	size_t i, cnt = 0;
+
+	for (i = offset; i < offset + num_segments; ++i) {
+		if (lba_map->segments[i] != FTL_LBA_MAP_SEG_CLEAR) {
+			break;
+		}
+		cnt++;
+	}
+
+	return cnt;
+}
+
+int
+ftl_band_read_lba_map(struct ftl_band *band, size_t offset, size_t lba_cnt,
+		      ftl_io_fn cb_fn, void *cb_ctx)
+{
+	size_t num_blocks, block_off, num_read, num_segments;
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	struct ftl_lba_map_request *request;
+	int rc = 0;
+
+	block_off = offset / FTL_NUM_LBA_IN_BLOCK;
+	num_segments = spdk_divide_round_up(offset + lba_cnt, FTL_NUM_LBA_IN_BLOCK);
+	num_blocks = num_segments - block_off;
+	assert(block_off + num_blocks <= ftl_lba_map_num_blocks(band->dev));
+
+	request = ftl_lba_map_alloc_request(band, block_off, num_blocks, cb_fn, cb_ctx);
+	if (!request) {
+		return -ENOMEM;
+	}
+
+	while (num_blocks) {
+		if (lba_map->segments[block_off] != FTL_LBA_MAP_SEG_CLEAR) {
+			if (lba_map->segments[block_off] == FTL_LBA_MAP_SEG_CACHED) {
+				ftl_lba_map_request_segment_done(request, block_off, 1);
+			}
+			num_blocks--;
+			block_off++;
+			continue;
+		}
+
+		num_read = ftl_lba_map_num_clear_segments(lba_map, block_off, num_blocks);
+		ftl_lba_map_set_segment_state(lba_map, block_off, num_read,
+					      FTL_LBA_MAP_SEG_PENDING);
+
+		rc = ftl_band_read_md(band, num_read, ftl_band_lba_map_addr(band, block_off),
+				      (char *)band->lba_map.map + block_off * FTL_BLOCK_SIZE,
+				      ftl_read_lba_map_cb, NULL, cb_fn, cb_ctx);
+		if (rc) {
+			ftl_lba_map_request_free(band->dev, request);
+			return rc;
+		}
+
+		assert(num_blocks >= num_read);
+		num_blocks -= num_read;
+		block_off += num_read;
+	}
+
+	if (request->num_pending) {
+		LIST_INSERT_HEAD(&lba_map->request_list, request, list_entry);
+	} else {
+		cb_fn(NULL, cb_ctx, 0);
+		ftl_lba_map_request_free(band->dev, request);
+	}
+
+	return rc;
+}
+
+int
+ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx)
+{
+	return ftl_band_read_md(band,
+				ftl_head_md_num_blocks(band->dev),
+				ftl_band_head_md_addr(band),
+				band->lba_map.dma_buf,
+				ftl_read_md_cb,
+				ftl_unpack_head_md,
+				cb_fn,
+				cb_ctx);
+}
+
+void
+ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone)
+{
+	CIRCLEQ_REMOVE(&band->zones, zone, circleq);
+	band->num_zones--;
+}
+
+int
+ftl_band_write_prep(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+
+	if (ftl_band_alloc_lba_map(band)) {
+		return -1;
+	}
+
+	band->seq = ++dev->seq;
+	return 0;
+}
+
+struct ftl_zone *
+ftl_band_next_operational_zone(struct ftl_band *band, struct ftl_zone *zone)
+{
+	struct ftl_zone *result = NULL;
+	struct ftl_zone *entry;
+
+	if (spdk_unlikely(!band->num_zones)) {
+		return NULL;
+	}
+
+	/* Erasing band may fail after it was assigned to wptr. */
+	/* In such a case zone is no longer in band->zones queue. */
+	if (spdk_likely(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) {
+		result = ftl_band_next_zone(band, zone);
+	} else {
+		CIRCLEQ_FOREACH_REVERSE(entry, &band->zones, circleq) {
+			if (entry->info.zone_id > zone->info.zone_id) {
+				result = entry;
+			} else {
+				if (!result) {
+					result = CIRCLEQ_FIRST(&band->zones);
+				}
+				break;
+			}
+		}
+	}
+
+	return result;
+}
+
+void
+ftl_band_clear_lba_map(struct ftl_band *band)
+{
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	size_t num_segments;
+
+	spdk_bit_array_clear_mask(lba_map->vld);
+	memset(lba_map->map, 0, ftl_lba_map_num_blocks(band->dev) * FTL_BLOCK_SIZE);
+
+	/* For open band all lba map segments are already cached */
+	assert(band->state == FTL_BAND_STATE_PREP);
+	num_segments = spdk_divide_round_up(ftl_get_num_blocks_in_band(band->dev), FTL_NUM_LBA_IN_BLOCK);
+	ftl_lba_map_set_segment_state(&band->lba_map, 0, num_segments, FTL_LBA_MAP_SEG_CACHED);
+
+	lba_map->num_vld = 0;
+}
+
+size_t
+ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev)
+{
+	/* Map pool element holds the whole tail md + segments map */
+	return ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE +
+	       spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK);
+}
diff --git a/src/spdk/lib/ftl/ftl_band.h b/src/spdk/lib/ftl/ftl_band.h
new file mode 100644
index 000000000..109b369a5
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_band.h
@@ -0,0 +1,287 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_BAND_H
+#define FTL_BAND_H
+
+#include "spdk/stdinc.h"
+#include "spdk/bit_array.h"
+#include "spdk/queue.h"
+#include "spdk/bdev_zone.h"
+
+#include "ftl_io.h"
+#include "ftl_addr.h"
+#include "ftl_core.h"
+
+/* Number of LBAs that could be stored in a single block */
+#define FTL_NUM_LBA_IN_BLOCK	(FTL_BLOCK_SIZE / sizeof(uint64_t))
+
+struct spdk_ftl_dev;
+struct ftl_lba_map_request;
+
+struct ftl_zone {
+	struct spdk_bdev_zone_info		info;
+
+	/* Indicates that there is inflight write */
+	bool					busy;
+
+	CIRCLEQ_ENTRY(ftl_zone)			circleq;
+};
+
+enum ftl_md_status {
+	FTL_MD_SUCCESS,
+	/* Metadata read failure */
+	FTL_MD_IO_FAILURE,
+	/* Invalid version */
+	FTL_MD_INVALID_VER,
+	/* UUID doesn't match */
+	FTL_MD_NO_MD,
+	/* UUID and version matches but CRC doesn't */
+	FTL_MD_INVALID_CRC,
+	/* Vld or lba map size doesn't match */
+	FTL_MD_INVALID_SIZE
+};
+
+enum ftl_lba_map_seg_state {
+	FTL_LBA_MAP_SEG_CLEAR,
+	FTL_LBA_MAP_SEG_PENDING,
+	FTL_LBA_MAP_SEG_CACHED
+};
+
+struct ftl_lba_map {
+	/* LBA/vld map lock */
+	pthread_spinlock_t			lock;
+
+	/* Number of valid LBAs */
+	size_t					num_vld;
+
+	/* LBA map's reference count */
+	size_t					ref_cnt;
+
+	/* Bitmap of valid LBAs */
+	struct spdk_bit_array			*vld;
+
+	/* LBA map (only valid for open/relocating bands) */
+	uint64_t				*map;
+
+	/* LBA map segment state map (clear, pending, cached) */
+	uint8_t					*segments;
+
+	LIST_HEAD(, ftl_lba_map_request)	request_list;
+
+	/* Metadata DMA buffer (only valid for open/relocating bands) */
+	void					*dma_buf;
+};
+
+enum ftl_band_state {
+	FTL_BAND_STATE_FREE,
+	FTL_BAND_STATE_PREP,
+	FTL_BAND_STATE_OPENING,
+	FTL_BAND_STATE_OPEN,
+	FTL_BAND_STATE_FULL,
+	FTL_BAND_STATE_CLOSING,
+	FTL_BAND_STATE_CLOSED,
+	FTL_BAND_STATE_MAX
+};
+
+struct ftl_lba_map_request {
+	/*  Completion callback */
+	ftl_io_fn				cb;
+
+	/*  Completion callback context */
+	void					*cb_ctx;
+
+	/* Bit array of requested segments */
+	struct spdk_bit_array			*segments;
+
+	/* Number of pending segments to read */
+	size_t					num_pending;
+
+	LIST_ENTRY(ftl_lba_map_request)		list_entry;
+};
+
+struct ftl_band {
+	/* Device this band belongs to */
+	struct spdk_ftl_dev			*dev;
+
+	/* Number of operational zones */
+	size_t					num_zones;
+
+	/* Array of zones */
+	struct ftl_zone				*zone_buf;
+
+	/* List of operational zones */
+	CIRCLEQ_HEAD(, ftl_zone)		zones;
+
+	/* LBA map */
+	struct ftl_lba_map			lba_map;
+
+	/* Band's state */
+	enum ftl_band_state			state;
+
+	/* Band's index */
+	unsigned int				id;
+
+	/* Latest merit calculation */
+	double					merit;
+
+	/* High defrag priority - means that the metadata should be copied and */
+	/* the band should be defragged immediately */
+	int					high_prio;
+
+	/* Sequence number */
+	uint64_t				seq;
+
+	/* Number of defrag cycles */
+	uint64_t				wr_cnt;
+
+	/* End metadata start addr */
+	struct ftl_addr				tail_md_addr;
+
+	/* Bitmap of all bands that have its data moved onto this band */
+	struct spdk_bit_array			*reloc_bitmap;
+	/* Number of open bands containing data moved from this band */
+	size_t					num_reloc_bands;
+	/* Number of blocks currently being moved from this band */
+	size_t					num_reloc_blocks;
+
+	/* Free/shut bands' lists */
+	LIST_ENTRY(ftl_band)			list_entry;
+
+	/* High priority queue link */
+	STAILQ_ENTRY(ftl_band)			prio_stailq;
+};
+
+uint64_t	ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr);
+struct ftl_addr ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off);
+void		ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state);
+size_t		ftl_band_age(const struct ftl_band *band);
+void		ftl_band_acquire_lba_map(struct ftl_band *band);
+int		ftl_band_alloc_lba_map(struct ftl_band *band);
+void		ftl_band_clear_lba_map(struct ftl_band *band);
+void		ftl_band_release_lba_map(struct ftl_band *band);
+int		ftl_band_read_lba_map(struct ftl_band *band,
+				      size_t offset, size_t lba_cnt,
+				      ftl_io_fn cb_fn, void *cb_ctx);
+struct ftl_addr ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr,
+					size_t num_blocks);
+struct ftl_addr ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr,
+				   size_t offset);
+size_t		ftl_band_num_usable_blocks(const struct ftl_band *band);
+size_t		ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset);
+size_t		ftl_band_user_blocks(const struct ftl_band *band);
+void		ftl_band_set_addr(struct ftl_band *band, uint64_t lba,
+				  struct ftl_addr addr);
+struct ftl_band *ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr);
+struct ftl_zone *ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr);
+void		ftl_band_md_clear(struct ftl_band *band);
+int		ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr,
+				      ftl_io_fn cb_fn, void *cb_ctx);
+int		ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx);
+int		ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb);
+int		ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb);
+struct ftl_addr ftl_band_tail_md_addr(struct ftl_band *band);
+struct ftl_addr ftl_band_head_md_addr(struct ftl_band *band);
+void		ftl_band_write_failed(struct ftl_band *band);
+int		ftl_band_full(struct ftl_band *band, size_t offset);
+int		ftl_band_write_prep(struct ftl_band *band);
+struct ftl_zone *ftl_band_next_operational_zone(struct ftl_band *band,
+		struct ftl_zone *zone);
+size_t		ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev);
+void		ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone);
+
+
+static inline int
+ftl_band_empty(const struct ftl_band *band)
+{
+	return band->lba_map.num_vld == 0;
+}
+
+static inline struct ftl_zone *
+ftl_band_next_zone(struct ftl_band *band, struct ftl_zone *zone)
+{
+	assert(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE);
+	return CIRCLEQ_LOOP_NEXT(&band->zones, zone, circleq);
+}
+
+static inline void
+ftl_band_set_next_state(struct ftl_band *band)
+{
+	ftl_band_set_state(band, (band->state + 1) % FTL_BAND_STATE_MAX);
+}
+
+static inline int
+ftl_band_state_changing(struct ftl_band *band)
+{
+	return band->state == FTL_BAND_STATE_OPENING ||
+	       band->state == FTL_BAND_STATE_CLOSING;
+}
+
+static inline int
+ftl_band_block_offset_valid(struct ftl_band *band, size_t block_off)
+{
+	struct ftl_lba_map *lba_map = &band->lba_map;
+
+	pthread_spin_lock(&lba_map->lock);
+	if (spdk_bit_array_get(lba_map->vld, block_off)) {
+		pthread_spin_unlock(&lba_map->lock);
+		return 1;
+	}
+
+	pthread_spin_unlock(&lba_map->lock);
+	return 0;
+}
+
+static inline int
+ftl_band_zone_is_last(struct ftl_band *band, struct ftl_zone *zone)
+{
+	return zone == CIRCLEQ_LAST(&band->zones);
+}
+
+static inline int
+ftl_band_zone_is_first(struct ftl_band *band, struct ftl_zone *zone)
+{
+	return zone == CIRCLEQ_FIRST(&band->zones);
+}
+
+static inline int
+ftl_zone_is_writable(const struct spdk_ftl_dev *dev, const struct ftl_zone *zone)
+{
+	bool busy = ftl_is_append_supported(dev) ? false : zone->busy;
+
+	return (zone->info.state == SPDK_BDEV_ZONE_STATE_OPEN ||
+		zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) &&
+	       !busy;
+}
+
+#endif /* FTL_BAND_H */
diff --git a/src/spdk/lib/ftl/ftl_core.c b/src/spdk/lib/ftl/ftl_core.c
new file mode 100644
index 000000000..b0b448806
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_core.c
@@ -0,0 +1,2460 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/likely.h"
+#include "spdk/stdinc.h"
+#include "spdk/nvme.h"
+#include "spdk/thread.h"
+#include "spdk/bdev_module.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+#include "spdk/crc32.h"
+
+#include "ftl_core.h"
+#include "ftl_band.h"
+#include "ftl_io.h"
+#include "ftl_debug.h"
+#include "ftl_reloc.h"
+
+struct ftl_band_flush {
+	struct spdk_ftl_dev		*dev;
+	/* Number of bands left to be flushed */
+	size_t				num_bands;
+	/* User callback */
+	spdk_ftl_fn			cb_fn;
+	/* Callback's argument */
+	void				*cb_arg;
+	/* List link */
+	LIST_ENTRY(ftl_band_flush)	list_entry;
+};
+
+struct ftl_wptr {
+	/* Owner device */
+	struct spdk_ftl_dev		*dev;
+
+	/* Current address */
+	struct ftl_addr			addr;
+
+	/* Band currently being written to */
+	struct ftl_band			*band;
+
+	/* Current logical block's offset */
+	uint64_t			offset;
+
+	/* Current zone */
+	struct ftl_zone			*zone;
+
+	/* Pending IO queue */
+	TAILQ_HEAD(, ftl_io)		pending_queue;
+
+	/* List link */
+	LIST_ENTRY(ftl_wptr)		list_entry;
+
+	/*
+	 * If setup in direct mode, there will be no offset or band state update after IO.
+	 * The zoned bdev address is not assigned by wptr, and is instead taken directly
+	 * from the request.
+	 */
+	bool				direct_mode;
+
+	/* Number of outstanding write requests */
+	uint32_t			num_outstanding;
+
+	/* Marks that the band related to this wptr needs to be closed as soon as possible */
+	bool				flush;
+};
+
+struct ftl_flush {
+	/* Owner device */
+	struct spdk_ftl_dev		*dev;
+
+	/* Number of batches to wait for */
+	size_t				num_req;
+
+	/* Callback */
+	struct {
+		spdk_ftl_fn		fn;
+		void			*ctx;
+	} cb;
+
+	/* Batch bitmap */
+	struct spdk_bit_array		*bmap;
+
+	/* List link */
+	LIST_ENTRY(ftl_flush)		list_entry;
+};
+
+static void
+ftl_wptr_free(struct ftl_wptr *wptr)
+{
+	if (!wptr) {
+		return;
+	}
+
+	free(wptr);
+}
+
+static void
+ftl_remove_wptr(struct ftl_wptr *wptr)
+{
+	struct spdk_ftl_dev *dev = wptr->dev;
+	struct ftl_band_flush *flush, *tmp;
+
+	if (spdk_unlikely(wptr->flush)) {
+		LIST_FOREACH_SAFE(flush, &dev->band_flush_list, list_entry, tmp) {
+			assert(flush->num_bands > 0);
+			if (--flush->num_bands == 0) {
+				flush->cb_fn(flush->cb_arg, 0);
+				LIST_REMOVE(flush, list_entry);
+				free(flush);
+			}
+		}
+	}
+
+	LIST_REMOVE(wptr, list_entry);
+	ftl_wptr_free(wptr);
+}
+
+static struct ftl_wbuf_entry *
+ftl_acquire_wbuf_entry(struct ftl_io_channel *io_channel, int io_flags)
+{
+	struct ftl_wbuf_entry *entry = NULL;
+	uint32_t qdepth;
+
+	if (!(io_flags & FTL_IO_INTERNAL)) {
+		qdepth = __atomic_fetch_add(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+		if (qdepth >= io_channel->qdepth_limit) {
+			__atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+			return NULL;
+		}
+	}
+
+	if (spdk_ring_dequeue(io_channel->free_queue, (void **)&entry, 1) != 1) {
+		if (!(io_flags & FTL_IO_INTERNAL)) {
+			__atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+		}
+
+		return NULL;
+	}
+
+	assert(entry != NULL);
+
+	ftl_evict_cache_entry(io_channel->dev, entry);
+
+	entry->io_flags = io_flags;
+	entry->addr.offset = FTL_ADDR_INVALID;
+	entry->lba = FTL_LBA_INVALID;
+	entry->band = NULL;
+	entry->valid = false;
+
+	return entry;
+}
+
+static void
+ftl_release_wbuf_entry(struct ftl_wbuf_entry *entry)
+{
+	struct ftl_io_channel *io_channel = entry->ioch;
+
+	if (!(entry->io_flags & FTL_IO_INTERNAL)) {
+		__atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+	}
+
+	spdk_ring_enqueue(io_channel->free_queue, (void **)&entry, 1, NULL);
+}
+
+static struct ftl_batch *
+ftl_get_next_batch(struct spdk_ftl_dev *dev)
+{
+	struct ftl_batch *batch = dev->current_batch;
+	struct ftl_io_channel *ioch;
+#define FTL_DEQUEUE_ENTRIES 128
+	struct ftl_wbuf_entry *entries[FTL_DEQUEUE_ENTRIES];
+	TAILQ_HEAD(, ftl_io_channel) ioch_queue;
+	size_t i, num_dequeued, num_remaining;
+	uint64_t *metadata;
+
+	if (batch == NULL) {
+		batch = TAILQ_FIRST(&dev->pending_batches);
+		if (batch != NULL) {
+			TAILQ_REMOVE(&dev->pending_batches, batch, tailq);
+			return batch;
+		}
+
+		batch = TAILQ_FIRST(&dev->free_batches);
+		if (spdk_unlikely(batch == NULL)) {
+			return NULL;
+		}
+
+		assert(TAILQ_EMPTY(&batch->entries));
+		assert(batch->num_entries == 0);
+		TAILQ_REMOVE(&dev->free_batches, batch, tailq);
+	}
+
+	/*
+	 * Keep shifting the queue to ensure fairness in IO channel selection.  Each time
+	 * ftl_get_next_batch() is called, we're starting to dequeue write buffer entries from a
+	 * different IO channel.
+	 */
+	TAILQ_INIT(&ioch_queue);
+	while (!TAILQ_EMPTY(&dev->ioch_queue)) {
+		ioch = TAILQ_FIRST(&dev->ioch_queue);
+		TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq);
+		TAILQ_INSERT_TAIL(&ioch_queue, ioch, tailq);
+
+		num_remaining = dev->xfer_size - batch->num_entries;
+		while (num_remaining > 0) {
+			num_dequeued = spdk_ring_dequeue(ioch->submit_queue, (void **)entries,
+							 spdk_min(num_remaining,
+									 FTL_DEQUEUE_ENTRIES));
+			if (num_dequeued == 0) {
+				break;
+			}
+
+			for (i = 0; i < num_dequeued; ++i) {
+				batch->iov[batch->num_entries + i].iov_base = entries[i]->payload;
+				batch->iov[batch->num_entries + i].iov_len = FTL_BLOCK_SIZE;
+
+				if (batch->metadata != NULL) {
+					metadata = (uint64_t *)((char *)batch->metadata +
+								i * dev->md_size);
+					*metadata = entries[i]->lba;
+				}
+
+				TAILQ_INSERT_TAIL(&batch->entries, entries[i], tailq);
+			}
+
+			batch->num_entries += num_dequeued;
+			num_remaining -= num_dequeued;
+		}
+
+		if (num_remaining == 0) {
+			break;
+		}
+	}
+
+	TAILQ_CONCAT(&dev->ioch_queue, &ioch_queue, tailq);
+
+	if (batch->num_entries == dev->xfer_size) {
+		dev->current_batch = NULL;
+	} else {
+		dev->current_batch = batch;
+		batch = NULL;
+	}
+
+	return batch;
+}
+
+static void
+ftl_release_batch(struct spdk_ftl_dev *dev, struct ftl_batch *batch)
+{
+	struct ftl_wbuf_entry *entry;
+
+	while (!TAILQ_EMPTY(&batch->entries)) {
+		entry = TAILQ_FIRST(&batch->entries);
+		TAILQ_REMOVE(&batch->entries, entry, tailq);
+		ftl_release_wbuf_entry(entry);
+	}
+
+	batch->num_entries = 0;
+	TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq);
+}
+
+static struct ftl_wbuf_entry *
+ftl_get_entry_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	struct ftl_io_channel *ioch;
+	uint64_t ioch_offset, entry_offset;
+
+	ioch_offset = addr.cache_offset & ((1 << dev->ioch_shift) - 1);
+	entry_offset = addr.cache_offset >> dev->ioch_shift;
+	ioch = dev->ioch_array[ioch_offset];
+
+	assert(ioch_offset < dev->conf.max_io_channels);
+	assert(entry_offset < ioch->num_entries);
+	assert(addr.cached == 1);
+
+	return &ioch->wbuf_entries[entry_offset];
+}
+
+static struct ftl_addr
+ftl_get_addr_from_entry(struct ftl_wbuf_entry *entry)
+{
+	struct ftl_io_channel *ioch = entry->ioch;
+	struct ftl_addr addr = {};
+
+	addr.cached = 1;
+	addr.cache_offset = (uint64_t)entry->index << ioch->dev->ioch_shift | ioch->index;
+
+	return addr;
+}
+
+static void
+ftl_io_cmpl_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_io *io = cb_arg;
+	struct spdk_ftl_dev *dev = io->dev;
+
+	if (spdk_unlikely(!success)) {
+		io->status = -EIO;
+	}
+
+	ftl_trace_completion(dev, io, FTL_TRACE_COMPLETION_DISK);
+
+	if (io->type == FTL_IO_WRITE && ftl_is_append_supported(dev)) {
+		assert(io->parent);
+		io->parent->addr.offset = spdk_bdev_io_get_append_location(bdev_io);
+	}
+
+	ftl_io_dec_req(io);
+	if (ftl_io_done(io)) {
+		ftl_io_complete(io);
+	}
+
+	spdk_bdev_free_io(bdev_io);
+}
+
+static void
+ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band)
+{
+	struct ftl_wptr *wptr = NULL;
+
+	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
+		if (wptr->band == band) {
+			break;
+		}
+	}
+
+	/* If the band already has the high_prio flag set, other writes must */
+	/* have failed earlier, so it's already taken care of. */
+	if (band->high_prio) {
+		assert(wptr == NULL);
+		return;
+	}
+
+	ftl_band_write_failed(band);
+	ftl_remove_wptr(wptr);
+}
+
+static struct ftl_wptr *
+ftl_wptr_from_band(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_wptr *wptr = NULL;
+
+	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
+		if (wptr->band == band) {
+			return wptr;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+ftl_md_write_fail(struct ftl_io *io, int status)
+{
+	struct ftl_band *band = io->band;
+	struct ftl_wptr *wptr;
+	char buf[128];
+
+	wptr = ftl_wptr_from_band(band);
+	assert(wptr);
+
+	SPDK_ERRLOG("Metadata write failed @addr: %s, status: %d\n",
+		    ftl_addr2str(wptr->addr, buf, sizeof(buf)), status);
+
+	ftl_halt_writes(io->dev, band);
+}
+
+static void
+ftl_md_write_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+	struct ftl_band *band = io->band;
+	struct ftl_wptr *wptr;
+	size_t id;
+
+	wptr = ftl_wptr_from_band(band);
+	assert(wptr);
+
+	if (status) {
+		ftl_md_write_fail(io, status);
+		return;
+	}
+
+	ftl_band_set_next_state(band);
+	if (band->state == FTL_BAND_STATE_CLOSED) {
+		if (ftl_dev_has_nv_cache(dev)) {
+			pthread_spin_lock(&nv_cache->lock);
+			nv_cache->num_available += ftl_band_user_blocks(band);
+
+			if (spdk_unlikely(nv_cache->num_available > nv_cache->num_data_blocks)) {
+				nv_cache->num_available = nv_cache->num_data_blocks;
+			}
+			pthread_spin_unlock(&nv_cache->lock);
+		}
+
+		/*
+		 * Go through the reloc_bitmap, checking for all the bands that had its data moved
+		 * onto current band and update their counters to allow them to be used for writing
+		 * (once they're closed and empty).
+		 */
+		for (id = 0; id < ftl_get_num_bands(dev); ++id) {
+			if (spdk_bit_array_get(band->reloc_bitmap, id)) {
+				assert(dev->bands[id].num_reloc_bands > 0);
+				dev->bands[id].num_reloc_bands--;
+
+				spdk_bit_array_clear(band->reloc_bitmap, id);
+			}
+		}
+
+		ftl_remove_wptr(wptr);
+	}
+}
+
+static int
+ftl_read_next_physical_addr(struct ftl_io *io, struct ftl_addr *addr)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	size_t num_blocks, max_blocks;
+
+	assert(ftl_io_mode_physical(io));
+	assert(io->iov_pos < io->iov_cnt);
+
+	if (io->pos == 0) {
+		*addr = io->addr;
+	} else {
+		*addr = ftl_band_next_xfer_addr(io->band, io->addr, io->pos);
+	}
+
+	assert(!ftl_addr_invalid(*addr));
+
+	/* Metadata has to be read in the way it's written (jumping across */
+	/* the zones in xfer_size increments) */
+	if (io->flags & FTL_IO_MD) {
+		max_blocks = dev->xfer_size - (addr->offset % dev->xfer_size);
+		num_blocks = spdk_min(ftl_io_iovec_len_left(io), max_blocks);
+		assert(addr->offset / dev->xfer_size ==
+		       (addr->offset + num_blocks - 1) / dev->xfer_size);
+	} else {
+		num_blocks = ftl_io_iovec_len_left(io);
+	}
+
+	return num_blocks;
+}
+
+static int
+ftl_wptr_close_band(struct ftl_wptr *wptr)
+{
+	struct ftl_band *band = wptr->band;
+
+	ftl_band_set_state(band, FTL_BAND_STATE_CLOSING);
+
+	return ftl_band_write_tail_md(band, ftl_md_write_cb);
+}
+
+static int
+ftl_wptr_open_band(struct ftl_wptr *wptr)
+{
+	struct ftl_band *band = wptr->band;
+
+	assert(ftl_band_zone_is_first(band, wptr->zone));
+	assert(band->lba_map.num_vld == 0);
+
+	ftl_band_clear_lba_map(band);
+
+	assert(band->state == FTL_BAND_STATE_PREP);
+	ftl_band_set_state(band, FTL_BAND_STATE_OPENING);
+
+	return ftl_band_write_head_md(band, ftl_md_write_cb);
+}
+
+static int
+ftl_submit_erase(struct ftl_io *io)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_band *band = io->band;
+	struct ftl_addr addr = io->addr;
+	struct ftl_io_channel *ioch;
+	struct ftl_zone *zone;
+	int rc = 0;
+	size_t i;
+
+	ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+	for (i = 0; i < io->num_blocks; ++i) {
+		if (i != 0) {
+			zone = ftl_band_next_zone(band, ftl_band_zone_from_addr(band, addr));
+			assert(zone->info.state == SPDK_BDEV_ZONE_STATE_FULL);
+			addr.offset = zone->info.zone_id;
+		}
+
+		assert(ftl_addr_get_zone_offset(dev, addr) == 0);
+
+		ftl_trace_submission(dev, io, addr, 1);
+		rc = spdk_bdev_zone_management(dev->base_bdev_desc, ioch->base_ioch, addr.offset,
+					       SPDK_BDEV_ZONE_RESET, ftl_io_cmpl_cb, io);
+		if (spdk_unlikely(rc)) {
+			ftl_io_fail(io, rc);
+			SPDK_ERRLOG("Vector reset failed with status: %d\n", rc);
+			break;
+		}
+
+		ftl_io_inc_req(io);
+		ftl_io_advance(io, 1);
+	}
+
+	if (ftl_io_done(io)) {
+		ftl_io_complete(io);
+	}
+
+	return rc;
+}
+
+static bool
+ftl_check_core_thread(const struct spdk_ftl_dev *dev)
+{
+	return dev->core_thread == spdk_get_thread();
+}
+
+struct spdk_io_channel *
+ftl_get_io_channel(const struct spdk_ftl_dev *dev)
+{
+	if (ftl_check_core_thread(dev)) {
+		return dev->ioch;
+	}
+
+	return NULL;
+}
+
+static void
+ftl_erase_fail(struct ftl_io *io, int status)
+{
+	struct ftl_zone *zone;
+	struct ftl_band *band = io->band;
+	char buf[128];
+
+	SPDK_ERRLOG("Erase failed at address: %s, status: %d\n",
+		    ftl_addr2str(io->addr, buf, sizeof(buf)), status);
+
+	zone = ftl_band_zone_from_addr(band, io->addr);
+	zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE;
+	ftl_band_remove_zone(band, zone);
+	band->tail_md_addr = ftl_band_tail_md_addr(band);
+}
+
+static void
+ftl_zone_erase_cb(struct ftl_io *io, void *ctx, int status)
+{
+	struct ftl_zone *zone;
+
+	zone = ftl_band_zone_from_addr(io->band, io->addr);
+	zone->busy = false;
+
+	if (spdk_unlikely(status)) {
+		ftl_erase_fail(io, status);
+		return;
+	}
+
+	zone->info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
+	zone->info.write_pointer = zone->info.zone_id;
+}
+
+static int
+ftl_band_erase(struct ftl_band *band)
+{
+	struct ftl_zone *zone;
+	struct ftl_io *io;
+	int rc = 0;
+
+	assert(band->state == FTL_BAND_STATE_CLOSED ||
+	       band->state == FTL_BAND_STATE_FREE);
+
+	ftl_band_set_state(band, FTL_BAND_STATE_PREP);
+
+	CIRCLEQ_FOREACH(zone, &band->zones, circleq) {
+		if (zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) {
+			continue;
+		}
+
+		io = ftl_io_erase_init(band, 1, ftl_zone_erase_cb);
+		if (!io) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		zone->busy = true;
+		io->addr.offset = zone->info.zone_id;
+		rc = ftl_submit_erase(io);
+		if (rc) {
+			zone->busy = false;
+			assert(0);
+			/* TODO: change band's state back to close? */
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static struct ftl_band *
+ftl_next_write_band(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band;
+
+	/* Find a free band that has all of its data moved onto other closed bands */
+	LIST_FOREACH(band, &dev->free_bands, list_entry) {
+		assert(band->state == FTL_BAND_STATE_FREE);
+		if (band->num_reloc_bands == 0 && band->num_reloc_blocks == 0) {
+			break;
+		}
+	}
+
+	if (spdk_unlikely(!band)) {
+		return NULL;
+	}
+
+	if (ftl_band_erase(band)) {
+		/* TODO: handle erase failure */
+		return NULL;
+	}
+
+	return band;
+}
+
+static struct ftl_band *
+ftl_next_wptr_band(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band;
+
+	if (!dev->next_band) {
+		band = ftl_next_write_band(dev);
+	} else {
+		assert(dev->next_band->state == FTL_BAND_STATE_PREP);
+		band = dev->next_band;
+		dev->next_band = NULL;
+	}
+
+	return band;
+}
+
+static struct ftl_wptr *
+ftl_wptr_init(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_wptr *wptr;
+
+	wptr = calloc(1, sizeof(*wptr));
+	if (!wptr) {
+		return NULL;
+	}
+
+	wptr->dev = dev;
+	wptr->band = band;
+	wptr->zone = CIRCLEQ_FIRST(&band->zones);
+	wptr->addr.offset = wptr->zone->info.zone_id;
+	TAILQ_INIT(&wptr->pending_queue);
+
+	return wptr;
+}
+
+static int
+ftl_add_direct_wptr(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_wptr *wptr;
+
+	assert(band->state == FTL_BAND_STATE_OPEN);
+
+	wptr = ftl_wptr_init(band);
+	if (!wptr) {
+		return -1;
+	}
+
+	wptr->direct_mode = true;
+
+	if (ftl_band_alloc_lba_map(band)) {
+		ftl_wptr_free(wptr);
+		return -1;
+	}
+
+	LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id);
+	ftl_trace_write_band(dev, band);
+	return 0;
+}
+
+static void
+ftl_close_direct_wptr(struct ftl_band *band)
+{
+	struct ftl_wptr *wptr = ftl_wptr_from_band(band);
+
+	assert(wptr);
+	assert(wptr->direct_mode);
+	assert(band->state == FTL_BAND_STATE_CLOSED);
+
+	ftl_band_release_lba_map(band);
+
+	ftl_remove_wptr(wptr);
+}
+
+int
+ftl_band_set_direct_access(struct ftl_band *band, bool access)
+{
+	if (access) {
+		return ftl_add_direct_wptr(band);
+	} else {
+		ftl_close_direct_wptr(band);
+		return 0;
+	}
+}
+
+static int
+ftl_add_wptr(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band;
+	struct ftl_wptr *wptr;
+
+	band = ftl_next_wptr_band(dev);
+	if (!band) {
+		return -1;
+	}
+
+	wptr = ftl_wptr_init(band);
+	if (!wptr) {
+		return -1;
+	}
+
+	if (ftl_band_write_prep(band)) {
+		ftl_wptr_free(wptr);
+		return -1;
+	}
+
+	LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id);
+	ftl_trace_write_band(dev, band);
+	return 0;
+}
+
+static void
+ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size)
+{
+	struct ftl_band *band = wptr->band;
+	struct spdk_ftl_dev *dev = wptr->dev;
+	struct spdk_ftl_conf *conf = &dev->conf;
+	size_t next_thld;
+
+	if (spdk_unlikely(wptr->direct_mode)) {
+		return;
+	}
+
+	wptr->offset += xfer_size;
+	next_thld = (ftl_band_num_usable_blocks(band) * conf->band_thld) / 100;
+
+	if (ftl_band_full(band, wptr->offset)) {
+		ftl_band_set_state(band, FTL_BAND_STATE_FULL);
+	}
+
+	wptr->zone->busy = true;
+	wptr->addr = ftl_band_next_xfer_addr(band, wptr->addr, xfer_size);
+	wptr->zone = ftl_band_next_operational_zone(band, wptr->zone);
+
+	assert(!ftl_addr_invalid(wptr->addr));
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: pu:%lu band:%lu, offset:%lu\n",
+		      ftl_addr_get_punit(dev, wptr->addr),
+		      ftl_addr_get_band(dev, wptr->addr),
+		      wptr->addr.offset);
+
+	if (wptr->offset >= next_thld && !dev->next_band) {
+		dev->next_band = ftl_next_write_band(dev);
+	}
+}
+
+static size_t
+ftl_wptr_user_blocks_left(const struct ftl_wptr *wptr)
+{
+	return ftl_band_user_blocks_left(wptr->band, wptr->offset);
+}
+
+static bool
+ftl_wptr_ready(struct ftl_wptr *wptr)
+{
+	struct ftl_band *band = wptr->band;
+
+	/* TODO: add handling of empty bands */
+
+	if (spdk_unlikely(!ftl_zone_is_writable(wptr->dev, wptr->zone))) {
+		/* Erasing band may fail after it was assigned to wptr. */
+		if (spdk_unlikely(wptr->zone->info.state == SPDK_BDEV_ZONE_STATE_OFFLINE)) {
+			ftl_wptr_advance(wptr, wptr->dev->xfer_size);
+		}
+		return false;
+	}
+
+	/* If we're in the process of writing metadata, wait till it is */
+	/* completed. */
+	/* TODO: we should probably change bands once we're writing tail md */
+	if (ftl_band_state_changing(band)) {
+		return false;
+	}
+
+	if (band->state == FTL_BAND_STATE_FULL) {
+		if (wptr->num_outstanding == 0) {
+			if (ftl_wptr_close_band(wptr)) {
+				/* TODO: need recovery here */
+				assert(false);
+			}
+		}
+
+		return false;
+	}
+
+	if (band->state != FTL_BAND_STATE_OPEN) {
+		if (ftl_wptr_open_band(wptr)) {
+			/* TODO: need recovery here */
+			assert(false);
+		}
+
+		return false;
+	}
+
+	return true;
+}
+
+int
+ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+	struct ftl_wptr *wptr;
+	struct ftl_band_flush *flush;
+
+	assert(ftl_get_core_thread(dev) == spdk_get_thread());
+
+	flush = calloc(1, sizeof(*flush));
+	if (spdk_unlikely(!flush)) {
+		return -ENOMEM;
+	}
+
+	LIST_INSERT_HEAD(&dev->band_flush_list, flush, list_entry);
+
+	flush->cb_fn = cb_fn;
+	flush->cb_arg = cb_arg;
+	flush->dev = dev;
+
+	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
+		wptr->flush = true;
+		flush->num_bands++;
+	}
+
+	return 0;
+}
+
+static const struct spdk_ftl_limit *
+ftl_get_limit(const struct spdk_ftl_dev *dev, int type)
+{
+	assert(type < SPDK_FTL_LIMIT_MAX);
+	return &dev->conf.limits[type];
+}
+
+static bool
+ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry)
+{
+	struct ftl_addr addr;
+
+	/* If the LBA is invalid don't bother checking the md and l2p */
+	if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) {
+		return false;
+	}
+
+	addr = ftl_l2p_get(dev, entry->lba);
+	if (!(ftl_addr_cached(addr) && entry == ftl_get_entry_from_addr(dev, addr))) {
+		return false;
+	}
+
+	return true;
+}
+
+void
+ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry)
+{
+	pthread_spin_lock(&entry->lock);
+
+	if (!entry->valid) {
+		goto unlock;
+	}
+
+	/* If the l2p wasn't updated and still points at the entry, fill it with the */
+	/* on-disk address and clear the cache status bit. Otherwise, skip the l2p update */
+	/* and just clear the cache status. */
+	if (!ftl_cache_lba_valid(dev, entry)) {
+		goto clear;
+	}
+
+	ftl_l2p_set(dev, entry->lba, entry->addr);
+clear:
+	entry->valid = false;
+unlock:
+	pthread_spin_unlock(&entry->lock);
+}
+
+static void
+ftl_pad_wbuf(struct spdk_ftl_dev *dev, size_t size)
+{
+	struct ftl_wbuf_entry *entry;
+	struct ftl_io_channel *ioch;
+	int flags = FTL_IO_PAD | FTL_IO_INTERNAL;
+
+	ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+	for (size_t i = 0; i < size; ++i) {
+		entry = ftl_acquire_wbuf_entry(ioch, flags);
+		if (!entry) {
+			break;
+		}
+
+		entry->lba = FTL_LBA_INVALID;
+		entry->addr = ftl_to_addr(FTL_ADDR_INVALID);
+		memset(entry->payload, 0, FTL_BLOCK_SIZE);
+
+		spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL);
+	}
+}
+
+static void
+ftl_remove_free_bands(struct spdk_ftl_dev *dev)
+{
+	while (!LIST_EMPTY(&dev->free_bands)) {
+		LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry);
+	}
+
+	dev->next_band = NULL;
+}
+
+static void
+ftl_wptr_pad_band(struct ftl_wptr *wptr)
+{
+	struct spdk_ftl_dev *dev = wptr->dev;
+	struct ftl_batch *batch = dev->current_batch;
+	struct ftl_io_channel *ioch;
+	size_t size, pad_size, blocks_left;
+
+	size = batch != NULL ? batch->num_entries : 0;
+	TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+		size += spdk_ring_count(ioch->submit_queue);
+	}
+
+	ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+	blocks_left = ftl_wptr_user_blocks_left(wptr);
+	assert(size <= blocks_left);
+	assert(blocks_left % dev->xfer_size == 0);
+	pad_size = spdk_min(blocks_left - size, spdk_ring_count(ioch->free_queue));
+
+	ftl_pad_wbuf(dev, pad_size);
+}
+
+static void
+ftl_wptr_process_shutdown(struct ftl_wptr *wptr)
+{
+	struct spdk_ftl_dev *dev = wptr->dev;
+	struct ftl_batch *batch = dev->current_batch;
+	struct ftl_io_channel *ioch;
+	size_t size;
+
+	size = batch != NULL ? batch->num_entries : 0;
+	TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+		size += spdk_ring_count(ioch->submit_queue);
+	}
+
+	if (size >= dev->xfer_size) {
+		return;
+	}
+
+	/* If we reach this point we need to remove free bands */
+	/* and pad current wptr band to the end */
+	ftl_remove_free_bands(dev);
+	ftl_wptr_pad_band(wptr);
+}
+
+static int
+ftl_shutdown_complete(struct spdk_ftl_dev *dev)
+{
+	struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(dev->ioch);
+
+	return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) &&
+	       dev->num_io_channels == 1 && LIST_EMPTY(&dev->wptr_list) &&
+	       TAILQ_EMPTY(&ioch->retry_queue);
+}
+
+void
+ftl_apply_limits(struct spdk_ftl_dev *dev)
+{
+	const struct spdk_ftl_limit *limit;
+	struct ftl_io_channel *ioch;
+	struct ftl_stats *stats = &dev->stats;
+	uint32_t qdepth_limit = 100;
+	int i;
+
+	/* Clear existing limit */
+	dev->limit = SPDK_FTL_LIMIT_MAX;
+
+	for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) {
+		limit = ftl_get_limit(dev, i);
+
+		if (dev->num_free <= limit->thld) {
+			qdepth_limit = limit->limit;
+			stats->limits[i]++;
+			dev->limit = i;
+			break;
+		}
+	}
+
+	ftl_trace_limits(dev, dev->limit, dev->num_free);
+	TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+		__atomic_store_n(&ioch->qdepth_limit, (qdepth_limit * ioch->num_entries) / 100,
+				 __ATOMIC_SEQ_CST);
+	}
+}
+
+static int
+ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	struct ftl_band *band = ftl_band_from_addr(dev, addr);
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	uint64_t offset;
+
+	offset = ftl_band_block_offset_from_addr(band, addr);
+
+	/* The bit might be already cleared if two writes are scheduled to the */
+	/* same LBA at the same time */
+	if (spdk_bit_array_get(lba_map->vld, offset)) {
+		assert(lba_map->num_vld > 0);
+		spdk_bit_array_clear(lba_map->vld, offset);
+		lba_map->num_vld--;
+		return 1;
+	}
+
+	return 0;
+}
+
+int
+ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	struct ftl_band *band;
+	int rc;
+
+	assert(!ftl_addr_cached(addr));
+	band = ftl_band_from_addr(dev, addr);
+
+	pthread_spin_lock(&band->lba_map.lock);
+	rc = ftl_invalidate_addr_unlocked(dev, addr);
+	pthread_spin_unlock(&band->lba_map.lock);
+
+	return rc;
+}
+
+static int
+ftl_read_retry(int rc)
+{
+	return rc == -EAGAIN;
+}
+
+static int
+ftl_read_canceled(int rc)
+{
+	return rc == -EFAULT || rc == 0;
+}
+
+static int
+ftl_cache_read(struct ftl_io *io, uint64_t lba,
+	       struct ftl_addr addr, void *buf)
+{
+	struct ftl_wbuf_entry *entry;
+	struct ftl_addr naddr;
+	int rc = 0;
+
+	entry = ftl_get_entry_from_addr(io->dev, addr);
+	pthread_spin_lock(&entry->lock);
+
+	naddr = ftl_l2p_get(io->dev, lba);
+	if (addr.offset != naddr.offset) {
+		rc = -1;
+		goto out;
+	}
+
+	memcpy(buf, entry->payload, FTL_BLOCK_SIZE);
+out:
+	pthread_spin_unlock(&entry->lock);
+	return rc;
+}
+
+static int
+ftl_read_next_logical_addr(struct ftl_io *io, struct ftl_addr *addr)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_addr next_addr;
+	size_t i;
+
+	*addr = ftl_l2p_get(dev, ftl_io_current_lba(io));
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read addr:%lx, lba:%lu\n",
+		      addr->offset, ftl_io_current_lba(io));
+
+	/* If the address is invalid, skip it (the buffer should already be zero'ed) */
+	if (ftl_addr_invalid(*addr)) {
+		return -EFAULT;
+	}
+
+	if (ftl_addr_cached(*addr)) {
+		if (!ftl_cache_read(io, ftl_io_current_lba(io), *addr, ftl_io_iovec_addr(io))) {
+			return 0;
+		}
+
+		/* If the state changed, we have to re-read the l2p */
+		return -EAGAIN;
+	}
+
+	for (i = 1; i < ftl_io_iovec_len_left(io); ++i) {
+		next_addr = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i));
+
+		if (ftl_addr_invalid(next_addr) || ftl_addr_cached(next_addr)) {
+			break;
+		}
+
+		if (addr->offset + i != next_addr.offset) {
+			break;
+		}
+	}
+
+	return i;
+}
+
+static int
+ftl_submit_read(struct ftl_io *io)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_io_channel *ioch;
+	struct ftl_addr addr;
+	int rc = 0, num_blocks;
+
+	ioch = ftl_io_channel_get_ctx(io->ioch);
+
+	assert(LIST_EMPTY(&io->children));
+
+	while (io->pos < io->num_blocks) {
+		if (ftl_io_mode_physical(io)) {
+			num_blocks = rc = ftl_read_next_physical_addr(io, &addr);
+		} else {
+			num_blocks = rc = ftl_read_next_logical_addr(io, &addr);
+		}
+
+		/* We might need to retry the read from scratch (e.g. */
+		/* because write was under way and completed before */
+		/* we could read it from the write buffer */
+		if (ftl_read_retry(rc)) {
+			continue;
+		}
+
+		/* We don't have to schedule the read, as it was read from cache */
+		if (ftl_read_canceled(rc)) {
+			ftl_io_advance(io, 1);
+			ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID :
+					     FTL_TRACE_COMPLETION_CACHE);
+			rc = 0;
+			continue;
+		}
+
+		assert(num_blocks > 0);
+
+		ftl_trace_submission(dev, io, addr, num_blocks);
+		rc = spdk_bdev_read_blocks(dev->base_bdev_desc, ioch->base_ioch,
+					   ftl_io_iovec_addr(io),
+					   addr.offset,
+					   num_blocks, ftl_io_cmpl_cb, io);
+		if (spdk_unlikely(rc)) {
+			if (rc == -ENOMEM) {
+				TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry);
+				rc = 0;
+			} else {
+				ftl_io_fail(io, rc);
+			}
+			break;
+		}
+
+		ftl_io_inc_req(io);
+		ftl_io_advance(io, num_blocks);
+	}
+
+	/* If we didn't have to read anything from the device, */
+	/* complete the request right away */
+	if (ftl_io_done(io)) {
+		ftl_io_complete(io);
+	}
+
+	return rc;
+}
+
+static void
+ftl_complete_flush(struct ftl_flush *flush)
+{
+	assert(flush->num_req == 0);
+	LIST_REMOVE(flush, list_entry);
+
+	flush->cb.fn(flush->cb.ctx, 0);
+
+	spdk_bit_array_free(&flush->bmap);
+	free(flush);
+}
+
+static void
+ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_batch *batch)
+{
+	struct ftl_flush *flush, *tflush;
+	size_t offset;
+
+	LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) {
+		offset = batch->index;
+
+		if (spdk_bit_array_get(flush->bmap, offset)) {
+			spdk_bit_array_clear(flush->bmap, offset);
+			if (!(--flush->num_req)) {
+				ftl_complete_flush(flush);
+			}
+		}
+	}
+}
+
+static void
+ftl_nv_cache_wrap_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache *nv_cache = cb_arg;
+
+	if (!success) {
+		SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n");
+		/* TODO: go into read-only mode */
+		assert(0);
+	}
+
+	pthread_spin_lock(&nv_cache->lock);
+	nv_cache->ready = true;
+	pthread_spin_unlock(&nv_cache->lock);
+
+	spdk_bdev_free_io(bdev_io);
+}
+
+static void
+ftl_nv_cache_wrap(void *ctx)
+{
+	struct ftl_nv_cache *nv_cache = ctx;
+	int rc;
+
+	rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_wrap_cb, nv_cache);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n",
+			    spdk_strerror(-rc));
+		/* TODO: go into read-only mode */
+		assert(0);
+	}
+}
+
+static uint64_t
+ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_blocks, unsigned int *phase)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+	struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+	uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID;
+
+	cache_size = spdk_bdev_get_num_blocks(bdev);
+
+	pthread_spin_lock(&nv_cache->lock);
+	if (spdk_unlikely(nv_cache->num_available == 0 || !nv_cache->ready)) {
+		goto out;
+	}
+
+	num_available = spdk_min(nv_cache->num_available, *num_blocks);
+	num_available = spdk_min(num_available, dev->conf.nv_cache.max_request_cnt);
+
+	if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) {
+		*num_blocks = cache_size - nv_cache->current_addr;
+	} else {
+		*num_blocks = num_available;
+	}
+
+	cache_addr = nv_cache->current_addr;
+	nv_cache->current_addr += *num_blocks;
+	nv_cache->num_available -= *num_blocks;
+	*phase = nv_cache->phase;
+
+	if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) {
+		nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET;
+		nv_cache->phase = ftl_nv_cache_next_phase(nv_cache->phase);
+		nv_cache->ready = false;
+		spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_nv_cache_wrap, nv_cache);
+	}
+out:
+	pthread_spin_unlock(&nv_cache->lock);
+	return cache_addr;
+}
+
+static struct ftl_io *
+ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_blocks)
+{
+	struct ftl_io_init_opts opts = {
+		.dev		= parent->dev,
+		.parent		= parent,
+		.iovcnt		= 0,
+		.num_blocks	= num_blocks,
+		.flags		= parent->flags | FTL_IO_CACHE,
+	};
+
+	return ftl_io_init_internal(&opts);
+}
+
+static void
+ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_io *io = cb_arg;
+	struct ftl_nv_cache *nv_cache = &io->dev->nv_cache;
+
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->addr.offset);
+		io->status = -EIO;
+	}
+
+	ftl_io_dec_req(io);
+	if (ftl_io_done(io)) {
+		spdk_mempool_put(nv_cache->md_pool, io->md);
+		ftl_io_complete(io);
+	}
+
+	spdk_bdev_free_io(bdev_io);
+}
+
+static void
+ftl_submit_nv_cache(void *ctx)
+{
+	struct ftl_io *io = ctx;
+	struct spdk_ftl_dev *dev = io->dev;
+	struct spdk_thread *thread;
+	struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+	struct ftl_io_channel *ioch;
+	int rc;
+
+	ioch = ftl_io_channel_get_ctx(io->ioch);
+	thread = spdk_io_channel_get_thread(io->ioch);
+
+	rc = spdk_bdev_write_blocks_with_md(nv_cache->bdev_desc, ioch->cache_ioch,
+					    ftl_io_iovec_addr(io), io->md, io->addr.offset,
+					    io->num_blocks, ftl_nv_cache_submit_cb, io);
+	if (rc == -ENOMEM) {
+		spdk_thread_send_msg(thread, ftl_submit_nv_cache, io);
+		return;
+	} else if (rc) {
+		SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n",
+			    spdk_strerror(-rc), io->addr.offset, io->num_blocks);
+		spdk_mempool_put(nv_cache->md_pool, io->md);
+		io->status = -EIO;
+		ftl_io_complete(io);
+		return;
+	}
+
+	ftl_io_advance(io, io->num_blocks);
+	ftl_io_inc_req(io);
+}
+
+static void
+ftl_nv_cache_fill_md(struct ftl_io *io, unsigned int phase)
+{
+	struct spdk_bdev *bdev;
+	struct ftl_nv_cache *nv_cache = &io->dev->nv_cache;
+	uint64_t block_off, lba;
+	void *md_buf = io->md;
+
+	bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+
+	for (block_off = 0; block_off < io->num_blocks; ++block_off) {
+		lba = ftl_nv_cache_pack_lba(ftl_io_get_lba(io, block_off), phase);
+		memcpy(md_buf, &lba, sizeof(lba));
+		md_buf += spdk_bdev_get_md_size(bdev);
+	}
+}
+
+static void
+_ftl_write_nv_cache(void *ctx)
+{
+	struct ftl_io *child, *io = ctx;
+	struct spdk_ftl_dev *dev = io->dev;
+	struct spdk_thread *thread;
+	unsigned int phase;
+	uint64_t num_blocks;
+
+	thread = spdk_io_channel_get_thread(io->ioch);
+
+	while (io->pos < io->num_blocks) {
+		num_blocks = ftl_io_iovec_len_left(io);
+
+		child = ftl_alloc_io_nv_cache(io, num_blocks);
+		if (spdk_unlikely(!child)) {
+			spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
+			return;
+		}
+
+		child->md = spdk_mempool_get(dev->nv_cache.md_pool);
+		if (spdk_unlikely(!child->md)) {
+			ftl_io_free(child);
+			spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
+			break;
+		}
+
+		/* Reserve area on the write buffer cache */
+		child->addr.offset = ftl_reserve_nv_cache(&dev->nv_cache, &num_blocks, &phase);
+		if (child->addr.offset == FTL_LBA_INVALID) {
+			spdk_mempool_put(dev->nv_cache.md_pool, child->md);
+			ftl_io_free(child);
+			spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
+			break;
+		}
+
+		/* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */
+		if (spdk_unlikely(num_blocks != ftl_io_iovec_len_left(io))) {
+			ftl_io_shrink_iovec(child, num_blocks);
+		}
+
+		ftl_nv_cache_fill_md(child, phase);
+		ftl_submit_nv_cache(child);
+	}
+
+	if (ftl_io_done(io)) {
+		ftl_io_complete(io);
+	}
+}
+
+static void
+ftl_write_nv_cache(struct ftl_io *parent)
+{
+	ftl_io_reset(parent);
+	parent->flags |= FTL_IO_CACHE;
+	_ftl_write_nv_cache(parent);
+}
+
+int
+ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown,
+			  spdk_bdev_io_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+	struct ftl_nv_cache_header *hdr = nv_cache->dma_buf;
+	struct spdk_bdev *bdev;
+	struct ftl_io_channel *ioch;
+
+	bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+	ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+	memset(hdr, 0, spdk_bdev_get_block_size(bdev));
+
+	hdr->phase = (uint8_t)nv_cache->phase;
+	hdr->size = spdk_bdev_get_num_blocks(bdev);
+	hdr->uuid = dev->uuid;
+	hdr->version = FTL_NV_CACHE_HEADER_VERSION;
+	hdr->current_addr = shutdown ? nv_cache->current_addr : FTL_LBA_INVALID;
+	hdr->checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0);
+
+	return spdk_bdev_write_blocks(nv_cache->bdev_desc, ioch->cache_ioch, hdr, 0, 1,
+				      cb_fn, cb_arg);
+}
+
+int
+ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, void *cb_arg)
+{
+	struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+	struct ftl_io_channel *ioch;
+	struct spdk_bdev *bdev;
+
+	ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+	bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+
+	return spdk_bdev_write_zeroes_blocks(nv_cache->bdev_desc, ioch->cache_ioch, 1,
+					     spdk_bdev_get_num_blocks(bdev) - 1,
+					     cb_fn, cb_arg);
+}
+
+static void
+ftl_write_fail(struct ftl_io *io, int status)
+{
+	struct ftl_batch *batch = io->batch;
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_wbuf_entry *entry;
+	struct ftl_band *band;
+	char buf[128];
+
+	entry = TAILQ_FIRST(&batch->entries);
+
+	band = ftl_band_from_addr(io->dev, entry->addr);
+	SPDK_ERRLOG("Write failed @addr: %s, status: %d\n",
+		    ftl_addr2str(entry->addr, buf, sizeof(buf)), status);
+
+	/* Close the band and, halt wptr and defrag */
+	ftl_halt_writes(dev, band);
+
+	TAILQ_FOREACH(entry, &batch->entries, tailq) {
+		/* Invalidate meta set by process_writes() */
+		ftl_invalidate_addr(dev, entry->addr);
+	}
+
+	/* Reset the batch back to the write buffer to resend it later */
+	TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq);
+}
+
+static void
+ftl_write_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_batch *batch = io->batch;
+	struct ftl_wbuf_entry *entry;
+	struct ftl_band *band;
+	struct ftl_addr prev_addr, addr = io->addr;
+
+	if (status) {
+		ftl_write_fail(io, status);
+		return;
+	}
+
+	assert(io->num_blocks == dev->xfer_size);
+	assert(!(io->flags & FTL_IO_MD));
+
+	TAILQ_FOREACH(entry, &batch->entries, tailq) {
+		band = entry->band;
+		if (!(entry->io_flags & FTL_IO_PAD)) {
+			/* Verify that the LBA is set for user blocks */
+			assert(entry->lba != FTL_LBA_INVALID);
+		}
+
+		if (band != NULL) {
+			assert(band->num_reloc_blocks > 0);
+			band->num_reloc_blocks--;
+		}
+
+		entry->addr = addr;
+		if (entry->lba != FTL_LBA_INVALID) {
+			pthread_spin_lock(&entry->lock);
+			prev_addr = ftl_l2p_get(dev, entry->lba);
+
+			/* If the l2p was updated in the meantime, don't update band's metadata */
+			if (ftl_addr_cached(prev_addr) &&
+			    entry == ftl_get_entry_from_addr(dev, prev_addr)) {
+				/* Setting entry's cache bit needs to be done after metadata */
+				/* within the band is updated to make sure that writes */
+				/* invalidating the entry clear the metadata as well */
+				ftl_band_set_addr(io->band, entry->lba, entry->addr);
+				entry->valid = true;
+			}
+			pthread_spin_unlock(&entry->lock);
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lu, lba:%lu\n",
+			      entry->addr.offset, entry->lba);
+
+		addr = ftl_band_next_addr(io->band, addr, 1);
+	}
+
+	ftl_process_flush(dev, batch);
+	ftl_release_batch(dev, batch);
+}
+
+static void
+ftl_update_stats(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry)
+{
+	if (!(entry->io_flags & FTL_IO_INTERNAL)) {
+		dev->stats.write_user++;
+	}
+	dev->stats.write_total++;
+}
+
+static void
+ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry,
+	       struct ftl_addr addr)
+{
+	struct ftl_addr prev_addr;
+	struct ftl_wbuf_entry *prev;
+	struct ftl_band *band;
+	int valid;
+	bool io_weak = entry->io_flags & FTL_IO_WEAK;
+
+	prev_addr = ftl_l2p_get(dev, entry->lba);
+	if (ftl_addr_invalid(prev_addr)) {
+		ftl_l2p_set(dev, entry->lba, addr);
+		return;
+	}
+
+	if (ftl_addr_cached(prev_addr)) {
+		prev = ftl_get_entry_from_addr(dev, prev_addr);
+		pthread_spin_lock(&prev->lock);
+
+		/* Re-read the L2P under the lock to protect against updates */
+		/* to this LBA from other threads */
+		prev_addr = ftl_l2p_get(dev, entry->lba);
+
+		/* If the entry is no longer in cache, another write has been */
+		/* scheduled in the meantime, so we can return to evicted path */
+		if (!ftl_addr_cached(prev_addr)) {
+			pthread_spin_unlock(&prev->lock);
+			goto evicted;
+		}
+
+		/*
+		 * Relocating block could still reside in cache due to fact that write
+		 * buffers are independent for each IO channel and enough amount of data
+		 * (write unit size) must be collected before it will be submitted to lower
+		 * layer.
+		 * When previous entry wasn't overwritten invalidate old address and entry.
+		 * Otherwise skip relocating block.
+		 */
+		if (io_weak &&
+		    /* Check if prev_addr was updated in meantime */
+		    !(ftl_addr_cmp(prev_addr, ftl_get_addr_from_entry(prev)) &&
+		      /* Check if relocating address it the same as in previous entry */
+		      ftl_addr_cmp(prev->addr, entry->addr))) {
+			pthread_spin_unlock(&prev->lock);
+			return;
+		}
+
+		/*
+		 * If previous entry is part of cache and was written into disk remove
+		 * and invalidate it
+		 */
+		if (prev->valid) {
+			ftl_invalidate_addr(dev, prev->addr);
+			prev->valid = false;
+		}
+
+		ftl_l2p_set(dev, entry->lba, addr);
+		pthread_spin_unlock(&prev->lock);
+		return;
+	}
+
+evicted:
+	/*
+	 *  If the L2P's physical address is different than what we expected we don't need to
+	 *  do anything (someone's already overwritten our data).
+	 */
+	if (io_weak && !ftl_addr_cmp(prev_addr, entry->addr)) {
+		return;
+	}
+
+	/* Lock the band containing previous physical address. This assures atomic changes to */
+	/* the L2P as wall as metadata. The valid bits in metadata are used to */
+	/* check weak writes validity. */
+	band = ftl_band_from_addr(dev, prev_addr);
+	pthread_spin_lock(&band->lba_map.lock);
+
+	valid = ftl_invalidate_addr_unlocked(dev, prev_addr);
+
+	/* If the address has been invalidated already, we don't want to update */
+	/* the L2P for weak writes, as it means the write is no longer valid. */
+	if (!io_weak || valid) {
+		ftl_l2p_set(dev, entry->lba, addr);
+	}
+
+	pthread_spin_unlock(&band->lba_map.lock);
+}
+
+static struct ftl_io *
+ftl_io_init_child_write(struct ftl_io *parent, struct ftl_addr addr, ftl_io_fn cb)
+{
+	struct ftl_io *io;
+	struct spdk_ftl_dev *dev = parent->dev;
+	struct ftl_io_init_opts opts = {
+		.dev		= dev,
+		.io		= NULL,
+		.parent		= parent,
+		.band		= parent->band,
+		.size		= sizeof(struct ftl_io),
+		.flags		= 0,
+		.type		= parent->type,
+		.num_blocks	= dev->xfer_size,
+		.cb_fn		= cb,
+		.iovcnt		= 0,
+	};
+
+	io = ftl_io_init_internal(&opts);
+	if (!io) {
+		return NULL;
+	}
+
+	io->addr = addr;
+
+	return io;
+}
+
+static void
+ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status)
+{
+	struct ftl_zone *zone;
+	struct ftl_wptr *wptr;
+
+	zone = ftl_band_zone_from_addr(io->band, io->addr);
+	wptr = ftl_wptr_from_band(io->band);
+
+	zone->busy = false;
+	zone->info.write_pointer += io->num_blocks;
+
+	if (zone->info.write_pointer == zone->info.zone_id + zone->info.capacity) {
+		zone->info.state = SPDK_BDEV_ZONE_STATE_FULL;
+	}
+
+	/* If some other write on the same band failed the write pointer would already be freed */
+	if (spdk_likely(wptr)) {
+		wptr->num_outstanding--;
+	}
+}
+
+static int
+ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io)
+{
+	struct spdk_ftl_dev	*dev = io->dev;
+	struct ftl_io_channel	*ioch;
+	struct ftl_io		*child;
+	struct ftl_addr		addr;
+	int			rc;
+
+	ioch = ftl_io_channel_get_ctx(io->ioch);
+
+	if (spdk_likely(!wptr->direct_mode)) {
+		addr = wptr->addr;
+	} else {
+		assert(io->flags & FTL_IO_DIRECT_ACCESS);
+		assert(ftl_addr_get_band(dev, io->addr) == wptr->band->id);
+		addr = io->addr;
+	}
+
+	/* Split IO to child requests and release zone immediately after child is completed */
+	child = ftl_io_init_child_write(io, addr, ftl_io_child_write_cb);
+	if (!child) {
+		return -EAGAIN;
+	}
+
+	wptr->num_outstanding++;
+
+	if (ftl_is_append_supported(dev)) {
+		rc = spdk_bdev_zone_appendv(dev->base_bdev_desc, ioch->base_ioch,
+					    child->iov, child->iov_cnt,
+					    ftl_addr_get_zone_slba(dev, addr),
+					    dev->xfer_size, ftl_io_cmpl_cb, child);
+	} else {
+		rc = spdk_bdev_writev_blocks(dev->base_bdev_desc, ioch->base_ioch,
+					     child->iov, child->iov_cnt, addr.offset,
+					     dev->xfer_size, ftl_io_cmpl_cb, child);
+	}
+
+	if (rc) {
+		wptr->num_outstanding--;
+		ftl_io_fail(child, rc);
+		ftl_io_complete(child);
+		SPDK_ERRLOG("spdk_bdev_write_blocks_with_md failed with status:%d, addr:%lu\n",
+			    rc, addr.offset);
+		return -EIO;
+	}
+
+	ftl_io_inc_req(child);
+	ftl_io_advance(child, dev->xfer_size);
+
+	return 0;
+}
+
+static int
+ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io)
+{
+	struct spdk_ftl_dev	*dev = io->dev;
+	int			rc = 0;
+
+	assert(io->num_blocks % dev->xfer_size == 0);
+
+	while (io->iov_pos < io->iov_cnt) {
+		/* There are no guarantees of the order of completion of NVMe IO submission queue */
+		/* so wait until zone is not busy before submitting another write */
+		if (!ftl_is_append_supported(dev) && wptr->zone->busy) {
+			TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry);
+			rc = -EAGAIN;
+			break;
+		}
+
+		rc = ftl_submit_child_write(wptr, io);
+		if (spdk_unlikely(rc)) {
+			if (rc == -EAGAIN) {
+				TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry);
+			} else {
+				ftl_io_fail(io, rc);
+			}
+			break;
+		}
+
+		ftl_trace_submission(dev, io, wptr->addr, dev->xfer_size);
+		ftl_wptr_advance(wptr, dev->xfer_size);
+	}
+
+	if (ftl_io_done(io)) {
+		/* Parent IO will complete after all children are completed */
+		ftl_io_complete(io);
+	}
+
+	return rc;
+}
+
+static void
+ftl_flush_pad_batch(struct spdk_ftl_dev *dev)
+{
+	struct ftl_batch *batch = dev->current_batch;
+	struct ftl_io_channel *ioch;
+	size_t size = 0, num_entries = 0;
+
+	assert(batch != NULL);
+	assert(batch->num_entries < dev->xfer_size);
+
+	TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+		size += spdk_ring_count(ioch->submit_queue);
+	}
+
+	num_entries = dev->xfer_size - batch->num_entries;
+	if (size < num_entries) {
+		ftl_pad_wbuf(dev, num_entries - size);
+	}
+}
+
+static bool
+ftl_check_io_channel_flush(struct spdk_ftl_dev *dev)
+{
+	struct ftl_io_channel *ioch;
+
+	TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+		if (ioch->flush && spdk_ring_count(ioch->free_queue) != ioch->num_entries) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static int
+ftl_wptr_process_writes(struct ftl_wptr *wptr)
+{
+	struct spdk_ftl_dev	*dev = wptr->dev;
+	struct ftl_batch	*batch;
+	struct ftl_wbuf_entry	*entry;
+	struct ftl_io		*io;
+
+	if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) {
+		io = TAILQ_FIRST(&wptr->pending_queue);
+		TAILQ_REMOVE(&wptr->pending_queue, io, ioch_entry);
+
+		if (ftl_submit_write(wptr, io) == -EAGAIN) {
+			return 0;
+		}
+	}
+
+	/* Make sure the band is prepared for writing */
+	if (!ftl_wptr_ready(wptr)) {
+		return 0;
+	}
+
+	if (dev->halt) {
+		ftl_wptr_process_shutdown(wptr);
+	}
+
+	if (spdk_unlikely(wptr->flush)) {
+		ftl_wptr_pad_band(wptr);
+	}
+
+	batch = ftl_get_next_batch(dev);
+	if (!batch) {
+		/* If there are queued flush requests we need to pad the write buffer to */
+		/* force out remaining entries */
+		if (!LIST_EMPTY(&dev->flush_list) || ftl_check_io_channel_flush(dev)) {
+			ftl_flush_pad_batch(dev);
+		}
+
+		return 0;
+	}
+
+	io = ftl_io_wbuf_init(dev, wptr->addr, wptr->band, batch, ftl_write_cb);
+	if (!io) {
+		goto error;
+	}
+
+	TAILQ_FOREACH(entry, &batch->entries, tailq) {
+		/* Update band's relocation stats if the IO comes from reloc */
+		if (entry->io_flags & FTL_IO_WEAK) {
+			if (!spdk_bit_array_get(wptr->band->reloc_bitmap, entry->band->id)) {
+				spdk_bit_array_set(wptr->band->reloc_bitmap, entry->band->id);
+				entry->band->num_reloc_bands++;
+			}
+		}
+
+		ftl_trace_wbuf_pop(dev, entry);
+		ftl_update_stats(dev, entry);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lx\n", wptr->addr.offset);
+
+	if (ftl_submit_write(wptr, io)) {
+		/* TODO: we need some recovery here */
+		assert(0 && "Write submit failed");
+		if (ftl_io_done(io)) {
+			ftl_io_free(io);
+		}
+	}
+
+	return dev->xfer_size;
+error:
+	TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq);
+	return 0;
+}
+
+static int
+ftl_process_writes(struct spdk_ftl_dev *dev)
+{
+	struct ftl_wptr *wptr, *twptr;
+	size_t num_active = 0;
+	enum ftl_band_state state;
+
+	LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) {
+		ftl_wptr_process_writes(wptr);
+		state = wptr->band->state;
+
+		if (state != FTL_BAND_STATE_FULL &&
+		    state != FTL_BAND_STATE_CLOSING &&
+		    state != FTL_BAND_STATE_CLOSED) {
+			num_active++;
+		}
+	}
+
+	if (num_active < 1) {
+		ftl_add_wptr(dev);
+	}
+
+	return 0;
+}
+
+static void
+ftl_fill_wbuf_entry(struct ftl_wbuf_entry *entry, struct ftl_io *io)
+{
+	memcpy(entry->payload, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE);
+
+	if (entry->io_flags & FTL_IO_WEAK) {
+		entry->band = ftl_band_from_addr(io->dev, io->addr);
+		entry->addr = ftl_band_next_addr(entry->band, io->addr, io->pos);
+		entry->band->num_reloc_blocks++;
+	}
+
+	entry->trace = io->trace;
+	entry->lba = ftl_io_current_lba(io);
+}
+
+static int
+ftl_wbuf_fill(struct ftl_io *io)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_io_channel *ioch;
+	struct ftl_wbuf_entry *entry;
+
+	ioch = ftl_io_channel_get_ctx(io->ioch);
+
+	while (io->pos < io->num_blocks) {
+		if (ftl_io_current_lba(io) == FTL_LBA_INVALID) {
+			ftl_io_advance(io, 1);
+			continue;
+		}
+
+		entry = ftl_acquire_wbuf_entry(ioch, io->flags);
+		if (!entry) {
+			TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry);
+			return 0;
+		}
+
+		ftl_fill_wbuf_entry(entry, io);
+
+		ftl_trace_wbuf_fill(dev, io);
+		ftl_update_l2p(dev, entry, ftl_get_addr_from_entry(entry));
+		ftl_io_advance(io, 1);
+
+		/* Needs to be done after L2P is updated to avoid race with */
+		/* write completion callback when it's processed faster than */
+		/* L2P is set in update_l2p(). */
+		spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL);
+	}
+
+	if (ftl_io_done(io)) {
+		if (ftl_dev_has_nv_cache(dev) && !(io->flags & FTL_IO_BYPASS_CACHE)) {
+			ftl_write_nv_cache(io);
+		} else {
+			TAILQ_INSERT_TAIL(&ioch->write_cmpl_queue, io, ioch_entry);
+		}
+	}
+
+	return 0;
+}
+
+static bool
+ftl_dev_needs_defrag(struct spdk_ftl_dev *dev)
+{
+	const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START);
+
+	if (ftl_reloc_is_halted(dev->reloc)) {
+		return false;
+	}
+
+	if (ftl_reloc_is_defrag_active(dev->reloc)) {
+		return false;
+	}
+
+	if (dev->num_free <= limit->thld) {
+		return true;
+	}
+
+	return false;
+}
+
+static double
+ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid)
+{
+	size_t usable, valid, invalid;
+	double vld_ratio;
+
+	/* If the band doesn't have any usable blocks it's of no use */
+	usable = ftl_band_num_usable_blocks(band);
+	if (usable == 0) {
+		return 0.0;
+	}
+
+	valid =  threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld;
+	invalid = usable - valid;
+
+	/* Add one to avoid division by 0 */
+	vld_ratio = (double)invalid / (double)(valid + 1);
+	return vld_ratio * ftl_band_age(band);
+}
+
+static bool
+ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev)
+{
+	struct spdk_ftl_conf *conf = &dev->conf;
+	size_t thld_vld;
+
+	/* If we're in dire need of free bands, every band is worth defragging */
+	if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) {
+		return true;
+	}
+
+	thld_vld = (ftl_band_num_usable_blocks(band) * conf->invalid_thld) / 100;
+
+	return band->merit > ftl_band_calc_merit(band, &thld_vld);
+}
+
+static struct ftl_band *
+ftl_select_defrag_band(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band, *mband = NULL;
+	double merit = 0;
+
+	LIST_FOREACH(band, &dev->shut_bands, list_entry) {
+		assert(band->state == FTL_BAND_STATE_CLOSED);
+		band->merit = ftl_band_calc_merit(band, NULL);
+		if (band->merit > merit) {
+			merit = band->merit;
+			mband = band;
+		}
+	}
+
+	if (mband && !ftl_band_needs_defrag(mband, dev)) {
+		mband = NULL;
+	}
+
+	return mband;
+}
+
+static void
+ftl_process_relocs(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band;
+
+	if (ftl_dev_needs_defrag(dev)) {
+		band = ftl_select_defrag_band(dev);
+		if (band) {
+			ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 0, true);
+			ftl_trace_defrag_band(dev, band);
+		}
+	}
+
+	ftl_reloc(dev->reloc);
+}
+
+int
+ftl_current_limit(const struct spdk_ftl_dev *dev)
+{
+	return dev->limit;
+}
+
+void
+spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs)
+{
+	attrs->uuid = dev->uuid;
+	attrs->num_blocks = dev->num_lbas;
+	attrs->block_size = FTL_BLOCK_SIZE;
+	attrs->num_zones = ftl_get_num_zones(dev);
+	attrs->zone_size = ftl_get_num_blocks_in_zone(dev);
+	attrs->conf = dev->conf;
+	attrs->base_bdev = spdk_bdev_get_name(spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+
+	attrs->cache_bdev = NULL;
+	if (dev->nv_cache.bdev_desc) {
+		attrs->cache_bdev = spdk_bdev_get_name(
+					    spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc));
+	}
+}
+
+static void
+_ftl_io_write(void *ctx)
+{
+	ftl_io_write((struct ftl_io *)ctx);
+}
+
+static int
+ftl_submit_write_leaf(struct ftl_io *io)
+{
+	int rc;
+
+	rc = ftl_submit_write(ftl_wptr_from_band(io->band), io);
+	if (rc == -EAGAIN) {
+		/* EAGAIN means that the request was put on the pending queue */
+		return 0;
+	}
+
+	return rc;
+}
+
+void
+ftl_io_write(struct ftl_io *io)
+{
+	struct spdk_ftl_dev *dev = io->dev;
+	struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(io->ioch);
+
+	/* Put the IO on retry queue in case IO channel is not initialized */
+	if (spdk_unlikely(ioch->index == FTL_IO_CHANNEL_INDEX_INVALID)) {
+		TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry);
+		return;
+	}
+
+	/* For normal IOs we just need to copy the data onto the write buffer */
+	if (!(io->flags & FTL_IO_MD)) {
+		ftl_io_call_foreach_child(io, ftl_wbuf_fill);
+	} else {
+		/* Metadata has its own buffer, so it doesn't have to be copied, so just */
+		/* send it the the core thread and schedule the write immediately */
+		if (ftl_check_core_thread(dev)) {
+			ftl_io_call_foreach_child(io, ftl_submit_write_leaf);
+		} else {
+			spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io);
+		}
+	}
+}
+
+int
+spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
+	       struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+	struct ftl_io *io;
+
+	if (iov_cnt == 0) {
+		return -EINVAL;
+	}
+
+	if (lba_cnt == 0) {
+		return -EINVAL;
+	}
+
+	if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) {
+		return -EINVAL;
+	}
+
+	if (!dev->initialized) {
+		return -EBUSY;
+	}
+
+	io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE);
+	if (!io) {
+		return -ENOMEM;
+	}
+
+	ftl_io_write(io);
+
+	return 0;
+}
+
+void
+ftl_io_read(struct ftl_io *io)
+{
+	ftl_io_call_foreach_child(io, ftl_submit_read);
+}
+
+int
+spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
+	      struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+	struct ftl_io *io;
+
+	if (iov_cnt == 0) {
+		return -EINVAL;
+	}
+
+	if (lba_cnt == 0) {
+		return -EINVAL;
+	}
+
+	if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) {
+		return -EINVAL;
+	}
+
+	if (!dev->initialized) {
+		return -EBUSY;
+	}
+
+	io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ);
+	if (!io) {
+		return -ENOMEM;
+	}
+
+	ftl_io_read(io);
+	return 0;
+}
+
+static struct ftl_flush *
+ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+	struct ftl_flush *flush;
+
+	flush = calloc(1, sizeof(*flush));
+	if (!flush) {
+		return NULL;
+	}
+
+	flush->bmap = spdk_bit_array_create(FTL_BATCH_COUNT);
+	if (!flush->bmap) {
+		goto error;
+	}
+
+	flush->dev = dev;
+	flush->cb.fn = cb_fn;
+	flush->cb.ctx = cb_arg;
+
+	return flush;
+error:
+	free(flush);
+	return NULL;
+}
+
+static void
+_ftl_flush(void *ctx)
+{
+	struct ftl_flush *flush = ctx;
+	struct spdk_ftl_dev *dev = flush->dev;
+	uint32_t i;
+
+	/* Attach flush object to all non-empty batches */
+	for (i = 0; i < FTL_BATCH_COUNT; ++i) {
+		if (dev->batch_array[i].num_entries > 0) {
+			spdk_bit_array_set(flush->bmap, i);
+			flush->num_req++;
+		}
+	}
+
+	LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry);
+
+	/* If the write buffer was already empty, the flush can be completed right away */
+	if (!flush->num_req) {
+		ftl_complete_flush(flush);
+	}
+}
+
+int
+ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+	struct ftl_flush *flush;
+
+	flush = ftl_flush_init(dev, cb_fn, cb_arg);
+	if (!flush) {
+		return -ENOMEM;
+	}
+
+	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush);
+	return 0;
+}
+
+int
+spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+	if (!dev->initialized) {
+		return -EBUSY;
+	}
+
+	return ftl_flush_wbuf(dev, cb_fn, cb_arg);
+}
+
+bool
+ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr)
+{
+	struct ftl_zone *zone = ftl_band_zone_from_addr(band, addr);
+
+	return addr.offset < zone->info.write_pointer;
+}
+
+static void ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event);
+
+static void
+_ftl_process_media_event(void *ctx)
+{
+	struct ftl_media_event *event = ctx;
+	struct spdk_ftl_dev *dev = event->dev;
+
+	ftl_process_media_event(dev, event->event);
+	spdk_mempool_put(dev->media_events_pool, event);
+}
+
+static void
+ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event)
+{
+	struct ftl_band *band;
+	struct ftl_addr addr = { .offset = event.offset };
+	size_t block_off;
+
+	if (!ftl_check_core_thread(dev)) {
+		struct ftl_media_event *media_event;
+
+		media_event = spdk_mempool_get(dev->media_events_pool);
+		if (!media_event) {
+			SPDK_ERRLOG("Media event lost due to lack of memory");
+			return;
+		}
+
+		media_event->dev = dev;
+		media_event->event = event;
+		spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_process_media_event,
+				     media_event);
+		return;
+	}
+
+	band = ftl_band_from_addr(dev, addr);
+	block_off = ftl_band_block_offset_from_addr(band, addr);
+
+	ftl_reloc_add(dev->reloc, band, block_off, event.num_blocks, 0, false);
+}
+
+void
+ftl_get_media_events(struct spdk_ftl_dev *dev)
+{
+#define FTL_MAX_MEDIA_EVENTS 128
+	struct spdk_bdev_media_event events[FTL_MAX_MEDIA_EVENTS];
+	size_t num_events, i;
+
+	if (!dev->initialized) {
+		return;
+	}
+
+	do {
+		num_events = spdk_bdev_get_media_events(dev->base_bdev_desc,
+							events, FTL_MAX_MEDIA_EVENTS);
+
+		for (i = 0; i < num_events; ++i) {
+			ftl_process_media_event(dev, events[i]);
+		}
+
+	} while (num_events);
+}
+
+int
+ftl_io_channel_poll(void *arg)
+{
+	struct ftl_io_channel *ch = arg;
+	struct ftl_io *io;
+	TAILQ_HEAD(, ftl_io) retry_queue;
+
+	if (TAILQ_EMPTY(&ch->write_cmpl_queue) && TAILQ_EMPTY(&ch->retry_queue)) {
+		return SPDK_POLLER_IDLE;
+	}
+
+	while (!TAILQ_EMPTY(&ch->write_cmpl_queue)) {
+		io = TAILQ_FIRST(&ch->write_cmpl_queue);
+		TAILQ_REMOVE(&ch->write_cmpl_queue, io, ioch_entry);
+		ftl_io_complete(io);
+	}
+
+	/*
+	 * Create local copy of the retry queue to prevent from infinite retrying if IO will be
+	 * inserted to the retry queue again
+	 */
+	TAILQ_INIT(&retry_queue);
+	TAILQ_SWAP(&ch->retry_queue, &retry_queue, ftl_io, ioch_entry);
+
+	while (!TAILQ_EMPTY(&retry_queue)) {
+		io = TAILQ_FIRST(&retry_queue);
+		TAILQ_REMOVE(&retry_queue, io, ioch_entry);
+		if (io->type == FTL_IO_WRITE) {
+			ftl_io_write(io);
+		} else {
+			ftl_io_read(io);
+		}
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+int
+ftl_task_core(void *ctx)
+{
+	struct spdk_ftl_dev *dev = ctx;
+
+	if (dev->halt) {
+		if (ftl_shutdown_complete(dev)) {
+			spdk_poller_unregister(&dev->core_poller);
+			return SPDK_POLLER_IDLE;
+		}
+	}
+
+	ftl_process_writes(dev);
+	ftl_process_relocs(dev);
+
+	return SPDK_POLLER_BUSY;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE)
diff --git a/src/spdk/lib/ftl/ftl_core.h b/src/spdk/lib/ftl/ftl_core.h
new file mode 100644
index 000000000..b782ba731
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_core.h
@@ -0,0 +1,552 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_CORE_H
+#define FTL_CORE_H
+
+#include "spdk/stdinc.h"
+#include "spdk/uuid.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/ftl.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_zone.h"
+
+#include "ftl_addr.h"
+#include "ftl_io.h"
+#include "ftl_trace.h"
+
+#ifdef SPDK_CONFIG_PMDK
+#include "libpmem.h"
+#endif /* SPDK_CONFIG_PMDK */
+
+struct spdk_ftl_dev;
+struct ftl_band;
+struct ftl_zone;
+struct ftl_io;
+struct ftl_restore;
+struct ftl_wptr;
+struct ftl_flush;
+struct ftl_reloc;
+struct ftl_anm_event;
+struct ftl_band_flush;
+
+struct ftl_stats {
+	/* Number of writes scheduled directly by the user */
+	uint64_t				write_user;
+
+	/* Total number of writes */
+	uint64_t				write_total;
+
+	/* Traces */
+	struct ftl_trace			trace;
+
+	/* Number of limits applied */
+	uint64_t				limits[SPDK_FTL_LIMIT_MAX];
+};
+
+struct ftl_global_md {
+	/* Device instance */
+	struct spdk_uuid			uuid;
+	/* Size of the l2p table */
+	uint64_t				num_lbas;
+};
+
+struct ftl_nv_cache {
+	/* Write buffer cache bdev */
+	struct spdk_bdev_desc			*bdev_desc;
+	/* Write pointer */
+	uint64_t				current_addr;
+	/* Number of available blocks left */
+	uint64_t				num_available;
+	/* Maximum number of blocks */
+	uint64_t				num_data_blocks;
+	/*
+	 * Phase of the current cycle of writes. Each time whole cache area is filled, the phase is
+	 * advanced. Current phase is saved in every IO's metadata, as well as in the header saved
+	 * in the first sector. By looking at the phase of each block, it's possible to find the
+	 * oldest block and replay the order of the writes when recovering the data from the cache.
+	 */
+	unsigned int				phase;
+	/* Indicates that the data can be written to the cache */
+	bool					ready;
+	/* Metadata pool */
+	struct spdk_mempool			*md_pool;
+	/* DMA buffer for writing the header */
+	void					*dma_buf;
+	/* Cache lock */
+	pthread_spinlock_t			lock;
+};
+
+struct ftl_batch {
+	/* Queue of write buffer entries, can reach up to xfer_size entries */
+	TAILQ_HEAD(, ftl_wbuf_entry)		entries;
+	/* Number of entries in the queue above */
+	uint32_t				num_entries;
+	/* Index within spdk_ftl_dev.batch_array */
+	uint32_t				index;
+	struct iovec				*iov;
+	void					*metadata;
+	TAILQ_ENTRY(ftl_batch)			tailq;
+};
+
+struct spdk_ftl_dev {
+	/* Device instance */
+	struct spdk_uuid			uuid;
+	/* Device name */
+	char					*name;
+	/* Configuration */
+	struct spdk_ftl_conf			conf;
+
+	/* Indicates the device is fully initialized */
+	int					initialized;
+	/* Indicates the device is about to be stopped */
+	int					halt;
+	/* Indicates the device is about to start stopping - use to handle multiple stop request */
+	bool					halt_started;
+
+	/* Underlying device */
+	struct spdk_bdev_desc			*base_bdev_desc;
+
+	/* Non-volatile write buffer cache */
+	struct ftl_nv_cache			nv_cache;
+
+	/* LBA map memory pool */
+	struct spdk_mempool			*lba_pool;
+
+	/* LBA map requests pool */
+	struct spdk_mempool			*lba_request_pool;
+
+	/* Media management events pool */
+	struct spdk_mempool			*media_events_pool;
+
+	/* Statistics */
+	struct ftl_stats			stats;
+
+	/* Current sequence number */
+	uint64_t				seq;
+
+	/* Array of bands */
+	struct ftl_band				*bands;
+	/* Number of operational bands */
+	size_t					num_bands;
+	/* Next write band */
+	struct ftl_band				*next_band;
+	/* Free band list */
+	LIST_HEAD(, ftl_band)			free_bands;
+	/* Closed bands list */
+	LIST_HEAD(, ftl_band)			shut_bands;
+	/* Number of free bands */
+	size_t					num_free;
+
+	/* List of write pointers */
+	LIST_HEAD(, ftl_wptr)			wptr_list;
+
+	/* Logical -> physical table */
+	void					*l2p;
+	/* Size of the l2p table */
+	uint64_t				num_lbas;
+	/* Size of pages mmapped for l2p, valid only for mapping on persistent memory */
+	size_t					l2p_pmem_len;
+
+	/* Address size */
+	size_t					addr_len;
+
+	/* Flush list */
+	LIST_HEAD(, ftl_flush)			flush_list;
+	/* List of band flush requests */
+	LIST_HEAD(, ftl_band_flush)		band_flush_list;
+
+	/* Device specific md buffer */
+	struct ftl_global_md			global_md;
+
+	/* Metadata size */
+	size_t					md_size;
+	void					*md_buf;
+
+	/* Transfer unit size */
+	size_t					xfer_size;
+
+	/* Current user write limit */
+	int					limit;
+
+	/* Inflight IO operations */
+	uint32_t				num_inflight;
+
+	/* Manages data relocation */
+	struct ftl_reloc			*reloc;
+
+	/* Thread on which the poller is running */
+	struct spdk_thread			*core_thread;
+	/* IO channel */
+	struct spdk_io_channel			*ioch;
+	/* Poller */
+	struct spdk_poller			*core_poller;
+
+	/* IO channel array provides means for retrieving write buffer entries
+	 * from their address stored in L2P.  The address is divided into two
+	 * parts - IO channel offset poining at specific IO channel (within this
+	 * array) and entry offset pointing at specific entry within that IO
+	 * channel.
+	 */
+	struct ftl_io_channel			**ioch_array;
+	TAILQ_HEAD(, ftl_io_channel)		ioch_queue;
+	uint64_t				num_io_channels;
+	/* Value required to shift address of a write buffer entry to retrieve
+	 * the IO channel it's part of.  The other part of the address describes
+	 * the offset of an entry within the IO channel's entry array.
+	 */
+	uint64_t				ioch_shift;
+
+	/* Write buffer batches */
+#define FTL_BATCH_COUNT 4096
+	struct ftl_batch			batch_array[FTL_BATCH_COUNT];
+	/* Iovec buffer used by batches */
+	struct iovec				*iov_buf;
+	/* Batch currently being filled  */
+	struct ftl_batch			*current_batch;
+	/* Full and ready to be sent batches. A batch is put on this queue in
+	 * case it's already filled, but cannot be sent.
+	 */
+	TAILQ_HEAD(, ftl_batch)			pending_batches;
+	TAILQ_HEAD(, ftl_batch)			free_batches;
+
+	/* Devices' list */
+	STAILQ_ENTRY(spdk_ftl_dev)		stailq;
+};
+
+struct ftl_nv_cache_header {
+	/* Version of the header */
+	uint32_t				version;
+	/* UUID of the FTL device */
+	struct spdk_uuid			uuid;
+	/* Size of the non-volatile cache (in blocks) */
+	uint64_t				size;
+	/* Contains the next address to be written after clean shutdown, invalid LBA otherwise */
+	uint64_t				current_addr;
+	/* Current phase */
+	uint8_t					phase;
+	/* Checksum of the header, needs to be last element */
+	uint32_t				checksum;
+} __attribute__((packed));
+
+struct ftl_media_event {
+	/* Owner */
+	struct spdk_ftl_dev			*dev;
+	/* Media event */
+	struct spdk_bdev_media_event		event;
+};
+
+typedef void (*ftl_restore_fn)(struct ftl_restore *, int, void *cb_arg);
+
+void	ftl_apply_limits(struct spdk_ftl_dev *dev);
+void	ftl_io_read(struct ftl_io *io);
+void	ftl_io_write(struct ftl_io *io);
+int	ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg);
+int	ftl_current_limit(const struct spdk_ftl_dev *dev);
+int	ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr);
+int	ftl_task_core(void *ctx);
+int	ftl_task_read(void *ctx);
+void	ftl_process_anm_event(struct ftl_anm_event *event);
+size_t	ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev);
+size_t	ftl_tail_md_hdr_num_blocks(void);
+size_t	ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev);
+size_t	ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev);
+size_t	ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev);
+int	ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg);
+int	ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg);
+void	ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg);
+int	ftl_band_set_direct_access(struct ftl_band *band, bool access);
+bool	ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr);
+int	ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg);
+int	ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown,
+				  spdk_bdev_io_completion_cb cb_fn, void *cb_arg);
+int	ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn,
+			   void *cb_arg);
+void	ftl_get_media_events(struct spdk_ftl_dev *dev);
+int	ftl_io_channel_poll(void *arg);
+void	ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry);
+struct spdk_io_channel *ftl_get_io_channel(const struct spdk_ftl_dev *dev);
+struct ftl_io_channel *ftl_io_channel_get_ctx(struct spdk_io_channel *ioch);
+
+
+#define ftl_to_addr(address) \
+	(struct ftl_addr) { .offset = (uint64_t)(address) }
+
+#define ftl_to_addr_packed(address) \
+	(struct ftl_addr) { .pack.offset = (uint32_t)(address) }
+
+static inline struct spdk_thread *
+ftl_get_core_thread(const struct spdk_ftl_dev *dev)
+{
+	return dev->core_thread;
+}
+
+static inline size_t
+ftl_get_num_bands(const struct spdk_ftl_dev *dev)
+{
+	return dev->num_bands;
+}
+
+static inline size_t
+ftl_get_num_punits(const struct spdk_ftl_dev *dev)
+{
+	return spdk_bdev_get_optimal_open_zones(spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+}
+
+static inline size_t
+ftl_get_num_zones(const struct spdk_ftl_dev *dev)
+{
+	return ftl_get_num_bands(dev) * ftl_get_num_punits(dev);
+}
+
+static inline size_t
+ftl_get_num_blocks_in_zone(const struct spdk_ftl_dev *dev)
+{
+	return spdk_bdev_get_zone_size(spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+}
+
+static inline uint64_t
+ftl_get_num_blocks_in_band(const struct spdk_ftl_dev *dev)
+{
+	return ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev);
+}
+
+static inline uint64_t
+ftl_addr_get_zone_slba(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	return addr.offset -= (addr.offset % ftl_get_num_blocks_in_zone(dev));
+}
+
+static inline uint64_t
+ftl_addr_get_band(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	return addr.offset / ftl_get_num_blocks_in_band(dev);
+}
+
+static inline uint64_t
+ftl_addr_get_punit(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	return (addr.offset / ftl_get_num_blocks_in_zone(dev)) % ftl_get_num_punits(dev);
+}
+
+static inline uint64_t
+ftl_addr_get_zone_offset(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	return addr.offset % ftl_get_num_blocks_in_zone(dev);
+}
+
+static inline size_t
+ftl_vld_map_size(const struct spdk_ftl_dev *dev)
+{
+	return (size_t)spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), CHAR_BIT);
+}
+
+static inline int
+ftl_addr_packed(const struct spdk_ftl_dev *dev)
+{
+	return dev->addr_len < 32;
+}
+
+static inline void
+ftl_l2p_lba_persist(const struct spdk_ftl_dev *dev, uint64_t lba)
+{
+#ifdef SPDK_CONFIG_PMDK
+	size_t ftl_addr_size = ftl_addr_packed(dev) ? 4 : 8;
+	pmem_persist((char *)dev->l2p + (lba * ftl_addr_size), ftl_addr_size);
+#else /* SPDK_CONFIG_PMDK */
+	SPDK_ERRLOG("Libpmem not available, cannot flush l2p to pmem\n");
+	assert(0);
+#endif /* SPDK_CONFIG_PMDK */
+}
+
+static inline int
+ftl_addr_invalid(struct ftl_addr addr)
+{
+	return addr.offset == ftl_to_addr(FTL_ADDR_INVALID).offset;
+}
+
+static inline int
+ftl_addr_cached(struct ftl_addr addr)
+{
+	return !ftl_addr_invalid(addr) && addr.cached;
+}
+
+static inline struct ftl_addr
+ftl_addr_to_packed(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+	struct ftl_addr p = {};
+
+	if (ftl_addr_invalid(addr)) {
+		p = ftl_to_addr_packed(FTL_ADDR_INVALID);
+	} else if (ftl_addr_cached(addr)) {
+		p.pack.cached = 1;
+		p.pack.cache_offset = (uint32_t) addr.cache_offset;
+	} else {
+		p.pack.offset = (uint32_t) addr.offset;
+	}
+
+	return p;
+}
+
+static inline struct ftl_addr
+ftl_addr_from_packed(const struct spdk_ftl_dev *dev, struct ftl_addr p)
+{
+	struct ftl_addr addr = {};
+
+	if (p.pack.offset == (uint32_t)FTL_ADDR_INVALID) {
+		addr = ftl_to_addr(FTL_ADDR_INVALID);
+	} else if (p.pack.cached) {
+		addr.cached = 1;
+		addr.cache_offset = p.pack.cache_offset;
+	} else {
+		addr = p;
+	}
+
+	return addr;
+}
+
+#define _ftl_l2p_set(l2p, off, val, bits) \
+	__atomic_store_n(((uint##bits##_t *)(l2p)) + (off), val, __ATOMIC_SEQ_CST)
+
+#define _ftl_l2p_set32(l2p, off, val) \
+	_ftl_l2p_set(l2p, off, val, 32)
+
+#define _ftl_l2p_set64(l2p, off, val) \
+	_ftl_l2p_set(l2p, off, val, 64)
+
+#define _ftl_l2p_get(l2p, off, bits) \
+	__atomic_load_n(((uint##bits##_t *)(l2p)) + (off), __ATOMIC_SEQ_CST)
+
+#define _ftl_l2p_get32(l2p, off) \
+	_ftl_l2p_get(l2p, off, 32)
+
+#define _ftl_l2p_get64(l2p, off) \
+	_ftl_l2p_get(l2p, off, 64)
+
+#define ftl_addr_cmp(p1, p2) \
+	((p1).offset == (p2).offset)
+
+static inline void
+ftl_l2p_set(struct spdk_ftl_dev *dev, uint64_t lba, struct ftl_addr addr)
+{
+	assert(dev->num_lbas > lba);
+
+	if (ftl_addr_packed(dev)) {
+		_ftl_l2p_set32(dev->l2p, lba, ftl_addr_to_packed(dev, addr).offset);
+	} else {
+		_ftl_l2p_set64(dev->l2p, lba, addr.offset);
+	}
+
+	if (dev->l2p_pmem_len != 0) {
+		ftl_l2p_lba_persist(dev, lba);
+	}
+}
+
+static inline struct ftl_addr
+ftl_l2p_get(struct spdk_ftl_dev *dev, uint64_t lba)
+{
+	assert(dev->num_lbas > lba);
+
+	if (ftl_addr_packed(dev)) {
+		return ftl_addr_from_packed(dev, ftl_to_addr_packed(
+						    _ftl_l2p_get32(dev->l2p, lba)));
+	} else {
+		return ftl_to_addr(_ftl_l2p_get64(dev->l2p, lba));
+	}
+}
+
+static inline bool
+ftl_dev_has_nv_cache(const struct spdk_ftl_dev *dev)
+{
+	return dev->nv_cache.bdev_desc != NULL;
+}
+
+#define FTL_NV_CACHE_HEADER_VERSION	(1)
+#define FTL_NV_CACHE_DATA_OFFSET	(1)
+#define FTL_NV_CACHE_PHASE_OFFSET	(62)
+#define FTL_NV_CACHE_PHASE_COUNT	(4)
+#define FTL_NV_CACHE_PHASE_MASK		(3ULL << FTL_NV_CACHE_PHASE_OFFSET)
+#define FTL_NV_CACHE_LBA_INVALID	(FTL_LBA_INVALID & ~FTL_NV_CACHE_PHASE_MASK)
+
+static inline bool
+ftl_nv_cache_phase_is_valid(unsigned int phase)
+{
+	return phase > 0 && phase <= 3;
+}
+
+static inline unsigned int
+ftl_nv_cache_next_phase(unsigned int current)
+{
+	static const unsigned int phases[] = { 0, 2, 3, 1 };
+	assert(ftl_nv_cache_phase_is_valid(current));
+	return phases[current];
+}
+
+static inline unsigned int
+ftl_nv_cache_prev_phase(unsigned int current)
+{
+	static const unsigned int phases[] = { 0, 3, 1, 2 };
+	assert(ftl_nv_cache_phase_is_valid(current));
+	return phases[current];
+}
+
+static inline uint64_t
+ftl_nv_cache_pack_lba(uint64_t lba, unsigned int phase)
+{
+	assert(ftl_nv_cache_phase_is_valid(phase));
+	return (lba & ~FTL_NV_CACHE_PHASE_MASK) | ((uint64_t)phase << FTL_NV_CACHE_PHASE_OFFSET);
+}
+
+static inline void
+ftl_nv_cache_unpack_lba(uint64_t in_lba, uint64_t *out_lba, unsigned int *phase)
+{
+	*out_lba = in_lba & ~FTL_NV_CACHE_PHASE_MASK;
+	*phase = (in_lba & FTL_NV_CACHE_PHASE_MASK) >> FTL_NV_CACHE_PHASE_OFFSET;
+
+	/* If the phase is invalid the block wasn't written yet, so treat the LBA as invalid too */
+	if (!ftl_nv_cache_phase_is_valid(*phase) || *out_lba == FTL_NV_CACHE_LBA_INVALID) {
+		*out_lba = FTL_LBA_INVALID;
+	}
+}
+
+static inline bool
+ftl_is_append_supported(const struct spdk_ftl_dev *dev)
+{
+	return dev->conf.use_append;
+}
+
+#endif /* FTL_CORE_H */
diff --git a/src/spdk/lib/ftl/ftl_debug.c b/src/spdk/lib/ftl/ftl_debug.c
new file mode 100644
index 000000000..9fbb43810
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_debug.c
@@ -0,0 +1,169 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+#include "ftl_debug.h"
+#include "ftl_band.h"
+
+#if defined(DEBUG)
+#if defined(FTL_META_DEBUG)
+
+static const char *ftl_band_state_str[] = {
+	"free",
+	"prep",
+	"opening",
+	"open",
+	"full",
+	"closing",
+	"closed",
+	"max"
+};
+
+bool
+ftl_band_validate_md(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	struct ftl_addr addr_md, addr_l2p;
+	size_t i, size, seg_off;
+	bool valid = true;
+
+	size = ftl_get_num_blocks_in_band(dev);
+
+	pthread_spin_lock(&lba_map->lock);
+	for (i = 0; i < size; ++i) {
+		if (!spdk_bit_array_get(lba_map->vld, i)) {
+			continue;
+		}
+
+		seg_off = i / FTL_NUM_LBA_IN_BLOCK;
+		if (lba_map->segments[seg_off] != FTL_LBA_MAP_SEG_CACHED) {
+			continue;
+		}
+
+		addr_md = ftl_band_addr_from_block_offset(band, i);
+		addr_l2p = ftl_l2p_get(dev, lba_map->map[i]);
+
+		if (addr_l2p.cached) {
+			continue;
+		}
+
+		if (addr_l2p.offset != addr_md.offset) {
+			valid = false;
+			break;
+		}
+
+	}
+
+	pthread_spin_unlock(&lba_map->lock);
+
+	return valid;
+}
+
+void
+ftl_dev_dump_bands(struct spdk_ftl_dev *dev)
+{
+	size_t i, total = 0;
+
+	if (!dev->bands) {
+		return;
+	}
+
+	ftl_debug("Bands validity:\n");
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		if (dev->bands[i].state == FTL_BAND_STATE_FREE &&
+		    dev->bands[i].wr_cnt == 0) {
+			continue;
+		}
+
+		if (!dev->bands[i].num_zones) {
+			ftl_debug(" Band %3zu: all zones are offline\n", i + 1);
+			continue;
+		}
+
+		total += dev->bands[i].lba_map.num_vld;
+		ftl_debug(" Band %3zu: %8zu / %zu \tnum_zones: %zu \twr_cnt: %"PRIu64"\tmerit:"
+			  "%10.3f\tstate: %s\n",
+			  i + 1, dev->bands[i].lba_map.num_vld,
+			  ftl_band_user_blocks(&dev->bands[i]),
+			  dev->bands[i].num_zones,
+			  dev->bands[i].wr_cnt,
+			  dev->bands[i].merit,
+			  ftl_band_state_str[dev->bands[i].state]);
+	}
+}
+
+#endif /* defined(FTL_META_DEBUG) */
+
+#if defined(FTL_DUMP_STATS)
+
+void
+ftl_dev_dump_stats(const struct spdk_ftl_dev *dev)
+{
+	size_t i, total = 0;
+	char uuid[SPDK_UUID_STRING_LEN];
+	double waf;
+	const char *limits[] = {
+		[SPDK_FTL_LIMIT_CRIT]  = "crit",
+		[SPDK_FTL_LIMIT_HIGH]  = "high",
+		[SPDK_FTL_LIMIT_LOW]   = "low",
+		[SPDK_FTL_LIMIT_START] = "start"
+	};
+
+	if (!dev->bands) {
+		return;
+	}
+
+	/* Count the number of valid LBAs */
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		total += dev->bands[i].lba_map.num_vld;
+	}
+
+	waf = (double)dev->stats.write_total / (double)dev->stats.write_user;
+
+	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &dev->uuid);
+	ftl_debug("\n");
+	ftl_debug("device UUID:         %s\n", uuid);
+	ftl_debug("total valid LBAs:    %zu\n", total);
+	ftl_debug("total writes:        %"PRIu64"\n", dev->stats.write_total);
+	ftl_debug("user writes:         %"PRIu64"\n", dev->stats.write_user);
+	ftl_debug("WAF:                 %.4lf\n", waf);
+	ftl_debug("limits:\n");
+	for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) {
+		ftl_debug(" %5s: %"PRIu64"\n", limits[i], dev->stats.limits[i]);
+	}
+}
+
+#endif /* defined(FTL_DUMP_STATS) */
+#endif /* defined(DEBUG) */
diff --git a/src/spdk/lib/ftl/ftl_debug.h b/src/spdk/lib/ftl/ftl_debug.h
new file mode 100644
index 000000000..c90c92ef2
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_debug.h
@@ -0,0 +1,73 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_DEBUG_H
+#define FTL_DEBUG_H
+
+#include "ftl_addr.h"
+#include "ftl_band.h"
+#include "ftl_core.h"
+
+#if defined(DEBUG)
+/* Debug flags - enabled when defined */
+#define FTL_META_DEBUG	1
+#define FTL_DUMP_STATS	1
+
+#define ftl_debug(msg, ...) \
+	SPDK_ERRLOG(msg, ## __VA_ARGS__)
+#else
+#define ftl_debug(msg, ...)
+#endif
+
+static inline const char *
+ftl_addr2str(struct ftl_addr addr, char *buf, size_t size)
+{
+	snprintf(buf, size, "(%"PRIu64")", addr.offset);
+	return buf;
+}
+
+#if defined(FTL_META_DEBUG)
+bool ftl_band_validate_md(struct ftl_band *band);
+void ftl_dev_dump_bands(struct spdk_ftl_dev *dev);
+#else
+#define ftl_band_validate_md(band)
+#define ftl_dev_dump_bands(dev)
+#endif
+
+#if defined(FTL_DUMP_STATS)
+void ftl_dev_dump_stats(const struct spdk_ftl_dev *dev);
+#else
+#define ftl_dev_dump_stats(dev)
+#endif
+
+#endif /* FTL_DEBUG_H */
diff --git a/src/spdk/lib/ftl/ftl_init.c b/src/spdk/lib/ftl/ftl_init.c
new file mode 100644
index 000000000..15a8c21c9
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_init.c
@@ -0,0 +1,1688 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/nvme.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/bdev_zone.h"
+#include "spdk/bdev_module.h"
+#include "spdk/config.h"
+
+#include "ftl_core.h"
+#include "ftl_io.h"
+#include "ftl_reloc.h"
+#include "ftl_band.h"
+#include "ftl_debug.h"
+
+#ifdef SPDK_CONFIG_PMDK
+#include "libpmem.h"
+#endif /* SPDK_CONFIG_PMDK */
+
+#define FTL_CORE_RING_SIZE	4096
+#define FTL_INIT_TIMEOUT	30
+#define FTL_NSID		1
+#define FTL_ZONE_INFO_COUNT	64
+
+/* Dummy bdev module used to to claim bdevs. */
+static struct spdk_bdev_module g_ftl_bdev_module = {
+	.name	= "ftl_lib",
+};
+
+struct ftl_dev_init_ctx {
+	/* Owner */
+	struct spdk_ftl_dev		*dev;
+	/* Initial arguments */
+	struct spdk_ftl_dev_init_opts	opts;
+	/* IO channel for zone info retrieving */
+	struct spdk_io_channel		*ioch;
+	/* Buffer for reading zone info  */
+	struct spdk_bdev_zone_info	info[FTL_ZONE_INFO_COUNT];
+	/* Currently read zone */
+	size_t				zone_id;
+	/* User's callback */
+	spdk_ftl_init_fn		cb_fn;
+	/* Callback's argument */
+	void				*cb_arg;
+	/* Thread to call the callback on */
+	struct spdk_thread		*thread;
+	/* Poller to check if the device has been destroyed/initialized */
+	struct spdk_poller		*poller;
+	/* Status to return for halt completion callback */
+	int				halt_complete_status;
+};
+
+static STAILQ_HEAD(, spdk_ftl_dev)	g_ftl_queue = STAILQ_HEAD_INITIALIZER(g_ftl_queue);
+static pthread_mutex_t			g_ftl_queue_lock = PTHREAD_MUTEX_INITIALIZER;
+static const struct spdk_ftl_conf	g_default_conf = {
+	.limits = {
+		/* 5 free bands  / 0 % host writes */
+		[SPDK_FTL_LIMIT_CRIT]  = { .thld = 5,  .limit = 0 },
+		/* 10 free bands / 5 % host writes */
+		[SPDK_FTL_LIMIT_HIGH]  = { .thld = 10, .limit = 5 },
+		/* 20 free bands / 40 % host writes */
+		[SPDK_FTL_LIMIT_LOW]   = { .thld = 20, .limit = 40 },
+		/* 40 free bands / 100 % host writes - defrag starts running */
+		[SPDK_FTL_LIMIT_START] = { .thld = 40, .limit = 100 },
+	},
+	/* 10 percent valid blocks */
+	.invalid_thld = 10,
+	/* 20% spare blocks */
+	.lba_rsvd = 20,
+	/* 6M write buffer per each IO channel */
+	.write_buffer_size = 6 * 1024 * 1024,
+	/* 90% band fill threshold */
+	.band_thld = 90,
+	/* Max 32 IO depth per band relocate */
+	.max_reloc_qdepth = 32,
+	/* Max 3 active band relocates */
+	.max_active_relocs = 3,
+	/* IO pool size per user thread (this should be adjusted to thread IO qdepth) */
+	.user_io_pool_size = 2048,
+	/*
+	 * If clear ftl will return error when restoring after a dirty shutdown
+	 * If set, last band will be padded, ftl will restore based only on closed bands - this
+	 * will result in lost data after recovery.
+	 */
+	.allow_open_bands = false,
+	.max_io_channels = 128,
+	.nv_cache = {
+		/* Maximum number of concurrent requests */
+		.max_request_cnt = 2048,
+		/* Maximum number of blocks per request */
+		.max_request_size = 16,
+	}
+};
+
+static int
+ftl_band_init_md(struct ftl_band *band)
+{
+	struct ftl_lba_map *lba_map = &band->lba_map;
+	int rc;
+
+	lba_map->vld = spdk_bit_array_create(ftl_get_num_blocks_in_band(band->dev));
+	if (!lba_map->vld) {
+		return -ENOMEM;
+	}
+
+	rc = pthread_spin_init(&lba_map->lock, PTHREAD_PROCESS_PRIVATE);
+	if (rc) {
+		spdk_bit_array_free(&lba_map->vld);
+		return rc;
+	}
+	ftl_band_md_clear(band);
+	return 0;
+}
+
+static int
+ftl_check_conf(const struct spdk_ftl_dev *dev, const struct spdk_ftl_conf *conf)
+{
+	size_t i;
+
+	if (conf->invalid_thld >= 100) {
+		return -1;
+	}
+	if (conf->lba_rsvd >= 100) {
+		return -1;
+	}
+	if (conf->lba_rsvd == 0) {
+		return -1;
+	}
+	if (conf->write_buffer_size == 0) {
+		return -1;
+	}
+	if (conf->write_buffer_size % FTL_BLOCK_SIZE != 0) {
+		return -1;
+	}
+
+	for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) {
+		if (conf->limits[i].limit > 100) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+ftl_dev_init_bands(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band, *pband;
+	unsigned int i;
+	int rc = 0;
+
+	LIST_INIT(&dev->free_bands);
+	LIST_INIT(&dev->shut_bands);
+
+	dev->num_free = 0;
+	dev->bands = calloc(ftl_get_num_bands(dev), sizeof(*dev->bands));
+	if (!dev->bands) {
+		return -1;
+	}
+
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		band = &dev->bands[i];
+		band->id = i;
+		band->dev = dev;
+		band->state = FTL_BAND_STATE_CLOSED;
+
+		if (LIST_EMPTY(&dev->shut_bands)) {
+			LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry);
+		} else {
+			LIST_INSERT_AFTER(pband, band, list_entry);
+		}
+		pband = band;
+
+		CIRCLEQ_INIT(&band->zones);
+		band->zone_buf = calloc(ftl_get_num_punits(dev), sizeof(*band->zone_buf));
+		if (!band->zone_buf) {
+			SPDK_ERRLOG("Failed to allocate block state table for band: [%u]\n", i);
+			rc = -1;
+			break;
+		}
+
+		rc = ftl_band_init_md(band);
+		if (rc) {
+			SPDK_ERRLOG("Failed to initialize metadata structures for band [%u]\n", i);
+			break;
+		}
+
+		band->reloc_bitmap = spdk_bit_array_create(ftl_get_num_bands(dev));
+		if (!band->reloc_bitmap) {
+			SPDK_ERRLOG("Failed to allocate band relocation bitmap\n");
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static void
+ftl_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
+{
+	struct spdk_ftl_dev *dev = event_ctx;
+
+	switch (type) {
+	case SPDK_BDEV_EVENT_REMOVE:
+		assert(0);
+		break;
+	case SPDK_BDEV_EVENT_MEDIA_MANAGEMENT:
+		assert(bdev == spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+		ftl_get_media_events(dev);
+	default:
+		break;
+	}
+}
+
+static int
+ftl_dev_init_nv_cache(struct spdk_ftl_dev *dev, const char *bdev_name)
+{
+	struct spdk_bdev *bdev;
+	struct spdk_ftl_conf *conf = &dev->conf;
+	struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+	char pool_name[128];
+	int rc;
+
+	if (!bdev_name) {
+		return 0;
+	}
+
+	bdev = spdk_bdev_get_by_name(bdev_name);
+	if (!bdev) {
+		SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name);
+		return -1;
+	}
+
+	if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb,
+			       dev, &nv_cache->bdev_desc)) {
+		SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name);
+		return -1;
+	}
+
+	if (spdk_bdev_module_claim_bdev(bdev, nv_cache->bdev_desc, &g_ftl_bdev_module)) {
+		spdk_bdev_close(nv_cache->bdev_desc);
+		nv_cache->bdev_desc = NULL;
+		SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name);
+		return -1;
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_FTL_INIT, "Using %s as write buffer cache\n",
+		     spdk_bdev_get_name(bdev));
+
+	if (spdk_bdev_get_block_size(bdev) != FTL_BLOCK_SIZE) {
+		SPDK_ERRLOG("Unsupported block size (%d)\n", spdk_bdev_get_block_size(bdev));
+		return -1;
+	}
+
+	if (!spdk_bdev_is_md_separate(bdev)) {
+		SPDK_ERRLOG("Bdev %s doesn't support separate metadata buffer IO\n",
+			    spdk_bdev_get_name(bdev));
+		return -1;
+	}
+
+	if (spdk_bdev_get_md_size(bdev) < sizeof(uint64_t)) {
+		SPDK_ERRLOG("Bdev's %s metadata is too small (%"PRIu32")\n",
+			    spdk_bdev_get_name(bdev), spdk_bdev_get_md_size(bdev));
+		return -1;
+	}
+
+	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
+		SPDK_ERRLOG("Unsupported DIF type used by bdev %s\n",
+			    spdk_bdev_get_name(bdev));
+		return -1;
+	}
+
+	/* The cache needs to be capable of storing at least two full bands. This requirement comes
+	 * from the fact that cache works as a protection against power loss, so before the data
+	 * inside the cache can be overwritten, the band it's stored on has to be closed. Plus one
+	 * extra block is needed to store the header.
+	 */
+	if (spdk_bdev_get_num_blocks(bdev) < ftl_get_num_blocks_in_band(dev) * 2 + 1) {
+		SPDK_ERRLOG("Insufficient number of blocks for write buffer cache (available: %"
+			    PRIu64", required: %"PRIu64")\n", spdk_bdev_get_num_blocks(bdev),
+			    ftl_get_num_blocks_in_band(dev) * 2 + 1);
+		return -1;
+	}
+
+	rc = snprintf(pool_name, sizeof(pool_name), "ftl-nvpool-%p", dev);
+	if (rc < 0 || rc >= 128) {
+		return -1;
+	}
+
+	nv_cache->md_pool = spdk_mempool_create(pool_name, conf->nv_cache.max_request_cnt,
+						spdk_bdev_get_md_size(bdev) *
+						conf->nv_cache.max_request_size,
+						SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+						SPDK_ENV_SOCKET_ID_ANY);
+	if (!nv_cache->md_pool) {
+		SPDK_ERRLOG("Failed to initialize non-volatile cache metadata pool\n");
+		return -1;
+	}
+
+	nv_cache->dma_buf = spdk_dma_zmalloc(FTL_BLOCK_SIZE, spdk_bdev_get_buf_align(bdev), NULL);
+	if (!nv_cache->dma_buf) {
+		SPDK_ERRLOG("Memory allocation failure\n");
+		return -1;
+	}
+
+	if (pthread_spin_init(&nv_cache->lock, PTHREAD_PROCESS_PRIVATE)) {
+		SPDK_ERRLOG("Failed to initialize cache lock\n");
+		return -1;
+	}
+
+	nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET;
+	nv_cache->num_data_blocks = spdk_bdev_get_num_blocks(bdev) - 1;
+	nv_cache->num_available = nv_cache->num_data_blocks;
+	nv_cache->ready = false;
+
+	return 0;
+}
+
+void
+spdk_ftl_conf_init_defaults(struct spdk_ftl_conf *conf)
+{
+	*conf = g_default_conf;
+}
+
+static void
+ftl_lba_map_request_ctor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx)
+{
+	struct ftl_lba_map_request *request = obj;
+	struct spdk_ftl_dev *dev = opaque;
+
+	request->segments = spdk_bit_array_create(spdk_divide_round_up(
+				    ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK));
+}
+
+static int
+ftl_init_media_events_pool(struct spdk_ftl_dev *dev)
+{
+	char pool_name[128];
+	int rc;
+
+	rc = snprintf(pool_name, sizeof(pool_name), "ftl-media-%p", dev);
+	if (rc < 0 || rc >= (int)sizeof(pool_name)) {
+		SPDK_ERRLOG("Failed to create media pool name\n");
+		return -1;
+	}
+
+	dev->media_events_pool = spdk_mempool_create(pool_name, 1024,
+				 sizeof(struct ftl_media_event),
+				 SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+				 SPDK_ENV_SOCKET_ID_ANY);
+	if (!dev->media_events_pool) {
+		SPDK_ERRLOG("Failed to create media events pool\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+ftl_init_lba_map_pools(struct spdk_ftl_dev *dev)
+{
+#define POOL_NAME_LEN 128
+	char pool_name[POOL_NAME_LEN];
+	int rc;
+
+	rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lba-pool");
+	if (rc < 0 || rc >= POOL_NAME_LEN) {
+		return -ENAMETOOLONG;
+	}
+
+	/* We need to reserve at least 2 buffers for band close / open sequence
+	 * alone, plus additional (8) buffers for handling write errors.
+	 * TODO: This memory pool is utilized only by core thread - it introduce
+	 * unnecessary overhead and should be replaced by different data structure.
+	 */
+	dev->lba_pool = spdk_mempool_create(pool_name, 2 + 8,
+					    ftl_lba_map_pool_elem_size(dev),
+					    SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+					    SPDK_ENV_SOCKET_ID_ANY);
+	if (!dev->lba_pool) {
+		return -ENOMEM;
+	}
+
+	rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lbareq-pool");
+	if (rc < 0 || rc >= POOL_NAME_LEN) {
+		return -ENAMETOOLONG;
+	}
+
+	dev->lba_request_pool = spdk_mempool_create_ctor(pool_name,
+				dev->conf.max_reloc_qdepth * dev->conf.max_active_relocs,
+				sizeof(struct ftl_lba_map_request),
+				SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+				SPDK_ENV_SOCKET_ID_ANY,
+				ftl_lba_map_request_ctor,
+				dev);
+	if (!dev->lba_request_pool) {
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void
+ftl_init_wptr_list(struct spdk_ftl_dev *dev)
+{
+	LIST_INIT(&dev->wptr_list);
+	LIST_INIT(&dev->flush_list);
+	LIST_INIT(&dev->band_flush_list);
+}
+
+static size_t
+ftl_dev_band_max_seq(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band;
+	size_t seq = 0;
+
+	LIST_FOREACH(band, &dev->shut_bands, list_entry) {
+		if (band->seq > seq) {
+			seq = band->seq;
+		}
+	}
+
+	return seq;
+}
+
+static void
+_ftl_init_bands_state(void *ctx)
+{
+	struct ftl_band *band, *temp_band;
+	struct spdk_ftl_dev *dev = ctx;
+
+	dev->seq = ftl_dev_band_max_seq(dev);
+
+	LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) {
+		if (!band->lba_map.num_vld) {
+			ftl_band_set_state(band, FTL_BAND_STATE_FREE);
+		}
+	}
+
+	ftl_reloc_resume(dev->reloc);
+	/* Clear the limit applications as they're incremented incorrectly by */
+	/* the initialization code */
+	memset(dev->stats.limits, 0, sizeof(dev->stats.limits));
+}
+
+static int
+ftl_init_num_free_bands(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band;
+	int cnt = 0;
+
+	LIST_FOREACH(band, &dev->shut_bands, list_entry) {
+		if (band->num_zones && !band->lba_map.num_vld) {
+			cnt++;
+		}
+	}
+	return cnt;
+}
+
+static int
+ftl_init_bands_state(struct spdk_ftl_dev *dev)
+{
+	/* TODO: Should we abort initialization or expose read only device */
+	/* if there is no free bands? */
+	/* If we abort initialization should we depend on condition that */
+	/* we have no free bands or should we have some minimal number of */
+	/* free bands? */
+	if (!ftl_init_num_free_bands(dev)) {
+		return -1;
+	}
+
+	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_init_bands_state, dev);
+	return 0;
+}
+
+static void
+_ftl_dev_init_core_thread(void *ctx)
+{
+	struct spdk_ftl_dev *dev = ctx;
+
+	dev->core_poller = SPDK_POLLER_REGISTER(ftl_task_core, dev, 0);
+	if (!dev->core_poller) {
+		SPDK_ERRLOG("Unable to register core poller\n");
+		assert(0);
+	}
+
+	dev->ioch = spdk_get_io_channel(dev);
+}
+
+static int
+ftl_dev_init_core_thread(struct spdk_ftl_dev *dev, const struct spdk_ftl_dev_init_opts *opts)
+{
+	if (!opts->core_thread) {
+		return -1;
+	}
+
+	dev->core_thread = opts->core_thread;
+
+	spdk_thread_send_msg(opts->core_thread, _ftl_dev_init_core_thread, dev);
+	return 0;
+}
+
+static int
+ftl_dev_l2p_alloc_pmem(struct spdk_ftl_dev *dev, size_t l2p_size, const char *l2p_path)
+{
+#ifdef SPDK_CONFIG_PMDK
+	int is_pmem;
+
+	if ((dev->l2p = pmem_map_file(l2p_path, 0,
+				      0, 0, &dev->l2p_pmem_len, &is_pmem)) == NULL) {
+		SPDK_ERRLOG("Failed to mmap l2p_path\n");
+		return -1;
+	}
+
+	if (!is_pmem) {
+		SPDK_NOTICELOG("l2p_path mapped on non-pmem device\n");
+	}
+
+	if (dev->l2p_pmem_len < l2p_size) {
+		SPDK_ERRLOG("l2p_path file is too small\n");
+		return -1;
+	}
+
+	pmem_memset_persist(dev->l2p, FTL_ADDR_INVALID, l2p_size);
+
+	return 0;
+#else /* SPDK_CONFIG_PMDK */
+	SPDK_ERRLOG("Libpmem not available, cannot use pmem l2p_path\n");
+	return -1;
+#endif /* SPDK_CONFIG_PMDK */
+}
+
+static int
+ftl_dev_l2p_alloc_dram(struct spdk_ftl_dev *dev, size_t l2p_size)
+{
+	dev->l2p = malloc(l2p_size);
+	if (!dev->l2p) {
+		SPDK_ERRLOG("Failed to allocate l2p table\n");
+		return -1;
+	}
+
+	memset(dev->l2p, FTL_ADDR_INVALID, l2p_size);
+
+	return 0;
+}
+
+static int
+ftl_dev_l2p_alloc(struct spdk_ftl_dev *dev)
+{
+	size_t addr_size = dev->addr_len >= 32 ? 8 : 4;
+	size_t l2p_size = dev->num_lbas * addr_size;
+	const char *l2p_path = dev->conf.l2p_path;
+
+	if (dev->num_lbas == 0) {
+		SPDK_ERRLOG("Invalid l2p table size\n");
+		return -1;
+	}
+
+	if (dev->l2p) {
+		SPDK_ERRLOG("L2p table already allocated\n");
+		return -1;
+	}
+
+	dev->l2p_pmem_len = 0;
+	if (l2p_path) {
+		return ftl_dev_l2p_alloc_pmem(dev, l2p_size, l2p_path);
+	} else {
+		return ftl_dev_l2p_alloc_dram(dev, l2p_size);
+	}
+}
+
+static void
+ftl_dev_free_init_ctx(struct ftl_dev_init_ctx *init_ctx)
+{
+	if (!init_ctx) {
+		return;
+	}
+
+	if (init_ctx->ioch) {
+		spdk_put_io_channel(init_ctx->ioch);
+	}
+
+	free(init_ctx);
+}
+
+static void
+ftl_call_init_complete_cb(void *ctx)
+{
+	struct ftl_dev_init_ctx *init_ctx = ctx;
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+
+	if (init_ctx->cb_fn != NULL) {
+		init_ctx->cb_fn(dev, init_ctx->cb_arg, 0);
+	}
+
+	ftl_dev_free_init_ctx(init_ctx);
+}
+
+static void
+ftl_init_complete(struct ftl_dev_init_ctx *init_ctx)
+{
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+
+	pthread_mutex_lock(&g_ftl_queue_lock);
+	STAILQ_INSERT_HEAD(&g_ftl_queue, dev, stailq);
+	pthread_mutex_unlock(&g_ftl_queue_lock);
+
+	dev->initialized = 1;
+
+	spdk_thread_send_msg(init_ctx->thread, ftl_call_init_complete_cb, init_ctx);
+}
+
+static void
+ftl_init_fail_cb(struct spdk_ftl_dev *dev, void *ctx, int status)
+{
+	struct ftl_dev_init_ctx *init_ctx = ctx;
+
+	if (init_ctx->cb_fn != NULL) {
+		init_ctx->cb_fn(NULL, init_ctx->cb_arg, -ENODEV);
+	}
+
+	ftl_dev_free_init_ctx(init_ctx);
+}
+
+static int ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg,
+			struct spdk_thread *thread);
+
+static void
+ftl_init_fail(struct ftl_dev_init_ctx *init_ctx)
+{
+	if (ftl_dev_free(init_ctx->dev, ftl_init_fail_cb, init_ctx, init_ctx->thread)) {
+		SPDK_ERRLOG("Unable to free the device\n");
+		assert(0);
+	}
+}
+
+static void
+ftl_write_nv_cache_md_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *init_ctx = cb_arg;
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Writing non-volatile cache's metadata header failed\n");
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	dev->nv_cache.ready = true;
+	ftl_init_complete(init_ctx);
+}
+
+static void
+ftl_clear_nv_cache_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *init_ctx = cb_arg;
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+	struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Unable to clear the non-volatile cache bdev\n");
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	nv_cache->phase = 1;
+	if (ftl_nv_cache_write_header(nv_cache, false, ftl_write_nv_cache_md_cb, init_ctx)) {
+		SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n");
+		ftl_init_fail(init_ctx);
+	}
+}
+
+static void
+_ftl_nv_cache_scrub(void *ctx)
+{
+	struct ftl_dev_init_ctx *init_ctx = ctx;
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+	int rc;
+
+	rc = ftl_nv_cache_scrub(&dev->nv_cache, ftl_clear_nv_cache_cb, init_ctx);
+
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to clear the non-volatile cache bdev: %s\n",
+			    spdk_strerror(-rc));
+		ftl_init_fail(init_ctx);
+	}
+}
+
+static int
+ftl_setup_initial_state(struct ftl_dev_init_ctx *init_ctx)
+{
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+	struct spdk_ftl_conf *conf = &dev->conf;
+	size_t i;
+
+	spdk_uuid_generate(&dev->uuid);
+
+	dev->num_lbas = 0;
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		dev->num_lbas += ftl_band_num_usable_blocks(&dev->bands[i]);
+	}
+
+	dev->num_lbas = (dev->num_lbas * (100 - conf->lba_rsvd)) / 100;
+
+	if (ftl_dev_l2p_alloc(dev)) {
+		SPDK_ERRLOG("Unable to init l2p table\n");
+		return -1;
+	}
+
+	if (ftl_init_bands_state(dev)) {
+		SPDK_ERRLOG("Unable to finish the initialization\n");
+		return -1;
+	}
+
+	if (!ftl_dev_has_nv_cache(dev)) {
+		ftl_init_complete(init_ctx);
+	} else {
+		spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_nv_cache_scrub, init_ctx);
+	}
+
+	return 0;
+}
+
+static void
+ftl_restore_nv_cache_cb(struct ftl_restore *restore, int status, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *init_ctx = cb_arg;
+
+	if (spdk_unlikely(status != 0)) {
+		SPDK_ERRLOG("Failed to restore the non-volatile cache state\n");
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	ftl_init_complete(init_ctx);
+}
+
+static void
+ftl_restore_device_cb(struct ftl_restore *restore, int status, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *init_ctx = cb_arg;
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+
+	if (status) {
+		SPDK_ERRLOG("Failed to restore the device from the SSD\n");
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	if (ftl_init_bands_state(dev)) {
+		SPDK_ERRLOG("Unable to finish the initialization\n");
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	if (!ftl_dev_has_nv_cache(dev)) {
+		ftl_init_complete(init_ctx);
+		return;
+	}
+
+	ftl_restore_nv_cache(restore, ftl_restore_nv_cache_cb, init_ctx);
+}
+
+static void
+ftl_restore_md_cb(struct ftl_restore *restore, int status, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *init_ctx = cb_arg;
+
+	if (status) {
+		SPDK_ERRLOG("Failed to restore the metadata from the SSD\n");
+		goto error;
+	}
+
+	/* After the metadata is read it should be possible to allocate the L2P */
+	if (ftl_dev_l2p_alloc(init_ctx->dev)) {
+		SPDK_ERRLOG("Failed to allocate the L2P\n");
+		goto error;
+	}
+
+	if (ftl_restore_device(restore, ftl_restore_device_cb, init_ctx)) {
+		SPDK_ERRLOG("Failed to start device restoration from the SSD\n");
+		goto error;
+	}
+
+	return;
+error:
+	ftl_init_fail(init_ctx);
+}
+
+static int
+ftl_restore_state(struct ftl_dev_init_ctx *init_ctx)
+{
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+
+	dev->uuid = init_ctx->opts.uuid;
+
+	if (ftl_restore_md(dev, ftl_restore_md_cb, init_ctx)) {
+		SPDK_ERRLOG("Failed to start metadata restoration from the SSD\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+ftl_dev_update_bands(struct spdk_ftl_dev *dev)
+{
+	struct ftl_band *band, *temp_band;
+	size_t i;
+
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		band = &dev->bands[i];
+		band->tail_md_addr = ftl_band_tail_md_addr(band);
+	}
+
+	/* Remove band from shut_bands list to prevent further processing */
+	/* if all blocks on this band are bad */
+	LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) {
+		if (!band->num_zones) {
+			dev->num_bands--;
+			LIST_REMOVE(band, list_entry);
+		}
+	}
+}
+
+static void
+ftl_dev_init_state(struct ftl_dev_init_ctx *init_ctx)
+{
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+
+	ftl_dev_update_bands(dev);
+
+	if (ftl_dev_init_core_thread(dev, &init_ctx->opts)) {
+		SPDK_ERRLOG("Unable to initialize device thread\n");
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	if (init_ctx->opts.mode & SPDK_FTL_MODE_CREATE) {
+		if (ftl_setup_initial_state(init_ctx)) {
+			SPDK_ERRLOG("Failed to setup initial state of the device\n");
+			ftl_init_fail(init_ctx);
+			return;
+		}
+	} else {
+		if (ftl_restore_state(init_ctx)) {
+			SPDK_ERRLOG("Unable to restore device's state from the SSD\n");
+			ftl_init_fail(init_ctx);
+			return;
+		}
+	}
+}
+
+static void ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx);
+
+static void
+ftl_dev_get_zone_info_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *init_ctx = cb_arg;
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+	struct ftl_band *band;
+	struct ftl_zone *zone;
+	struct ftl_addr addr;
+	size_t i, zones_left, num_zones;
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id);
+		ftl_init_fail(init_ctx);
+		return;
+	}
+
+	zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev));
+	num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT);
+
+	for (i = 0; i < num_zones; ++i) {
+		addr.offset = init_ctx->info[i].zone_id;
+		band = &dev->bands[ftl_addr_get_band(dev, addr)];
+		zone = &band->zone_buf[ftl_addr_get_punit(dev, addr)];
+		zone->info = init_ctx->info[i];
+
+		/* TODO: add support for zone capacity less than zone size */
+		if (zone->info.capacity != ftl_get_num_blocks_in_zone(dev)) {
+			zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE;
+			SPDK_ERRLOG("Zone capacity is not equal zone size for "
+				    "zone id: %"PRIu64"\n", init_ctx->zone_id);
+		}
+
+		/* Set write pointer to the last block plus one for zone in full state */
+		if (zone->info.state == SPDK_BDEV_ZONE_STATE_FULL) {
+			zone->info.write_pointer = zone->info.zone_id + zone->info.capacity;
+		}
+
+		if (zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE) {
+			band->num_zones++;
+			CIRCLEQ_INSERT_TAIL(&band->zones, zone, circleq);
+		}
+	}
+
+	init_ctx->zone_id = init_ctx->zone_id + num_zones * ftl_get_num_blocks_in_zone(dev);
+
+	ftl_dev_get_zone_info(init_ctx);
+}
+
+static void
+ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx)
+{
+	struct spdk_ftl_dev *dev = init_ctx->dev;
+	size_t zones_left, num_zones;
+	int rc;
+
+	zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev));
+	if (zones_left == 0) {
+		ftl_dev_init_state(init_ctx);
+		return;
+	}
+
+	num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT);
+
+	rc = spdk_bdev_get_zone_info(dev->base_bdev_desc, init_ctx->ioch,
+				     init_ctx->zone_id, num_zones, init_ctx->info,
+				     ftl_dev_get_zone_info_cb, init_ctx);
+
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id);
+		ftl_init_fail(init_ctx);
+	}
+}
+
+static int
+ftl_dev_init_zones(struct ftl_dev_init_ctx *init_ctx)
+{
+	struct spdk_ftl_dev *dev =  init_ctx->dev;
+
+	init_ctx->zone_id = 0;
+	init_ctx->ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc);
+	if (!init_ctx->ioch) {
+		SPDK_ERRLOG("Failed to get base bdev IO channel\n");
+		return -1;
+	}
+
+	ftl_dev_get_zone_info(init_ctx);
+
+	return 0;
+}
+
+struct _ftl_io_channel {
+	struct ftl_io_channel *ioch;
+};
+
+struct ftl_io_channel *
+ftl_io_channel_get_ctx(struct spdk_io_channel *ioch)
+{
+	struct _ftl_io_channel *_ioch = spdk_io_channel_get_ctx(ioch);
+
+	return _ioch->ioch;
+}
+
+static void
+ftl_io_channel_register(void *ctx)
+{
+	struct ftl_io_channel *ioch = ctx;
+	struct spdk_ftl_dev *dev = ioch->dev;
+	uint32_t ioch_index;
+
+	for (ioch_index = 0; ioch_index < dev->conf.max_io_channels; ++ioch_index) {
+		if (dev->ioch_array[ioch_index] == NULL) {
+			dev->ioch_array[ioch_index] = ioch;
+			ioch->index = ioch_index;
+			break;
+		}
+	}
+
+	assert(ioch_index < dev->conf.max_io_channels);
+	TAILQ_INSERT_TAIL(&dev->ioch_queue, ioch, tailq);
+}
+
+static int
+ftl_io_channel_init_wbuf(struct ftl_io_channel *ioch)
+{
+	struct spdk_ftl_dev *dev = ioch->dev;
+	struct ftl_wbuf_entry *entry;
+	uint32_t i;
+	int rc;
+
+	ioch->num_entries = dev->conf.write_buffer_size / FTL_BLOCK_SIZE;
+	ioch->wbuf_entries = calloc(ioch->num_entries, sizeof(*ioch->wbuf_entries));
+	if (ioch->wbuf_entries == NULL) {
+		SPDK_ERRLOG("Failed to allocate write buffer entry array\n");
+		return -1;
+	}
+
+	ioch->qdepth_limit = ioch->num_entries;
+	ioch->wbuf_payload = spdk_zmalloc(dev->conf.write_buffer_size, FTL_BLOCK_SIZE, NULL,
+					  SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (ioch->wbuf_payload == NULL) {
+		SPDK_ERRLOG("Failed to allocate write buffer payload\n");
+		goto error_entries;
+	}
+
+	ioch->free_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC,
+					    spdk_align32pow2(ioch->num_entries + 1),
+					    SPDK_ENV_SOCKET_ID_ANY);
+	if (ioch->free_queue == NULL) {
+		SPDK_ERRLOG("Failed to allocate free queue\n");
+		goto error_payload;
+	}
+
+	ioch->submit_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC,
+					      spdk_align32pow2(ioch->num_entries + 1),
+					      SPDK_ENV_SOCKET_ID_ANY);
+	if (ioch->submit_queue == NULL) {
+		SPDK_ERRLOG("Failed to allocate submit queue\n");
+		goto error_free_queue;
+	}
+
+	for (i = 0; i < ioch->num_entries; ++i) {
+		entry = &ioch->wbuf_entries[i];
+		entry->payload = (char *)ioch->wbuf_payload + i * FTL_BLOCK_SIZE;
+		entry->ioch = ioch;
+		entry->index = i;
+		entry->addr.offset = FTL_ADDR_INVALID;
+
+		rc = pthread_spin_init(&entry->lock, PTHREAD_PROCESS_PRIVATE);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to initialize spinlock\n");
+			goto error_spinlock;
+		}
+
+		spdk_ring_enqueue(ioch->free_queue, (void **)&entry, 1, NULL);
+	}
+
+	return 0;
+error_spinlock:
+	for (; i > 0; --i) {
+		pthread_spin_destroy(&ioch->wbuf_entries[i - 1].lock);
+	}
+
+	spdk_ring_free(ioch->submit_queue);
+error_free_queue:
+	spdk_ring_free(ioch->free_queue);
+error_payload:
+	spdk_free(ioch->wbuf_payload);
+error_entries:
+	free(ioch->wbuf_entries);
+
+	return -1;
+}
+
+static int
+ftl_io_channel_create_cb(void *io_device, void *ctx)
+{
+	struct spdk_ftl_dev *dev = io_device;
+	struct _ftl_io_channel *_ioch = ctx;
+	struct ftl_io_channel *ioch;
+	uint32_t num_io_channels;
+	char mempool_name[32];
+	int rc;
+
+	num_io_channels = __atomic_fetch_add(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST);
+	if (num_io_channels >= dev->conf.max_io_channels) {
+		SPDK_ERRLOG("Reached maximum number of IO channels\n");
+		__atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST);
+		return -1;
+	}
+
+	ioch = calloc(1, sizeof(*ioch));
+	if (ioch == NULL) {
+		SPDK_ERRLOG("Failed to allocate IO channel\n");
+		return -1;
+	}
+
+	rc = snprintf(mempool_name, sizeof(mempool_name), "ftl_io_%p", ioch);
+	if (rc < 0 || rc >= (int)sizeof(mempool_name)) {
+		SPDK_ERRLOG("Failed to create IO channel pool name\n");
+		free(ioch);
+		return -1;
+	}
+
+	ioch->cache_ioch = NULL;
+	ioch->index = FTL_IO_CHANNEL_INDEX_INVALID;
+	ioch->dev = dev;
+	ioch->elem_size = sizeof(struct ftl_md_io);
+	ioch->io_pool = spdk_mempool_create(mempool_name,
+					    dev->conf.user_io_pool_size,
+					    ioch->elem_size,
+					    0,
+					    SPDK_ENV_SOCKET_ID_ANY);
+	if (!ioch->io_pool) {
+		SPDK_ERRLOG("Failed to create IO channel's IO pool\n");
+		free(ioch);
+		return -1;
+	}
+
+	ioch->base_ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc);
+	if (!ioch->base_ioch) {
+		SPDK_ERRLOG("Failed to create base bdev IO channel\n");
+		goto fail_ioch;
+	}
+
+	if (ftl_dev_has_nv_cache(dev)) {
+		ioch->cache_ioch = spdk_bdev_get_io_channel(dev->nv_cache.bdev_desc);
+		if (!ioch->cache_ioch) {
+			SPDK_ERRLOG("Failed to create cache IO channel\n");
+			goto fail_cache;
+		}
+	}
+
+	TAILQ_INIT(&ioch->write_cmpl_queue);
+	TAILQ_INIT(&ioch->retry_queue);
+	ioch->poller = SPDK_POLLER_REGISTER(ftl_io_channel_poll, ioch, 0);
+	if (!ioch->poller) {
+		SPDK_ERRLOG("Failed to register IO channel poller\n");
+		goto fail_poller;
+	}
+
+	if (ftl_io_channel_init_wbuf(ioch)) {
+		SPDK_ERRLOG("Failed to initialize IO channel's write buffer\n");
+		goto fail_wbuf;
+	}
+
+	_ioch->ioch = ioch;
+
+	spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_register, ioch);
+
+	return 0;
+fail_wbuf:
+	spdk_poller_unregister(&ioch->poller);
+fail_poller:
+	if (ioch->cache_ioch) {
+		spdk_put_io_channel(ioch->cache_ioch);
+	}
+fail_cache:
+	spdk_put_io_channel(ioch->base_ioch);
+fail_ioch:
+	spdk_mempool_free(ioch->io_pool);
+	free(ioch);
+
+	return -1;
+}
+
+static void
+ftl_io_channel_unregister(void *ctx)
+{
+	struct ftl_io_channel *ioch = ctx;
+	struct spdk_ftl_dev *dev = ioch->dev;
+	uint32_t i, num_io_channels __attribute__((unused));
+
+	assert(ioch->index < dev->conf.max_io_channels);
+	assert(dev->ioch_array[ioch->index] == ioch);
+
+	dev->ioch_array[ioch->index] = NULL;
+	TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq);
+
+	num_io_channels = __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST);
+	assert(num_io_channels > 0);
+
+	for (i = 0; i < ioch->num_entries; ++i) {
+		pthread_spin_destroy(&ioch->wbuf_entries[i].lock);
+	}
+
+	spdk_mempool_free(ioch->io_pool);
+	spdk_ring_free(ioch->free_queue);
+	spdk_ring_free(ioch->submit_queue);
+	spdk_free(ioch->wbuf_payload);
+	free(ioch->wbuf_entries);
+	free(ioch);
+}
+
+static void
+_ftl_io_channel_destroy_cb(void *ctx)
+{
+	struct ftl_io_channel *ioch = ctx;
+	struct spdk_ftl_dev *dev = ioch->dev;
+	uint32_t i;
+
+	/* Do not destroy the channel if some of its entries are still in use */
+	if (spdk_ring_count(ioch->free_queue) != ioch->num_entries) {
+		spdk_thread_send_msg(spdk_get_thread(), _ftl_io_channel_destroy_cb, ctx);
+		return;
+	}
+
+	/* Evict all valid entries from cache */
+	for (i = 0; i < ioch->num_entries; ++i) {
+		ftl_evict_cache_entry(dev, &ioch->wbuf_entries[i]);
+	}
+
+	spdk_poller_unregister(&ioch->poller);
+
+	spdk_put_io_channel(ioch->base_ioch);
+	if (ioch->cache_ioch) {
+		spdk_put_io_channel(ioch->cache_ioch);
+	}
+
+	ioch->base_ioch = NULL;
+	ioch->cache_ioch = NULL;
+
+	spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_unregister, ioch);
+}
+
+static void
+ftl_io_channel_destroy_cb(void *io_device, void *ctx)
+{
+	struct _ftl_io_channel *_ioch = ctx;
+	struct ftl_io_channel *ioch = _ioch->ioch;
+
+	/* Mark the IO channel as being flush to force out any unwritten entries */
+	ioch->flush = true;
+
+	_ftl_io_channel_destroy_cb(ioch);
+}
+
+static int
+ftl_dev_init_io_channel(struct spdk_ftl_dev *dev)
+{
+	struct ftl_batch *batch;
+	uint32_t i;
+
+	/* Align the IO channels to nearest power of 2 to allow for easy addr bit shift */
+	dev->conf.max_io_channels = spdk_align32pow2(dev->conf.max_io_channels);
+	dev->ioch_shift = spdk_u32log2(dev->conf.max_io_channels);
+
+	dev->ioch_array = calloc(dev->conf.max_io_channels, sizeof(*dev->ioch_array));
+	if (!dev->ioch_array) {
+		SPDK_ERRLOG("Failed to allocate IO channel array\n");
+		return -1;
+	}
+
+	if (dev->md_size > 0) {
+		dev->md_buf = spdk_zmalloc(dev->md_size * dev->xfer_size * FTL_BATCH_COUNT,
+					   dev->md_size, NULL, SPDK_ENV_LCORE_ID_ANY,
+					   SPDK_MALLOC_DMA);
+		if (dev->md_buf == NULL) {
+			SPDK_ERRLOG("Failed to allocate metadata buffer\n");
+			return -1;
+		}
+	}
+
+	dev->iov_buf = calloc(FTL_BATCH_COUNT, dev->xfer_size * sizeof(struct iovec));
+	if (!dev->iov_buf) {
+		SPDK_ERRLOG("Failed to allocate iovec buffer\n");
+		return -1;
+	}
+
+	TAILQ_INIT(&dev->free_batches);
+	TAILQ_INIT(&dev->pending_batches);
+	TAILQ_INIT(&dev->ioch_queue);
+
+	for (i = 0; i < FTL_BATCH_COUNT; ++i) {
+		batch = &dev->batch_array[i];
+		batch->iov = &dev->iov_buf[i * dev->xfer_size];
+		batch->num_entries = 0;
+		batch->index = i;
+		TAILQ_INIT(&batch->entries);
+		if (dev->md_buf != NULL) {
+			batch->metadata = (char *)dev->md_buf + i * dev->xfer_size * dev->md_size;
+		}
+
+		TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq);
+	}
+
+	dev->num_io_channels = 0;
+
+	spdk_io_device_register(dev, ftl_io_channel_create_cb, ftl_io_channel_destroy_cb,
+				sizeof(struct _ftl_io_channel),
+				NULL);
+
+	return 0;
+}
+
+static int
+ftl_dev_init_base_bdev(struct spdk_ftl_dev *dev, const char *bdev_name)
+{
+	uint32_t block_size;
+	uint64_t num_blocks;
+	struct spdk_bdev *bdev;
+
+	bdev = spdk_bdev_get_by_name(bdev_name);
+	if (!bdev) {
+		SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name);
+		return -1;
+	}
+
+	if (!spdk_bdev_is_zoned(bdev)) {
+		SPDK_ERRLOG("Bdev dosen't support zone capabilities: %s\n",
+			    spdk_bdev_get_name(bdev));
+		return -1;
+	}
+
+	if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb,
+			       dev, &dev->base_bdev_desc)) {
+		SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name);
+		return -1;
+	}
+
+	if (spdk_bdev_module_claim_bdev(bdev, dev->base_bdev_desc, &g_ftl_bdev_module)) {
+		spdk_bdev_close(dev->base_bdev_desc);
+		dev->base_bdev_desc = NULL;
+		SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name);
+		return -1;
+	}
+
+	dev->xfer_size = spdk_bdev_get_write_unit_size(bdev);
+	dev->md_size = spdk_bdev_get_md_size(bdev);
+
+	block_size = spdk_bdev_get_block_size(bdev);
+	if (block_size != FTL_BLOCK_SIZE) {
+		SPDK_ERRLOG("Unsupported block size (%"PRIu32")\n", block_size);
+		return -1;
+	}
+
+	num_blocks = spdk_bdev_get_num_blocks(bdev);
+	if (num_blocks % ftl_get_num_punits(dev)) {
+		SPDK_ERRLOG("Unsupported geometry. Base bdev block count must be multiple "
+			    "of optimal number of zones.\n");
+		return -1;
+	}
+
+	if (ftl_is_append_supported(dev) &&
+	    !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZONE_APPEND)) {
+		SPDK_ERRLOG("Bdev dosen't support append: %s\n",
+			    spdk_bdev_get_name(bdev));
+		return -1;
+	}
+
+	dev->num_bands = num_blocks / (ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev));
+	dev->addr_len = spdk_u64log2(num_blocks) + 1;
+
+	return 0;
+}
+
+static void
+ftl_lba_map_request_dtor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx)
+{
+	struct ftl_lba_map_request *request = obj;
+
+	spdk_bit_array_free(&request->segments);
+}
+
+static void
+ftl_release_bdev(struct spdk_bdev_desc *bdev_desc)
+{
+	if (!bdev_desc) {
+		return;
+	}
+
+	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_desc));
+	spdk_bdev_close(bdev_desc);
+}
+
+static void
+ftl_dev_free_sync(struct spdk_ftl_dev *dev)
+{
+	struct spdk_ftl_dev *iter;
+	size_t i;
+
+	if (!dev) {
+		return;
+	}
+
+	pthread_mutex_lock(&g_ftl_queue_lock);
+	STAILQ_FOREACH(iter, &g_ftl_queue, stailq) {
+		if (iter == dev) {
+			STAILQ_REMOVE(&g_ftl_queue, dev, spdk_ftl_dev, stailq);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_ftl_queue_lock);
+
+	assert(LIST_EMPTY(&dev->wptr_list));
+	assert(dev->current_batch == NULL);
+
+	ftl_dev_dump_bands(dev);
+	ftl_dev_dump_stats(dev);
+
+	if (dev->bands) {
+		for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+			free(dev->bands[i].zone_buf);
+			spdk_bit_array_free(&dev->bands[i].lba_map.vld);
+			spdk_bit_array_free(&dev->bands[i].reloc_bitmap);
+		}
+	}
+
+	spdk_dma_free(dev->nv_cache.dma_buf);
+
+	spdk_mempool_free(dev->lba_pool);
+	spdk_mempool_free(dev->nv_cache.md_pool);
+	spdk_mempool_free(dev->media_events_pool);
+	if (dev->lba_request_pool) {
+		spdk_mempool_obj_iter(dev->lba_request_pool, ftl_lba_map_request_dtor, NULL);
+	}
+	spdk_mempool_free(dev->lba_request_pool);
+
+	ftl_reloc_free(dev->reloc);
+
+	ftl_release_bdev(dev->nv_cache.bdev_desc);
+	ftl_release_bdev(dev->base_bdev_desc);
+
+	spdk_free(dev->md_buf);
+
+	assert(dev->num_io_channels == 0);
+	free(dev->ioch_array);
+	free(dev->iov_buf);
+	free(dev->name);
+	free(dev->bands);
+	if (dev->l2p_pmem_len != 0) {
+#ifdef SPDK_CONFIG_PMDK
+		pmem_unmap(dev->l2p, dev->l2p_pmem_len);
+#endif /* SPDK_CONFIG_PMDK */
+	} else {
+		free(dev->l2p);
+	}
+	free((char *)dev->conf.l2p_path);
+	free(dev);
+}
+
+int
+spdk_ftl_dev_init(const struct spdk_ftl_dev_init_opts *_opts, spdk_ftl_init_fn cb_fn, void *cb_arg)
+{
+	struct spdk_ftl_dev *dev;
+	struct spdk_ftl_dev_init_opts opts = *_opts;
+	struct ftl_dev_init_ctx *init_ctx = NULL;
+	int rc = -ENOMEM;
+
+	dev = calloc(1, sizeof(*dev));
+	if (!dev) {
+		return -ENOMEM;
+	}
+
+	init_ctx = calloc(1, sizeof(*init_ctx));
+	if (!init_ctx) {
+		goto fail_sync;
+	}
+
+	init_ctx->dev = dev;
+	init_ctx->opts = *_opts;
+	init_ctx->cb_fn = cb_fn;
+	init_ctx->cb_arg = cb_arg;
+	init_ctx->thread = spdk_get_thread();
+
+	if (!opts.conf) {
+		opts.conf = &g_default_conf;
+	}
+
+	if (!opts.base_bdev) {
+		SPDK_ERRLOG("Lack of underlying device in configuration\n");
+		rc = -EINVAL;
+		goto fail_sync;
+	}
+
+	dev->conf = *opts.conf;
+	dev->limit = SPDK_FTL_LIMIT_MAX;
+
+	dev->name = strdup(opts.name);
+	if (!dev->name) {
+		SPDK_ERRLOG("Unable to set device name\n");
+		goto fail_sync;
+	}
+
+	if (ftl_dev_init_base_bdev(dev, opts.base_bdev)) {
+		SPDK_ERRLOG("Unsupported underlying device\n");
+		goto fail_sync;
+	}
+
+	if (opts.conf->l2p_path) {
+		dev->conf.l2p_path = strdup(opts.conf->l2p_path);
+		if (!dev->conf.l2p_path) {
+			rc = -ENOMEM;
+			goto fail_sync;
+		}
+	}
+
+	/* In case of errors, we free all of the memory in ftl_dev_free_sync(), */
+	/* so we don't have to clean up in each of the init functions. */
+	if (ftl_check_conf(dev, opts.conf)) {
+		SPDK_ERRLOG("Invalid device configuration\n");
+		goto fail_sync;
+	}
+
+	if (ftl_init_lba_map_pools(dev)) {
+		SPDK_ERRLOG("Unable to init LBA map pools\n");
+		goto fail_sync;
+	}
+
+	if (ftl_init_media_events_pool(dev)) {
+		SPDK_ERRLOG("Unable to init media events pools\n");
+		goto fail_sync;
+	}
+
+	ftl_init_wptr_list(dev);
+
+	if (ftl_dev_init_bands(dev)) {
+		SPDK_ERRLOG("Unable to initialize band array\n");
+		goto fail_sync;
+	}
+
+	if (ftl_dev_init_nv_cache(dev, opts.cache_bdev)) {
+		SPDK_ERRLOG("Unable to initialize persistent cache\n");
+		goto fail_sync;
+	}
+
+	dev->reloc = ftl_reloc_init(dev);
+	if (!dev->reloc) {
+		SPDK_ERRLOG("Unable to initialize reloc structures\n");
+		goto fail_sync;
+	}
+
+	if (ftl_dev_init_io_channel(dev)) {
+		SPDK_ERRLOG("Unable to initialize IO channels\n");
+		goto fail_sync;
+	}
+
+	if (ftl_dev_init_zones(init_ctx)) {
+		SPDK_ERRLOG("Failed to initialize zones\n");
+		goto fail_async;
+	}
+
+	return 0;
+fail_sync:
+	ftl_dev_free_sync(dev);
+	ftl_dev_free_init_ctx(init_ctx);
+	return rc;
+fail_async:
+	ftl_init_fail(init_ctx);
+	return 0;
+}
+
+static void
+_ftl_halt_defrag(void *arg)
+{
+	ftl_reloc_halt(((struct spdk_ftl_dev *)arg)->reloc);
+}
+
+static void
+ftl_halt_complete_cb(void *ctx)
+{
+	struct ftl_dev_init_ctx *fini_ctx = ctx;
+	struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+	/* Make sure core IO channel has already been released */
+	if (dev->num_io_channels > 0) {
+		spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx);
+		return;
+	}
+
+	spdk_io_device_unregister(fini_ctx->dev, NULL);
+
+	ftl_dev_free_sync(fini_ctx->dev);
+	if (fini_ctx->cb_fn != NULL) {
+		fini_ctx->cb_fn(NULL, fini_ctx->cb_arg, fini_ctx->halt_complete_status);
+	}
+
+	ftl_dev_free_init_ctx(fini_ctx);
+}
+
+static void
+ftl_put_io_channel_cb(void *ctx)
+{
+	struct ftl_dev_init_ctx *fini_ctx = ctx;
+	struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+	spdk_put_io_channel(dev->ioch);
+	spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx);
+}
+
+static void
+ftl_nv_cache_header_fini_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_dev_init_ctx *fini_ctx = cb_arg;
+	int rc = 0;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Failed to write non-volatile cache metadata header\n");
+		rc = -EIO;
+	}
+
+	fini_ctx->halt_complete_status = rc;
+	spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx);
+}
+
+static int
+ftl_halt_poller(void *ctx)
+{
+	struct ftl_dev_init_ctx *fini_ctx = ctx;
+	struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+	if (!dev->core_poller) {
+		spdk_poller_unregister(&fini_ctx->poller);
+
+		if (ftl_dev_has_nv_cache(dev)) {
+			ftl_nv_cache_write_header(&dev->nv_cache, true,
+						  ftl_nv_cache_header_fini_cb, fini_ctx);
+		} else {
+			fini_ctx->halt_complete_status = 0;
+			spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx);
+		}
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+ftl_add_halt_poller(void *ctx)
+{
+	struct ftl_dev_init_ctx *fini_ctx = ctx;
+	struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+	dev->halt = 1;
+
+	_ftl_halt_defrag(dev);
+
+	assert(!fini_ctx->poller);
+	fini_ctx->poller = SPDK_POLLER_REGISTER(ftl_halt_poller, fini_ctx, 100);
+}
+
+static int
+ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg,
+	     struct spdk_thread *thread)
+{
+	struct ftl_dev_init_ctx *fini_ctx;
+
+	if (dev->halt_started) {
+		dev->halt_started = true;
+		return -EBUSY;
+	}
+
+	fini_ctx = calloc(1, sizeof(*fini_ctx));
+	if (!fini_ctx) {
+		return -ENOMEM;
+	}
+
+	fini_ctx->dev = dev;
+	fini_ctx->cb_fn = cb_fn;
+	fini_ctx->cb_arg = cb_arg;
+	fini_ctx->thread = thread;
+
+	spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_add_halt_poller, fini_ctx);
+	return 0;
+}
+
+int
+spdk_ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg)
+{
+	return ftl_dev_free(dev, cb_fn, cb_arg, spdk_get_thread());
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ftl_init", SPDK_LOG_FTL_INIT)
diff --git a/src/spdk/lib/ftl/ftl_io.c b/src/spdk/lib/ftl/ftl_io.c
new file mode 100644
index 000000000..39a845bae
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_io.c
@@ -0,0 +1,563 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/ftl.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+
+#include "ftl_io.h"
+#include "ftl_core.h"
+#include "ftl_band.h"
+#include "ftl_debug.h"
+
+void
+ftl_io_inc_req(struct ftl_io *io)
+{
+	struct ftl_band *band = io->band;
+
+	if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) {
+		ftl_band_acquire_lba_map(band);
+	}
+
+	__atomic_fetch_add(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST);
+
+	++io->req_cnt;
+}
+
+void
+ftl_io_dec_req(struct ftl_io *io)
+{
+	struct ftl_band *band = io->band;
+	unsigned long num_inflight __attribute__((unused));
+
+	if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) {
+		ftl_band_release_lba_map(band);
+	}
+
+	num_inflight = __atomic_fetch_sub(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST);
+
+	assert(num_inflight > 0);
+	assert(io->req_cnt > 0);
+
+	--io->req_cnt;
+}
+
+struct iovec *
+ftl_io_iovec(struct ftl_io *io)
+{
+	return &io->iov[0];
+}
+
+uint64_t
+ftl_io_get_lba(const struct ftl_io *io, size_t offset)
+{
+	assert(offset < io->num_blocks);
+
+	if (io->flags & FTL_IO_VECTOR_LBA) {
+		return io->lba.vector[offset];
+	} else {
+		return io->lba.single + offset;
+	}
+}
+
+uint64_t
+ftl_io_current_lba(const struct ftl_io *io)
+{
+	return ftl_io_get_lba(io, io->pos);
+}
+
+void
+ftl_io_advance(struct ftl_io *io, size_t num_blocks)
+{
+	struct iovec *iov = ftl_io_iovec(io);
+	size_t iov_blocks, block_left = num_blocks;
+
+	io->pos += num_blocks;
+
+	if (io->iov_cnt != 0) {
+		while (block_left > 0) {
+			assert(io->iov_pos < io->iov_cnt);
+			iov_blocks = iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE;
+
+			if (io->iov_off + block_left < iov_blocks) {
+				io->iov_off += block_left;
+				break;
+			}
+
+			assert(iov_blocks > io->iov_off);
+			block_left -= (iov_blocks - io->iov_off);
+			io->iov_off = 0;
+			io->iov_pos++;
+		}
+	}
+
+	if (io->parent) {
+		ftl_io_advance(io->parent, num_blocks);
+	}
+}
+
+size_t
+ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt)
+{
+	size_t num_blocks = 0, i = 0;
+
+	for (; i < iov_cnt; ++i) {
+		num_blocks += iov[i].iov_len / FTL_BLOCK_SIZE;
+	}
+
+	return num_blocks;
+}
+
+void *
+ftl_io_iovec_addr(struct ftl_io *io)
+{
+	assert(io->iov_pos < io->iov_cnt);
+	assert(io->iov_off * FTL_BLOCK_SIZE < ftl_io_iovec(io)[io->iov_pos].iov_len);
+
+	return (char *)ftl_io_iovec(io)[io->iov_pos].iov_base +
+	       io->iov_off * FTL_BLOCK_SIZE;
+}
+
+size_t
+ftl_io_iovec_len_left(struct ftl_io *io)
+{
+	struct iovec *iov = ftl_io_iovec(io);
+	return iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE - io->iov_off;
+}
+
+static void
+ftl_io_init_iovec(struct ftl_io *io, const struct iovec *iov, size_t iov_cnt, size_t iov_off,
+		  size_t num_blocks)
+{
+	size_t offset = 0, num_left;
+
+	io->iov_pos = 0;
+	io->iov_cnt = 0;
+	io->num_blocks = num_blocks;
+
+	while (offset < num_blocks) {
+		assert(io->iov_cnt < FTL_IO_MAX_IOVEC && io->iov_cnt < iov_cnt);
+
+		num_left = spdk_min(iov[io->iov_cnt].iov_len / FTL_BLOCK_SIZE - iov_off,
+				    num_blocks);
+		io->iov[io->iov_cnt].iov_base = (char *)iov[io->iov_cnt].iov_base +
+						iov_off * FTL_BLOCK_SIZE;
+		io->iov[io->iov_cnt].iov_len = num_left * FTL_BLOCK_SIZE;
+
+		offset += num_left;
+		io->iov_cnt++;
+		iov_off = 0;
+	}
+}
+
+void
+ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks)
+{
+	size_t iov_off = 0, block_off = 0;
+
+	assert(io->num_blocks >= num_blocks);
+	assert(io->pos == 0 && io->iov_pos == 0 && io->iov_off == 0);
+
+	for (; iov_off < io->iov_cnt; ++iov_off) {
+		size_t num_iov = io->iov[iov_off].iov_len / FTL_BLOCK_SIZE;
+		size_t num_left = num_blocks - block_off;
+
+		if (num_iov >= num_left) {
+			io->iov[iov_off].iov_len = num_left * FTL_BLOCK_SIZE;
+			io->iov_cnt = iov_off + 1;
+			io->num_blocks = num_blocks;
+			break;
+		}
+
+		block_off += num_iov;
+	}
+}
+
+static void
+ftl_io_init(struct ftl_io *io, struct spdk_ftl_dev *dev,
+	    ftl_io_fn fn, void *ctx, int flags, int type)
+{
+	io->flags |= flags | FTL_IO_INITIALIZED;
+	io->type = type;
+	io->dev = dev;
+	io->lba.single = FTL_LBA_INVALID;
+	io->addr.offset = FTL_ADDR_INVALID;
+	io->cb_fn = fn;
+	io->cb_ctx = ctx;
+	io->trace = ftl_trace_alloc_id(dev);
+}
+
+struct ftl_io *
+ftl_io_init_internal(const struct ftl_io_init_opts *opts)
+{
+	struct ftl_io *io = opts->io;
+	struct ftl_io *parent = opts->parent;
+	struct spdk_ftl_dev *dev = opts->dev;
+	const struct iovec *iov;
+	size_t iov_cnt, iov_off;
+
+	if (!io) {
+		if (parent) {
+			io = ftl_io_alloc_child(parent);
+		} else {
+			io = ftl_io_alloc(ftl_get_io_channel(dev));
+		}
+
+		if (!io) {
+			return NULL;
+		}
+	}
+
+	ftl_io_clear(io);
+	ftl_io_init(io, dev, opts->cb_fn, opts->cb_ctx, opts->flags | FTL_IO_INTERNAL, opts->type);
+
+	io->batch = opts->batch;
+	io->band = opts->band;
+	io->md = opts->md;
+	io->iov = &io->iov_buf[0];
+
+	if (parent) {
+		if (parent->flags & FTL_IO_VECTOR_LBA) {
+			io->lba.vector = parent->lba.vector + parent->pos;
+		} else {
+			io->lba.single = parent->lba.single + parent->pos;
+		}
+
+		iov = &parent->iov[parent->iov_pos];
+		iov_cnt = parent->iov_cnt - parent->iov_pos;
+		iov_off = parent->iov_off;
+	} else {
+		iov = &opts->iovs[0];
+		iov_cnt = opts->iovcnt;
+		iov_off = 0;
+	}
+
+	/* Some requests (zone resets) do not use iovecs */
+	if (iov_cnt > 0) {
+		ftl_io_init_iovec(io, iov, iov_cnt, iov_off, opts->num_blocks);
+	}
+
+	if (opts->flags & FTL_IO_VECTOR_LBA) {
+		io->lba.vector = calloc(io->num_blocks, sizeof(uint64_t));
+		if (!io->lba.vector) {
+			ftl_io_free(io);
+			return NULL;
+		}
+	}
+
+	return io;
+}
+
+struct ftl_io *
+ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr, struct ftl_band *band,
+		 struct ftl_batch *batch, ftl_io_fn cb)
+{
+	struct ftl_io *io;
+	struct ftl_io_init_opts opts = {
+		.dev		= dev,
+		.io		= NULL,
+		.batch		= batch,
+		.band		= band,
+		.size		= sizeof(struct ftl_io),
+		.flags		= 0,
+		.type		= FTL_IO_WRITE,
+		.num_blocks	= dev->xfer_size,
+		.cb_fn		= cb,
+		.iovcnt		= dev->xfer_size,
+		.md		= batch->metadata,
+	};
+
+	memcpy(opts.iovs, batch->iov, sizeof(struct iovec) * dev->xfer_size);
+
+	io = ftl_io_init_internal(&opts);
+	if (!io) {
+		return NULL;
+	}
+
+	io->addr = addr;
+
+	return io;
+}
+
+struct ftl_io *
+ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb)
+{
+	struct ftl_io *io;
+	struct ftl_io_init_opts opts = {
+		.dev		= band->dev,
+		.io		= NULL,
+		.band		= band,
+		.size		= sizeof(struct ftl_io),
+		.flags		= FTL_IO_PHYSICAL_MODE,
+		.type		= FTL_IO_ERASE,
+		.num_blocks	= 1,
+		.cb_fn		= cb,
+		.iovcnt		= 0,
+		.md		= NULL,
+	};
+
+	io = ftl_io_init_internal(&opts);
+	if (!io) {
+		return NULL;
+	}
+
+	io->num_blocks = num_blocks;
+
+	return io;
+}
+
+static void
+_ftl_user_cb(struct ftl_io *io, void *arg, int status)
+{
+	io->user_fn(arg, status);
+}
+
+struct ftl_io *
+ftl_io_user_init(struct spdk_io_channel *_ioch, uint64_t lba, size_t num_blocks, struct iovec *iov,
+		 size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_ctx, int type)
+{
+	struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(_ioch);
+	struct spdk_ftl_dev *dev = ioch->dev;
+	struct ftl_io *io;
+
+	io = ftl_io_alloc(_ioch);
+	if (spdk_unlikely(!io)) {
+		return NULL;
+	}
+
+	ftl_io_init(io, dev, _ftl_user_cb, cb_ctx, 0, type);
+	io->lba.single = lba;
+	io->user_fn = cb_fn;
+	io->iov = iov;
+	io->iov_cnt = iov_cnt;
+	io->num_blocks = num_blocks;
+
+	ftl_trace_lba_io_init(io->dev, io);
+	return io;
+}
+
+static void
+_ftl_io_free(struct ftl_io *io)
+{
+	struct ftl_io_channel *ioch;
+
+	assert(LIST_EMPTY(&io->children));
+
+	if (io->flags & FTL_IO_VECTOR_LBA) {
+		free(io->lba.vector);
+	}
+
+	if (pthread_spin_destroy(&io->lock)) {
+		SPDK_ERRLOG("pthread_spin_destroy failed\n");
+	}
+
+	ioch = ftl_io_channel_get_ctx(io->ioch);
+	spdk_mempool_put(ioch->io_pool, io);
+}
+
+static bool
+ftl_io_remove_child(struct ftl_io *io)
+{
+	struct ftl_io *parent = io->parent;
+	bool parent_done;
+
+	pthread_spin_lock(&parent->lock);
+	LIST_REMOVE(io, child_entry);
+	parent_done = parent->done && LIST_EMPTY(&parent->children);
+	parent->status = parent->status ? : io->status;
+	pthread_spin_unlock(&parent->lock);
+
+	return parent_done;
+}
+
+void
+ftl_io_complete(struct ftl_io *io)
+{
+	struct ftl_io *parent = io->parent;
+	bool complete;
+
+	io->flags &= ~FTL_IO_INITIALIZED;
+
+	pthread_spin_lock(&io->lock);
+	complete = LIST_EMPTY(&io->children);
+	io->done = true;
+	pthread_spin_unlock(&io->lock);
+
+	if (complete) {
+		if (io->cb_fn) {
+			io->cb_fn(io, io->cb_ctx, io->status);
+		}
+
+		if (parent && ftl_io_remove_child(io)) {
+			ftl_io_complete(parent);
+		}
+
+		_ftl_io_free(io);
+	}
+}
+
+struct ftl_io *
+ftl_io_alloc_child(struct ftl_io *parent)
+{
+	struct ftl_io *io;
+
+	io = ftl_io_alloc(parent->ioch);
+	if (spdk_unlikely(!io)) {
+		return NULL;
+	}
+
+	ftl_io_init(io, parent->dev, NULL, NULL, parent->flags, parent->type);
+	io->parent = parent;
+
+	pthread_spin_lock(&parent->lock);
+	LIST_INSERT_HEAD(&parent->children, io, child_entry);
+	pthread_spin_unlock(&parent->lock);
+
+	return io;
+}
+
+void ftl_io_fail(struct ftl_io *io, int status)
+{
+	io->status = status;
+	ftl_io_advance(io, io->num_blocks - io->pos);
+}
+
+void *
+ftl_io_get_md(const struct ftl_io *io)
+{
+	if (!io->md) {
+		return NULL;
+	}
+
+	return (char *)io->md + io->pos * io->dev->md_size;
+}
+
+struct ftl_io *
+ftl_io_alloc(struct spdk_io_channel *ch)
+{
+	struct ftl_io *io;
+	struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(ch);
+
+	io = spdk_mempool_get(ioch->io_pool);
+	if (!io) {
+		return NULL;
+	}
+
+	memset(io, 0, ioch->elem_size);
+	io->ioch = ch;
+
+	if (pthread_spin_init(&io->lock, PTHREAD_PROCESS_PRIVATE)) {
+		SPDK_ERRLOG("pthread_spin_init failed\n");
+		spdk_mempool_put(ioch->io_pool, io);
+		return NULL;
+	}
+
+	return io;
+}
+
+void
+ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb, void *ctx, int flags, int type)
+{
+	ftl_io_clear(io);
+	ftl_io_init(io, io->dev, cb, ctx, flags, type);
+}
+
+void
+ftl_io_clear(struct ftl_io *io)
+{
+	ftl_io_reset(io);
+
+	io->flags = 0;
+	io->batch = NULL;
+	io->band = NULL;
+}
+
+void
+ftl_io_reset(struct ftl_io *io)
+{
+	io->req_cnt = io->pos = io->iov_pos = io->iov_off = 0;
+	io->done = false;
+}
+
+void
+ftl_io_free(struct ftl_io *io)
+{
+	struct ftl_io *parent;
+
+	if (!io) {
+		return;
+	}
+
+	parent = io->parent;
+	if (parent && ftl_io_remove_child(io)) {
+		ftl_io_complete(parent);
+	}
+
+	_ftl_io_free(io);
+}
+
+void
+ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *))
+{
+	struct ftl_io *child, *tmp;
+
+	assert(!io->done);
+
+	/*
+	 * If the IO doesn't have any children, it means that it directly describes a request (i.e.
+	 * all of the buffers, LBAs, etc. are filled). Otherwise the IO only groups together several
+	 * requests and may be partially filled, so the callback needs to be called on all of its
+	 * children instead.
+	 */
+	if (LIST_EMPTY(&io->children)) {
+		callback(io);
+		return;
+	}
+
+	LIST_FOREACH_SAFE(child, &io->children, child_entry, tmp) {
+		int rc = callback(child);
+		if (rc) {
+			assert(rc != -EAGAIN);
+			ftl_io_fail(io, rc);
+			break;
+		}
+	}
+
+	/*
+	 * If all the callbacks were processed or an error occurred, treat this IO as completed.
+	 * Multiple calls to ftl_io_call_foreach_child are not supported, resubmissions are supposed
+	 * to be handled in the callback.
+	 */
+	ftl_io_complete(io);
+}
diff --git a/src/spdk/lib/ftl/ftl_io.h b/src/spdk/lib/ftl/ftl_io.h
new file mode 100644
index 000000000..d49dc3de7
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_io.h
@@ -0,0 +1,351 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_IO_H
+#define FTL_IO_H
+
+#include "spdk/stdinc.h"
+#include "spdk/nvme.h"
+#include "spdk/ftl.h"
+
+#include "ftl_addr.h"
+#include "ftl_trace.h"
+
+struct spdk_ftl_dev;
+struct ftl_band;
+struct ftl_batch;
+struct ftl_io;
+
+typedef int (*ftl_md_pack_fn)(struct ftl_band *);
+typedef void (*ftl_io_fn)(struct ftl_io *, void *, int);
+
+/* IO flags */
+enum ftl_io_flags {
+	/* Indicates whether IO is already initialized */
+	FTL_IO_INITIALIZED	= (1 << 0),
+	/* Internal based IO (defrag, metadata etc.) */
+	FTL_IO_INTERNAL		= (1 << 1),
+	/* Indicates that the IO should not go through if there's */
+	/* already another one scheduled to the same LBA */
+	FTL_IO_WEAK		= (1 << 2),
+	/* Indicates that the IO is used for padding */
+	FTL_IO_PAD		= (1 << 3),
+	/* The IO operates on metadata */
+	FTL_IO_MD		= (1 << 4),
+	/* Using physical instead of logical address */
+	FTL_IO_PHYSICAL_MODE	= (1 << 5),
+	/* Indicates that IO contains noncontiguous LBAs */
+	FTL_IO_VECTOR_LBA	= (1 << 6),
+	/* The IO is directed to non-volatile cache */
+	FTL_IO_CACHE		= (1 << 7),
+	/* Indicates that physical address should be taken from IO struct, */
+	/* not assigned by wptr, only works if wptr is also in direct mode */
+	FTL_IO_DIRECT_ACCESS	= (1 << 8),
+	/* Bypass the non-volatile cache */
+	FTL_IO_BYPASS_CACHE	= (1 << 9),
+};
+
+enum ftl_io_type {
+	FTL_IO_READ,
+	FTL_IO_WRITE,
+	FTL_IO_ERASE,
+};
+
+#define FTL_IO_MAX_IOVEC 64
+
+struct ftl_io_init_opts {
+	struct spdk_ftl_dev			*dev;
+
+	/* IO descriptor */
+	struct ftl_io				*io;
+
+	/* Parent request */
+	struct ftl_io				*parent;
+
+	/* Size of IO descriptor */
+	size_t                                  size;
+
+	/* IO flags */
+	int                                     flags;
+
+	/* IO type */
+	enum ftl_io_type			type;
+
+	/* Transfer batch, set for IO going through the write buffer */
+	struct ftl_batch			*batch;
+
+	/* Band to which the IO is directed */
+	struct ftl_band				*band;
+
+	/* Number of logical blocks */
+	size_t                                  num_blocks;
+
+	/* Data */
+	struct iovec				iovs[FTL_IO_MAX_IOVEC];
+	int					iovcnt;
+
+	/* Metadata */
+	void                                    *md;
+
+	/* Callback's function */
+	ftl_io_fn				cb_fn;
+
+	/* Callback's context */
+	void					*cb_ctx;
+};
+
+struct ftl_io_channel;
+
+struct ftl_wbuf_entry {
+	/* IO channel that owns the write bufer entry */
+	struct ftl_io_channel			*ioch;
+	/* Data payload (single block) */
+	void					*payload;
+	/* Index within the IO channel's wbuf_entries array */
+	uint32_t				index;
+	uint32_t				io_flags;
+	/* Points at the band the data is copied from.  Only valid for internal
+	 * requests coming from reloc.
+	 */
+	struct ftl_band				*band;
+	/* Physical address of that particular block.  Valid once the data has
+	 * been written out.
+	 */
+	struct ftl_addr				addr;
+	/* Logical block address */
+	uint64_t				lba;
+
+	/* Trace ID of the requests the entry is part of */
+	uint64_t				trace;
+
+	/* Indicates that the entry was written out and is still present in the
+	 * L2P table.
+	 */
+	bool					valid;
+	/* Lock that protects the entry from being evicted from the L2P */
+	pthread_spinlock_t			lock;
+	TAILQ_ENTRY(ftl_wbuf_entry)		tailq;
+};
+
+#define FTL_IO_CHANNEL_INDEX_INVALID ((uint64_t)-1)
+
+struct ftl_io_channel {
+	/* Device */
+	struct spdk_ftl_dev			*dev;
+	/* IO pool element size */
+	size_t					elem_size;
+	/* Index within the IO channel array */
+	uint64_t				index;
+	/* IO pool */
+	struct spdk_mempool			*io_pool;
+	/* Underlying device IO channel */
+	struct spdk_io_channel			*base_ioch;
+	/* Persistent cache IO channel */
+	struct spdk_io_channel			*cache_ioch;
+	/* Poller used for completing write requests and retrying IO */
+	struct spdk_poller			*poller;
+	/* Write completion queue */
+	TAILQ_HEAD(, ftl_io)			write_cmpl_queue;
+	TAILQ_HEAD(, ftl_io)			retry_queue;
+	TAILQ_ENTRY(ftl_io_channel)		tailq;
+
+	/* Array of write buffer entries */
+	struct ftl_wbuf_entry			*wbuf_entries;
+	/* Write buffer data payload */
+	void					*wbuf_payload;
+	/* Number of write buffer entries */
+	uint32_t				num_entries;
+	/* Write buffer queues */
+	struct spdk_ring			*free_queue;
+	struct spdk_ring			*submit_queue;
+	/* Maximum number of concurrent user writes */
+	uint32_t				qdepth_limit;
+	/* Current number of concurrent user writes */
+	uint32_t				qdepth_current;
+	/* Means that the IO channel is being flushed */
+	bool					flush;
+};
+
+/* General IO descriptor */
+struct ftl_io {
+	/* Device */
+	struct spdk_ftl_dev			*dev;
+
+	/* IO channel */
+	struct spdk_io_channel			*ioch;
+
+	union {
+		/* LBA table */
+		uint64_t			*vector;
+
+		/* First LBA */
+		uint64_t			single;
+	} lba;
+
+	/* First block address */
+	struct ftl_addr				addr;
+
+	/* Number of processed blocks */
+	size_t					pos;
+
+	/* Number of blocks */
+	size_t					num_blocks;
+
+	/* IO vector pointer */
+	struct iovec				*iov;
+
+	/* IO vector buffer for internal requests */
+	struct iovec				iov_buf[FTL_IO_MAX_IOVEC];
+
+	/* Metadata */
+	void					*md;
+
+	/* Number of IO vectors */
+	size_t					iov_cnt;
+
+	/* Position within the iovec */
+	size_t					iov_pos;
+
+	/* Offset within the iovec (in blocks) */
+	size_t					iov_off;
+
+	/* Transfer batch (valid only for writes going through the write buffer) */
+	struct ftl_batch			*batch;
+
+	/* Band this IO is being written to */
+	struct ftl_band				*band;
+
+	/* Request status */
+	int					status;
+
+	/* Number of split requests */
+	size_t					req_cnt;
+
+	/* Callback's function */
+	ftl_io_fn				cb_fn;
+
+	/* Callback's context */
+	void					*cb_ctx;
+
+	/* User callback function */
+	spdk_ftl_fn				user_fn;
+
+	/* Flags */
+	int					flags;
+
+	/* IO type */
+	enum ftl_io_type			type;
+
+	/* Done flag */
+	bool					done;
+
+	/* Parent request */
+	struct ftl_io				*parent;
+	/* Child requests list */
+	LIST_HEAD(, ftl_io)			children;
+	/* Child list link */
+	LIST_ENTRY(ftl_io)			child_entry;
+	/* Children lock */
+	pthread_spinlock_t			lock;
+
+	/* Trace group id */
+	uint64_t				trace;
+
+	/* Used by retry and write completion queues */
+	TAILQ_ENTRY(ftl_io)			ioch_entry;
+};
+
+/* Metadata IO */
+struct ftl_md_io {
+	/* Parent IO structure */
+	struct ftl_io				io;
+
+	/* Serialization/deserialization callback */
+	ftl_md_pack_fn				pack_fn;
+
+	/* Callback's function */
+	ftl_io_fn				cb_fn;
+
+	/* Callback's context */
+	void					*cb_ctx;
+};
+
+static inline bool
+ftl_io_mode_physical(const struct ftl_io *io)
+{
+	return io->flags & FTL_IO_PHYSICAL_MODE;
+}
+
+static inline bool
+ftl_io_mode_logical(const struct ftl_io *io)
+{
+	return !ftl_io_mode_physical(io);
+}
+
+static inline bool
+ftl_io_done(const struct ftl_io *io)
+{
+	return io->req_cnt == 0 && io->pos == io->num_blocks;
+}
+
+struct ftl_io *ftl_io_alloc(struct spdk_io_channel *ch);
+struct ftl_io *ftl_io_alloc_child(struct ftl_io *parent);
+void ftl_io_fail(struct ftl_io *io, int status);
+void ftl_io_free(struct ftl_io *io);
+struct ftl_io *ftl_io_init_internal(const struct ftl_io_init_opts *opts);
+void ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb,
+		   void *ctx, int flags, int type);
+void ftl_io_clear(struct ftl_io *io);
+void ftl_io_inc_req(struct ftl_io *io);
+void ftl_io_dec_req(struct ftl_io *io);
+struct iovec *ftl_io_iovec(struct ftl_io *io);
+uint64_t ftl_io_current_lba(const struct ftl_io *io);
+uint64_t ftl_io_get_lba(const struct ftl_io *io, size_t offset);
+void ftl_io_advance(struct ftl_io *io, size_t num_blocks);
+size_t ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt);
+void *ftl_io_iovec_addr(struct ftl_io *io);
+size_t ftl_io_iovec_len_left(struct ftl_io *io);
+struct ftl_io *ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr,
+				struct ftl_band *band, struct ftl_batch *batch, ftl_io_fn cb);
+struct ftl_io *ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb);
+struct ftl_io *ftl_io_user_init(struct spdk_io_channel *ioch, uint64_t lba, size_t num_blocks,
+				struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn,
+				void *cb_arg, int type);
+void *ftl_io_get_md(const struct ftl_io *io);
+void ftl_io_complete(struct ftl_io *io);
+void ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks);
+void ftl_io_process_error(struct ftl_io *io, const struct spdk_nvme_cpl *status);
+void ftl_io_reset(struct ftl_io *io);
+void ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *));
+
+#endif /* FTL_IO_H */
diff --git a/src/spdk/lib/ftl/ftl_reloc.c b/src/spdk/lib/ftl/ftl_reloc.c
new file mode 100644
index 000000000..e59bf4d81
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_reloc.c
@@ -0,0 +1,860 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/likely.h"
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+
+#include "ftl_reloc.h"
+#include "ftl_core.h"
+#include "ftl_io.h"
+#include "ftl_band.h"
+#include "ftl_debug.h"
+
+/* Maximum active reloc moves */
+#define FTL_RELOC_MAX_MOVES 256
+
+struct ftl_reloc;
+struct ftl_band_reloc;
+
+enum ftl_reloc_move_state {
+	FTL_RELOC_STATE_READ_LBA_MAP,
+	FTL_RELOC_STATE_READ,
+	FTL_RELOC_STATE_WRITE,
+};
+
+enum ftl_band_reloc_state {
+	FTL_BAND_RELOC_STATE_INACTIVE,
+	FTL_BAND_RELOC_STATE_PENDING,
+	FTL_BAND_RELOC_STATE_ACTIVE,
+	FTL_BAND_RELOC_STATE_HIGH_PRIO
+};
+
+struct ftl_reloc_move {
+	struct ftl_band_reloc			*breloc;
+
+	/* Start addr */
+	struct ftl_addr				addr;
+
+	/* Number of logical blocks */
+	size_t					num_blocks;
+
+	/* Data buffer */
+	void					*data;
+
+	/* Move state (read lba_map, read, write) */
+	enum ftl_reloc_move_state		state;
+
+	/* IO associated with move */
+	struct ftl_io				*io;
+
+	STAILQ_ENTRY(ftl_reloc_move)		entry;
+};
+
+struct ftl_band_reloc {
+	struct ftl_reloc			*parent;
+
+	/* Band being relocated */
+	struct ftl_band				*band;
+
+	/* Number of logical blocks to be relocated */
+	size_t					num_blocks;
+
+	/* Bitmap of logical blocks to be relocated */
+	struct spdk_bit_array			*reloc_map;
+
+	/*  State of the band reloc */
+	enum ftl_band_reloc_state		state;
+
+	/* The band is being defragged */
+	bool					defrag;
+
+	/* Reloc map iterator */
+	struct {
+		/* Array of zone offsets */
+		size_t				*zone_offset;
+
+		/* Current zone */
+		size_t				zone_current;
+	} iter;
+
+	/* Number of outstanding moves */
+	size_t					num_outstanding;
+
+	/* Pool of move objects */
+	struct ftl_reloc_move			*moves;
+
+	/* Move queue */
+	STAILQ_HEAD(, ftl_reloc_move)		move_queue;
+
+	TAILQ_ENTRY(ftl_band_reloc)		entry;
+};
+
+struct ftl_reloc {
+	/* Device associated with relocate */
+	struct spdk_ftl_dev			*dev;
+
+	/* Indicates relocate is about to halt */
+	bool					halt;
+
+	/* Maximum number of IOs per band */
+	size_t					max_qdepth;
+
+	/* Maximum number of active band relocates */
+	size_t					max_active;
+
+	/* Maximum transfer size (in logical blocks) per single IO */
+	size_t					xfer_size;
+	/* Number of bands being defragged */
+	size_t					num_defrag_bands;
+
+	/* Array of band relocates */
+	struct ftl_band_reloc			*brelocs;
+
+	/* Number of active/priority band relocates */
+	size_t					num_active;
+
+	/* Priority band relocates queue */
+	TAILQ_HEAD(, ftl_band_reloc)		prio_queue;
+
+	/* Active band relocates queue */
+	TAILQ_HEAD(, ftl_band_reloc)		active_queue;
+
+	/* Pending band relocates queue */
+	TAILQ_HEAD(, ftl_band_reloc)		pending_queue;
+};
+
+bool
+ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc)
+{
+	return reloc->num_defrag_bands > 0;
+}
+
+static size_t
+ftl_reloc_iter_zone_offset(struct ftl_band_reloc *breloc)
+{
+	size_t zone = breloc->iter.zone_current;
+
+	return breloc->iter.zone_offset[zone];
+}
+
+static size_t
+ftl_reloc_iter_zone_done(struct ftl_band_reloc *breloc)
+{
+	size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+	return ftl_reloc_iter_zone_offset(breloc) == num_blocks;
+}
+
+static void
+ftl_reloc_clr_block(struct ftl_band_reloc *breloc, size_t block_off)
+{
+	if (!spdk_bit_array_get(breloc->reloc_map, block_off)) {
+		return;
+	}
+
+	spdk_bit_array_clear(breloc->reloc_map, block_off);
+	assert(breloc->num_blocks);
+	breloc->num_blocks--;
+}
+
+static void
+ftl_reloc_read_lba_map_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct ftl_reloc_move *move = arg;
+	struct ftl_band_reloc *breloc = move->breloc;
+
+	breloc->num_outstanding--;
+	assert(status == 0);
+	move->state = FTL_RELOC_STATE_WRITE;
+	STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+}
+
+static int
+ftl_reloc_read_lba_map(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+	struct ftl_band *band = breloc->band;
+
+	breloc->num_outstanding++;
+	return ftl_band_read_lba_map(band, ftl_band_block_offset_from_addr(band, move->addr),
+				     move->num_blocks, ftl_reloc_read_lba_map_cb, move);
+}
+
+static void
+ftl_reloc_prep(struct ftl_band_reloc *breloc)
+{
+	struct ftl_band *band = breloc->band;
+	struct ftl_reloc *reloc = breloc->parent;
+	struct ftl_reloc_move *move;
+	size_t i;
+
+	reloc->num_active++;
+
+	if (!band->high_prio) {
+		if (ftl_band_alloc_lba_map(band)) {
+			SPDK_ERRLOG("Failed to allocate lba map\n");
+			assert(false);
+		}
+	} else {
+		ftl_band_acquire_lba_map(band);
+	}
+
+	for (i = 0; i < reloc->max_qdepth; ++i) {
+		move = &breloc->moves[i];
+		move->state = FTL_RELOC_STATE_READ;
+		STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+	}
+}
+
+static void
+ftl_reloc_free_move(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+	assert(move);
+	spdk_dma_free(move->data);
+	memset(move, 0, sizeof(*move));
+	move->state = FTL_RELOC_STATE_READ;
+}
+
+static void
+ftl_reloc_write_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct ftl_reloc_move *move = arg;
+	struct ftl_addr addr = move->addr;
+	struct ftl_band_reloc *breloc = move->breloc;
+	size_t i;
+
+	breloc->num_outstanding--;
+
+	if (status) {
+		SPDK_ERRLOG("Reloc write failed with status: %d\n", status);
+		assert(false);
+		return;
+	}
+
+	for (i = 0; i < move->num_blocks; ++i) {
+		addr.offset = move->addr.offset + i;
+		size_t block_off = ftl_band_block_offset_from_addr(breloc->band, addr);
+		ftl_reloc_clr_block(breloc, block_off);
+	}
+
+	ftl_reloc_free_move(breloc, move);
+	STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+}
+
+static void
+ftl_reloc_read_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct ftl_reloc_move *move = arg;
+	struct ftl_band_reloc *breloc = move->breloc;
+
+	breloc->num_outstanding--;
+
+	/* TODO: We should handle fail on relocation read. We need to inform */
+	/* user that this group of blocks is bad (update l2p with bad block address and */
+	/* put it to lba_map/sector_lba). Maybe we could also retry read with smaller granularity? */
+	if (status) {
+		SPDK_ERRLOG("Reloc read failed with status: %d\n", status);
+		assert(false);
+		return;
+	}
+
+	move->state = FTL_RELOC_STATE_READ_LBA_MAP;
+	move->io = NULL;
+	STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+}
+
+static void
+ftl_reloc_iter_reset(struct ftl_band_reloc *breloc)
+{
+	memset(breloc->iter.zone_offset, 0, ftl_get_num_punits(breloc->band->dev) *
+	       sizeof(*breloc->iter.zone_offset));
+	breloc->iter.zone_current = 0;
+}
+
+static size_t
+ftl_reloc_iter_block_offset(struct ftl_band_reloc *breloc)
+{
+	size_t zone_offset = breloc->iter.zone_current * ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+	return breloc->iter.zone_offset[breloc->iter.zone_current] + zone_offset;
+}
+
+static void
+ftl_reloc_iter_next_zone(struct ftl_band_reloc *breloc)
+{
+	size_t num_zones = ftl_get_num_punits(breloc->band->dev);
+
+	breloc->iter.zone_current = (breloc->iter.zone_current + 1) % num_zones;
+}
+
+static int
+ftl_reloc_block_valid(struct ftl_band_reloc *breloc, size_t block_off)
+{
+	struct ftl_addr addr = ftl_band_addr_from_block_offset(breloc->band, block_off);
+
+	return ftl_addr_is_written(breloc->band, addr) &&
+	       spdk_bit_array_get(breloc->reloc_map, block_off) &&
+	       ftl_band_block_offset_valid(breloc->band, block_off);
+}
+
+static int
+ftl_reloc_iter_next(struct ftl_band_reloc *breloc, size_t *block_off)
+{
+	size_t zone = breloc->iter.zone_current;
+
+	*block_off = ftl_reloc_iter_block_offset(breloc);
+
+	if (ftl_reloc_iter_zone_done(breloc)) {
+		return 0;
+	}
+
+	breloc->iter.zone_offset[zone]++;
+
+	if (!ftl_reloc_block_valid(breloc, *block_off)) {
+		ftl_reloc_clr_block(breloc, *block_off);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+ftl_reloc_first_valid_block(struct ftl_band_reloc *breloc, size_t *block_off)
+{
+	size_t i, num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+	for (i = ftl_reloc_iter_zone_offset(breloc); i < num_blocks; ++i) {
+		if (ftl_reloc_iter_next(breloc, block_off)) {
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+ftl_reloc_iter_done(struct ftl_band_reloc *breloc)
+{
+	size_t i;
+	size_t num_zones = ftl_get_num_punits(breloc->band->dev);
+	size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+	for (i = 0; i < num_zones; ++i) {
+		if (breloc->iter.zone_offset[i] != num_blocks) {
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static size_t
+ftl_reloc_find_valid_blocks(struct ftl_band_reloc *breloc,
+			    size_t _num_blocks, struct ftl_addr *addr)
+{
+	size_t block_off, num_blocks = 0;
+
+	if (!ftl_reloc_first_valid_block(breloc, &block_off)) {
+		return 0;
+	}
+
+	*addr = ftl_band_addr_from_block_offset(breloc->band, block_off);
+
+	for (num_blocks = 1; num_blocks < _num_blocks; num_blocks++) {
+		if (!ftl_reloc_iter_next(breloc, &block_off)) {
+			break;
+		}
+	}
+
+	return num_blocks;
+}
+
+static size_t
+ftl_reloc_next_blocks(struct ftl_band_reloc *breloc, struct ftl_addr *addr)
+{
+	size_t i, num_blocks = 0;
+	struct spdk_ftl_dev *dev = breloc->parent->dev;
+
+	for (i = 0; i < ftl_get_num_punits(dev); ++i) {
+		num_blocks = ftl_reloc_find_valid_blocks(breloc, breloc->parent->xfer_size, addr);
+		ftl_reloc_iter_next_zone(breloc);
+
+		if (num_blocks || ftl_reloc_iter_done(breloc)) {
+			break;
+		}
+	}
+
+	return num_blocks;
+}
+
+static struct ftl_io *
+ftl_reloc_io_init(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move,
+		  ftl_io_fn fn, enum ftl_io_type io_type, int flags)
+{
+	size_t block_off, i;
+	struct ftl_addr addr = move->addr;
+	struct ftl_io *io = NULL;
+	struct ftl_io_init_opts opts = {
+		.dev		= breloc->parent->dev,
+		.band		= breloc->band,
+		.size		= sizeof(*io),
+		.flags		= flags | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE,
+		.type		= io_type,
+		.num_blocks	= move->num_blocks,
+		.iovs		= {
+			{
+				.iov_base = move->data,
+				.iov_len = move->num_blocks * FTL_BLOCK_SIZE,
+			}
+		},
+		.iovcnt		= 1,
+		.cb_fn		= fn,
+	};
+
+	io = ftl_io_init_internal(&opts);
+	if (!io) {
+		return NULL;
+	}
+
+	io->cb_ctx = move;
+	io->addr = move->addr;
+
+	if (flags & FTL_IO_VECTOR_LBA) {
+		for (i = 0; i < io->num_blocks; ++i, ++addr.offset) {
+			block_off = ftl_band_block_offset_from_addr(breloc->band, addr);
+
+			if (!ftl_band_block_offset_valid(breloc->band, block_off)) {
+				io->lba.vector[i] = FTL_LBA_INVALID;
+				continue;
+			}
+
+			io->lba.vector[i] = breloc->band->lba_map.map[block_off];
+		}
+	}
+
+	ftl_trace_lba_io_init(io->dev, io);
+
+	return io;
+}
+
+static int
+ftl_reloc_write(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+	int io_flags =  FTL_IO_WEAK | FTL_IO_VECTOR_LBA | FTL_IO_BYPASS_CACHE;
+
+	if (spdk_likely(!move->io)) {
+		move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_write_cb,
+					     FTL_IO_WRITE, io_flags);
+		if (!move->io) {
+			ftl_reloc_free_move(breloc, move);
+			STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+			return -ENOMEM;
+		}
+	}
+
+	breloc->num_outstanding++;
+	ftl_io_write(move->io);
+	return 0;
+}
+
+static int
+ftl_reloc_read(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+	struct ftl_addr addr = {};
+
+	move->num_blocks = ftl_reloc_next_blocks(breloc, &addr);
+	move->breloc = breloc;
+	move->addr = addr;
+
+	if (!move->num_blocks) {
+		return 0;
+	}
+
+	move->data = spdk_dma_malloc(FTL_BLOCK_SIZE * move->num_blocks, 4096, NULL);
+	if (!move->data) {
+		return -1;
+	}
+
+	move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_read_cb, FTL_IO_READ, 0);
+	if (!move->io) {
+		ftl_reloc_free_move(breloc, move);
+		STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+		SPDK_ERRLOG("Failed to initialize io for relocation.");
+		return -1;
+	}
+
+	breloc->num_outstanding++;
+	ftl_io_read(move->io);
+	return 0;
+}
+
+static void
+ftl_reloc_process_moves(struct ftl_band_reloc *breloc)
+{
+	struct ftl_reloc_move *move;
+	STAILQ_HEAD(, ftl_reloc_move) move_queue;
+	int rc = 0;
+
+	/*
+	 * When IO allocation fails, we do not want to retry immediately so keep moves on
+	 * temporary queue
+	 */
+	STAILQ_INIT(&move_queue);
+	STAILQ_SWAP(&breloc->move_queue, &move_queue, ftl_reloc_move);
+
+	while (!STAILQ_EMPTY(&move_queue)) {
+		move = STAILQ_FIRST(&move_queue);
+		STAILQ_REMOVE_HEAD(&move_queue, entry);
+
+		switch (move->state) {
+		case FTL_RELOC_STATE_READ_LBA_MAP:
+			rc = ftl_reloc_read_lba_map(breloc, move);
+			break;
+		case FTL_RELOC_STATE_READ:
+			rc = ftl_reloc_read(breloc, move);
+			break;
+		case FTL_RELOC_STATE_WRITE:
+			rc = ftl_reloc_write(breloc, move);
+			break;
+		default:
+			assert(false);
+			break;
+		}
+
+		if (rc) {
+			SPDK_ERRLOG("Move queue processing failed\n");
+			assert(false);
+		}
+	}
+}
+
+static bool
+ftl_reloc_done(struct ftl_band_reloc *breloc)
+{
+	return !breloc->num_outstanding && STAILQ_EMPTY(&breloc->move_queue);
+}
+
+static void
+ftl_reloc_release(struct ftl_band_reloc *breloc)
+{
+	struct ftl_reloc *reloc = breloc->parent;
+	struct ftl_band *band = breloc->band;
+
+	ftl_reloc_iter_reset(breloc);
+	ftl_band_release_lba_map(band);
+	reloc->num_active--;
+
+	if (breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) {
+		/* High prio band must be relocated as a whole and ANM events will be ignored */
+		assert(breloc->num_blocks == 0 && ftl_band_empty(band));
+		TAILQ_REMOVE(&reloc->prio_queue, breloc, entry);
+		band->high_prio = 0;
+		breloc->state = FTL_BAND_RELOC_STATE_INACTIVE;
+	} else {
+		assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE);
+		TAILQ_REMOVE(&reloc->active_queue, breloc, entry);
+		breloc->state = FTL_BAND_RELOC_STATE_INACTIVE;
+
+		/* If we got ANM event during relocation put such band back to pending queue */
+		if (breloc->num_blocks != 0) {
+			breloc->state = FTL_BAND_RELOC_STATE_PENDING;
+			TAILQ_INSERT_TAIL(&reloc->pending_queue, breloc, entry);
+			return;
+		}
+	}
+
+	if (ftl_band_empty(band) && band->state == FTL_BAND_STATE_CLOSED) {
+		ftl_band_set_state(breloc->band, FTL_BAND_STATE_FREE);
+
+		if (breloc->defrag) {
+			breloc->defrag = false;
+			assert(reloc->num_defrag_bands > 0);
+			reloc->num_defrag_bands--;
+		}
+	}
+}
+
+static void
+ftl_process_reloc(struct ftl_band_reloc *breloc)
+{
+	ftl_reloc_process_moves(breloc);
+
+	if (ftl_reloc_done(breloc)) {
+		ftl_reloc_release(breloc);
+	}
+}
+
+static int
+ftl_band_reloc_init(struct ftl_reloc *reloc, struct ftl_band_reloc *breloc,
+		    struct ftl_band *band)
+{
+	breloc->band = band;
+	breloc->parent = reloc;
+
+	breloc->reloc_map = spdk_bit_array_create(ftl_get_num_blocks_in_band(reloc->dev));
+	if (!breloc->reloc_map) {
+		SPDK_ERRLOG("Failed to initialize reloc map");
+		return -1;
+	}
+
+	breloc->iter.zone_offset = calloc(ftl_get_num_punits(band->dev),
+					  sizeof(*breloc->iter.zone_offset));
+	if (!breloc->iter.zone_offset) {
+		SPDK_ERRLOG("Failed to initialize reloc iterator");
+		return -1;
+	}
+
+	STAILQ_INIT(&breloc->move_queue);
+
+	breloc->moves = calloc(reloc->max_qdepth, sizeof(*breloc->moves));
+	if (!breloc->moves) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+ftl_band_reloc_free(struct ftl_band_reloc *breloc)
+{
+	struct ftl_reloc_move *move;
+
+	if (!breloc) {
+		return;
+	}
+
+	assert(breloc->num_outstanding == 0);
+
+	/* Drain write queue if there is active band relocation during shutdown */
+	if (breloc->state == FTL_BAND_RELOC_STATE_ACTIVE ||
+	    breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) {
+		assert(breloc->parent->halt);
+		STAILQ_FOREACH(move, &breloc->move_queue, entry) {
+			ftl_reloc_free_move(breloc, move);
+		}
+	}
+
+	spdk_bit_array_free(&breloc->reloc_map);
+	free(breloc->iter.zone_offset);
+	free(breloc->moves);
+}
+
+struct ftl_reloc *
+ftl_reloc_init(struct spdk_ftl_dev *dev)
+{
+	struct ftl_reloc *reloc;
+	size_t i;
+
+	reloc = calloc(1, sizeof(*reloc));
+	if (!reloc) {
+		return NULL;
+	}
+
+	reloc->dev = dev;
+	reloc->halt = true;
+	reloc->max_qdepth = dev->conf.max_reloc_qdepth;
+	reloc->max_active = dev->conf.max_active_relocs;
+	reloc->xfer_size = dev->xfer_size;
+	reloc->num_defrag_bands = 0;
+
+	if (reloc->max_qdepth > FTL_RELOC_MAX_MOVES) {
+		goto error;
+	}
+
+	reloc->brelocs = calloc(ftl_get_num_bands(dev), sizeof(*reloc->brelocs));
+	if (!reloc->brelocs) {
+		goto error;
+	}
+
+	for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) {
+		if (ftl_band_reloc_init(reloc, &reloc->brelocs[i], &dev->bands[i])) {
+			goto error;
+		}
+	}
+
+	TAILQ_INIT(&reloc->pending_queue);
+	TAILQ_INIT(&reloc->active_queue);
+	TAILQ_INIT(&reloc->prio_queue);
+
+	return reloc;
+error:
+	ftl_reloc_free(reloc);
+	return NULL;
+}
+
+void
+ftl_reloc_free(struct ftl_reloc *reloc)
+{
+	size_t i;
+
+	if (!reloc) {
+		return;
+	}
+
+	for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) {
+		ftl_band_reloc_free(&reloc->brelocs[i]);
+	}
+
+	free(reloc->brelocs);
+	free(reloc);
+}
+
+bool
+ftl_reloc_is_halted(const struct ftl_reloc *reloc)
+{
+	return reloc->halt;
+}
+
+void
+ftl_reloc_halt(struct ftl_reloc *reloc)
+{
+	reloc->halt = true;
+}
+
+void
+ftl_reloc_resume(struct ftl_reloc *reloc)
+{
+	reloc->halt = false;
+}
+
+void
+ftl_reloc(struct ftl_reloc *reloc)
+{
+	struct ftl_band_reloc *breloc, *tbreloc;
+
+	if (ftl_reloc_is_halted(reloc)) {
+		return;
+	}
+
+	/* Process first band from priority queue and return */
+	breloc = TAILQ_FIRST(&reloc->prio_queue);
+	if (breloc) {
+		ftl_process_reloc(breloc);
+		return;
+	}
+
+	TAILQ_FOREACH_SAFE(breloc, &reloc->pending_queue, entry, tbreloc) {
+		if (reloc->num_active == reloc->max_active) {
+			break;
+		}
+
+		/* Wait for band to close before relocating */
+		if (breloc->band->state != FTL_BAND_STATE_CLOSED) {
+			continue;
+		}
+
+		ftl_reloc_prep(breloc);
+		assert(breloc->state == FTL_BAND_RELOC_STATE_PENDING);
+		TAILQ_REMOVE(&reloc->pending_queue, breloc, entry);
+		breloc->state = FTL_BAND_RELOC_STATE_ACTIVE;
+		TAILQ_INSERT_HEAD(&reloc->active_queue, breloc, entry);
+	}
+
+	TAILQ_FOREACH_SAFE(breloc, &reloc->active_queue, entry, tbreloc) {
+		assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE);
+		ftl_process_reloc(breloc);
+	}
+}
+
+void
+ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, size_t offset,
+	      size_t num_blocks, int prio, bool is_defrag)
+{
+	struct ftl_band_reloc *breloc = &reloc->brelocs[band->id];
+	size_t i;
+
+	/* No need to add anything if already at high prio - whole band should be relocated */
+	if (!prio && band->high_prio) {
+		return;
+	}
+
+	pthread_spin_lock(&band->lba_map.lock);
+	if (band->lba_map.num_vld == 0) {
+		pthread_spin_unlock(&band->lba_map.lock);
+
+		/* If the band is closed and has no valid blocks, free it */
+		if (band->state == FTL_BAND_STATE_CLOSED) {
+			ftl_band_set_state(band, FTL_BAND_STATE_FREE);
+		}
+
+		return;
+	}
+	pthread_spin_unlock(&band->lba_map.lock);
+
+	for (i = offset; i < offset + num_blocks; ++i) {
+		if (spdk_bit_array_get(breloc->reloc_map, i)) {
+			continue;
+		}
+		spdk_bit_array_set(breloc->reloc_map, i);
+		breloc->num_blocks++;
+	}
+
+	/* If the band is coming from the defrag process, mark it appropriately */
+	if (is_defrag) {
+		assert(offset == 0 && num_blocks == ftl_get_num_blocks_in_band(band->dev));
+		reloc->num_defrag_bands++;
+		breloc->defrag = true;
+	}
+
+	if (!prio) {
+		if (breloc->state == FTL_BAND_RELOC_STATE_INACTIVE) {
+			breloc->state = FTL_BAND_RELOC_STATE_PENDING;
+			TAILQ_INSERT_HEAD(&reloc->pending_queue, breloc, entry);
+		}
+	} else {
+		bool active = false;
+		/* If priority band is already on pending or active queue, remove it from it */
+		switch (breloc->state) {
+		case FTL_BAND_RELOC_STATE_PENDING:
+			TAILQ_REMOVE(&reloc->pending_queue, breloc, entry);
+			break;
+		case FTL_BAND_RELOC_STATE_ACTIVE:
+			active = true;
+			TAILQ_REMOVE(&reloc->active_queue, breloc, entry);
+			break;
+		default:
+			break;
+		}
+
+		breloc->state = FTL_BAND_RELOC_STATE_HIGH_PRIO;
+		TAILQ_INSERT_TAIL(&reloc->prio_queue, breloc, entry);
+
+		/*
+		 * If band has been already on active queue it doesn't need any additional
+		 * resources
+		 */
+		if (!active) {
+			ftl_reloc_prep(breloc);
+		}
+	}
+}
diff --git a/src/spdk/lib/ftl/ftl_reloc.h b/src/spdk/lib/ftl/ftl_reloc.h
new file mode 100644
index 000000000..21f49a47d
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_reloc.h
@@ -0,0 +1,53 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_RELOC_H
+#define FTL_RELOC_H
+
+#include "spdk/stdinc.h"
+#include "spdk/ftl.h"
+
+struct ftl_reloc;
+struct ftl_band;
+
+struct ftl_reloc	*ftl_reloc_init(struct spdk_ftl_dev *dev);
+void			ftl_reloc_free(struct ftl_reloc *reloc);
+void			ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band,
+				      size_t offset, size_t num_blocks, int prio, bool is_defrag);
+void			ftl_reloc(struct ftl_reloc *reloc);
+void			ftl_reloc_halt(struct ftl_reloc *reloc);
+void			ftl_reloc_resume(struct ftl_reloc *reloc);
+bool			ftl_reloc_is_halted(const struct ftl_reloc *reloc);
+bool			ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc);
+
+#endif /* FTL_RELOC_H */
diff --git a/src/spdk/lib/ftl/ftl_restore.c b/src/spdk/lib/ftl/ftl_restore.c
new file mode 100644
index 000000000..6f626645d
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_restore.c
@@ -0,0 +1,1350 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/ftl.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/crc32.h"
+
+#include "ftl_core.h"
+#include "ftl_band.h"
+#include "ftl_io.h"
+
+struct ftl_restore_band {
+	struct ftl_restore		*parent;
+	/* Associated band */
+	struct ftl_band			*band;
+	/* Status of retrieving this band's metadata */
+	enum ftl_md_status		md_status;
+	/* Padded queue link  */
+	STAILQ_ENTRY(ftl_restore_band)	stailq;
+};
+
+struct ftl_nv_cache_restore;
+
+/* Describes single phase to be restored from non-volatile cache */
+struct ftl_nv_cache_range {
+	struct ftl_nv_cache_restore	*parent;
+	/* Start offset */
+	uint64_t			start_addr;
+	/* Last block's address */
+	uint64_t			last_addr;
+	/*
+	 * Number of blocks (can be smaller than the difference between the last
+	 * and the starting block due to range overlap)
+	 */
+	uint64_t			num_blocks;
+	/* Number of blocks already recovered */
+	uint64_t			num_recovered;
+	/* Current address during recovery */
+	uint64_t			current_addr;
+	/* Phase of the range */
+	unsigned int			phase;
+	/* Indicates whether the data from this range needs to be recovered */
+	bool				recovery;
+};
+
+struct ftl_nv_cache_block {
+	struct ftl_nv_cache_restore	*parent;
+	/* Data buffer */
+	void				*buf;
+	/* Metadata buffer */
+	void				*md_buf;
+	/* Block offset within the cache */
+	uint64_t			offset;
+};
+
+struct ftl_nv_cache_restore {
+	struct ftl_nv_cache		*nv_cache;
+	/* IO channel to use */
+	struct spdk_io_channel		*ioch;
+	/*
+	 * Non-volatile cache ranges. The ranges can overlap, as we have no
+	 * control over the order of completions. The phase of the range is the
+	 * index within the table. The range with index 0 marks blocks that were
+	 * never written.
+	 */
+	struct ftl_nv_cache_range	range[FTL_NV_CACHE_PHASE_COUNT];
+#define FTL_NV_CACHE_RESTORE_DEPTH 128
+	/* Non-volatile cache buffers */
+	struct ftl_nv_cache_block	block[FTL_NV_CACHE_RESTORE_DEPTH];
+	/* Current address */
+	uint64_t			current_addr;
+	/* Number of outstanding requests */
+	size_t				num_outstanding;
+	/* Recovery/scan status */
+	int				status;
+	/* Current phase of the recovery */
+	unsigned int			phase;
+};
+
+struct ftl_restore {
+	struct spdk_ftl_dev		*dev;
+	/* Completion callback (called for each phase of the restoration) */
+	ftl_restore_fn			cb;
+	/* Completion callback context */
+	void				*cb_arg;
+	/* Number of inflight IOs */
+	unsigned int			num_ios;
+	/* Current band number (index in the below bands array) */
+	unsigned int			current;
+	/* Array of bands */
+	struct ftl_restore_band		*bands;
+	/* Queue of bands to be padded (due to unsafe shutdown) */
+	STAILQ_HEAD(, ftl_restore_band) pad_bands;
+	/* Status of the padding */
+	int				pad_status;
+	/* Metadata buffer */
+	void				*md_buf;
+	/* LBA map buffer */
+	void				*lba_map;
+	/* Indicates we're in the final phase of the restoration */
+	bool				final_phase;
+	/* Non-volatile cache recovery */
+	struct ftl_nv_cache_restore	nv_cache;
+};
+
+static int
+ftl_restore_tail_md(struct ftl_restore_band *rband);
+static void
+ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status);
+static void
+ftl_restore_pad_band(struct ftl_restore_band *rband);
+
+static void
+ftl_restore_free(struct ftl_restore *restore)
+{
+	unsigned int i;
+
+	if (!restore) {
+		return;
+	}
+
+	for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) {
+		spdk_dma_free(restore->nv_cache.block[i].buf);
+	}
+
+	spdk_dma_free(restore->md_buf);
+	free(restore->bands);
+	free(restore);
+}
+
+static struct ftl_restore *
+ftl_restore_init(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg)
+{
+	struct ftl_restore *restore;
+	struct ftl_restore_band *rband;
+	size_t i;
+
+	restore = calloc(1, sizeof(*restore));
+	if (!restore) {
+		goto error;
+	}
+
+	restore->dev = dev;
+	restore->cb = cb;
+	restore->cb_arg = cb_arg;
+	restore->final_phase = false;
+
+	restore->bands = calloc(ftl_get_num_bands(dev), sizeof(*restore->bands));
+	if (!restore->bands) {
+		goto error;
+	}
+
+	STAILQ_INIT(&restore->pad_bands);
+
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		rband = &restore->bands[i];
+		rband->band = &dev->bands[i];
+		rband->parent = restore;
+		rband->md_status = FTL_MD_NO_MD;
+	}
+
+	/* Allocate buffer capable of holding head mds of all bands */
+	restore->md_buf = spdk_dma_zmalloc(ftl_get_num_bands(dev) * ftl_head_md_num_blocks(dev) *
+					   FTL_BLOCK_SIZE, 0, NULL);
+	if (!restore->md_buf) {
+		goto error;
+	}
+
+	return restore;
+error:
+	ftl_restore_free(restore);
+	return NULL;
+}
+
+static void
+ftl_restore_complete(struct ftl_restore *restore, int status)
+{
+	struct ftl_restore *ctx = status ? NULL : restore;
+	bool final_phase = restore->final_phase;
+
+	restore->cb(ctx, status, restore->cb_arg);
+	if (status || final_phase) {
+		ftl_restore_free(restore);
+	}
+}
+
+static int
+ftl_band_cmp(const void *lband, const void *rband)
+{
+	uint64_t lseq = ((struct ftl_restore_band *)lband)->band->seq;
+	uint64_t rseq = ((struct ftl_restore_band *)rband)->band->seq;
+
+	if (lseq < rseq) {
+		return -1;
+	} else {
+		return 1;
+	}
+}
+
+static int
+ftl_restore_check_seq(const struct ftl_restore *restore)
+{
+	const struct spdk_ftl_dev *dev = restore->dev;
+	const struct ftl_restore_band *rband;
+	const struct ftl_band *next_band;
+	size_t i;
+
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		rband = &restore->bands[i];
+		if (rband->md_status != FTL_MD_SUCCESS) {
+			continue;
+		}
+
+		next_band = LIST_NEXT(rband->band, list_entry);
+		if (next_band && rband->band->seq == next_band->seq) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static bool
+ftl_restore_head_valid(struct spdk_ftl_dev *dev, struct ftl_restore *restore, size_t *num_valid)
+{
+	struct ftl_restore_band *rband;
+	size_t i;
+
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		rband = &restore->bands[i];
+
+		if (rband->md_status != FTL_MD_SUCCESS &&
+		    rband->md_status != FTL_MD_NO_MD &&
+		    rband->md_status != FTL_MD_IO_FAILURE) {
+			SPDK_ERRLOG("Inconsistent head metadata found on band %u\n",
+				    rband->band->id);
+			return false;
+		}
+
+		if (rband->md_status == FTL_MD_SUCCESS) {
+			(*num_valid)++;
+		}
+	}
+
+	return true;
+}
+
+static void
+ftl_restore_head_complete(struct ftl_restore *restore)
+{
+	struct spdk_ftl_dev *dev = restore->dev;
+	size_t num_valid = 0;
+	int status = -EIO;
+
+	if (!ftl_restore_head_valid(dev, restore, &num_valid)) {
+		goto out;
+	}
+
+	if (num_valid == 0) {
+		SPDK_ERRLOG("Couldn't find any valid bands\n");
+		goto out;
+	}
+
+	/* Sort bands in sequence number ascending order */
+	qsort(restore->bands, ftl_get_num_bands(dev), sizeof(struct ftl_restore_band),
+	      ftl_band_cmp);
+
+	if (ftl_restore_check_seq(restore)) {
+		SPDK_ERRLOG("Band sequence consistency failed\n");
+		goto out;
+	}
+
+	dev->num_lbas = dev->global_md.num_lbas;
+	status = 0;
+out:
+	ftl_restore_complete(restore, status);
+}
+
+static void
+ftl_restore_head_cb(struct ftl_io *io, void *ctx, int status)
+{
+	struct ftl_restore_band *rband = ctx;
+	struct ftl_restore *restore = rband->parent;
+	unsigned int num_ios;
+
+	rband->md_status = status;
+	num_ios = __atomic_fetch_sub(&restore->num_ios, 1, __ATOMIC_SEQ_CST);
+	assert(num_ios > 0);
+
+	if (num_ios == 1) {
+		ftl_restore_head_complete(restore);
+	}
+}
+
+static void
+ftl_restore_head_md(void *ctx)
+{
+	struct ftl_restore *restore = ctx;
+	struct spdk_ftl_dev *dev = restore->dev;
+	struct ftl_restore_band *rband;
+	struct ftl_lba_map *lba_map;
+	unsigned int num_failed = 0, num_ios;
+	size_t i;
+
+	restore->num_ios = ftl_get_num_bands(dev);
+
+	for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+		rband = &restore->bands[i];
+		lba_map = &rband->band->lba_map;
+
+		lba_map->dma_buf = restore->md_buf + i * ftl_head_md_num_blocks(dev) * FTL_BLOCK_SIZE;
+
+		if (ftl_band_read_head_md(rband->band, ftl_restore_head_cb, rband)) {
+			if (spdk_likely(rband->band->num_zones)) {
+				SPDK_ERRLOG("Failed to read metadata on band %zu\n", i);
+
+				rband->md_status = FTL_MD_INVALID_CRC;
+
+				/* If the first IO fails, don't bother sending anything else */
+				if (i == 0) {
+					ftl_restore_complete(restore, -EIO);
+				}
+			}
+
+			num_failed++;
+		}
+	}
+
+	if (spdk_unlikely(num_failed > 0)) {
+		num_ios = __atomic_fetch_sub(&restore->num_ios, num_failed, __ATOMIC_SEQ_CST);
+		if (num_ios == num_failed) {
+			ftl_restore_complete(restore, -EIO);
+		}
+	}
+}
+
+int
+ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg)
+{
+	struct ftl_restore *restore;
+
+	restore = ftl_restore_init(dev, cb, cb_arg);
+	if (!restore) {
+		return -ENOMEM;
+	}
+
+	spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_head_md, restore);
+
+	return 0;
+}
+
+static int
+ftl_restore_l2p(struct ftl_band *band)
+{
+	struct spdk_ftl_dev *dev = band->dev;
+	struct ftl_addr addr;
+	uint64_t lba;
+	size_t i;
+
+	for (i = 0; i < ftl_get_num_blocks_in_band(band->dev); ++i) {
+		if (!spdk_bit_array_get(band->lba_map.vld, i)) {
+			continue;
+		}
+
+		lba = band->lba_map.map[i];
+		if (lba >= dev->num_lbas) {
+			return -1;
+		}
+
+		addr = ftl_l2p_get(dev, lba);
+		if (!ftl_addr_invalid(addr)) {
+			ftl_invalidate_addr(dev, addr);
+		}
+
+		addr = ftl_band_addr_from_block_offset(band, i);
+
+		ftl_band_set_addr(band, lba, addr);
+		ftl_l2p_set(dev, lba, addr);
+	}
+
+	return 0;
+}
+
+static struct ftl_restore_band *
+ftl_restore_next_band(struct ftl_restore *restore)
+{
+	struct ftl_restore_band *rband;
+
+	for (; restore->current < ftl_get_num_bands(restore->dev); ++restore->current) {
+		rband = &restore->bands[restore->current];
+
+		if (spdk_likely(rband->band->num_zones) &&
+		    rband->md_status == FTL_MD_SUCCESS) {
+			restore->current++;
+			return rband;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+ftl_nv_cache_restore_complete(struct ftl_nv_cache_restore *restore, int status)
+{
+	struct ftl_restore *ftl_restore = SPDK_CONTAINEROF(restore, struct ftl_restore, nv_cache);
+
+	restore->status = restore->status ? : status;
+	if (restore->num_outstanding == 0) {
+		ftl_restore_complete(ftl_restore, restore->status);
+	}
+}
+
+static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+
+static void
+ftl_nv_cache_restore_done(struct ftl_nv_cache_restore *restore, uint64_t current_addr)
+{
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+
+	pthread_spin_lock(&nv_cache->lock);
+	nv_cache->current_addr = current_addr;
+	nv_cache->ready = true;
+	pthread_spin_unlock(&nv_cache->lock);
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Enabling non-volatile cache (phase: %u, addr: %"
+		      PRIu64")\n", nv_cache->phase, current_addr);
+
+	ftl_nv_cache_restore_complete(restore, 0);
+}
+
+static void
+ftl_nv_cache_write_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache_restore *restore = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n");
+		ftl_nv_cache_restore_complete(restore, -EIO);
+		return;
+	}
+
+	ftl_nv_cache_restore_done(restore, FTL_NV_CACHE_DATA_OFFSET);
+}
+
+static void
+ftl_nv_cache_scrub_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache_restore *restore = cb_arg;
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	int rc;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Scrubbing non-volatile cache failed\n");
+		ftl_nv_cache_restore_complete(restore, -EIO);
+		return;
+	}
+
+	nv_cache->phase = 1;
+	rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_write_header_cb, restore);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to write the non-volatile cache metadata header: %s\n",
+			    spdk_strerror(-rc));
+		ftl_nv_cache_restore_complete(restore, -EIO);
+	}
+}
+
+static void
+ftl_nv_cache_scrub_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache_restore *restore = cb_arg;
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	int rc;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n");
+		ftl_nv_cache_restore_complete(restore, -EIO);
+		return;
+	}
+
+	rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", spdk_strerror(-rc));
+		ftl_nv_cache_restore_complete(restore, rc);
+	}
+}
+
+static void
+ftl_nv_cache_band_flush_cb(void *ctx, int status)
+{
+	struct ftl_nv_cache_restore *restore = ctx;
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	int rc;
+
+	if (spdk_unlikely(status != 0)) {
+		SPDK_ERRLOG("Flushing active bands failed: %s\n", spdk_strerror(-status));
+		ftl_nv_cache_restore_complete(restore, status);
+		return;
+	}
+
+	/*
+	 * Use phase 0 to indicate that the cache is being scrubbed. If the power is lost during
+	 * this process, we'll know it needs to be resumed.
+	 */
+	nv_cache->phase = 0;
+	rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_scrub_header_cb, restore);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n",
+			    spdk_strerror(-rc));
+		ftl_nv_cache_restore_complete(restore, rc);
+	}
+}
+
+static void
+ftl_nv_cache_wbuf_flush_cb(void *ctx, int status)
+{
+	struct ftl_nv_cache_restore *restore = ctx;
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+	int rc;
+
+	if (spdk_unlikely(status != 0)) {
+		SPDK_ERRLOG("Flushing the write buffer failed: %s\n", spdk_strerror(-status));
+		ftl_nv_cache_restore_complete(restore, status);
+		return;
+	}
+
+	rc = ftl_flush_active_bands(dev, ftl_nv_cache_band_flush_cb, restore);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Unable to flush active bands: %s\n", spdk_strerror(-rc));
+		ftl_nv_cache_restore_complete(restore, rc);
+	}
+}
+
+static void
+ftl_nv_cache_recovery_done(struct ftl_nv_cache_restore *restore)
+{
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	struct ftl_nv_cache_range *range_prev, *range_current;
+	struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+	struct spdk_bdev *bdev;
+	uint64_t current_addr;
+	int rc;
+
+	range_prev = &restore->range[ftl_nv_cache_prev_phase(nv_cache->phase)];
+	range_current = &restore->range[nv_cache->phase];
+	bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+
+	/*
+	 * If there are more than two ranges or the ranges overlap, scrub the non-volatile cache to
+	 * make sure that any subsequent power loss will find the cache in usable state
+	 */
+	if ((range_prev->num_blocks + range_current->num_blocks < nv_cache->num_data_blocks) ||
+	    (range_prev->start_addr < range_current->last_addr &&
+	     range_current->start_addr < range_prev->last_addr)) {
+		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache inconsistency detected\n");
+
+		rc = ftl_flush_wbuf(dev, ftl_nv_cache_wbuf_flush_cb, restore);
+		if (spdk_unlikely(rc != 0)) {
+			SPDK_ERRLOG("Unable to flush the write buffer: %s\n", spdk_strerror(-rc));
+			ftl_nv_cache_restore_complete(restore, rc);
+		}
+
+		return;
+	}
+
+	/* The latest phase is the one written in the header (set in nvc_cache->phase) */
+	current_addr = range_current->last_addr + 1;
+
+	/*
+	 * The first range might be empty (only the header was written) or the range might
+	 * end at the last available address, in which case set current address to the
+	 * beginning of the device.
+	 */
+	if (range_current->num_blocks == 0 || current_addr >= spdk_bdev_get_num_blocks(bdev)) {
+		current_addr = FTL_NV_CACHE_DATA_OFFSET;
+	}
+
+	ftl_nv_cache_restore_done(restore, current_addr);
+}
+
+static void
+ftl_nv_cache_recover_block(struct ftl_nv_cache_block *block)
+{
+	struct ftl_nv_cache_restore *restore = block->parent;
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	struct ftl_nv_cache_range *range = &restore->range[restore->phase];
+	int rc;
+
+	assert(range->current_addr <= range->last_addr);
+
+	restore->num_outstanding++;
+	block->offset = range->current_addr++;
+	rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch,
+					   block->buf, block->md_buf,
+					   block->offset, 1, ftl_nv_cache_block_read_cb,
+					   block);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n",
+			    block->offset, spdk_strerror(-rc));
+		restore->num_outstanding--;
+		ftl_nv_cache_restore_complete(restore, rc);
+	}
+}
+
+static void
+ftl_nv_cache_recover_range(struct ftl_nv_cache_restore *restore)
+{
+	struct ftl_nv_cache_range *range;
+	unsigned int phase = restore->phase;
+
+	do {
+		/* Find first range with non-zero number of blocks that is marked for recovery */
+		range = &restore->range[phase];
+		if (range->recovery && range->num_recovered < range->num_blocks) {
+			break;
+		}
+
+		phase = ftl_nv_cache_next_phase(phase);
+	} while (phase != restore->phase);
+
+	/* There are no ranges to be recovered, we're done */
+	if (range->num_recovered == range->num_blocks || !range->recovery) {
+		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache recovery done\n");
+		ftl_nv_cache_recovery_done(restore);
+		return;
+	}
+
+	range->current_addr = range->start_addr;
+	restore->phase = phase;
+
+	SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Recovering range %u %"PRIu64"-%"PRIu64" (%"PRIu64")\n",
+		      phase, range->start_addr, range->last_addr, range->num_blocks);
+
+	ftl_nv_cache_recover_block(&restore->block[0]);
+}
+
+static void
+ftl_nv_cache_write_cb(struct ftl_io *io, void *cb_arg, int status)
+{
+	struct ftl_nv_cache_block *block = cb_arg;
+	struct ftl_nv_cache_restore *restore = block->parent;
+	struct ftl_nv_cache_range *range = &restore->range[restore->phase];
+
+	restore->num_outstanding--;
+	if (status != 0) {
+		SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n",
+			    block->offset, spdk_strerror(-status));
+		ftl_nv_cache_restore_complete(restore, -ENOMEM);
+		return;
+	}
+
+	range->num_recovered++;
+	if (range->current_addr <= range->last_addr) {
+		ftl_nv_cache_recover_block(block);
+	} else if (restore->num_outstanding == 0) {
+		assert(range->num_recovered == range->num_blocks);
+		ftl_nv_cache_recover_range(restore);
+	}
+}
+
+static struct ftl_io *
+ftl_nv_cache_alloc_io(struct ftl_nv_cache_block *block, uint64_t lba)
+{
+	struct ftl_restore *restore = SPDK_CONTAINEROF(block->parent, struct ftl_restore, nv_cache);
+	struct ftl_io_init_opts opts = {
+		.dev		= restore->dev,
+		.io		= NULL,
+		.flags		= FTL_IO_BYPASS_CACHE,
+		.type		= FTL_IO_WRITE,
+		.num_blocks	= 1,
+		.cb_fn		= ftl_nv_cache_write_cb,
+		.cb_ctx		= block,
+		.iovs		= {
+			{
+				.iov_base = block->buf,
+				.iov_len = FTL_BLOCK_SIZE,
+			}
+		},
+		.iovcnt		= 1,
+	};
+	struct ftl_io *io;
+
+	io = ftl_io_init_internal(&opts);
+	if (spdk_unlikely(!io)) {
+		return NULL;
+	}
+
+	io->lba.single = lba;
+	return io;
+}
+
+static void
+ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache_block *block = cb_arg;
+	struct ftl_nv_cache_restore *restore = block->parent;
+	struct ftl_nv_cache_range *range = &restore->range[restore->phase];
+	struct ftl_io *io;
+	unsigned int phase;
+	uint64_t lba;
+
+	spdk_bdev_free_io(bdev_io);
+	restore->num_outstanding--;
+
+	if (!success) {
+		SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64"\n",
+			    block->offset);
+		ftl_nv_cache_restore_complete(restore, -EIO);
+		return;
+	}
+
+	ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase);
+	if (spdk_unlikely(phase != restore->phase)) {
+		if (range->current_addr < range->last_addr) {
+			ftl_nv_cache_recover_block(block);
+		} else if (restore->num_outstanding == 0) {
+			ftl_nv_cache_recover_range(restore);
+		}
+
+		return;
+	}
+
+	io = ftl_nv_cache_alloc_io(block, lba);
+	if (spdk_unlikely(!io)) {
+		SPDK_ERRLOG("Failed to allocate ftl_io during non-volatile cache recovery\n");
+		ftl_nv_cache_restore_complete(restore, -ENOMEM);
+		return;
+	}
+
+	restore->num_outstanding++;
+	ftl_io_write(io);
+}
+
+/*
+ * Since we have no control over the order in which the requests complete in regards to their
+ * submission, the cache can be in either of the following states:
+ *  - [1 1 1 1 1 1 1 1 1 1]: simplest case, whole cache contains single phase (although it should be
+ *			     very rare),
+ *  - [1 1 1 1 3 3 3 3 3 3]: two phases, changing somewhere in the middle with no overlap. This is
+ *			     the state left by clean shutdown,
+ *  - [1 1 1 1 3 1 3 3 3 3]: similar to the above, but this time the two ranges overlap. This
+ *			     happens when completions are reordered during unsafe shutdown,
+ *  - [2 1 2 1 1 1 1 3 1 3]: three different phases, each one of which can overlap with
+ *			     previous/next one. The data from the oldest phase doesn't need to be
+ *			     recovered, as it was already being written to, which means it's
+ *			     already on the main storage.
+ */
+static void
+ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore)
+{
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+#if defined(DEBUG)
+	struct ftl_nv_cache_range *range;
+	uint64_t i, num_blocks = 0;
+
+	for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
+		range = &restore->range[i];
+		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64
+			      ")\n", i, range->start_addr, range->last_addr, range->num_blocks);
+		num_blocks += range->num_blocks;
+	}
+	assert(num_blocks == nv_cache->num_data_blocks);
+#endif
+	restore->phase = ftl_nv_cache_prev_phase(nv_cache->phase);
+
+	/*
+	 * Only the latest two phases need to be recovered. The third one, even if present,
+	 * already has to be stored on the main storage, as it's already started to be
+	 * overwritten (only present here because of reordering of requests' completions).
+	 */
+	restore->range[nv_cache->phase].recovery = true;
+	restore->range[restore->phase].recovery = true;
+
+	ftl_nv_cache_recover_range(restore);
+}
+
+static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block);
+
+static void
+ftl_nv_cache_scan_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache_block *block = cb_arg;
+	struct ftl_nv_cache_restore *restore = block->parent;
+	struct ftl_nv_cache_range *range;
+	struct spdk_bdev *bdev;
+	unsigned int phase;
+	uint64_t lba;
+
+	restore->num_outstanding--;
+	bdev = spdk_bdev_desc_get_bdev(restore->nv_cache->bdev_desc);
+	spdk_bdev_free_io(bdev_io);
+
+	if (!success) {
+		SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64"\n",
+			    block->offset);
+		ftl_nv_cache_restore_complete(restore, -EIO);
+		return;
+	}
+
+	/* If we've already hit an error, don't bother with scanning anything else */
+	if (spdk_unlikely(restore->status != 0)) {
+		ftl_nv_cache_restore_complete(restore, restore->status);
+		return;
+	}
+
+	ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase);
+	range = &restore->range[phase];
+	range->num_blocks++;
+
+	if (range->start_addr == FTL_LBA_INVALID || range->start_addr > block->offset) {
+		range->start_addr = block->offset;
+	}
+
+	if (range->last_addr == FTL_LBA_INVALID || range->last_addr < block->offset) {
+		range->last_addr = block->offset;
+	}
+
+	/* All the blocks were read, once they're all completed and we're finished */
+	if (restore->current_addr == spdk_bdev_get_num_blocks(bdev)) {
+		if (restore->num_outstanding == 0) {
+			ftl_nv_cache_scan_done(restore);
+		}
+
+		return;
+	}
+
+	ftl_nv_cache_scan_block(block);
+}
+
+static int
+ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block)
+{
+	struct ftl_nv_cache_restore *restore = block->parent;
+	struct ftl_nv_cache *nv_cache = restore->nv_cache;
+	int rc;
+
+	restore->num_outstanding++;
+	block->offset = restore->current_addr++;
+	rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch,
+					   block->buf, block->md_buf,
+					   block->offset, 1, ftl_nv_cache_scan_cb,
+					   block);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64" (%s)\n",
+			    block->offset, spdk_strerror(-rc));
+		restore->num_outstanding--;
+		ftl_nv_cache_restore_complete(restore, rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+ftl_nv_cache_clean_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_nv_cache_restore *restore = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n");
+		ftl_nv_cache_restore_complete(restore, -EIO);
+		return;
+	}
+
+	ftl_nv_cache_restore_done(restore, restore->current_addr);
+}
+
+static bool
+ftl_nv_cache_header_valid(struct spdk_ftl_dev *dev, const struct ftl_nv_cache_header *hdr)
+{
+	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc);
+	uint32_t checksum;
+
+	checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0);
+	if (checksum != hdr->checksum) {
+		SPDK_ERRLOG("Invalid header checksum (found: %"PRIu32", expected: %"PRIu32")\n",
+			    checksum, hdr->checksum);
+		return false;
+	}
+
+	if (hdr->version != FTL_NV_CACHE_HEADER_VERSION) {
+		SPDK_ERRLOG("Invalid header version (found: %"PRIu32", expected: %"PRIu32")\n",
+			    hdr->version, FTL_NV_CACHE_HEADER_VERSION);
+		return false;
+	}
+
+	if (hdr->size != spdk_bdev_get_num_blocks(bdev)) {
+		SPDK_ERRLOG("Unexpected size of the non-volatile cache bdev (%"PRIu64", expected: %"
+			    PRIu64")\n", hdr->size, spdk_bdev_get_num_blocks(bdev));
+		return false;
+	}
+
+	if (spdk_uuid_compare(&hdr->uuid, &dev->uuid)) {
+		SPDK_ERRLOG("Invalid device UUID\n");
+		return false;
+	}
+
+	if (!ftl_nv_cache_phase_is_valid(hdr->phase) && hdr->phase != 0) {
+		return false;
+	}
+
+	if ((hdr->current_addr >= spdk_bdev_get_num_blocks(bdev) ||
+	     hdr->current_addr  < FTL_NV_CACHE_DATA_OFFSET) &&
+	    (hdr->current_addr != FTL_LBA_INVALID)) {
+		SPDK_ERRLOG("Unexpected value of non-volatile cache's current address: %"PRIu64"\n",
+			    hdr->current_addr);
+		return false;
+	}
+
+	return true;
+}
+
+static void
+ftl_nv_cache_read_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct ftl_restore *restore = cb_arg;
+	struct spdk_ftl_dev *dev = restore->dev;
+	struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+	struct ftl_nv_cache_header *hdr;
+	struct iovec *iov = NULL;
+	int iov_cnt = 0, i, rc;
+
+	if (!success) {
+		SPDK_ERRLOG("Unable to read non-volatile cache metadata header\n");
+		ftl_restore_complete(restore, -ENOTRECOVERABLE);
+		goto out;
+	}
+
+	spdk_bdev_io_get_iovec(bdev_io, &iov, &iov_cnt);
+	assert(iov != NULL);
+	hdr = iov[0].iov_base;
+
+	if (!ftl_nv_cache_header_valid(dev, hdr)) {
+		ftl_restore_complete(restore, -ENOTRECOVERABLE);
+		goto out;
+	}
+
+	/* Remember the latest phase */
+	nv_cache->phase = hdr->phase;
+
+	/* If the phase equals zero, we lost power during recovery. We need to finish it up
+	 * by scrubbing the device once again.
+	 */
+	if (hdr->phase == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Detected phase 0, restarting scrub\n");
+		rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore);
+		if (spdk_unlikely(rc != 0)) {
+			SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n",
+				    spdk_strerror(-rc));
+			ftl_restore_complete(restore, -ENOTRECOVERABLE);
+		}
+
+		goto out;
+	}
+
+	/* Valid current_addr means that the shutdown was clean, so we just need to overwrite the
+	 * header to make sure that any power loss occurring before the cache is wrapped won't be
+	 * mistaken for a clean shutdown.
+	 */
+	if (hdr->current_addr != FTL_LBA_INVALID) {
+		restore->nv_cache.current_addr = hdr->current_addr;
+
+		rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_clean_header_cb,
+					       &restore->nv_cache);
+		if (spdk_unlikely(rc != 0)) {
+			SPDK_ERRLOG("Failed to overwrite the non-volatile cache header: %s\n",
+				    spdk_strerror(-rc));
+			ftl_restore_complete(restore, -ENOTRECOVERABLE);
+		}
+
+		goto out;
+	}
+
+	/* Otherwise the shutdown was unexpected, so we need to recover the data from the cache */
+	restore->nv_cache.current_addr = FTL_NV_CACHE_DATA_OFFSET;
+
+	for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) {
+		if (ftl_nv_cache_scan_block(&restore->nv_cache.block[i])) {
+			break;
+		}
+	}
+out:
+	spdk_bdev_free_io(bdev_io);
+}
+
+void
+ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg)
+{
+	struct spdk_ftl_dev *dev = restore->dev;
+	struct spdk_bdev *bdev;
+	struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+	struct ftl_io_channel *ioch;
+	struct ftl_nv_cache_restore *nvc_restore = &restore->nv_cache;
+	struct ftl_nv_cache_block *block;
+	size_t alignment;
+	int rc, i;
+
+	ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+	bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+	alignment = spdk_max(spdk_bdev_get_buf_align(bdev), sizeof(uint64_t));
+
+	nvc_restore->nv_cache = nv_cache;
+	nvc_restore->ioch = ioch->cache_ioch;
+
+	restore->final_phase = true;
+	restore->cb = cb;
+	restore->cb_arg = cb_arg;
+
+	for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) {
+		block = &nvc_restore->block[i];
+		block->parent = nvc_restore;
+		block->buf = spdk_dma_zmalloc(spdk_bdev_get_block_size(bdev) +
+					      spdk_bdev_get_md_size(bdev),
+					      alignment, NULL);
+		if (!block->buf) {
+			/* The memory will be freed in ftl_restore_free */
+			SPDK_ERRLOG("Unable to allocate memory\n");
+			ftl_restore_complete(restore, -ENOMEM);
+			return;
+		}
+
+		block->md_buf = (char *)block->buf + spdk_bdev_get_block_size(bdev);
+	}
+
+	for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
+		nvc_restore->range[i].parent = nvc_restore;
+		nvc_restore->range[i].start_addr = FTL_LBA_INVALID;
+		nvc_restore->range[i].last_addr = FTL_LBA_INVALID;
+		nvc_restore->range[i].num_blocks = 0;
+		nvc_restore->range[i].recovery = false;
+		nvc_restore->range[i].phase = i;
+	}
+
+	rc = spdk_bdev_read_blocks(nv_cache->bdev_desc, ioch->cache_ioch, nv_cache->dma_buf,
+				   0, FTL_NV_CACHE_DATA_OFFSET, ftl_nv_cache_read_header_cb, restore);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Failed to read non-volatile cache metadata header: %s\n",
+			    spdk_strerror(-rc));
+		ftl_restore_complete(restore, rc);
+	}
+}
+
+static bool
+ftl_pad_zone_pad_finish(struct ftl_restore_band *rband, bool direct_access)
+{
+	struct ftl_restore *restore = rband->parent;
+	struct ftl_restore_band *next_band;
+	size_t i, num_pad_zones = 0;
+
+	if (spdk_unlikely(restore->pad_status && !restore->num_ios)) {
+		if (direct_access) {
+			/* In case of any errors found we want to clear direct access. */
+			/* Direct access bands have their own allocated md, which would be lost */
+			/* on restore complete otherwise. */
+			rband->band->state = FTL_BAND_STATE_CLOSED;
+			ftl_band_set_direct_access(rband->band, false);
+		}
+		ftl_restore_complete(restore, restore->pad_status);
+		return true;
+	}
+
+	for (i = 0; i < rband->band->num_zones; ++i) {
+		if (rband->band->zone_buf[i].info.state != SPDK_BDEV_ZONE_STATE_FULL) {
+			num_pad_zones++;
+		}
+	}
+
+	/* Finished all zones in a band, check if all bands are done */
+	if (num_pad_zones == 0) {
+		if (direct_access) {
+			rband->band->state = FTL_BAND_STATE_CLOSED;
+			ftl_band_set_direct_access(rband->band, false);
+		}
+
+		next_band = STAILQ_NEXT(rband, stailq);
+		if (!next_band) {
+			ftl_restore_complete(restore, restore->pad_status);
+			return true;
+		} else {
+			/* Start off padding in the next band */
+			ftl_restore_pad_band(next_band);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static struct ftl_io *
+ftl_restore_init_pad_io(struct ftl_restore_band *rband, void *buffer,
+			struct ftl_addr addr)
+{
+	struct ftl_band *band = rband->band;
+	struct spdk_ftl_dev *dev = band->dev;
+	int flags = FTL_IO_PAD | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE | FTL_IO_MD |
+		    FTL_IO_DIRECT_ACCESS;
+	struct ftl_io_init_opts opts = {
+		.dev		= dev,
+		.io		= NULL,
+		.band		= band,
+		.size		= sizeof(struct ftl_io),
+		.flags		= flags,
+		.type		= FTL_IO_WRITE,
+		.num_blocks	= dev->xfer_size,
+		.cb_fn		= ftl_pad_zone_cb,
+		.cb_ctx		= rband,
+		.iovs		= {
+			{
+				.iov_base = buffer,
+				.iov_len = dev->xfer_size * FTL_BLOCK_SIZE,
+			}
+		},
+		.iovcnt		= 1,
+		.parent		= NULL,
+	};
+	struct ftl_io *io;
+
+	io = ftl_io_init_internal(&opts);
+	if (spdk_unlikely(!io)) {
+		return NULL;
+	}
+
+	io->addr = addr;
+	rband->parent->num_ios++;
+
+	return io;
+}
+
+static void
+ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status)
+{
+	struct ftl_restore_band *rband = arg;
+	struct ftl_restore *restore = rband->parent;
+	struct ftl_band *band = io->band;
+	struct ftl_zone *zone;
+	struct ftl_io *new_io;
+	uint64_t offset;
+
+	restore->num_ios--;
+	/* TODO check for next unit error vs early close error */
+	if (status) {
+		restore->pad_status = status;
+		goto end;
+	}
+
+	offset = io->addr.offset % ftl_get_num_blocks_in_zone(restore->dev);
+	if (offset + io->num_blocks == ftl_get_num_blocks_in_zone(restore->dev)) {
+		zone = ftl_band_zone_from_addr(band, io->addr);
+		zone->info.state = SPDK_BDEV_ZONE_STATE_FULL;
+	} else {
+		struct ftl_addr addr = io->addr;
+		addr.offset += io->num_blocks;
+		new_io = ftl_restore_init_pad_io(rband, io->iov[0].iov_base, addr);
+		if (spdk_unlikely(!new_io)) {
+			restore->pad_status = -ENOMEM;
+			goto end;
+		}
+
+		ftl_io_write(new_io);
+		return;
+	}
+
+end:
+	spdk_dma_free(io->iov[0].iov_base);
+	ftl_pad_zone_pad_finish(rband, true);
+}
+
+static void
+ftl_restore_pad_band(struct ftl_restore_band *rband)
+{
+	struct ftl_restore *restore = rband->parent;
+	struct ftl_band *band = rband->band;
+	struct spdk_ftl_dev *dev = band->dev;
+	void *buffer = NULL;
+	struct ftl_io *io;
+	struct ftl_addr addr;
+	size_t i;
+	int rc = 0;
+
+	/* Check if some zones are not closed */
+	if (ftl_pad_zone_pad_finish(rband, false)) {
+		/*
+		 * If we're here, end meta wasn't recognized, but the whole band is written
+		 * Assume the band was padded and ignore it
+		 */
+		return;
+	}
+
+	band->state = FTL_BAND_STATE_OPEN;
+	rc = ftl_band_set_direct_access(band, true);
+	if (rc) {
+		ftl_restore_complete(restore, rc);
+		return;
+	}
+
+	for (i = 0; i < band->num_zones; ++i) {
+		if (band->zone_buf[i].info.state == SPDK_BDEV_ZONE_STATE_FULL) {
+			continue;
+		}
+
+		addr.offset = band->zone_buf[i].info.write_pointer;
+
+		buffer = spdk_dma_zmalloc(FTL_BLOCK_SIZE * dev->xfer_size, 0, NULL);
+		if (spdk_unlikely(!buffer)) {
+			rc = -ENOMEM;
+			goto error;
+		}
+
+		io = ftl_restore_init_pad_io(rband, buffer, addr);
+		if (spdk_unlikely(!io)) {
+			rc = -ENOMEM;
+			spdk_dma_free(buffer);
+			goto error;
+		}
+
+		ftl_io_write(io);
+	}
+
+	return;
+
+error:
+	restore->pad_status = rc;
+	ftl_pad_zone_pad_finish(rband, true);
+}
+
+static void
+ftl_restore_pad_open_bands(void *ctx)
+{
+	struct ftl_restore *restore = ctx;
+
+	ftl_restore_pad_band(STAILQ_FIRST(&restore->pad_bands));
+}
+
+static void
+ftl_restore_tail_md_cb(struct ftl_io *io, void *ctx, int status)
+{
+	struct ftl_restore_band *rband = ctx;
+	struct ftl_restore *restore = rband->parent;
+	struct spdk_ftl_dev *dev = restore->dev;
+
+	if (status) {
+		if (!dev->conf.allow_open_bands) {
+			SPDK_ERRLOG("%s while restoring tail md in band %u.\n",
+				    spdk_strerror(-status), rband->band->id);
+			ftl_band_release_lba_map(rband->band);
+			ftl_restore_complete(restore, status);
+			return;
+		} else {
+			SPDK_ERRLOG("%s while restoring tail md. Will attempt to pad band %u.\n",
+				    spdk_strerror(-status), rband->band->id);
+			STAILQ_INSERT_TAIL(&restore->pad_bands, rband, stailq);
+		}
+	}
+
+	if (!status && ftl_restore_l2p(rband->band)) {
+		ftl_band_release_lba_map(rband->band);
+		ftl_restore_complete(restore, -ENOTRECOVERABLE);
+		return;
+	}
+	ftl_band_release_lba_map(rband->band);
+
+	rband = ftl_restore_next_band(restore);
+	if (!rband) {
+		if (!STAILQ_EMPTY(&restore->pad_bands)) {
+			spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_pad_open_bands,
+					     restore);
+		} else {
+			ftl_restore_complete(restore, 0);
+		}
+
+		return;
+	}
+
+	ftl_restore_tail_md(rband);
+}
+
+static int
+ftl_restore_tail_md(struct ftl_restore_band *rband)
+{
+	struct ftl_restore *restore = rband->parent;
+	struct ftl_band *band = rband->band;
+
+	if (ftl_band_alloc_lba_map(band)) {
+		SPDK_ERRLOG("Failed to allocate lba map\n");
+		ftl_restore_complete(restore, -ENOMEM);
+		return -ENOMEM;
+	}
+
+	if (ftl_band_read_tail_md(band, band->tail_md_addr, ftl_restore_tail_md_cb, rband)) {
+		SPDK_ERRLOG("Failed to send tail metadata read\n");
+		ftl_restore_complete(restore, -EIO);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int
+ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg)
+{
+	struct spdk_ftl_dev *dev = restore->dev;
+	struct ftl_restore_band *rband;
+
+	restore->current = 0;
+	restore->cb = cb;
+	restore->cb_arg = cb_arg;
+	restore->final_phase = dev->nv_cache.bdev_desc == NULL;
+
+	/* If restore_device is called, there must be at least one valid band */
+	rband = ftl_restore_next_band(restore);
+	assert(rband);
+	return ftl_restore_tail_md(rband);
+}
diff --git a/src/spdk/lib/ftl/ftl_trace.c b/src/spdk/lib/ftl/ftl_trace.c
new file mode 100644
index 000000000..ba66323ad
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_trace.c
@@ -0,0 +1,361 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/trace.h"
+
+#include "ftl_core.h"
+#include "ftl_trace.h"
+#include "ftl_io.h"
+#include "ftl_band.h"
+
+#if defined(DEBUG)
+
+#define OWNER_FTL	0x20
+#define TRACE_GROUP_FTL	0x6
+
+enum ftl_trace_source {
+	FTL_TRACE_SOURCE_INTERNAL,
+	FTL_TRACE_SOURCE_USER,
+	FTL_TRACE_SOURCE_MAX,
+};
+
+#define FTL_TPOINT_ID(id, src) SPDK_TPOINT_ID(TRACE_GROUP_FTL, (((id) << 1) | (!!(src))))
+
+#define FTL_TRACE_BAND_DEFRAG(src)		FTL_TPOINT_ID(0, src)
+#define FTL_TRACE_BAND_WRITE(src)		FTL_TPOINT_ID(1, src)
+#define FTL_TRACE_LIMITS(src)			FTL_TPOINT_ID(2, src)
+#define FTL_TRACE_WBUF_POP(src)			FTL_TPOINT_ID(3, src)
+
+#define FTL_TRACE_READ_SCHEDULE(src)		FTL_TPOINT_ID(4, src)
+#define FTL_TRACE_READ_SUBMISSION(src)		FTL_TPOINT_ID(5, src)
+#define FTL_TRACE_READ_COMPLETION_INVALID(src)	FTL_TPOINT_ID(6, src)
+#define FTL_TRACE_READ_COMPLETION_CACHE(src)	FTL_TPOINT_ID(7, src)
+#define FTL_TRACE_READ_COMPLETION_DISK(src)	FTL_TPOINT_ID(8, src)
+
+#define FTL_TRACE_MD_READ_SCHEDULE(src)		FTL_TPOINT_ID(9,  src)
+#define FTL_TRACE_MD_READ_SUBMISSION(src)	FTL_TPOINT_ID(10, src)
+#define FTL_TRACE_MD_READ_COMPLETION(src)	FTL_TPOINT_ID(11, src)
+
+#define FTL_TRACE_WRITE_SCHEDULE(src)		FTL_TPOINT_ID(12, src)
+#define FTL_TRACE_WRITE_WBUF_FILL(src)		FTL_TPOINT_ID(13, src)
+#define FTL_TRACE_WRITE_SUBMISSION(src)		FTL_TPOINT_ID(14, src)
+#define FTL_TRACE_WRITE_COMPLETION(src)		FTL_TPOINT_ID(15, src)
+
+#define FTL_TRACE_MD_WRITE_SCHEDULE(src)	FTL_TPOINT_ID(16, src)
+#define FTL_TRACE_MD_WRITE_SUBMISSION(src)	FTL_TPOINT_ID(17, src)
+#define FTL_TRACE_MD_WRITE_COMPLETION(src)	FTL_TPOINT_ID(18, src)
+
+#define FTL_TRACE_ERASE_SUBMISSION(src)		FTL_TPOINT_ID(19, src)
+#define FTL_TRACE_ERASE_COMPLETION(src)		FTL_TPOINT_ID(20, src)
+
+SPDK_TRACE_REGISTER_FN(ftl_trace_func, "ftl", TRACE_GROUP_FTL)
+{
+	const char source[] = { 'i', 'u' };
+	char descbuf[128];
+	int i;
+
+	spdk_trace_register_owner(OWNER_FTL, 'f');
+
+	for (i = 0; i < FTL_TRACE_SOURCE_MAX; ++i) {
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_defrag");
+		spdk_trace_register_description(descbuf, FTL_TRACE_BAND_DEFRAG(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "band: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_write");
+		spdk_trace_register_description(descbuf, FTL_TRACE_BAND_WRITE(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "band: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "limits");
+		spdk_trace_register_description(descbuf, FTL_TRACE_LIMITS(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "limits: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_pop");
+		spdk_trace_register_description(descbuf, FTL_TRACE_WBUF_POP(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_sched");
+		spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SCHEDULE(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_submit");
+		spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SUBMISSION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_cmpl");
+		spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_COMPLETION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_sched");
+		spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SCHEDULE(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_submit");
+		spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SUBMISSION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_cmpl");
+		spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_COMPLETION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_sched");
+		spdk_trace_register_description(descbuf, FTL_TRACE_READ_SCHEDULE(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_submit");
+		spdk_trace_register_description(descbuf, FTL_TRACE_READ_SUBMISSION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_invld");
+		spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_INVALID(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_cache");
+		spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_CACHE(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_ssd");
+		spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_DISK(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_sched");
+		spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SCHEDULE(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_fill");
+		spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_WBUF_FILL(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_submit");
+		spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SUBMISSION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_cmpl");
+		spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_COMPLETION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_submit");
+		spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_SUBMISSION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+		snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_cmpl");
+		spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_COMPLETION(i),
+						OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+	}
+}
+
+static uint16_t
+ftl_trace_io_source(const struct ftl_io *io)
+{
+	if (io->flags & FTL_IO_INTERNAL) {
+		return FTL_TRACE_SOURCE_INTERNAL;
+	} else {
+		return FTL_TRACE_SOURCE_USER;
+	}
+}
+
+static uint64_t
+ftl_trace_next_id(struct ftl_trace *trace)
+{
+	assert(trace->id != FTL_TRACE_INVALID_ID);
+	return __atomic_fetch_add(&trace->id, 1, __ATOMIC_SEQ_CST);
+}
+
+void
+ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band)
+{
+	struct ftl_trace *trace = &dev->stats.trace;
+
+	spdk_trace_record(FTL_TRACE_BAND_DEFRAG(FTL_TRACE_SOURCE_INTERNAL),
+			  ftl_trace_next_id(trace), 0, band->lba_map.num_vld, band->id);
+}
+
+void
+ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band)
+{
+	struct ftl_trace *trace = &dev->stats.trace;
+
+	spdk_trace_record(FTL_TRACE_BAND_WRITE(FTL_TRACE_SOURCE_INTERNAL),
+			  ftl_trace_next_id(trace), 0, 0, band->id);
+}
+
+void
+ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io)
+{
+	uint16_t tpoint_id = 0, source;
+
+	assert(io->trace != FTL_TRACE_INVALID_ID);
+	source = ftl_trace_io_source(io);
+
+	if (io->flags & FTL_IO_MD) {
+		switch (io->type) {
+		case FTL_IO_READ:
+			tpoint_id = FTL_TRACE_MD_READ_SCHEDULE(source);
+			break;
+		case FTL_IO_WRITE:
+			tpoint_id = FTL_TRACE_MD_WRITE_SCHEDULE(source);
+			break;
+		default:
+			assert(0);
+		}
+	} else {
+		switch (io->type) {
+		case FTL_IO_READ:
+			tpoint_id = FTL_TRACE_READ_SCHEDULE(source);
+			break;
+		case FTL_IO_WRITE:
+			tpoint_id = FTL_TRACE_WRITE_SCHEDULE(source);
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	spdk_trace_record(tpoint_id, io->trace, io->num_blocks, 0, ftl_io_get_lba(io, 0));
+}
+
+void
+ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io)
+{
+	assert(io->trace != FTL_TRACE_INVALID_ID);
+
+	spdk_trace_record(FTL_TRACE_WRITE_WBUF_FILL(ftl_trace_io_source(io)), io->trace,
+			  0, 0, ftl_io_current_lba(io));
+}
+
+void
+ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry)
+{
+	uint16_t tpoint_id;
+
+	assert(entry->trace != FTL_TRACE_INVALID_ID);
+
+	if (entry->io_flags & FTL_IO_INTERNAL) {
+		tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_INTERNAL);
+	} else {
+		tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_USER);
+	}
+
+	spdk_trace_record(tpoint_id, entry->trace, 0, entry->addr.offset, entry->lba);
+}
+
+void
+ftl_trace_completion(struct spdk_ftl_dev *dev, const struct ftl_io *io,
+		     enum ftl_trace_completion completion)
+{
+	uint16_t tpoint_id = 0, source;
+
+	assert(io->trace != FTL_TRACE_INVALID_ID);
+	source = ftl_trace_io_source(io);
+
+	if (io->flags & FTL_IO_MD) {
+		switch (io->type) {
+		case FTL_IO_READ:
+			tpoint_id = FTL_TRACE_MD_READ_COMPLETION(source);
+			break;
+		case FTL_IO_WRITE:
+			tpoint_id = FTL_TRACE_MD_WRITE_COMPLETION(source);
+			break;
+		default:
+			assert(0);
+		}
+	} else {
+		switch (io->type) {
+		case FTL_IO_READ:
+			switch (completion) {
+			case FTL_TRACE_COMPLETION_INVALID:
+				tpoint_id = FTL_TRACE_READ_COMPLETION_INVALID(source);
+				break;
+			case FTL_TRACE_COMPLETION_CACHE:
+				tpoint_id = FTL_TRACE_READ_COMPLETION_CACHE(source);
+				break;
+			case FTL_TRACE_COMPLETION_DISK:
+				tpoint_id = FTL_TRACE_READ_COMPLETION_DISK(source);
+				break;
+			}
+			break;
+		case FTL_IO_WRITE:
+			tpoint_id = FTL_TRACE_WRITE_COMPLETION(source);
+			break;
+		case FTL_IO_ERASE:
+			tpoint_id = FTL_TRACE_ERASE_COMPLETION(source);
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	spdk_trace_record(tpoint_id, io->trace, 0, 0, ftl_io_get_lba(io, io->pos - 1));
+}
+
+void
+ftl_trace_submission(struct spdk_ftl_dev *dev, const struct ftl_io *io, struct ftl_addr addr,
+		     size_t addr_cnt)
+{
+	uint16_t tpoint_id = 0, source;
+
+	assert(io->trace != FTL_TRACE_INVALID_ID);
+	source = ftl_trace_io_source(io);
+
+	if (io->flags & FTL_IO_MD) {
+		switch (io->type) {
+		case FTL_IO_READ:
+			tpoint_id = FTL_TRACE_MD_READ_SUBMISSION(source);
+			break;
+		case FTL_IO_WRITE:
+			tpoint_id = FTL_TRACE_MD_WRITE_SUBMISSION(source);
+			break;
+		default:
+			assert(0);
+		}
+	} else {
+		switch (io->type) {
+		case FTL_IO_READ:
+			tpoint_id = FTL_TRACE_READ_SUBMISSION(source);
+			break;
+		case FTL_IO_WRITE:
+			tpoint_id = FTL_TRACE_WRITE_SUBMISSION(source);
+			break;
+		case FTL_IO_ERASE:
+			tpoint_id = FTL_TRACE_ERASE_SUBMISSION(source);
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	spdk_trace_record(tpoint_id, io->trace, addr_cnt, 0, addr.offset);
+}
+
+void
+ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free)
+{
+	struct ftl_trace *trace = &dev->stats.trace;
+
+	spdk_trace_record(FTL_TRACE_LIMITS(FTL_TRACE_SOURCE_INTERNAL), ftl_trace_next_id(trace),
+			  num_free, limit, 0);
+}
+
+uint64_t
+ftl_trace_alloc_id(struct spdk_ftl_dev *dev)
+{
+	struct ftl_trace *trace = &dev->stats.trace;
+
+	return ftl_trace_next_id(trace);
+}
+
+#endif /* defined(DEBUG) */
diff --git a/src/spdk/lib/ftl/ftl_trace.h b/src/spdk/lib/ftl/ftl_trace.h
new file mode 100644
index 000000000..52988cff6
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_trace.h
@@ -0,0 +1,84 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_TRACE_H
+#define FTL_TRACE_H
+
+#include "ftl_addr.h"
+
+#define FTL_TRACE_INVALID_ID ((uint64_t) -1)
+
+enum ftl_trace_completion {
+	FTL_TRACE_COMPLETION_INVALID,
+	FTL_TRACE_COMPLETION_CACHE,
+	FTL_TRACE_COMPLETION_DISK,
+};
+
+struct ftl_trace {
+	/* Monotonically incrementing event id */
+	uint64_t		id;
+};
+
+struct spdk_ftl_dev;
+struct ftl_trace;
+struct ftl_io;
+struct ftl_wbuf_entry;
+struct ftl_band;
+
+#if defined(DEBUG)
+uint64_t ftl_trace_alloc_id(struct spdk_ftl_dev *dev);
+void ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band);
+void ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band);
+void ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io);
+void ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io);
+void ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry);
+void ftl_trace_submission(struct spdk_ftl_dev *dev,
+			  const struct ftl_io *io,
+			  struct ftl_addr addr, size_t addr_cnt);
+void ftl_trace_completion(struct spdk_ftl_dev *dev,
+			  const struct ftl_io *io,
+			  enum ftl_trace_completion type);
+void ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free);
+#else /* defined(DEBUG) */
+#define ftl_trace_alloc_id(dev) FTL_TRACE_INVALID_ID
+#define ftl_trace_defrag_band(dev, band)
+#define ftl_trace_write_band(dev, band)
+#define ftl_trace_lba_io_init(dev, io)
+#define ftl_trace_wbuf_fill(dev, io)
+#define ftl_trace_wbuf_pop(dev, entry)
+#define ftl_trace_submission(dev, io, addr, addr_cnt)
+#define ftl_trace_completion(dev, io, type)
+#define ftl_trace_limits(dev, limits, num_free)
+#endif
+
+#endif /* FTL_TRACE_H */
diff --git a/src/spdk/lib/ftl/spdk_ftl.map b/src/spdk/lib/ftl/spdk_ftl.map
new file mode 100644
index 000000000..141fd01e0
--- /dev/null
+++ b/src/spdk/lib/ftl/spdk_ftl.map
@@ -0,0 +1,14 @@
+{
+	global:
+
+	# public functions
+	spdk_ftl_dev_init;
+	spdk_ftl_dev_free;
+	spdk_ftl_conf_init_defaults;
+	spdk_ftl_dev_get_attrs;
+	spdk_ftl_read;
+	spdk_ftl_write;
+	spdk_ftl_flush;
+
+	local: *;
+};
diff --git a/src/spdk/lib/idxd/Makefile b/src/spdk/lib/idxd/Makefile
new file mode 100644
index 000000000..ed66aeb15
--- /dev/null
+++ b/src/spdk/lib/idxd/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = idxd.c
+LIBNAME = idxd
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_idxd.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/idxd/idxd.c b/src/spdk/lib/idxd/idxd.c
new file mode 100644
index 000000000..992d96211
--- /dev/null
+++ b/src/spdk/lib/idxd/idxd.c
@@ -0,0 +1,1292 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/idxd.h"
+
+#include "idxd.h"
+
+#define ALIGN_4K 0x1000
+
+pthread_mutex_t	g_driver_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/*
+ * g_dev_cfg gives us 2 pre-set configurations of DSA to choose from
+ * via RPC.
+ */
+struct device_config *g_dev_cfg = NULL;
+
+/*
+ * Pre-built configurations. Variations depend on various factors
+ * including how many different types of target latency profiles there
+ * are, how many different QOS requirements there might be, etc.
+ */
+struct device_config g_dev_cfg0 = {
+	.config_num = 0,
+	.num_groups = 4,
+	.num_wqs_per_group = 1,
+	.num_engines_per_group = 1,
+	.total_wqs = 4,
+	.total_engines = 4,
+};
+
+struct device_config g_dev_cfg1 = {
+	.config_num = 1,
+	.num_groups = 2,
+	.num_wqs_per_group = 2,
+	.num_engines_per_group = 2,
+	.total_wqs = 4,
+	.total_engines = 4,
+};
+
+static uint32_t
+_idxd_read_4(struct spdk_idxd_device *idxd, uint32_t offset)
+{
+	return spdk_mmio_read_4((uint32_t *)(idxd->reg_base + offset));
+}
+
+static void
+_idxd_write_4(struct spdk_idxd_device *idxd, uint32_t offset, uint32_t value)
+{
+	spdk_mmio_write_4((uint32_t *)(idxd->reg_base + offset), value);
+}
+
+static uint64_t
+_idxd_read_8(struct spdk_idxd_device *idxd, uint32_t offset)
+{
+	return spdk_mmio_read_8((uint64_t *)(idxd->reg_base + offset));
+}
+
+static void
+_idxd_write_8(struct spdk_idxd_device *idxd, uint32_t offset, uint64_t value)
+{
+	spdk_mmio_write_8((uint64_t *)(idxd->reg_base + offset), value);
+}
+
+struct spdk_idxd_io_channel *
+spdk_idxd_get_channel(struct spdk_idxd_device *idxd)
+{
+	struct spdk_idxd_io_channel *chan;
+	struct idxd_batch *batch;
+	int i;
+
+	chan = calloc(1, sizeof(struct spdk_idxd_io_channel));
+	if (chan == NULL) {
+		SPDK_ERRLOG("Failed to allocate idxd chan\n");
+		return NULL;
+	}
+	chan->idxd = idxd;
+
+	TAILQ_INIT(&chan->batches);
+
+	TAILQ_INIT(&chan->batch_pool);
+	for (i = 0 ; i < NUM_BATCHES ; i++) {
+		batch = calloc(1, sizeof(struct idxd_batch));
+		if (batch == NULL) {
+			SPDK_ERRLOG("Failed to allocate batch\n");
+			while ((batch = TAILQ_FIRST(&chan->batch_pool))) {
+				TAILQ_REMOVE(&chan->batch_pool, batch, link);
+				free(batch);
+			}
+			return NULL;
+		}
+		TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+	}
+
+	return chan;
+}
+
+void
+spdk_idxd_put_channel(struct spdk_idxd_io_channel *chan)
+{
+	free(chan);
+}
+
+int
+spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan)
+{
+	uint32_t num_ring_slots;
+	int rc;
+
+	/* Round robin the WQ selection for the chan on this IDXD device. */
+	chan->idxd->wq_id++;
+	if (chan->idxd->wq_id == g_dev_cfg->total_wqs) {
+		chan->idxd->wq_id = 0;
+	}
+
+	num_ring_slots = chan->idxd->queues[chan->idxd->wq_id].wqcfg.wq_size;
+
+	chan->ring_ctrl.ring_slots = spdk_bit_array_create(num_ring_slots);
+	if (chan->ring_ctrl.ring_slots == NULL) {
+		SPDK_ERRLOG("Failed to allocate bit array for ring\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * max ring slots can change as channels come and go but we
+	 * start off getting all of the slots for this work queue.
+	 */
+	chan->ring_ctrl.max_ring_slots = num_ring_slots;
+
+	/* Store the original size of the ring. */
+	chan->ring_ctrl.ring_size = num_ring_slots;
+
+	chan->ring_ctrl.desc = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_hw_desc),
+					    0x40, NULL,
+					    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (chan->ring_ctrl.desc == NULL) {
+		SPDK_ERRLOG("Failed to allocate descriptor memory\n");
+		rc = -ENOMEM;
+		goto err_desc;
+	}
+
+	chan->ring_ctrl.completions = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_comp),
+				      0x40, NULL,
+				      SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (chan->ring_ctrl.completions == NULL) {
+		SPDK_ERRLOG("Failed to allocate completion memory\n");
+		rc = -ENOMEM;
+		goto err_comp;
+	}
+
+	chan->ring_ctrl.user_desc = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_hw_desc),
+				    0x40, NULL,
+				    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (chan->ring_ctrl.user_desc == NULL) {
+		SPDK_ERRLOG("Failed to allocate batch descriptor memory\n");
+		rc = -ENOMEM;
+		goto err_user_desc;
+	}
+
+	/* Each slot on the ring reserves DESC_PER_BATCH elemnts in user_desc. */
+	chan->ring_ctrl.user_ring_slots = spdk_bit_array_create(NUM_BATCHES);
+	if (chan->ring_ctrl.user_ring_slots == NULL) {
+		SPDK_ERRLOG("Failed to allocate bit array for user ring\n");
+		rc = -ENOMEM;
+		goto err_user_ring;
+	}
+
+	chan->ring_ctrl.user_completions = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_comp),
+					   0x40, NULL,
+					   SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (chan->ring_ctrl.user_completions == NULL) {
+		SPDK_ERRLOG("Failed to allocate user completion memory\n");
+		rc = -ENOMEM;
+		goto err_user_comp;
+	}
+
+	chan->ring_ctrl.portal = (char *)chan->idxd->portals + chan->idxd->wq_id * PORTAL_SIZE;
+
+	return 0;
+
+err_user_comp:
+	spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots);
+err_user_ring:
+	spdk_free(chan->ring_ctrl.user_desc);
+err_user_desc:
+	spdk_free(chan->ring_ctrl.completions);
+err_comp:
+	spdk_free(chan->ring_ctrl.desc);
+err_desc:
+	spdk_bit_array_free(&chan->ring_ctrl.ring_slots);
+
+	return rc;
+}
+
+/* Used for control commands, not for descriptor submission. */
+static int
+idxd_wait_cmd(struct spdk_idxd_device *idxd, int _timeout)
+{
+	uint32_t timeout = _timeout;
+	union idxd_cmdsts_reg cmd_status = {};
+
+	cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET);
+	while (cmd_status.active && --timeout) {
+		usleep(1);
+		cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET);
+	}
+
+	/* Check for timeout */
+	if (timeout == 0 && cmd_status.active) {
+		SPDK_ERRLOG("Command timeout, waited %u\n", _timeout);
+		return -EBUSY;
+	}
+
+	/* Check for error */
+	if (cmd_status.err) {
+		SPDK_ERRLOG("Command status reg reports error 0x%x\n", cmd_status.err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void
+_idxd_drain(struct spdk_idxd_io_channel *chan)
+{
+	uint32_t index;
+	int set = 0;
+
+	do {
+		spdk_idxd_process_events(chan);
+		set = 0;
+		for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) {
+			set |= spdk_bit_array_get(chan->ring_ctrl.ring_slots, index);
+		}
+	} while (set);
+}
+
+int
+spdk_idxd_reconfigure_chan(struct spdk_idxd_io_channel *chan, uint32_t num_channels)
+{
+	uint32_t num_ring_slots;
+	int rc;
+	struct idxd_batch *batch;
+
+	_idxd_drain(chan);
+
+	assert(spdk_bit_array_count_set(chan->ring_ctrl.ring_slots) == 0);
+
+	if (num_channels == 0) {
+		spdk_free(chan->ring_ctrl.completions);
+		spdk_free(chan->ring_ctrl.desc);
+		spdk_bit_array_free(&chan->ring_ctrl.ring_slots);
+		spdk_free(chan->ring_ctrl.user_completions);
+		spdk_free(chan->ring_ctrl.user_desc);
+		spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots);
+		while ((batch = TAILQ_FIRST(&chan->batch_pool))) {
+			TAILQ_REMOVE(&chan->batch_pool, batch, link);
+			free(batch);
+		}
+		return 0;
+	}
+
+	num_ring_slots = chan->ring_ctrl.ring_size / num_channels;
+
+	/* re-allocate our descriptor ring for hw flow control. */
+	rc = spdk_bit_array_resize(&chan->ring_ctrl.ring_slots, num_ring_slots);
+	if (rc < 0) {
+		SPDK_ERRLOG("Unable to resize channel bit array\n");
+		return -ENOMEM;
+	}
+
+	chan->ring_ctrl.max_ring_slots = num_ring_slots;
+
+	/*
+	 * Note: The batch descriptor ring does not change with the
+	 * number of channels as descriptors on this ring do not
+	 * "count" for flow control.
+	 */
+
+	return rc;
+}
+
+/* Called via RPC to select a pre-defined configuration. */
+void
+spdk_idxd_set_config(uint32_t config_num)
+{
+	switch (config_num) {
+	case 0:
+		g_dev_cfg = &g_dev_cfg0;
+		break;
+	case 1:
+		g_dev_cfg = &g_dev_cfg1;
+		break;
+	default:
+		g_dev_cfg = &g_dev_cfg0;
+		SPDK_ERRLOG("Invalid config, using default\n");
+		break;
+	}
+}
+
+static int
+idxd_unmap_pci_bar(struct spdk_idxd_device *idxd, int bar)
+{
+	int rc = 0;
+	void *addr = NULL;
+
+	if (bar == IDXD_MMIO_BAR) {
+		addr = (void *)idxd->reg_base;
+	} else if (bar == IDXD_WQ_BAR) {
+		addr = (void *)idxd->portals;
+	}
+
+	if (addr) {
+		rc = spdk_pci_device_unmap_bar(idxd->device, 0, addr);
+	}
+	return rc;
+}
+
+static int
+idxd_map_pci_bars(struct spdk_idxd_device *idxd)
+{
+	int rc;
+	void *addr;
+	uint64_t phys_addr, size;
+
+	rc = spdk_pci_device_map_bar(idxd->device, IDXD_MMIO_BAR, &addr, &phys_addr, &size);
+	if (rc != 0 || addr == NULL) {
+		SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", rc);
+		return -1;
+	}
+	idxd->reg_base = addr;
+
+	rc = spdk_pci_device_map_bar(idxd->device, IDXD_WQ_BAR, &addr, &phys_addr, &size);
+	if (rc != 0 || addr == NULL) {
+		SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", rc);
+		rc = idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+		if (rc) {
+			SPDK_ERRLOG("unable to unmap MMIO bar\n");
+		}
+		return -EINVAL;
+	}
+	idxd->portals = addr;
+
+	return 0;
+}
+
+static int
+idxd_reset_dev(struct spdk_idxd_device *idxd)
+{
+	int rc;
+
+	_idxd_write_4(idxd, IDXD_CMD_OFFSET, IDXD_RESET_DEVICE << IDXD_CMD_SHIFT);
+	rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US);
+	if (rc < 0) {
+		SPDK_ERRLOG("Error resetting device %u\n", rc);
+	}
+
+	return rc;
+}
+
+/*
+ * Build group config based on getting info from the device combined
+ * with the defined configuration. Once built, it is written to the
+ * device.
+ */
+static int
+idxd_group_config(struct spdk_idxd_device *idxd)
+{
+	int i;
+	uint64_t base_offset;
+
+	assert(g_dev_cfg->num_groups <= idxd->registers.groupcap.num_groups);
+	idxd->groups = calloc(idxd->registers.groupcap.num_groups, sizeof(struct idxd_group));
+	if (idxd->groups == NULL) {
+		SPDK_ERRLOG("Failed to allocate group memory\n");
+		return -ENOMEM;
+	}
+
+	assert(g_dev_cfg->total_engines <= idxd->registers.enginecap.num_engines);
+	for (i = 0; i < g_dev_cfg->total_engines; i++) {
+		idxd->groups[i % g_dev_cfg->num_groups].grpcfg.engines |= (1 << i);
+	}
+
+	assert(g_dev_cfg->total_wqs <= idxd->registers.wqcap.num_wqs);
+	for (i = 0; i < g_dev_cfg->total_wqs; i++) {
+		idxd->groups[i % g_dev_cfg->num_groups].grpcfg.wqs[0] |= (1 << i);
+	}
+
+	for (i = 0; i < g_dev_cfg->num_groups; i++) {
+		idxd->groups[i].idxd = idxd;
+		idxd->groups[i].id = i;
+
+		/* Divide BW tokens evenly */
+		idxd->groups[i].grpcfg.flags.tokens_allowed =
+			idxd->registers.groupcap.total_tokens / g_dev_cfg->num_groups;
+	}
+
+	/*
+	 * Now write the group config to the device for all groups. We write
+	 * to the max number of groups in order to 0 out the ones we didn't
+	 * configure.
+	 */
+	for (i = 0 ; i < idxd->registers.groupcap.num_groups; i++) {
+
+		base_offset = idxd->grpcfg_offset + i * 64;
+
+		/* GRPWQCFG, work queues config */
+		_idxd_write_8(idxd, base_offset, idxd->groups[i].grpcfg.wqs[0]);
+
+		/* GRPENGCFG, engine config */
+		_idxd_write_8(idxd, base_offset + CFG_ENGINE_OFFSET, idxd->groups[i].grpcfg.engines);
+
+		/* GRPFLAGS, flags config */
+		_idxd_write_8(idxd, base_offset + CFG_FLAG_OFFSET, idxd->groups[i].grpcfg.flags.raw);
+	}
+
+	return 0;
+}
+
+/*
+ * Build work queue (WQ) config based on getting info from the device combined
+ * with the defined configuration. Once built, it is written to the device.
+ */
+static int
+idxd_wq_config(struct spdk_idxd_device *idxd)
+{
+	int i, j;
+	struct idxd_wq *queue;
+	u_int32_t wq_size = idxd->registers.wqcap.total_wq_size / g_dev_cfg->total_wqs;
+
+	SPDK_NOTICELOG("Total ring slots available space 0x%x, so per work queue is 0x%x\n",
+		       idxd->registers.wqcap.total_wq_size, wq_size);
+	assert(g_dev_cfg->total_wqs <= IDXD_MAX_QUEUES);
+	assert(g_dev_cfg->total_wqs <= idxd->registers.wqcap.num_wqs);
+	assert(LOG2_WQ_MAX_BATCH <= idxd->registers.gencap.max_batch_shift);
+	assert(LOG2_WQ_MAX_XFER <= idxd->registers.gencap.max_xfer_shift);
+
+	idxd->queues = calloc(1, idxd->registers.wqcap.num_wqs * sizeof(struct idxd_wq));
+	if (idxd->queues == NULL) {
+		SPDK_ERRLOG("Failed to allocate queue memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < g_dev_cfg->total_wqs; i++) {
+		queue = &idxd->queues[i];
+		queue->wqcfg.wq_size = wq_size;
+		queue->wqcfg.mode = WQ_MODE_DEDICATED;
+		queue->wqcfg.max_batch_shift = LOG2_WQ_MAX_BATCH;
+		queue->wqcfg.max_xfer_shift = LOG2_WQ_MAX_XFER;
+		queue->wqcfg.wq_state = WQ_ENABLED;
+		queue->wqcfg.priority = WQ_PRIORITY_1;
+
+		/* Not part of the config struct */
+		queue->idxd = idxd;
+		queue->group = &idxd->groups[i % g_dev_cfg->num_groups];
+	}
+
+	/*
+	 * Now write the work queue config to the device for all wq space
+	 */
+	for (i = 0 ; i < idxd->registers.wqcap.num_wqs; i++) {
+		queue = &idxd->queues[i];
+		for (j = 0 ; j < WQCFG_NUM_DWORDS; j++) {
+			_idxd_write_4(idxd, idxd->wqcfg_offset + i * 32 + j * 4,
+				      queue->wqcfg.raw[j]);
+		}
+	}
+
+	return 0;
+}
+
+static int
+idxd_device_configure(struct spdk_idxd_device *idxd)
+{
+	int i, rc = 0;
+	union idxd_offsets_register offsets_reg;
+	union idxd_genstatus_register genstatus_reg;
+
+	/*
+	 * Map BAR0 and BAR2
+	 */
+	rc = idxd_map_pci_bars(idxd);
+	if (rc) {
+		return rc;
+	}
+
+	/*
+	 * Reset the device
+	 */
+	rc = idxd_reset_dev(idxd);
+	if (rc) {
+		goto err_reset;
+	}
+
+	/*
+	 * Read in config registers
+	 */
+	idxd->registers.version = _idxd_read_4(idxd, IDXD_VERSION_OFFSET);
+	idxd->registers.gencap.raw = _idxd_read_8(idxd, IDXD_GENCAP_OFFSET);
+	idxd->registers.wqcap.raw = _idxd_read_8(idxd, IDXD_WQCAP_OFFSET);
+	idxd->registers.groupcap.raw = _idxd_read_8(idxd, IDXD_GRPCAP_OFFSET);
+	idxd->registers.enginecap.raw = _idxd_read_8(idxd, IDXD_ENGCAP_OFFSET);
+	for (i = 0; i < IDXD_OPCAP_WORDS; i++) {
+		idxd->registers.opcap.raw[i] =
+			_idxd_read_8(idxd, i * sizeof(uint64_t) + IDXD_OPCAP_OFFSET);
+	}
+	offsets_reg.raw[0] = _idxd_read_8(idxd, IDXD_TABLE_OFFSET);
+	offsets_reg.raw[1] = _idxd_read_8(idxd, IDXD_TABLE_OFFSET + sizeof(uint64_t));
+	idxd->grpcfg_offset = offsets_reg.grpcfg * IDXD_TABLE_OFFSET_MULT;
+	idxd->wqcfg_offset = offsets_reg.wqcfg * IDXD_TABLE_OFFSET_MULT;
+	idxd->ims_offset = offsets_reg.ims * IDXD_TABLE_OFFSET_MULT;
+	idxd->msix_perm_offset = offsets_reg.msix_perm  * IDXD_TABLE_OFFSET_MULT;
+	idxd->perfmon_offset = offsets_reg.perfmon * IDXD_TABLE_OFFSET_MULT;
+
+	/*
+	 * Configure groups and work queues.
+	 */
+	rc = idxd_group_config(idxd);
+	if (rc) {
+		goto err_group_cfg;
+	}
+
+	rc = idxd_wq_config(idxd);
+	if (rc) {
+		goto err_wq_cfg;
+	}
+
+	/*
+	 * Enable the device
+	 */
+	genstatus_reg.raw = _idxd_read_4(idxd, IDXD_GENSTATUS_OFFSET);
+	assert(genstatus_reg.state == IDXD_DEVICE_STATE_DISABLED);
+
+	_idxd_write_4(idxd, IDXD_CMD_OFFSET, IDXD_ENABLE_DEV << IDXD_CMD_SHIFT);
+	rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US);
+	genstatus_reg.raw = _idxd_read_4(idxd, IDXD_GENSTATUS_OFFSET);
+	if ((rc < 0) || (genstatus_reg.state != IDXD_DEVICE_STATE_ENABLED)) {
+		rc = -EINVAL;
+		SPDK_ERRLOG("Error enabling device %u\n", rc);
+		goto err_device_enable;
+	}
+
+	genstatus_reg.raw = spdk_mmio_read_4((uint32_t *)(idxd->reg_base + IDXD_GENSTATUS_OFFSET));
+	assert(genstatus_reg.state == IDXD_DEVICE_STATE_ENABLED);
+
+	/*
+	 * Enable the work queues that we've configured
+	 */
+	for (i = 0; i < g_dev_cfg->total_wqs; i++) {
+		_idxd_write_4(idxd, IDXD_CMD_OFFSET,
+			      (IDXD_ENABLE_WQ << IDXD_CMD_SHIFT) | i);
+		rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US);
+		if (rc < 0) {
+			SPDK_ERRLOG("Error enabling work queues 0x%x\n", rc);
+			goto err_wq_enable;
+		}
+	}
+
+	if ((rc == 0) && (genstatus_reg.state == IDXD_DEVICE_STATE_ENABLED)) {
+		SPDK_NOTICELOG("Device enabled, version 0x%x gencap: 0x%lx\n",
+			       idxd->registers.version,
+			       idxd->registers.gencap.raw);
+
+	}
+
+	return rc;
+err_wq_enable:
+err_device_enable:
+	free(idxd->queues);
+err_wq_cfg:
+	free(idxd->groups);
+err_group_cfg:
+err_reset:
+	idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+	idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+
+	return rc;
+}
+
+static void
+idxd_device_destruct(struct spdk_idxd_device *idxd)
+{
+	idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+	idxd_unmap_pci_bar(idxd, IDXD_WQ_BAR);
+	free(idxd->groups);
+	free(idxd->queues);
+	free(idxd);
+}
+
+/* Caller must hold g_driver_lock */
+static struct spdk_idxd_device *
+idxd_attach(struct spdk_pci_device *device)
+{
+	struct spdk_idxd_device *idxd;
+	uint32_t cmd_reg;
+	int rc;
+
+	idxd = calloc(1, sizeof(struct spdk_idxd_device));
+	if (idxd == NULL) {
+		SPDK_ERRLOG("Failed to allocate memory for idxd device.\n");
+		return NULL;
+	}
+
+	idxd->device = device;
+
+	/* Enable PCI busmaster. */
+	spdk_pci_device_cfg_read32(device, &cmd_reg, 4);
+	cmd_reg |= 0x4;
+	spdk_pci_device_cfg_write32(device, cmd_reg, 4);
+
+	rc = idxd_device_configure(idxd);
+	if (rc) {
+		goto err;
+	}
+
+	return idxd;
+err:
+	idxd_device_destruct(idxd);
+	return NULL;
+}
+
+struct idxd_enum_ctx {
+	spdk_idxd_probe_cb probe_cb;
+	spdk_idxd_attach_cb attach_cb;
+	void *cb_ctx;
+};
+
+/* This function must only be called while holding g_driver_lock */
+static int
+idxd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+	struct idxd_enum_ctx *enum_ctx = ctx;
+	struct spdk_idxd_device *idxd;
+
+	if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) {
+		idxd = idxd_attach(pci_dev);
+		if (idxd == NULL) {
+			SPDK_ERRLOG("idxd_attach() failed\n");
+			return -EINVAL;
+		}
+
+		enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, idxd);
+	}
+
+	return 0;
+}
+
+int
+spdk_idxd_probe(void *cb_ctx, spdk_idxd_probe_cb probe_cb, spdk_idxd_attach_cb attach_cb)
+{
+	int rc;
+	struct idxd_enum_ctx enum_ctx;
+
+	enum_ctx.probe_cb = probe_cb;
+	enum_ctx.attach_cb = attach_cb;
+	enum_ctx.cb_ctx = cb_ctx;
+
+	pthread_mutex_lock(&g_driver_lock);
+	rc = spdk_pci_enumerate(spdk_pci_idxd_get_driver(), idxd_enum_cb, &enum_ctx);
+	pthread_mutex_unlock(&g_driver_lock);
+
+	return rc;
+}
+
+void
+spdk_idxd_detach(struct spdk_idxd_device *idxd)
+{
+	idxd_device_destruct(idxd);
+}
+
+static struct idxd_hw_desc *
+_idxd_prep_command(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn,
+		   void *cb_arg, struct idxd_batch *batch)
+{
+	uint32_t index;
+	struct idxd_hw_desc *desc;
+	struct idxd_comp *comp;
+
+	index = spdk_bit_array_find_first_clear(chan->ring_ctrl.ring_slots, 0);
+	if (index == UINT32_MAX) {
+		/* ran out of ring slots */
+		return NULL;
+	}
+
+	spdk_bit_array_set(chan->ring_ctrl.ring_slots, index);
+
+	desc = &chan->ring_ctrl.desc[index];
+	comp = &chan->ring_ctrl.completions[index];
+
+	desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION;
+	desc->completion_addr = (uintptr_t)&comp->hw;
+	comp->cb_arg = cb_arg;
+	comp->cb_fn = cb_fn;
+	if (batch) {
+		comp->batch = batch;
+		batch->batch_desc_index = index;
+	}
+
+	return desc;
+}
+
+int
+spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan, void *dst, const void *src,
+		      uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+	if (desc == NULL) {
+		return -EBUSY;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_MEMMOVE;
+	desc->src_addr = (uintptr_t)src;
+	desc->dst_addr = (uintptr_t)dst;
+	desc->xfer_size = nbytes;
+
+	/* Submit operation. */
+	movdir64b(chan->ring_ctrl.portal, desc);
+
+	return 0;
+}
+
+/* Dual-cast copies the same source to two separate destination buffers. */
+int
+spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *dst2,
+			  const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+		SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+		return -EINVAL;
+	}
+
+	/* Common prep. */
+	desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+	if (desc == NULL) {
+		return -EBUSY;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_DUALCAST;
+	desc->src_addr = (uintptr_t)src;
+	desc->dst_addr = (uintptr_t)dst1;
+	desc->dest2 = (uintptr_t)dst2;
+	desc->xfer_size = nbytes;
+
+	/* Submit operation. */
+	movdir64b(chan->ring_ctrl.portal, desc);
+
+	return 0;
+}
+
+int
+spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, void *src1, const void *src2,
+			 uint64_t nbytes,
+			 spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+	if (desc == NULL) {
+		return -EBUSY;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_COMPARE;
+	desc->src_addr = (uintptr_t)src1;
+	desc->src2_addr = (uintptr_t)src2;
+	desc->xfer_size = nbytes;
+
+	/* Submit operation. */
+	movdir64b(chan->ring_ctrl.portal, desc);
+
+	return 0;
+}
+
+int
+spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan, void *dst, uint64_t fill_pattern,
+		      uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+	if (desc == NULL) {
+		return -EBUSY;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_MEMFILL;
+	desc->pattern = fill_pattern;
+	desc->dst_addr = (uintptr_t)dst;
+	desc->xfer_size = nbytes;
+
+	/* Submit operation. */
+	movdir64b(chan->ring_ctrl.portal, desc);
+
+	return 0;
+}
+
+int
+spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan, uint32_t *dst, void *src,
+			uint32_t seed, uint64_t nbytes,
+			spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+	if (desc == NULL) {
+		return -EBUSY;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_CRC32C_GEN;
+	desc->dst_addr = (uintptr_t)dst;
+	desc->src_addr = (uintptr_t)src;
+	desc->flags &= IDXD_CLEAR_CRC_FLAGS;
+	desc->crc32c.seed = seed;
+	desc->xfer_size = nbytes;
+
+	/* Submit operation. */
+	movdir64b(chan->ring_ctrl.portal, desc);
+
+	return 0;
+}
+
+uint32_t
+spdk_idxd_batch_get_max(void)
+{
+	return DESC_PER_BATCH; /* TODO maybe add startup RPC to set this */
+}
+
+struct idxd_batch *
+spdk_idxd_batch_create(struct spdk_idxd_io_channel *chan)
+{
+	struct idxd_batch *batch = NULL;
+
+	if (!TAILQ_EMPTY(&chan->batch_pool)) {
+		batch = TAILQ_FIRST(&chan->batch_pool);
+		TAILQ_REMOVE(&chan->batch_pool, batch, link);
+	} else {
+		/* The application needs to handle this. */
+		return NULL;
+	}
+
+	batch->batch_num = spdk_bit_array_find_first_clear(chan->ring_ctrl.user_ring_slots, 0);
+	if (batch->batch_num == UINT32_MAX) {
+		/* ran out of ring slots, the application needs to handle this. */
+		TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+		return NULL;
+	}
+
+	spdk_bit_array_set(chan->ring_ctrl.user_ring_slots, batch->batch_num);
+
+	/*
+	 * Find the first descriptor address for the given batch. The
+	 * descriptor ring used for user desctipors is allocated in
+	 * units of DESC_PER_BATCH.  The actual index is in units of
+	 * one descriptor.
+	 */
+	batch->start_index = batch->cur_index = batch->batch_num * DESC_PER_BATCH;
+
+	TAILQ_INSERT_TAIL(&chan->batches, batch, link);
+	SPDK_DEBUGLOG(SPDK_LOG_IDXD, "New batch %p num %u\n", batch, batch->batch_num);
+
+	return batch;
+}
+
+static bool
+_does_batch_exist(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan)
+{
+	bool found = false;
+	struct idxd_batch *cur_batch;
+
+	TAILQ_FOREACH(cur_batch, &chan->batches, link) {
+		if (cur_batch == batch) {
+			found = true;
+			break;
+		}
+	}
+
+	return found;
+}
+
+int
+spdk_idxd_batch_cancel(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch)
+{
+	if (_does_batch_exist(batch, chan) == false) {
+		SPDK_ERRLOG("Attempt to cancel a batch that doesn't exist\n.");
+		return -EINVAL;
+	}
+
+	if (batch->remaining > 0) {
+		SPDK_ERRLOG("Cannot cancel batch, already submitted to HW\n.");
+		return -EINVAL;
+	}
+
+	TAILQ_REMOVE(&chan->batches, batch, link);
+	spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num);
+	TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+
+	return 0;
+}
+
+int
+spdk_idxd_batch_submit(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+		       spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	if (_does_batch_exist(batch, chan) == false) {
+		SPDK_ERRLOG("Attempt to submit a batch that doesn't exist\n.");
+		return -EINVAL;
+	}
+
+	/* Common prep. */
+	desc = _idxd_prep_command(chan, cb_fn, cb_arg, batch);
+	if (desc == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Can't submit batch %p busy batch num %u\n", batch, batch->batch_num);
+		return -EBUSY;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_BATCH;
+	desc->desc_list_addr = (uintptr_t)&chan->ring_ctrl.user_desc[batch->start_index];
+	desc->desc_count = batch->cur_index - batch->start_index;
+	assert(desc->desc_count <= DESC_PER_BATCH);
+
+	if (desc->desc_count < MIN_USER_DESC_COUNT) {
+		SPDK_ERRLOG("Attempt to submit a batch without at least %u operations.\n",
+			    MIN_USER_DESC_COUNT);
+		return -EINVAL;
+	}
+
+	/* Total completions for the batch = num desc plus 1 for the batch desc itself. */
+	batch->remaining = desc->desc_count + 1;
+
+	/* Submit operation. */
+	movdir64b(chan->ring_ctrl.portal, desc);
+
+	return 0;
+}
+
+static struct idxd_hw_desc *
+_idxd_prep_batch_cmd(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn,
+		     void *cb_arg, struct idxd_batch *batch)
+{
+	struct idxd_hw_desc *desc;
+	struct idxd_comp *comp;
+
+	if (_does_batch_exist(batch, chan) == false) {
+		SPDK_ERRLOG("Attempt to add to a batch that doesn't exist\n.");
+		return NULL;
+	}
+
+	if ((batch->cur_index - batch->start_index) == DESC_PER_BATCH) {
+		SPDK_ERRLOG("Attempt to add to a batch that is already full\n.");
+		return NULL;
+	}
+
+	desc = &chan->ring_ctrl.user_desc[batch->cur_index];
+	comp = &chan->ring_ctrl.user_completions[batch->cur_index];
+	SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Prep batch %p index %u\n", batch, batch->cur_index);
+
+	batch->cur_index++;
+	assert(batch->cur_index > batch->start_index);
+
+	desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION;
+	desc->completion_addr = (uintptr_t)&comp->hw;
+	comp->cb_arg = cb_arg;
+	comp->cb_fn = cb_fn;
+	comp->batch = batch;
+
+	return desc;
+}
+
+int
+spdk_idxd_batch_prep_copy(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+			  void *dst, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+	if (desc == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_MEMMOVE;
+	desc->src_addr = (uintptr_t)src;
+	desc->dst_addr = (uintptr_t)dst;
+	desc->xfer_size = nbytes;
+
+	return 0;
+}
+
+int
+spdk_idxd_batch_prep_fill(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+			  void *dst, uint64_t fill_pattern, uint64_t nbytes,
+			  spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+	if (desc == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_MEMFILL;
+	desc->pattern = fill_pattern;
+	desc->dst_addr = (uintptr_t)dst;
+	desc->xfer_size = nbytes;
+
+	return 0;
+}
+
+int
+spdk_idxd_batch_prep_dualcast(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+			      void *dst1, void *dst2, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+		SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+		return -EINVAL;
+	}
+
+	/* Common prep. */
+	desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+	if (desc == NULL) {
+		return -EINVAL;
+	}
+	desc->opcode = IDXD_OPCODE_DUALCAST;
+	desc->src_addr = (uintptr_t)src;
+	desc->dst_addr = (uintptr_t)dst1;
+	desc->dest2 = (uintptr_t)dst2;
+	desc->xfer_size = nbytes;
+
+	return 0;
+}
+
+int
+spdk_idxd_batch_prep_crc32c(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+			    uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes,
+			    spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+	if (desc == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_CRC32C_GEN;
+	desc->dst_addr = (uintptr_t)dst;
+	desc->src_addr = (uintptr_t)src;
+	desc->flags &= IDXD_CLEAR_CRC_FLAGS;
+	desc->crc32c.seed = seed;
+	desc->xfer_size = nbytes;
+
+	return 0;
+}
+
+int
+spdk_idxd_batch_prep_compare(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+			     void *src1, void *src2, uint64_t nbytes, spdk_idxd_req_cb cb_fn,
+			     void *cb_arg)
+{
+	struct idxd_hw_desc *desc;
+
+	/* Common prep. */
+	desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+	if (desc == NULL) {
+		return -EINVAL;
+	}
+
+	/* Command specific. */
+	desc->opcode = IDXD_OPCODE_COMPARE;
+	desc->src_addr = (uintptr_t)src1;
+	desc->src2_addr = (uintptr_t)src2;
+	desc->xfer_size = nbytes;
+
+	return 0;
+}
+
+static void
+_dump_error_reg(struct spdk_idxd_io_channel *chan)
+{
+	uint64_t sw_error_0;
+	uint16_t i;
+
+	sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET);
+
+	SPDK_NOTICELOG("SW Error bits set:");
+	for (i = 0; i < CHAR_BIT; i++) {
+		if ((1ULL << i) & sw_error_0) {
+			SPDK_NOTICELOG("    %d\n", i);
+		}
+	}
+	SPDK_NOTICELOG("SW Error error code: %#x\n", (uint8_t)(sw_error_0 >> 8));
+	SPDK_NOTICELOG("SW Error WQ index: %u\n", (uint8_t)(sw_error_0 >> 16));
+	SPDK_NOTICELOG("SW Error Operation: %u\n", (uint8_t)(sw_error_0 >> 32));
+}
+
+static void
+_free_batch(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan,
+	    struct idxd_comp *comp)
+{
+	TAILQ_REMOVE(&chan->batches, batch, link);
+	TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+	comp->batch = NULL;
+	spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num);
+	spdk_bit_array_clear(chan->ring_ctrl.ring_slots, batch->batch_desc_index);
+}
+
+static void
+_spdk_idxd_process_batch_events(struct spdk_idxd_io_channel *chan)
+{
+	uint16_t index;
+	struct idxd_comp *comp;
+	uint64_t sw_error_0;
+	int status = 0;
+	struct idxd_batch *batch;
+
+	/*
+	 * We don't check the bit array for user completions as there's only
+	 * one bit per per batch.
+	 */
+	for (index = 0; index < TOTAL_USER_DESC; index++) {
+		comp = &chan->ring_ctrl.user_completions[index];
+		if (comp->hw.status == 1) {
+			struct idxd_hw_desc *desc;
+
+			sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET);
+			if (sw_error_0 & 0x1) {
+				_dump_error_reg(chan);
+				status = -EINVAL;
+			}
+
+			desc = &chan->ring_ctrl.user_desc[index];
+			switch (desc->opcode) {
+			case IDXD_OPCODE_CRC32C_GEN:
+				*(uint32_t *)desc->dst_addr = comp->hw.crc32c_val;
+				*(uint32_t *)desc->dst_addr ^= ~0;
+				break;
+			case IDXD_OPCODE_COMPARE:
+				if (status == 0) {
+					status = comp->hw.result;
+				}
+				break;
+			case IDXD_OPCODE_MEMFILL:
+			case IDXD_OPCODE_DUALCAST:
+			case IDXD_OPCODE_MEMMOVE:
+				break;
+			default:
+				assert(false);
+				break;
+			}
+
+			/* The hw will complete all user desc first before the batch
+			 * desc (see spec for configuration exceptions) however
+			 * because of the order that we check for comps in the poller
+			 * we may "see" them in a different order than they actually
+			 * completed in.
+			 */
+			batch = comp->batch;
+			assert(batch->remaining > 0);
+			if (--batch->remaining == 0) {
+				_free_batch(batch, chan, comp);
+			}
+
+			comp->cb_fn((void *)comp->cb_arg, status);
+			comp->hw.status = status = 0;
+		}
+	}
+}
+
+/*
+ * TODO: Experiment with different methods of reaping completions for performance
+ * once we have real silicon.
+ */
+void
+spdk_idxd_process_events(struct spdk_idxd_io_channel *chan)
+{
+	uint16_t index;
+	struct idxd_comp *comp;
+	uint64_t sw_error_0;
+	int status = 0;
+	struct idxd_batch *batch;
+
+	if (!TAILQ_EMPTY(&chan->batches)) {
+		_spdk_idxd_process_batch_events(chan);
+	}
+
+	for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) {
+		if (spdk_bit_array_get(chan->ring_ctrl.ring_slots, index)) {
+			comp = &chan->ring_ctrl.completions[index];
+			if (comp->hw.status == 1) {
+				struct idxd_hw_desc *desc;
+
+				sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET);
+				if (sw_error_0 & 0x1) {
+					_dump_error_reg(chan);
+					status = -EINVAL;
+				}
+
+				desc = &chan->ring_ctrl.desc[index];
+				switch (desc->opcode) {
+				case IDXD_OPCODE_BATCH:
+					/* The hw will complete all user desc first before the batch
+					 * desc (see spec for configuration exceptions) however
+					 * because of the order that we check for comps in the poller
+					 * we may "see" them in a different order than they actually
+					 * completed in.
+					 */
+					batch = comp->batch;
+					assert(batch->remaining > 0);
+					if (--batch->remaining == 0) {
+						_free_batch(batch, chan, comp);
+					}
+					break;
+				case IDXD_OPCODE_CRC32C_GEN:
+					*(uint32_t *)desc->dst_addr = comp->hw.crc32c_val;
+					*(uint32_t *)desc->dst_addr ^= ~0;
+					break;
+				case IDXD_OPCODE_COMPARE:
+					if (status == 0) {
+						status = comp->hw.result;
+					}
+					break;
+				}
+
+				comp->cb_fn(comp->cb_arg, status);
+				comp->hw.status = status = 0;
+				if (desc->opcode != IDXD_OPCODE_BATCH) {
+					spdk_bit_array_clear(chan->ring_ctrl.ring_slots, index);
+				}
+			}
+		}
+	}
+}
+
+SPDK_LOG_REGISTER_COMPONENT("idxd", SPDK_LOG_IDXD)
diff --git a/src/spdk/lib/idxd/idxd.h b/src/spdk/lib/idxd/idxd.h
new file mode 100644
index 000000000..09d021152
--- /dev/null
+++ b/src/spdk/lib/idxd/idxd.h
@@ -0,0 +1,188 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IDXD_H__
+#define __IDXD_H__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/idxd.h"
+#include "spdk/queue.h"
+#include "spdk/mmio.h"
+#include "spdk/bit_array.h"
+
+#include "idxd_spec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* TODO: get the gcc intrinsic to work. */
+#define nop() asm volatile ("nop")
+static inline void movdir64b(void *dst, const void *src)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		     : "=m"(*(char *)dst)
+		     : "d"(src), "a"(dst));
+}
+
+#define IDXD_REGISTER_TIMEOUT_US		50
+#define IDXD_DRAIN_TIMEOUT_US			500000
+
+/* TODO: make some of these RPC selectable */
+#define WQ_MODE_DEDICATED	1
+#define LOG2_WQ_MAX_BATCH	8  /* 2^8 = 256 */
+#define LOG2_WQ_MAX_XFER	30 /* 2^30 = 1073741824 */
+#define WQCFG_NUM_DWORDS	8
+#define WQ_PRIORITY_1		1
+#define IDXD_MAX_QUEUES		64
+
+#define TOTAL_USER_DESC		(1 << LOG2_WQ_MAX_BATCH)
+#define DESC_PER_BATCH		16 /* TODO maybe make this a startup RPC */
+#define NUM_BATCHES		(TOTAL_USER_DESC / DESC_PER_BATCH)
+#define MIN_USER_DESC_COUNT	2
+
+struct idxd_batch {
+	uint32_t			batch_desc_index;
+	uint32_t			batch_num;
+	uint32_t			cur_index;
+	uint32_t			start_index;
+	uint32_t			remaining;
+	TAILQ_ENTRY(idxd_batch)		link;
+};
+
+struct device_config {
+	uint8_t		config_num;
+	uint8_t		num_wqs_per_group;
+	uint8_t		num_engines_per_group;
+	uint8_t		num_groups;
+	uint16_t	total_wqs;
+	uint16_t	total_engines;
+};
+
+struct idxd_ring_control {
+	void				*portal;
+
+	uint16_t			ring_size;
+
+	/*
+	 * Rings for this channel, one for descriptors and one
+	 * for completions, share the same index. Batch descriptors
+	 * are managed independently from data descriptors.
+	 */
+	struct idxd_hw_desc		*desc;
+	struct idxd_comp		*completions;
+	struct idxd_hw_desc		*user_desc;
+	struct idxd_comp		*user_completions;
+
+	/*
+	 * We use one bit array to track ring slots for both
+	 * desc and completions.
+	 */
+	struct spdk_bit_array		*ring_slots;
+	uint32_t			max_ring_slots;
+
+	/*
+	 * We use a separate bit array to track ring slots for
+	 * descriptors submitted via the user in a batch.
+	 */
+	struct spdk_bit_array		*user_ring_slots;
+};
+
+struct spdk_idxd_io_channel {
+	struct spdk_idxd_device		*idxd;
+	struct idxd_ring_control	ring_ctrl;
+	TAILQ_HEAD(, idxd_batch)	batch_pool; /* free batches */
+	TAILQ_HEAD(, idxd_batch)	batches; /* in use batches */
+};
+
+struct pci_dev_id {
+	int vendor_id;
+	int device_id;
+};
+
+struct idxd_group {
+	struct spdk_idxd_device	*idxd;
+	struct idxd_grpcfg	grpcfg;
+	struct pci_dev_id	pcidev;
+	int			num_engines;
+	int			num_wqs;
+	int			id;
+	uint8_t			tokens_allowed;
+	bool			use_token_limit;
+	uint8_t			tokens_reserved;
+	int			tc_a;
+	int			tc_b;
+};
+
+/*
+ * This struct wraps the hardware completion record which is 32 bytes in
+ * size and must be 32 byte aligned.
+ */
+struct idxd_comp {
+	struct idxd_hw_comp_record	hw;
+	void				*cb_arg;
+	spdk_idxd_req_cb		cb_fn;
+	struct idxd_batch		*batch;
+	uint64_t			pad2;
+} __attribute__((packed));
+SPDK_STATIC_ASSERT(sizeof(struct idxd_comp) == 64, "size mismatch");
+
+struct idxd_wq {
+	struct spdk_idxd_device		*idxd;
+	struct idxd_group		*group;
+	union idxd_wqcfg		wqcfg;
+};
+
+struct spdk_idxd_device {
+	struct spdk_pci_device		*device;
+	void				*reg_base;
+	void				*portals;
+	int				socket_id;
+	int				wq_id;
+
+	struct idxd_registers		registers;
+	uint32_t			ims_offset;
+	uint32_t			msix_perm_offset;
+	uint32_t			wqcfg_offset;
+	uint32_t			grpcfg_offset;
+	uint32_t			perfmon_offset;
+	struct idxd_group		*groups;
+	struct idxd_wq			*queues;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __IDXD_H__ */
diff --git a/src/spdk/lib/idxd/idxd_spec.h b/src/spdk/lib/idxd/idxd_spec.h
new file mode 100644
index 000000000..51d52cdcc
--- /dev/null
+++ b/src/spdk/lib/idxd/idxd_spec.h
@@ -0,0 +1,503 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * IDXD specification definitions
+ */
+
+#ifndef SPDK_IDXD_SPEC_H
+#define SPDK_IDXD_SPEC_H
+
+#include "spdk/stdinc.h"
+#include "spdk/assert.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IDXD_MMIO_BAR			0
+#define IDXD_WQ_BAR			2
+#define PORTAL_SIZE			(4096 * 4)
+
+#define CFG_ENGINE_OFFSET		0x20
+#define CFG_FLAG_OFFSET			0x28
+
+#define IDXD_CMD_SHIFT			20
+
+#define IDXD_VERSION_OFFSET		0x00
+#define IDXD_GENCAP_OFFSET		0x10
+#define IDXD_WQCAP_OFFSET		0x20
+#define IDXD_GRPCAP_OFFSET		0x30
+#define IDXD_OPCAP_OFFSET		0x40
+#define IDXD_ENGCAP_OFFSET		0x38
+#define IDXD_OPCAP_OFFSET		0x40
+#define IDXD_TABLE_OFFSET		0x60
+#define IDXD_GENCFG_OFFSET		0x80
+#define IDXD_GENCTRL_OFFSET		0x88
+#define IDXD_GENSTATUS_OFFSET		0x90
+#define IDXD_INTCAUSE_OFFSET		0x98
+#define IDXD_CMD_OFFSET			0xa0
+#define IDXD_CMDSTS_OFFSET		0xa8
+#define IDXD_SWERR_OFFSET		0xc0
+#define IDXD_TABLE_OFFSET_MULT		0x100
+
+#define IDXD_OPCAP_WORDS		0x4
+
+#define IDXD_CLEAR_CRC_FLAGS		0xFFFFu
+
+#define IDXD_FLAG_FENCE                 (1 << 0)
+#define IDXD_FLAG_COMPLETION_ADDR_VALID (1 << 2)
+#define IDXD_FLAG_REQUEST_COMPLETION    (1 << 3)
+#define IDXD_FLAG_CACHE_CONTROL         (1 << 8)
+
+/*
+ * IDXD is a family of devices, DSA is the only currently
+ * supported one.
+ */
+enum dsa_completion_status {
+	IDXD_COMP_NONE			= 0,
+	IDXD_COMP_SUCCESS		= 1,
+	IDXD_COMP_SUCCESS_PRED		= 2,
+	IDXD_COMP_PAGE_FAULT_NOBOF	= 3,
+	IDXD_COMP_PAGE_FAULT_IR		= 4,
+	IDXD_COMP_BATCH_FAIL		= 5,
+	IDXD_COMP_BATCH_PAGE_FAULT	= 6,
+	IDXD_COMP_DR_OFFSET_NOINC	= 7,
+	IDXD_COMP_DR_OFFSET_ERANGE	= 8,
+	IDXD_COMP_DIF_ERR		= 9,
+	IDXD_COMP_BAD_OPCODE		= 16,
+	IDXD_COMP_INVALID_FLAGS		= 17,
+	IDXD_COMP_NOZERO_RESERVE	= 18,
+	IDXD_COMP_XFER_ERANGE		= 19,
+	IDXD_COMP_DESC_CNT_ERANGE	= 20,
+	IDXD_COMP_DR_ERANGE		= 21,
+	IDXD_COMP_OVERLAP_BUFFERS	= 22,
+	IDXD_COMP_DCAST_ERR		= 23,
+	IDXD_COMP_DESCLIST_ALIGN	= 24,
+	IDXD_COMP_INT_HANDLE_INVAL	= 25,
+	IDXD_COMP_CRA_XLAT		= 26,
+	IDXD_COMP_CRA_ALIGN		= 27,
+	IDXD_COMP_ADDR_ALIGN		= 28,
+	IDXD_COMP_PRIV_BAD		= 29,
+	IDXD_COMP_TRAFFIC_CLASS_CONF	= 30,
+	IDXD_COMP_PFAULT_RDBA		= 31,
+	IDXD_COMP_HW_ERR1		= 32,
+	IDXD_COMP_HW_ERR_DRB		= 33,
+	IDXD_COMP_TRANSLATION_FAIL	= 34,
+};
+
+enum idxd_wq_state {
+	WQ_DISABLED	= 0,
+	WQ_ENABLED	= 1,
+};
+
+enum idxd_wq_flag {
+	WQ_FLAG_DEDICATED	= 0,
+	WQ_FLAG_BOF		= 1,
+};
+
+enum idxd_wq_type {
+	WQT_NONE	= 0,
+	WQT_KERNEL	= 1,
+	WQT_USER	= 2,
+	WQT_MDEV	= 3,
+};
+
+enum idxd_dev_state {
+	IDXD_DEVICE_STATE_DISABLED	= 0,
+	IDXD_DEVICE_STATE_ENABLED	= 1,
+	IDXD_DEVICE_STATE_DRAIN		= 2,
+	IDXD_DEVICE_STATE_HALT		= 3,
+};
+
+enum idxd_device_reset_type {
+	IDXD_DEVICE_RESET_SOFTWARE	= 0,
+	IDXD_DEVICE_RESET_FLR		= 1,
+	IDXD_DEVICE_RESET_WARM		= 2,
+	IDXD_DEVICE_RESET_COLD		= 3,
+};
+
+enum idxd_cmds {
+	IDXD_ENABLE_DEV		= 1,
+	IDXD_DISABLE_DEV	= 2,
+	IDXD_DRAIN_ALL		= 3,
+	IDXD_ABORT_ALL		= 4,
+	IDXD_RESET_DEVICE	= 5,
+	IDXD_ENABLE_WQ		= 6,
+	IDXD_DISABLE_WQ		= 7,
+	IDXD_DRAIN_WQ		= 8,
+	IDXD_ABORT_WQ		= 9,
+	IDXD_RESET_WQ		= 10,
+};
+
+enum idxd_cmdsts_err {
+	IDXD_CMDSTS_SUCCESS		= 0,
+	IDXD_CMDSTS_INVAL_CMD		= 1,
+	IDXD_CMDSTS_INVAL_WQIDX		= 2,
+	IDXD_CMDSTS_HW_ERR		= 3,
+	IDXD_CMDSTS_ERR_DEV_ENABLED	= 16,
+	IDXD_CMDSTS_ERR_CONFIG		= 17,
+	IDXD_CMDSTS_ERR_BUSMASTER_EN	= 18,
+	IDXD_CMDSTS_ERR_PASID_INVAL	= 19,
+	IDXD_CMDSTS_ERR_WQ_SIZE_ERANGE	= 20,
+	IDXD_CMDSTS_ERR_GRP_CONFIG	= 21,
+	IDXD_CMDSTS_ERR_GRP_CONFIG2	= 22,
+	IDXD_CMDSTS_ERR_GRP_CONFIG3	= 23,
+	IDXD_CMDSTS_ERR_GRP_CONFIG4	= 24,
+	IDXD_CMDSTS_ERR_DEV_NOTEN	= 32,
+	IDXD_CMDSTS_ERR_WQ_ENABLED	= 33,
+	IDXD_CMDSTS_ERR_WQ_SIZE		= 34,
+	IDXD_CMDSTS_ERR_WQ_PRIOR	= 35,
+	IDXD_CMDSTS_ERR_WQ_MODE		= 36,
+	IDXD_CMDSTS_ERR_BOF_EN		= 37,
+	IDXD_CMDSTS_ERR_PASID_EN	= 38,
+	IDXD_CMDSTS_ERR_MAX_BATCH_SIZE	= 39,
+	IDXD_CMDSTS_ERR_MAX_XFER_SIZE	= 40,
+	IDXD_CMDSTS_ERR_DIS_DEV_EN	= 49,
+	IDXD_CMDSTS_ERR_DEV_NOT_EN	= 50,
+	IDXD_CMDSTS_ERR_INVAL_INT_IDX	= 65,
+	IDXD_CMDSTS_ERR_NO_HANDLE	= 66,
+};
+
+enum idxd_wq_hw_state {
+	IDXD_WQ_DEV_DISABLED	= 0,
+	IDXD_WQ_DEV_ENABLED	= 1,
+	IDXD_WQ_DEV_BUSY	= 2,
+};
+
+struct idxd_hw_desc {
+	uint32_t	pasid: 20;
+	uint32_t	rsvd: 11;
+	uint32_t	priv: 1;
+	uint32_t	flags: 24;
+	uint32_t	opcode: 8;
+	uint64_t	completion_addr;
+	union {
+		uint64_t	src_addr;
+		uint64_t	readback_addr;
+		uint64_t	pattern;
+		uint64_t	desc_list_addr;
+	};
+	union {
+		uint64_t	dst_addr;
+		uint64_t	readback_addr2;
+		uint64_t	src2_addr;
+		uint64_t	comp_pattern;
+	};
+	union {
+		uint32_t	xfer_size;
+		uint32_t	desc_count;
+	};
+	uint16_t	int_handle;
+	uint16_t	rsvd1;
+	union {
+		uint8_t		expected_res;
+		struct delta {
+			uint64_t	addr;
+			uint32_t	max_size;
+		} delta;
+		uint32_t	delta_rec_size;
+		uint64_t	dest2;
+		struct crc32c {
+			uint32_t	seed;
+			uint32_t	rsvd;
+			uint64_t	addr;
+		} crc32c;
+		struct dif_chk {
+			uint8_t		src_flags;
+			uint8_t		rsvd1;
+			uint8_t		flags;
+			uint8_t		rsvd2[5];
+			uint32_t	ref_tag_seed;
+			uint16_t	app_tag_mask;
+			uint16_t	app_tag_seed;
+		} dif_chk;
+		struct dif_ins {
+			uint8_t		rsvd1;
+			uint8_t		dest_flag;
+			uint8_t		flags;
+			uint8_t		rsvd2[13];
+			uint32_t	ref_tag_seed;
+			uint16_t	app_tag_mask;
+			uint16_t	app_tag_seed;
+		} dif_ins;
+		struct dif_upd {
+			uint8_t		src_flags;
+			uint8_t		dest_flags;
+			uint8_t		flags;
+			uint8_t		rsvd[5];
+			uint32_t	src_ref_tag_seed;
+			uint16_t	src_app_tag_mask;
+			uint16_t	src_app_tag_seed;
+			uint32_t	dest_ref_tag_seed;
+			uint16_t	dest_app_tag_mask;
+			uint16_t	dest_app_tag_seed;
+		} dif_upd;
+		uint8_t		op_specific[24];
+	};
+} __attribute__((packed));
+SPDK_STATIC_ASSERT(sizeof(struct idxd_hw_desc) == 64, "size mismatch");
+
+struct idxd_hw_comp_record {
+	volatile uint8_t	status;
+	union {
+		uint8_t		result;
+		uint8_t		dif_status;
+	};
+	uint16_t		rsvd;
+	uint32_t		bytes_completed;
+	uint64_t		fault_addr;
+	union {
+		uint32_t	delta_rec_size;
+		uint32_t	crc32c_val;
+		struct {
+			uint32_t	dif_chk_ref_tag;
+			uint16_t	dif_chk_app_tag_mask;
+			uint16_t	dif_chk_app_tag;
+		};
+		struct dif_ins_comp {
+			uint64_t	rsvd;
+			uint32_t	ref_tag;
+			uint16_t	app_tag_mask;
+			uint16_t	app_tag;
+		} dif_ins_comp;
+		struct dif_upd_comp {
+			uint32_t	src_ref_tag;
+			uint16_t	src_app_tag_mask;
+			uint16_t	src_app_tag;
+			uint32_t	dest_ref_tag;
+			uint16_t	dest_app_tag_mask;
+			uint16_t	dest_app_tag;
+		} dif_upd_comp;
+		uint8_t		op_specific[16];
+	};
+} __attribute__((packed));
+SPDK_STATIC_ASSERT(sizeof(struct idxd_hw_comp_record) == 32, "size mismatch");
+
+union idxd_gencap_register {
+	struct {
+		uint64_t block_on_fault: 1;
+		uint64_t overlap_copy: 1;
+		uint64_t cache_control_mem: 1;
+		uint64_t cache_control_cache: 1;
+		uint64_t rsvd: 3;
+		uint64_t int_handle_req: 1;
+		uint64_t dest_readback: 1;
+		uint64_t drain_readback: 1;
+		uint64_t rsvd2: 6;
+		uint64_t max_xfer_shift: 5;
+		uint64_t max_batch_shift: 4;
+		uint64_t max_ims_mult: 6;
+		uint64_t config_en: 1;
+		uint64_t max_descs_per_engine: 8;
+		uint64_t rsvd3: 24;
+	} __attribute__((packed));
+	uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_gencap_register) == 8, "size mismatch");
+
+union idxd_wqcap_register {
+	struct {
+		uint64_t total_wq_size: 16;
+		uint64_t num_wqs: 8;
+		uint64_t rsvd: 24;
+		uint64_t shared_mode: 1;
+		uint64_t dedicated_mode: 1;
+		uint64_t rsvd2: 1;
+		uint64_t priority: 1;
+		uint64_t occupancy: 1;
+		uint64_t occupancy_int: 1;
+		uint64_t rsvd3: 10;
+	} __attribute__((packed));
+	uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_wqcap_register) == 8, "size mismatch");
+
+union idxd_groupcap_register {
+	struct {
+		uint64_t num_groups: 8;
+		uint64_t total_tokens: 8;
+		uint64_t token_en: 1;
+		uint64_t token_limit: 1;
+		uint64_t rsvd: 46;
+	} __attribute__((packed));
+	uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_groupcap_register) == 8, "size mismatch");
+
+union idxd_enginecap_register {
+	struct {
+		uint64_t num_engines: 8;
+		uint64_t rsvd: 56;
+	} __attribute__((packed));
+	uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_enginecap_register) == 8, "size mismatch");
+
+struct idxd_opcap_register {
+	uint64_t raw[4];
+};
+SPDK_STATIC_ASSERT(sizeof(struct idxd_opcap_register) == 32, "size mismatch");
+
+struct idxd_registers {
+	uint32_t version;
+	union idxd_gencap_register gencap;
+	union idxd_wqcap_register wqcap;
+	union idxd_groupcap_register groupcap;
+	union idxd_enginecap_register enginecap;
+	struct idxd_opcap_register opcap;
+};
+SPDK_STATIC_ASSERT(sizeof(struct idxd_registers) == 72, "size mismatch");
+
+union idxd_offsets_register {
+	struct {
+		uint64_t grpcfg: 16;
+		uint64_t wqcfg: 16;
+		uint64_t msix_perm: 16;
+		uint64_t ims: 16;
+		uint64_t perfmon: 16;
+		uint64_t rsvd: 48;
+	} __attribute__((packed));
+	uint64_t raw[2];
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_offsets_register) == 16, "size mismatch");
+
+union idxd_genstatus_register {
+	struct {
+		uint32_t state: 2;
+		uint32_t reset_type: 2;
+		uint32_t rsvd: 28;
+	} __attribute__((packed));
+	uint32_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_genstatus_register) == 4, "size mismatch");
+
+union idxd_cmdsts_reg {
+	struct {
+		uint8_t err;
+		uint16_t result;
+		uint8_t rsvd: 7;
+		uint8_t active: 1;
+	} __attribute__((packed));
+	uint32_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_cmdsts_reg) == 4, "size mismatch");
+
+union idxd_swerr_register {
+	struct {
+		uint64_t valid: 1;
+		uint64_t overflow: 1;
+		uint64_t desc_valid: 1;
+		uint64_t wq_idx_valid: 1;
+		uint64_t batch: 1;
+		uint64_t fault_rw: 1;
+		uint64_t priv: 1;
+		uint64_t rsvd: 1;
+		uint64_t error: 8;
+		uint64_t wq_idx: 8;
+		uint64_t rsvd2: 8;
+		uint64_t operation: 8;
+		uint64_t pasid: 20;
+		uint64_t rsvd3: 4;
+		uint64_t batch_idx: 16;
+		uint64_t rsvd4: 16;
+		uint64_t invalid_flags: 32;
+		uint64_t fault_addr;
+		uint64_t rsvd5;
+	} __attribute__((packed));
+	uint64_t raw[4];
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_swerr_register) == 32, "size mismatch");
+
+union idxd_group_flags {
+	struct {
+		uint32_t tc_a: 3;
+		uint32_t tc_b: 3;
+		uint32_t rsvd: 1;
+		uint32_t use_token_limit: 1;
+		uint32_t tokens_reserved: 8;
+		uint32_t rsvd2: 4;
+		uint32_t tokens_allowed: 8;
+		uint32_t rsvd3: 4;
+	} __attribute__((packed));
+	uint32_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_group_flags) == 4, "size mismatch");
+
+struct idxd_grpcfg {
+	uint64_t wqs[4];
+	uint64_t engines;
+	union idxd_group_flags flags;
+};
+SPDK_STATIC_ASSERT(sizeof(struct idxd_grpcfg) == 48, "size mismatch");
+
+union idxd_wqcfg {
+	struct {
+		uint16_t wq_size;
+		uint16_t rsvd;
+		uint16_t wq_thresh;
+		uint16_t rsvd1;
+		uint32_t mode: 1;
+		uint32_t bof: 1;
+		uint32_t rsvd2: 2;
+		uint32_t priority: 4;
+		uint32_t pasid: 20;
+		uint32_t pasid_en: 1;
+		uint32_t priv: 1;
+		uint32_t rsvd3: 2;
+		uint32_t max_xfer_shift: 5;
+		uint32_t max_batch_shift: 4;
+		uint32_t rsvd4: 23;
+		uint16_t occupancy_inth;
+		uint16_t occupancy_table_sel: 1;
+		uint16_t rsvd5: 15;
+		uint16_t occupancy_limit;
+		uint16_t occupancy_int_en: 1;
+		uint16_t rsvd6: 15;
+		uint16_t occupancy;
+		uint16_t occupancy_int: 1;
+		uint16_t rsvd7: 12;
+		uint16_t mode_support: 1;
+		uint16_t wq_state: 2;
+		uint32_t rsvd8;
+	} __attribute__((packed));
+	uint32_t raw[8];
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_wqcfg) == 32, "size mismatch");
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SPDK_IDXD_SPEC_H */
diff --git a/src/spdk/lib/idxd/spdk_idxd.map b/src/spdk/lib/idxd/spdk_idxd.map
new file mode 100644
index 000000000..4bffdf209
--- /dev/null
+++ b/src/spdk/lib/idxd/spdk_idxd.map
@@ -0,0 +1,29 @@
+{
+	global:
+
+	# public functions
+	spdk_idxd_configure_chan;
+	spdk_idxd_reconfigure_chan;
+	spdk_idxd_probe;
+	spdk_idxd_detach;
+	spdk_idxd_batch_prep_copy;
+	spdk_idxd_batch_prep_dualcast;
+	spdk_idxd_batch_prep_fill;
+	spdk_idxd_batch_prep_crc32c;
+	spdk_idxd_batch_prep_compare;
+	spdk_idxd_batch_submit;
+	spdk_idxd_batch_create;
+	spdk_idxd_batch_cancel;
+	spdk_idxd_batch_get_max;
+	spdk_idxd_set_config;
+	spdk_idxd_submit_compare;
+	spdk_idxd_submit_crc32c;
+	spdk_idxd_submit_copy;
+	spdk_idxd_submit_dualcast;
+	spdk_idxd_submit_fill;
+	spdk_idxd_process_events;
+	spdk_idxd_get_channel;
+	spdk_idxd_put_channel;
+
+	local: *;
+};
diff --git a/src/spdk/lib/ioat/Makefile b/src/spdk/lib/ioat/Makefile
new file mode 100644
index 000000000..4cada5685
--- /dev/null
+++ b/src/spdk/lib/ioat/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = ioat.c
+LIBNAME = ioat
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ioat.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/ioat/ioat.c b/src/spdk/lib/ioat/ioat.c
new file mode 100644
index 000000000..516fa545c
--- /dev/null
+++ b/src/spdk/lib/ioat/ioat.c
@@ -0,0 +1,775 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "ioat_internal.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+
+#include "spdk_internal/log.h"
+
+struct ioat_driver {
+	pthread_mutex_t			lock;
+	TAILQ_HEAD(, spdk_ioat_chan)	attached_chans;
+};
+
+static struct ioat_driver g_ioat_driver = {
+	.lock = PTHREAD_MUTEX_INITIALIZER,
+	.attached_chans = TAILQ_HEAD_INITIALIZER(g_ioat_driver.attached_chans),
+};
+
+static uint64_t
+ioat_get_chansts(struct spdk_ioat_chan *ioat)
+{
+	return spdk_mmio_read_8(&ioat->regs->chansts);
+}
+
+static void
+ioat_write_chancmp(struct spdk_ioat_chan *ioat, uint64_t addr)
+{
+	spdk_mmio_write_8(&ioat->regs->chancmp, addr);
+}
+
+static void
+ioat_write_chainaddr(struct spdk_ioat_chan *ioat, uint64_t addr)
+{
+	spdk_mmio_write_8(&ioat->regs->chainaddr, addr);
+}
+
+static inline void
+ioat_suspend(struct spdk_ioat_chan *ioat)
+{
+	ioat->regs->chancmd = SPDK_IOAT_CHANCMD_SUSPEND;
+}
+
+static inline void
+ioat_reset(struct spdk_ioat_chan *ioat)
+{
+	ioat->regs->chancmd = SPDK_IOAT_CHANCMD_RESET;
+}
+
+static inline uint32_t
+ioat_reset_pending(struct spdk_ioat_chan *ioat)
+{
+	uint8_t cmd;
+
+	cmd = ioat->regs->chancmd;
+	return (cmd & SPDK_IOAT_CHANCMD_RESET) == SPDK_IOAT_CHANCMD_RESET;
+}
+
+static int
+ioat_map_pci_bar(struct spdk_ioat_chan *ioat)
+{
+	int regs_bar, rc;
+	void *addr;
+	uint64_t phys_addr, size;
+
+	regs_bar = 0;
+	rc = spdk_pci_device_map_bar(ioat->device, regs_bar, &addr, &phys_addr, &size);
+	if (rc != 0 || addr == NULL) {
+		SPDK_ERRLOG("pci_device_map_range failed with error code %d\n",
+			    rc);
+		return -1;
+	}
+
+	ioat->regs = (volatile struct spdk_ioat_registers *)addr;
+
+	return 0;
+}
+
+static int
+ioat_unmap_pci_bar(struct spdk_ioat_chan *ioat)
+{
+	int rc = 0;
+	void *addr = (void *)ioat->regs;
+
+	if (addr) {
+		rc = spdk_pci_device_unmap_bar(ioat->device, 0, addr);
+	}
+	return rc;
+}
+
+
+static inline uint32_t
+ioat_get_active(struct spdk_ioat_chan *ioat)
+{
+	return (ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1);
+}
+
+static inline uint32_t
+ioat_get_ring_space(struct spdk_ioat_chan *ioat)
+{
+	return (1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1;
+}
+
+static uint32_t
+ioat_get_ring_index(struct spdk_ioat_chan *ioat, uint32_t index)
+{
+	return index & ((1 << ioat->ring_size_order) - 1);
+}
+
+static void
+ioat_get_ring_entry(struct spdk_ioat_chan *ioat, uint32_t index,
+		    struct ioat_descriptor **desc,
+		    union spdk_ioat_hw_desc **hw_desc)
+{
+	uint32_t i = ioat_get_ring_index(ioat, index);
+
+	*desc = &ioat->ring[i];
+	*hw_desc = &ioat->hw_ring[i];
+}
+
+static void
+ioat_submit_single(struct spdk_ioat_chan *ioat)
+{
+	ioat->head++;
+}
+
+void
+spdk_ioat_flush(struct spdk_ioat_chan *ioat)
+{
+	uint32_t index = ioat_get_ring_index(ioat, ioat->head - 1);
+	union spdk_ioat_hw_desc *hw_desc;
+
+	hw_desc = &ioat->hw_ring[index];
+	hw_desc->dma.u.control.completion_update = 1;
+	ioat->regs->dmacount = (uint16_t)ioat->head;
+}
+
+static struct ioat_descriptor *
+ioat_prep_null(struct spdk_ioat_chan *ioat)
+{
+	struct ioat_descriptor *desc;
+	union spdk_ioat_hw_desc *hw_desc;
+
+	if (ioat_get_ring_space(ioat) < 1) {
+		return NULL;
+	}
+
+	ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc);
+
+	hw_desc->dma.u.control_raw = 0;
+	hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY;
+	hw_desc->dma.u.control.null = 1;
+
+	hw_desc->dma.size = 8;
+	hw_desc->dma.src_addr = 0;
+	hw_desc->dma.dest_addr = 0;
+
+	desc->callback_fn = NULL;
+	desc->callback_arg = NULL;
+
+	ioat_submit_single(ioat);
+
+	return desc;
+}
+
+static struct ioat_descriptor *
+ioat_prep_copy(struct spdk_ioat_chan *ioat, uint64_t dst,
+	       uint64_t src, uint32_t len)
+{
+	struct ioat_descriptor *desc;
+	union spdk_ioat_hw_desc *hw_desc;
+
+	assert(len <= ioat->max_xfer_size);
+
+	if (ioat_get_ring_space(ioat) < 1) {
+		return NULL;
+	}
+
+	ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc);
+
+	hw_desc->dma.u.control_raw = 0;
+	hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY;
+
+	hw_desc->dma.size = len;
+	hw_desc->dma.src_addr = src;
+	hw_desc->dma.dest_addr = dst;
+
+	desc->callback_fn = NULL;
+	desc->callback_arg = NULL;
+
+	ioat_submit_single(ioat);
+
+	return desc;
+}
+
+static struct ioat_descriptor *
+ioat_prep_fill(struct spdk_ioat_chan *ioat, uint64_t dst,
+	       uint64_t fill_pattern, uint32_t len)
+{
+	struct ioat_descriptor *desc;
+	union spdk_ioat_hw_desc *hw_desc;
+
+	assert(len <= ioat->max_xfer_size);
+
+	if (ioat_get_ring_space(ioat) < 1) {
+		return NULL;
+	}
+
+	ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc);
+
+	hw_desc->fill.u.control_raw = 0;
+	hw_desc->fill.u.control.op = SPDK_IOAT_OP_FILL;
+
+	hw_desc->fill.size = len;
+	hw_desc->fill.src_data = fill_pattern;
+	hw_desc->fill.dest_addr = dst;
+
+	desc->callback_fn = NULL;
+	desc->callback_arg = NULL;
+
+	ioat_submit_single(ioat);
+
+	return desc;
+}
+
+static int ioat_reset_hw(struct spdk_ioat_chan *ioat)
+{
+	int timeout;
+	uint64_t status;
+	uint32_t chanerr;
+	int rc;
+
+	status = ioat_get_chansts(ioat);
+	if (is_ioat_active(status) || is_ioat_idle(status)) {
+		ioat_suspend(ioat);
+	}
+
+	timeout = 20; /* in milliseconds */
+	while (is_ioat_active(status) || is_ioat_idle(status)) {
+		spdk_delay_us(1000);
+		timeout--;
+		if (timeout == 0) {
+			SPDK_ERRLOG("timed out waiting for suspend\n");
+			return -1;
+		}
+		status = ioat_get_chansts(ioat);
+	}
+
+	/*
+	 * Clear any outstanding errors.
+	 * CHANERR is write-1-to-clear, so write the current CHANERR bits back to reset everything.
+	 */
+	chanerr = ioat->regs->chanerr;
+	ioat->regs->chanerr = chanerr;
+
+	if (ioat->regs->cbver < SPDK_IOAT_VER_3_3) {
+		rc = spdk_pci_device_cfg_read32(ioat->device, &chanerr,
+						SPDK_IOAT_PCI_CHANERR_INT_OFFSET);
+		if (rc) {
+			SPDK_ERRLOG("failed to read the internal channel error register\n");
+			return -1;
+		}
+
+		spdk_pci_device_cfg_write32(ioat->device, chanerr,
+					    SPDK_IOAT_PCI_CHANERR_INT_OFFSET);
+	}
+
+	ioat_reset(ioat);
+
+	timeout = 20;
+	while (ioat_reset_pending(ioat)) {
+		spdk_delay_us(1000);
+		timeout--;
+		if (timeout == 0) {
+			SPDK_ERRLOG("timed out waiting for reset\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+ioat_process_channel_events(struct spdk_ioat_chan *ioat)
+{
+	struct ioat_descriptor *desc;
+	uint64_t status, completed_descriptor, hw_desc_phys_addr, events_count = 0;
+	uint32_t tail;
+
+	if (ioat->head == ioat->tail) {
+		return 0;
+	}
+
+	status = *ioat->comp_update;
+	completed_descriptor = status & SPDK_IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK;
+
+	if (is_ioat_halted(status)) {
+		SPDK_ERRLOG("Channel halted (%x)\n", ioat->regs->chanerr);
+		return -1;
+	}
+
+	if (completed_descriptor == ioat->last_seen) {
+		return 0;
+	}
+
+	do {
+		tail = ioat_get_ring_index(ioat, ioat->tail);
+		desc = &ioat->ring[tail];
+
+		if (desc->callback_fn) {
+			desc->callback_fn(desc->callback_arg);
+		}
+
+		hw_desc_phys_addr = desc->phys_addr;
+		ioat->tail++;
+		events_count++;
+	} while (hw_desc_phys_addr != completed_descriptor);
+
+	ioat->last_seen = hw_desc_phys_addr;
+
+	return events_count;
+}
+
+static void
+ioat_channel_destruct(struct spdk_ioat_chan *ioat)
+{
+	ioat_unmap_pci_bar(ioat);
+
+	if (ioat->ring) {
+		free(ioat->ring);
+	}
+
+	if (ioat->hw_ring) {
+		spdk_free(ioat->hw_ring);
+	}
+
+	if (ioat->comp_update) {
+		spdk_free((void *)ioat->comp_update);
+		ioat->comp_update = NULL;
+	}
+}
+
+uint32_t
+spdk_ioat_get_max_descriptors(struct spdk_ioat_chan *ioat)
+{
+	return 1 << ioat->ring_size_order;
+}
+
+static int
+ioat_channel_start(struct spdk_ioat_chan *ioat)
+{
+	uint8_t xfercap, version;
+	uint64_t status;
+	int i, num_descriptors;
+	uint64_t comp_update_bus_addr = 0;
+	uint64_t phys_addr;
+
+	if (ioat_map_pci_bar(ioat) != 0) {
+		SPDK_ERRLOG("ioat_map_pci_bar() failed\n");
+		return -1;
+	}
+
+	version = ioat->regs->cbver;
+	if (version < SPDK_IOAT_VER_3_0) {
+		SPDK_ERRLOG(" unsupported IOAT version %u.%u\n",
+			    version >> 4, version & 0xF);
+		return -1;
+	}
+
+	/* Always support DMA copy */
+	ioat->dma_capabilities = SPDK_IOAT_ENGINE_COPY_SUPPORTED;
+	if (ioat->regs->dmacapability & SPDK_IOAT_DMACAP_BFILL) {
+		ioat->dma_capabilities |= SPDK_IOAT_ENGINE_FILL_SUPPORTED;
+	}
+	xfercap = ioat->regs->xfercap;
+
+	/* Only bits [4:0] are valid. */
+	xfercap &= 0x1f;
+	if (xfercap == 0) {
+		/* 0 means 4 GB max transfer size. */
+		ioat->max_xfer_size = 1ULL << 32;
+	} else if (xfercap < 12) {
+		/* XFERCAP must be at least 12 (4 KB) according to the spec. */
+		SPDK_ERRLOG("invalid XFERCAP value %u\n", xfercap);
+		return -1;
+	} else {
+		ioat->max_xfer_size = 1U << xfercap;
+	}
+
+	ioat->comp_update = spdk_zmalloc(sizeof(*ioat->comp_update), SPDK_IOAT_CHANCMP_ALIGN,
+					 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (ioat->comp_update == NULL) {
+		return -1;
+	}
+
+	comp_update_bus_addr = spdk_vtophys((void *)ioat->comp_update, NULL);
+	if (comp_update_bus_addr == SPDK_VTOPHYS_ERROR) {
+		spdk_free((void *)ioat->comp_update);
+		return -1;
+	}
+
+	ioat->ring_size_order = IOAT_DEFAULT_ORDER;
+
+	num_descriptors = 1 << ioat->ring_size_order;
+
+	ioat->ring = calloc(num_descriptors, sizeof(struct ioat_descriptor));
+	if (!ioat->ring) {
+		return -1;
+	}
+
+	ioat->hw_ring = spdk_zmalloc(num_descriptors * sizeof(union spdk_ioat_hw_desc), 64,
+				     NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (!ioat->hw_ring) {
+		return -1;
+	}
+
+	for (i = 0; i < num_descriptors; i++) {
+		phys_addr = spdk_vtophys(&ioat->hw_ring[i], NULL);
+		if (phys_addr == SPDK_VTOPHYS_ERROR) {
+			SPDK_ERRLOG("Failed to translate descriptor %u to physical address\n", i);
+			return -1;
+		}
+
+		ioat->ring[i].phys_addr = phys_addr;
+		ioat->hw_ring[ioat_get_ring_index(ioat, i - 1)].generic.next = phys_addr;
+	}
+
+	ioat->head = 0;
+	ioat->tail = 0;
+	ioat->last_seen = 0;
+
+	ioat_reset_hw(ioat);
+
+	ioat->regs->chanctrl = SPDK_IOAT_CHANCTRL_ANY_ERR_ABORT_EN;
+	ioat_write_chancmp(ioat, comp_update_bus_addr);
+	ioat_write_chainaddr(ioat, ioat->ring[0].phys_addr);
+
+	ioat_prep_null(ioat);
+	spdk_ioat_flush(ioat);
+
+	i = 100;
+	while (i-- > 0) {
+		spdk_delay_us(100);
+		status = ioat_get_chansts(ioat);
+		if (is_ioat_idle(status)) {
+			break;
+		}
+	}
+
+	if (is_ioat_idle(status)) {
+		ioat_process_channel_events(ioat);
+	} else {
+		SPDK_ERRLOG("could not start channel: status = %p\n error = %#x\n",
+			    (void *)status, ioat->regs->chanerr);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Caller must hold g_ioat_driver.lock */
+static struct spdk_ioat_chan *
+ioat_attach(struct spdk_pci_device *device)
+{
+	struct spdk_ioat_chan *ioat;
+	uint32_t cmd_reg;
+
+	ioat = calloc(1, sizeof(struct spdk_ioat_chan));
+	if (ioat == NULL) {
+		return NULL;
+	}
+
+	/* Enable PCI busmaster. */
+	spdk_pci_device_cfg_read32(device, &cmd_reg, 4);
+	cmd_reg |= 0x4;
+	spdk_pci_device_cfg_write32(device, cmd_reg, 4);
+
+	ioat->device = device;
+
+	if (ioat_channel_start(ioat) != 0) {
+		ioat_channel_destruct(ioat);
+		free(ioat);
+		return NULL;
+	}
+
+	return ioat;
+}
+
+struct ioat_enum_ctx {
+	spdk_ioat_probe_cb probe_cb;
+	spdk_ioat_attach_cb attach_cb;
+	void *cb_ctx;
+};
+
+/* This function must only be called while holding g_ioat_driver.lock */
+static int
+ioat_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+	struct ioat_enum_ctx *enum_ctx = ctx;
+	struct spdk_ioat_chan *ioat;
+
+	/* Verify that this device is not already attached */
+	TAILQ_FOREACH(ioat, &g_ioat_driver.attached_chans, tailq) {
+		/*
+		 * NOTE: This assumes that the PCI abstraction layer will use the same device handle
+		 *  across enumerations; we could compare by BDF instead if this is not true.
+		 */
+		if (pci_dev == ioat->device) {
+			return 0;
+		}
+	}
+
+	if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) {
+		/*
+		 * Since I/OAT init is relatively quick, just perform the full init during probing.
+		 *  If this turns out to be a bottleneck later, this can be changed to work like
+		 *  NVMe with a list of devices to initialize in parallel.
+		 */
+		ioat = ioat_attach(pci_dev);
+		if (ioat == NULL) {
+			SPDK_ERRLOG("ioat_attach() failed\n");
+			return -1;
+		}
+
+		TAILQ_INSERT_TAIL(&g_ioat_driver.attached_chans, ioat, tailq);
+
+		enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, ioat);
+	}
+
+	return 0;
+}
+
+int
+spdk_ioat_probe(void *cb_ctx, spdk_ioat_probe_cb probe_cb, spdk_ioat_attach_cb attach_cb)
+{
+	int rc;
+	struct ioat_enum_ctx enum_ctx;
+
+	pthread_mutex_lock(&g_ioat_driver.lock);
+
+	enum_ctx.probe_cb = probe_cb;
+	enum_ctx.attach_cb = attach_cb;
+	enum_ctx.cb_ctx = cb_ctx;
+
+	rc = spdk_pci_enumerate(spdk_pci_ioat_get_driver(), ioat_enum_cb, &enum_ctx);
+
+	pthread_mutex_unlock(&g_ioat_driver.lock);
+
+	return rc;
+}
+
+void
+spdk_ioat_detach(struct spdk_ioat_chan *ioat)
+{
+	struct ioat_driver	*driver = &g_ioat_driver;
+
+	/* ioat should be in the free list (not registered to a thread)
+	 * when calling ioat_detach().
+	 */
+	pthread_mutex_lock(&driver->lock);
+	TAILQ_REMOVE(&driver->attached_chans, ioat, tailq);
+	pthread_mutex_unlock(&driver->lock);
+
+	ioat_channel_destruct(ioat);
+	free(ioat);
+}
+
+int
+spdk_ioat_build_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+		     void *dst, const void *src, uint64_t nbytes)
+{
+	struct ioat_descriptor	*last_desc;
+	uint64_t	remaining, op_size;
+	uint64_t	vdst, vsrc;
+	uint64_t	vdst_page, vsrc_page;
+	uint64_t	pdst_page, psrc_page;
+	uint32_t	orig_head;
+
+	if (!ioat) {
+		return -EINVAL;
+	}
+
+	orig_head = ioat->head;
+
+	vdst = (uint64_t)dst;
+	vsrc = (uint64_t)src;
+	vdst_page = vsrc_page = 0;
+	pdst_page = psrc_page = SPDK_VTOPHYS_ERROR;
+
+	remaining = nbytes;
+	while (remaining) {
+		if (_2MB_PAGE(vsrc) != vsrc_page) {
+			vsrc_page = _2MB_PAGE(vsrc);
+			psrc_page = spdk_vtophys((void *)vsrc_page, NULL);
+		}
+
+		if (_2MB_PAGE(vdst) != vdst_page) {
+			vdst_page = _2MB_PAGE(vdst);
+			pdst_page = spdk_vtophys((void *)vdst_page, NULL);
+		}
+		op_size = remaining;
+		op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vsrc)));
+		op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vdst)));
+		op_size = spdk_min(op_size, ioat->max_xfer_size);
+		remaining -= op_size;
+
+		last_desc = ioat_prep_copy(ioat,
+					   pdst_page + _2MB_OFFSET(vdst),
+					   psrc_page + _2MB_OFFSET(vsrc),
+					   op_size);
+
+		if (remaining == 0 || last_desc == NULL) {
+			break;
+		}
+
+		vsrc += op_size;
+		vdst += op_size;
+
+	}
+	/* Issue null descriptor for null transfer */
+	if (nbytes == 0) {
+		last_desc = ioat_prep_null(ioat);
+	}
+
+	if (last_desc) {
+		last_desc->callback_fn = cb_fn;
+		last_desc->callback_arg = cb_arg;
+	} else {
+		/*
+		 * Ran out of descriptors in the ring - reset head to leave things as they were
+		 * in case we managed to fill out any descriptors.
+		 */
+		ioat->head = orig_head;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int
+spdk_ioat_submit_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+		      void *dst, const void *src, uint64_t nbytes)
+{
+	int rc;
+
+	rc = spdk_ioat_build_copy(ioat, cb_arg, cb_fn, dst, src, nbytes);
+	if (rc != 0) {
+		return rc;
+	}
+
+	spdk_ioat_flush(ioat);
+	return 0;
+}
+
+int
+spdk_ioat_build_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+		     void *dst, uint64_t fill_pattern, uint64_t nbytes)
+{
+	struct ioat_descriptor	*last_desc = NULL;
+	uint64_t	remaining, op_size;
+	uint64_t	vdst;
+	uint32_t	orig_head;
+
+	if (!ioat) {
+		return -EINVAL;
+	}
+
+	if (!(ioat->dma_capabilities & SPDK_IOAT_ENGINE_FILL_SUPPORTED)) {
+		SPDK_ERRLOG("Channel does not support memory fill\n");
+		return -1;
+	}
+
+	orig_head = ioat->head;
+
+	vdst = (uint64_t)dst;
+	remaining = nbytes;
+
+	while (remaining) {
+		op_size = remaining;
+		op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vdst)));
+		op_size = spdk_min(op_size, ioat->max_xfer_size);
+		remaining -= op_size;
+
+		last_desc = ioat_prep_fill(ioat,
+					   spdk_vtophys((void *)vdst, NULL),
+					   fill_pattern,
+					   op_size);
+
+		if (remaining == 0 || last_desc == NULL) {
+			break;
+		}
+
+		vdst += op_size;
+	}
+
+	if (last_desc) {
+		last_desc->callback_fn = cb_fn;
+		last_desc->callback_arg = cb_arg;
+	} else {
+		/*
+		 * Ran out of descriptors in the ring - reset head to leave things as they were
+		 * in case we managed to fill out any descriptors.
+		 */
+		ioat->head = orig_head;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int
+spdk_ioat_submit_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+		      void *dst, uint64_t fill_pattern, uint64_t nbytes)
+{
+	int rc;
+
+	rc = spdk_ioat_build_fill(ioat, cb_arg, cb_fn, dst, fill_pattern, nbytes);
+	if (rc != 0) {
+		return rc;
+	}
+
+	spdk_ioat_flush(ioat);
+	return 0;
+}
+
+uint32_t
+spdk_ioat_get_dma_capabilities(struct spdk_ioat_chan *ioat)
+{
+	if (!ioat) {
+		return 0;
+	}
+	return ioat->dma_capabilities;
+}
+
+int
+spdk_ioat_process_events(struct spdk_ioat_chan *ioat)
+{
+	return ioat_process_channel_events(ioat);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ioat", SPDK_LOG_IOAT)
diff --git a/src/spdk/lib/ioat/ioat_internal.h b/src/spdk/lib/ioat/ioat_internal.h
new file mode 100644
index 000000000..19593bb00
--- /dev/null
+++ b/src/spdk/lib/ioat/ioat_internal.h
@@ -0,0 +1,100 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IOAT_INTERNAL_H__
+#define __IOAT_INTERNAL_H__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/ioat.h"
+#include "spdk/ioat_spec.h"
+#include "spdk/queue.h"
+#include "spdk/mmio.h"
+
+/* Allocate 1 << 15 (32K) descriptors per channel by default. */
+#define IOAT_DEFAULT_ORDER			15
+
+struct ioat_descriptor {
+	uint64_t		phys_addr;
+	spdk_ioat_req_cb	callback_fn;
+	void			*callback_arg;
+};
+
+/* One of these per allocated PCI device. */
+struct spdk_ioat_chan {
+	/* Opaque handle to upper layer */
+	struct spdk_pci_device		*device;
+	uint64_t            max_xfer_size;
+	volatile struct spdk_ioat_registers *regs;
+
+	volatile uint64_t   *comp_update;
+
+	uint32_t            head;
+	uint32_t            tail;
+
+	uint32_t            ring_size_order;
+	uint64_t            last_seen;
+
+	struct ioat_descriptor		*ring;
+	union spdk_ioat_hw_desc		*hw_ring;
+	uint32_t			dma_capabilities;
+
+	/* tailq entry for attached_chans */
+	TAILQ_ENTRY(spdk_ioat_chan)	tailq;
+};
+
+static inline uint32_t
+is_ioat_active(uint64_t status)
+{
+	return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_ACTIVE;
+}
+
+static inline uint32_t
+is_ioat_idle(uint64_t status)
+{
+	return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_IDLE;
+}
+
+static inline uint32_t
+is_ioat_halted(uint64_t status)
+{
+	return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_HALTED;
+}
+
+static inline uint32_t
+is_ioat_suspended(uint64_t status)
+{
+	return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_SUSPENDED;
+}
+
+#endif /* __IOAT_INTERNAL_H__ */
diff --git a/src/spdk/lib/ioat/spdk_ioat.map b/src/spdk/lib/ioat/spdk_ioat.map
new file mode 100644
index 000000000..f467da817
--- /dev/null
+++ b/src/spdk/lib/ioat/spdk_ioat.map
@@ -0,0 +1,17 @@
+{
+	global:
+
+	# public functions
+	spdk_ioat_probe;
+	spdk_ioat_detach;
+	spdk_ioat_build_copy;
+	spdk_ioat_submit_copy;
+	spdk_ioat_build_fill;
+	spdk_ioat_submit_fill;
+	spdk_ioat_flush;
+	spdk_ioat_process_events;
+	spdk_ioat_get_dma_capabilities;
+	spdk_ioat_get_max_descriptors;
+
+	local: *;
+};
diff --git a/src/spdk/lib/iscsi/Makefile b/src/spdk/lib/iscsi/Makefile
new file mode 100644
index 000000000..2c663d880
--- /dev/null
+++ b/src/spdk/lib/iscsi/Makefile
@@ -0,0 +1,50 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib
+C_SRCS = conn.c \
+	 init_grp.c iscsi.c md5.c param.c portal_grp.c \
+	 tgt_node.c iscsi_subsystem.c \
+	 iscsi_rpc.c task.c
+LIBNAME = iscsi
+LOCAL_SYS_LIBS = -lcrypto
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_iscsi.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/iscsi/conn.c b/src/spdk/lib/iscsi/conn.c
new file mode 100644
index 000000000..4c7a54fcf
--- /dev/null
+++ b/src/spdk/lib/iscsi/conn.c
@@ -0,0 +1,1714 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/event.h"
+#include "spdk/likely.h"
+#include "spdk/thread.h"
+#include "spdk/queue.h"
+#include "spdk/trace.h"
+#include "spdk/net.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/task.h"
+#include "iscsi/conn.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/portal_grp.h"
+
+#define MAKE_DIGEST_WORD(BUF, CRC32C) \
+        (   ((*((uint8_t *)(BUF)+0)) = (uint8_t)((uint32_t)(CRC32C) >> 0)), \
+            ((*((uint8_t *)(BUF)+1)) = (uint8_t)((uint32_t)(CRC32C) >> 8)), \
+            ((*((uint8_t *)(BUF)+2)) = (uint8_t)((uint32_t)(CRC32C) >> 16)), \
+            ((*((uint8_t *)(BUF)+3)) = (uint8_t)((uint32_t)(CRC32C) >> 24)))
+
+#define SPDK_ISCSI_CONNECTION_MEMSET(conn)		\
+	memset(&(conn)->portal, 0, sizeof(*(conn)) -	\
+		offsetof(struct spdk_iscsi_conn, portal));
+
+struct spdk_iscsi_conn *g_conns_array = MAP_FAILED;
+static int g_conns_array_fd = -1;
+static char g_shm_name[64];
+
+static TAILQ_HEAD(, spdk_iscsi_conn) g_free_conns = TAILQ_HEAD_INITIALIZER(g_free_conns);
+static TAILQ_HEAD(, spdk_iscsi_conn) g_active_conns = TAILQ_HEAD_INITIALIZER(g_active_conns);
+
+static pthread_mutex_t g_conns_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct spdk_poller *g_shutdown_timer = NULL;
+
+static void iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group,
+			       struct spdk_sock *sock);
+
+static struct spdk_iscsi_conn *
+allocate_conn(void)
+{
+	struct spdk_iscsi_conn	*conn;
+
+	pthread_mutex_lock(&g_conns_mutex);
+	conn = TAILQ_FIRST(&g_free_conns);
+	if (conn != NULL) {
+		assert(!conn->is_valid);
+		TAILQ_REMOVE(&g_free_conns, conn, conn_link);
+		SPDK_ISCSI_CONNECTION_MEMSET(conn);
+		conn->is_valid = 1;
+
+		TAILQ_INSERT_TAIL(&g_active_conns, conn, conn_link);
+	}
+	pthread_mutex_unlock(&g_conns_mutex);
+
+	return conn;
+}
+
+static void
+_free_conn(struct spdk_iscsi_conn *conn)
+{
+	TAILQ_REMOVE(&g_active_conns, conn, conn_link);
+
+	memset(conn->portal_host, 0, sizeof(conn->portal_host));
+	memset(conn->portal_port, 0, sizeof(conn->portal_port));
+	conn->is_valid = 0;
+
+	TAILQ_INSERT_TAIL(&g_free_conns, conn, conn_link);
+}
+
+static void
+free_conn(struct spdk_iscsi_conn *conn)
+{
+	pthread_mutex_lock(&g_conns_mutex);
+	_free_conn(conn);
+	pthread_mutex_unlock(&g_conns_mutex);
+}
+
+static void
+_iscsi_conns_cleanup(void)
+{
+	if (g_conns_array != MAP_FAILED) {
+		munmap(g_conns_array, sizeof(struct spdk_iscsi_conn) *
+		       MAX_ISCSI_CONNECTIONS);
+		g_conns_array = MAP_FAILED;
+	}
+
+	if (g_conns_array_fd >= 0) {
+		close(g_conns_array_fd);
+		g_conns_array_fd = -1;
+		shm_unlink(g_shm_name);
+	}
+}
+
+int initialize_iscsi_conns(void)
+{
+	size_t conns_size = sizeof(struct spdk_iscsi_conn) * MAX_ISCSI_CONNECTIONS;
+	uint32_t i;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_init\n");
+
+	snprintf(g_shm_name, sizeof(g_shm_name), "/spdk_iscsi_conns.%d", spdk_app_get_shm_id());
+	g_conns_array_fd = shm_open(g_shm_name, O_RDWR | O_CREAT, 0600);
+	if (g_conns_array_fd < 0) {
+		SPDK_ERRLOG("could not shm_open %s\n", g_shm_name);
+		goto err;
+	}
+
+	if (ftruncate(g_conns_array_fd, conns_size) != 0) {
+		SPDK_ERRLOG("could not ftruncate\n");
+		goto err;
+	}
+	g_conns_array = mmap(0, conns_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+			     g_conns_array_fd, 0);
+
+	if (g_conns_array == MAP_FAILED) {
+		SPDK_ERRLOG("could not mmap cons array file %s (%d)\n", g_shm_name, errno);
+		goto err;
+	}
+
+	memset(g_conns_array, 0, conns_size);
+
+	for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) {
+		g_conns_array[i].id = i;
+		TAILQ_INSERT_TAIL(&g_free_conns, &g_conns_array[i], conn_link);
+	}
+
+	return 0;
+
+err:
+	_iscsi_conns_cleanup();
+
+	return -1;
+}
+
+static void
+iscsi_poll_group_add_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_conn *conn)
+{
+	int rc;
+
+	rc = spdk_sock_group_add_sock(pg->sock_group, conn->sock, iscsi_conn_sock_cb, conn);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to add sock=%p of conn=%p\n", conn->sock, conn);
+		return;
+	}
+
+	conn->is_stopped = false;
+	STAILQ_INSERT_TAIL(&pg->connections, conn, pg_link);
+}
+
+static void
+iscsi_poll_group_remove_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_conn *conn)
+{
+	int rc;
+
+	assert(conn->sock != NULL);
+	rc = spdk_sock_group_remove_sock(pg->sock_group, conn->sock);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to remove sock=%p of conn=%p\n", conn->sock, conn);
+	}
+
+	conn->is_stopped = true;
+	STAILQ_REMOVE(&pg->connections, conn, spdk_iscsi_conn, pg_link);
+}
+
+static void
+iscsi_conn_start(void *ctx)
+{
+	struct spdk_iscsi_conn *conn = ctx;
+
+	iscsi_poll_group_add_conn(conn->pg, conn);
+}
+
+int
+iscsi_conn_construct(struct spdk_iscsi_portal *portal,
+		     struct spdk_sock *sock)
+{
+	struct spdk_iscsi_poll_group *pg;
+	struct spdk_iscsi_conn *conn;
+	int i, rc;
+
+	conn = allocate_conn();
+	if (conn == NULL) {
+		SPDK_ERRLOG("Could not allocate connection.\n");
+		return -1;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	conn->timeout = g_iscsi.timeout * spdk_get_ticks_hz(); /* seconds to TSC */
+	conn->nopininterval = g_iscsi.nopininterval;
+	conn->nopininterval *= spdk_get_ticks_hz(); /* seconds to TSC */
+	conn->nop_outstanding = false;
+	conn->data_out_cnt = 0;
+	conn->data_in_cnt = 0;
+	conn->disable_chap = portal->group->disable_chap;
+	conn->require_chap = portal->group->require_chap;
+	conn->mutual_chap = portal->group->mutual_chap;
+	conn->chap_group = portal->group->chap_group;
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	conn->MaxRecvDataSegmentLength = 8192; /* RFC3720(12.12) */
+
+	conn->portal = portal;
+	conn->pg_tag = portal->group->tag;
+	memcpy(conn->portal_host, portal->host, strlen(portal->host));
+	memcpy(conn->portal_port, portal->port, strlen(portal->port));
+	conn->sock = sock;
+
+	conn->state = ISCSI_CONN_STATE_INVALID;
+	conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE;
+	conn->ttt = 0;
+
+	conn->partial_text_parameter = NULL;
+
+	for (i = 0; i < MAX_CONNECTION_PARAMS; i++) {
+		conn->conn_param_state_negotiated[i] = false;
+	}
+
+	for (i = 0; i < MAX_SESSION_PARAMS; i++) {
+		conn->sess_param_state_negotiated[i] = false;
+	}
+
+	conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY;
+
+	TAILQ_INIT(&conn->write_pdu_list);
+	TAILQ_INIT(&conn->snack_pdu_list);
+	TAILQ_INIT(&conn->queued_r2t_tasks);
+	TAILQ_INIT(&conn->active_r2t_tasks);
+	TAILQ_INIT(&conn->queued_datain_tasks);
+	memset(&conn->luns, 0, sizeof(conn->luns));
+
+	rc = spdk_sock_getaddr(sock, conn->target_addr, sizeof conn->target_addr, NULL,
+			       conn->initiator_addr, sizeof conn->initiator_addr, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("spdk_sock_getaddr() failed\n");
+		goto error_return;
+	}
+
+	/* set low water mark */
+	rc = spdk_sock_set_recvlowat(conn->sock, 1);
+	if (rc != 0) {
+		SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n");
+		goto error_return;
+	}
+
+	/* set default params */
+	rc = iscsi_conn_params_init(&conn->params);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_conn_params_init() failed\n");
+		goto error_return;
+	}
+	conn->logout_request_timer = NULL;
+	conn->logout_timer = NULL;
+	conn->shutdown_timer = NULL;
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Launching connection on acceptor thread\n");
+	conn->pending_task_cnt = 0;
+
+	/* Get the first poll group. */
+	pg = TAILQ_FIRST(&g_iscsi.poll_group_head);
+	if (pg == NULL) {
+		SPDK_ERRLOG("There is no poll group.\n");
+		assert(false);
+		goto error_return;
+	}
+
+	conn->pg = pg;
+	spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)),
+			     iscsi_conn_start, conn);
+	return 0;
+
+error_return:
+	iscsi_param_free(conn->params);
+	free_conn(conn);
+	return -1;
+}
+
+void
+iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	iscsi_conn_xfer_complete_cb cb_fn;
+	void *cb_arg;
+
+	cb_fn = pdu->cb_fn;
+	cb_arg = pdu->cb_arg;
+
+	assert(cb_fn != NULL);
+	pdu->cb_fn = NULL;
+
+	if (pdu->task) {
+		iscsi_task_put(pdu->task);
+	}
+	iscsi_put_pdu(pdu);
+
+	cb_fn(cb_arg);
+}
+
+static int
+iscsi_conn_free_tasks(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_pdu *pdu, *tmp_pdu;
+	struct spdk_iscsi_task *iscsi_task, *tmp_iscsi_task;
+
+	TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) {
+		TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+		iscsi_conn_free_pdu(conn, pdu);
+	}
+
+	TAILQ_FOREACH_SAFE(iscsi_task, &conn->queued_datain_tasks, link, tmp_iscsi_task) {
+		if (!iscsi_task->is_queued) {
+			TAILQ_REMOVE(&conn->queued_datain_tasks, iscsi_task, link);
+			iscsi_task_put(iscsi_task);
+		}
+	}
+
+	/* We have to parse conn->write_pdu_list in the end.  In iscsi_conn_free_pdu(),
+	 *  iscsi_conn_handle_queued_datain_tasks() may be called, and
+	 *  iscsi_conn_handle_queued_datain_tasks() will parse conn->queued_datain_tasks
+	 *  and may stack some PDUs to conn->write_pdu_list.  Hence when we come here, we
+	 *  have to ensure there is no associated task in conn->queued_datain_tasks.
+	 */
+	TAILQ_FOREACH_SAFE(pdu, &conn->write_pdu_list, tailq, tmp_pdu) {
+		TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq);
+		iscsi_conn_free_pdu(conn, pdu);
+	}
+
+	if (conn->pending_task_cnt) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+iscsi_conn_cleanup_backend(struct spdk_iscsi_conn *conn)
+{
+	int rc;
+	struct spdk_iscsi_tgt_node *target;
+
+	if (conn->sess->connections > 1) {
+		/* connection specific cleanup */
+	} else if (!g_iscsi.AllowDuplicateIsid) {
+		/* clean up all tasks to all LUNs for session */
+		target = conn->sess->target;
+		if (target != NULL) {
+			rc = iscsi_tgt_node_cleanup_luns(conn, target);
+			if (rc < 0) {
+				SPDK_ERRLOG("target abort failed\n");
+			}
+		}
+	}
+}
+
+static void
+iscsi_conn_free(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_sess *sess;
+	int idx;
+	uint32_t i;
+
+	pthread_mutex_lock(&g_conns_mutex);
+
+	if (conn->sess == NULL) {
+		goto end;
+	}
+
+	idx = -1;
+	sess = conn->sess;
+	conn->sess = NULL;
+
+	for (i = 0; i < sess->connections; i++) {
+		if (sess->conns[i] == conn) {
+			idx = i;
+			break;
+		}
+	}
+
+	if (idx < 0) {
+		SPDK_ERRLOG("remove conn not found\n");
+	} else {
+		for (i = idx; i < sess->connections - 1; i++) {
+			sess->conns[i] = sess->conns[i + 1];
+		}
+		sess->conns[sess->connections - 1] = NULL;
+		sess->connections--;
+
+		if (sess->connections == 0) {
+			/* cleanup last connection */
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+				      "cleanup last conn free sess\n");
+			iscsi_free_sess(sess);
+		}
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Terminating connections(tsih %d): %d\n",
+		      sess->tsih, sess->connections);
+
+end:
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "cleanup free conn\n");
+	iscsi_param_free(conn->params);
+	_free_conn(conn);
+
+	pthread_mutex_unlock(&g_conns_mutex);
+}
+
+static void
+iscsi_conn_close_lun(struct spdk_iscsi_conn *conn, int lun_id)
+{
+	struct spdk_iscsi_lun *iscsi_lun;
+
+	iscsi_lun = conn->luns[lun_id];
+	if (iscsi_lun == NULL) {
+		return;
+	}
+
+	spdk_scsi_lun_free_io_channel(iscsi_lun->desc);
+	spdk_scsi_lun_close(iscsi_lun->desc);
+	spdk_poller_unregister(&iscsi_lun->remove_poller);
+	free(iscsi_lun);
+
+	conn->luns[lun_id] = NULL;
+}
+
+static void
+iscsi_conn_close_luns(struct spdk_iscsi_conn *conn)
+{
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		iscsi_conn_close_lun(conn, i);
+	}
+}
+
+static bool
+iscsi_conn_check_tasks_for_lun(struct spdk_iscsi_conn *conn,
+			       struct spdk_scsi_lun *lun)
+{
+	struct spdk_iscsi_pdu *pdu, *tmp_pdu;
+	struct spdk_iscsi_task *task;
+
+	assert(lun != NULL);
+
+	/* We can remove deferred PDUs safely because they are already flushed. */
+	TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) {
+		if (lun == pdu->task->scsi.lun) {
+			TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+			iscsi_conn_free_pdu(conn, pdu);
+		}
+	}
+
+	TAILQ_FOREACH(task, &conn->queued_datain_tasks, link) {
+		if (lun == task->scsi.lun) {
+			return false;
+		}
+	}
+
+	/* This check loop works even when connection exits in the middle of LUN hotplug
+	 *  because all PDUs in write_pdu_list are removed in iscsi_conn_free_tasks().
+	 */
+	TAILQ_FOREACH(pdu, &conn->write_pdu_list, tailq) {
+		if (pdu->task && lun == pdu->task->scsi.lun) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int
+iscsi_conn_remove_lun(void *ctx)
+{
+	struct spdk_iscsi_lun *iscsi_lun = ctx;
+	struct spdk_iscsi_conn *conn = iscsi_lun->conn;
+	struct spdk_scsi_lun *lun = iscsi_lun->lun;
+	int lun_id = spdk_scsi_lun_get_id(lun);
+
+	if (!iscsi_conn_check_tasks_for_lun(conn, lun)) {
+		return SPDK_POLLER_BUSY;
+	}
+	iscsi_conn_close_lun(conn, lun_id);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+_iscsi_conn_hotremove_lun(void *ctx)
+{
+	struct spdk_iscsi_lun *iscsi_lun = ctx;
+	struct spdk_iscsi_conn *conn = iscsi_lun->conn;
+	struct spdk_scsi_lun *lun = iscsi_lun->lun;
+
+	assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) ==
+	       spdk_get_thread());
+
+	/* If a connection is already in stating status, just return */
+	if (conn->state >= ISCSI_CONN_STATE_EXITING) {
+		return;
+	}
+
+	iscsi_clear_all_transfer_task(conn, lun, NULL);
+
+	iscsi_lun->remove_poller = SPDK_POLLER_REGISTER(iscsi_conn_remove_lun, iscsi_lun,
+				   1000);
+}
+
+static void
+iscsi_conn_hotremove_lun(struct spdk_scsi_lun *lun, void *remove_ctx)
+{
+	struct spdk_iscsi_conn *conn = remove_ctx;
+	int lun_id = spdk_scsi_lun_get_id(lun);
+	struct spdk_iscsi_lun *iscsi_lun;
+
+	iscsi_lun = conn->luns[lun_id];
+	if (iscsi_lun == NULL) {
+		SPDK_ERRLOG("LUN hotplug was notified to the unallocated LUN %d.\n", lun_id);
+		return;
+	}
+
+	spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)),
+			     _iscsi_conn_hotremove_lun, iscsi_lun);
+}
+
+static int
+iscsi_conn_open_lun(struct spdk_iscsi_conn *conn, int lun_id,
+		    struct spdk_scsi_lun *lun)
+{
+	int rc;
+	struct spdk_iscsi_lun *iscsi_lun;
+
+	iscsi_lun = calloc(1, sizeof(*iscsi_lun));
+	if (iscsi_lun == NULL) {
+		return -ENOMEM;
+	}
+
+	iscsi_lun->conn = conn;
+	iscsi_lun->lun = lun;
+
+	rc = spdk_scsi_lun_open(lun, iscsi_conn_hotremove_lun, conn, &iscsi_lun->desc);
+	if (rc != 0) {
+		free(iscsi_lun);
+		return rc;
+	}
+
+	rc = spdk_scsi_lun_allocate_io_channel(iscsi_lun->desc);
+	if (rc != 0) {
+		spdk_scsi_lun_close(iscsi_lun->desc);
+		free(iscsi_lun);
+		return rc;
+	}
+
+	conn->luns[lun_id] = iscsi_lun;
+
+	return 0;
+}
+
+static void
+iscsi_conn_open_luns(struct spdk_iscsi_conn *conn)
+{
+	int i, rc;
+	struct spdk_scsi_lun *lun;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		lun = spdk_scsi_dev_get_lun(conn->dev, i);
+		if (lun == NULL) {
+			continue;
+		}
+
+		rc = iscsi_conn_open_lun(conn, i, lun);
+		if (rc != 0) {
+			goto error;
+		}
+	}
+
+	return;
+
+error:
+	iscsi_conn_close_luns(conn);
+}
+
+/**
+ *  This function will stop executing the specified connection.
+ */
+static void
+iscsi_conn_stop(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	assert(conn->state == ISCSI_CONN_STATE_EXITED);
+	assert(conn->data_in_cnt == 0);
+	assert(conn->data_out_cnt == 0);
+
+	if (conn->sess != NULL &&
+	    conn->sess->session_type == SESSION_TYPE_NORMAL &&
+	    conn->full_feature) {
+		target = conn->sess->target;
+		pthread_mutex_lock(&target->mutex);
+		target->num_active_conns--;
+		pthread_mutex_unlock(&target->mutex);
+
+		iscsi_conn_close_luns(conn);
+	}
+
+	assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) ==
+	       spdk_get_thread());
+}
+
+static int
+_iscsi_conn_check_shutdown(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+	int rc;
+
+	rc = iscsi_conn_free_tasks(conn);
+	if (rc < 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	spdk_poller_unregister(&conn->shutdown_timer);
+
+	iscsi_conn_stop(conn);
+	iscsi_conn_free(conn);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+_iscsi_conn_destruct(struct spdk_iscsi_conn *conn)
+{
+	int rc;
+
+	iscsi_poll_group_remove_conn(conn->pg, conn);
+	spdk_sock_close(&conn->sock);
+	iscsi_clear_all_transfer_task(conn, NULL, NULL);
+	spdk_poller_unregister(&conn->logout_request_timer);
+	spdk_poller_unregister(&conn->logout_timer);
+
+	rc = iscsi_conn_free_tasks(conn);
+	if (rc < 0) {
+		/* The connection cannot be freed yet. Check back later. */
+		conn->shutdown_timer = SPDK_POLLER_REGISTER(_iscsi_conn_check_shutdown, conn, 1000);
+	} else {
+		iscsi_conn_stop(conn);
+		iscsi_conn_free(conn);
+	}
+}
+
+static int
+_iscsi_conn_check_pending_tasks(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->dev != NULL &&
+	    spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	spdk_poller_unregister(&conn->shutdown_timer);
+
+	_iscsi_conn_destruct(conn);
+
+	return SPDK_POLLER_BUSY;
+}
+
+void
+iscsi_conn_destruct(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_pdu *pdu;
+	struct spdk_iscsi_task *task;
+	int opcode;
+
+	/* If a connection is already in exited status, just return */
+	if (conn->state >= ISCSI_CONN_STATE_EXITED) {
+		return;
+	}
+
+	conn->state = ISCSI_CONN_STATE_EXITED;
+
+	/*
+	 * Each connection pre-allocates its next PDU - make sure these get
+	 *  freed here.
+	 */
+	pdu = conn->pdu_in_progress;
+	if (pdu) {
+		/* remove the task left in the PDU too. */
+		task = pdu->task;
+		if (task) {
+			opcode = pdu->bhs.opcode;
+			switch (opcode) {
+			case ISCSI_OP_SCSI:
+			case ISCSI_OP_SCSI_DATAOUT:
+				spdk_scsi_task_process_abort(&task->scsi);
+				iscsi_task_cpl(&task->scsi);
+				break;
+			default:
+				SPDK_ERRLOG("unexpected opcode %x\n", opcode);
+				iscsi_task_put(task);
+				break;
+			}
+		}
+		iscsi_put_pdu(pdu);
+		conn->pdu_in_progress = NULL;
+	}
+
+	if (conn->sess != NULL && conn->pending_task_cnt > 0) {
+		iscsi_conn_cleanup_backend(conn);
+	}
+
+	if (conn->dev != NULL &&
+	    spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) {
+		conn->shutdown_timer = SPDK_POLLER_REGISTER(_iscsi_conn_check_pending_tasks, conn, 1000);
+	} else {
+		_iscsi_conn_destruct(conn);
+	}
+}
+
+int
+iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target)
+{
+	struct spdk_iscsi_conn *conn;
+	int num = 0;
+
+	if (g_conns_array == MAP_FAILED) {
+		return 0;
+	}
+
+	pthread_mutex_lock(&g_conns_mutex);
+	TAILQ_FOREACH(conn, &g_active_conns, conn_link) {
+		if (target == NULL || conn->target == target) {
+			num++;
+		}
+	}
+	pthread_mutex_unlock(&g_conns_mutex);
+	return num;
+}
+
+static void
+iscsi_conn_check_shutdown_cb(void *arg1)
+{
+	_iscsi_conns_cleanup();
+	shutdown_iscsi_conns_done();
+}
+
+static int
+iscsi_conn_check_shutdown(void *arg)
+{
+	if (iscsi_get_active_conns(NULL) != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	spdk_poller_unregister(&g_shutdown_timer);
+
+	spdk_thread_send_msg(spdk_get_thread(), iscsi_conn_check_shutdown_cb, NULL);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+iscsi_send_logout_request(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_async *rsph;
+
+	rsp_pdu = iscsi_get_pdu(conn);
+	assert(rsp_pdu != NULL);
+
+	rsph = (struct iscsi_bhs_async *)&rsp_pdu->bhs;
+	rsp_pdu->data = NULL;
+
+	rsph->opcode = ISCSI_OP_ASYNC;
+	to_be32(&rsph->ffffffff, 0xFFFFFFFF);
+	rsph->async_event = 1;
+	to_be16(&rsph->param3, ISCSI_LOGOUT_REQUEST_TIMEOUT);
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+}
+
+static int
+logout_request_timeout(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->state < ISCSI_CONN_STATE_EXITING) {
+		conn->state = ISCSI_CONN_STATE_EXITING;
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+/* If the connection is running and logout is not requested yet, request logout
+ * to initiator and wait for the logout process to start.
+ */
+static void
+_iscsi_conn_request_logout(void *ctx)
+{
+	struct spdk_iscsi_conn *conn = ctx;
+
+	if (conn->state > ISCSI_CONN_STATE_RUNNING ||
+	    conn->logout_request_timer != NULL) {
+		return;
+	}
+
+	iscsi_send_logout_request(conn);
+
+	conn->logout_request_timer = SPDK_POLLER_REGISTER(logout_request_timeout,
+				     conn, ISCSI_LOGOUT_REQUEST_TIMEOUT * 1000000);
+}
+
+static void
+iscsi_conn_request_logout(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_thread *thread;
+
+	if (conn->state == ISCSI_CONN_STATE_INVALID) {
+		/* Move it to EXITING state if the connection is in login. */
+		conn->state = ISCSI_CONN_STATE_EXITING;
+	} else if (conn->state == ISCSI_CONN_STATE_RUNNING &&
+		   conn->logout_request_timer == NULL) {
+		thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg));
+		spdk_thread_send_msg(thread, _iscsi_conn_request_logout, conn);
+	}
+}
+
+void
+iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target)
+{
+	struct spdk_iscsi_conn	*conn;
+
+	if (g_conns_array == MAP_FAILED) {
+		return;
+	}
+
+	pthread_mutex_lock(&g_conns_mutex);
+	TAILQ_FOREACH(conn, &g_active_conns, conn_link) {
+		if (target == NULL || conn->target == target) {
+			iscsi_conn_request_logout(conn);
+		}
+	}
+	pthread_mutex_unlock(&g_conns_mutex);
+}
+
+void
+shutdown_iscsi_conns(void)
+{
+	iscsi_conns_request_logout(NULL);
+
+	g_shutdown_timer = SPDK_POLLER_REGISTER(iscsi_conn_check_shutdown, NULL, 1000);
+}
+
+/* Do not set conn->state if the connection has already started exiting.
+ *  This ensures we do not move a connection from EXITED state back to EXITING.
+ */
+static void
+_iscsi_conn_drop(void *ctx)
+{
+	struct spdk_iscsi_conn *conn = ctx;
+
+	if (conn->state < ISCSI_CONN_STATE_EXITING) {
+		conn->state = ISCSI_CONN_STATE_EXITING;
+	}
+}
+
+int
+iscsi_drop_conns(struct spdk_iscsi_conn *conn, const char *conn_match,
+		 int drop_all)
+{
+	struct spdk_iscsi_conn	*xconn;
+	const char		*xconn_match;
+	struct spdk_thread	*thread;
+	int			num;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_drop_conns\n");
+
+	num = 0;
+	pthread_mutex_lock(&g_conns_mutex);
+	if (g_conns_array == MAP_FAILED) {
+		goto exit;
+	}
+
+	TAILQ_FOREACH(xconn, &g_active_conns, conn_link) {
+		if (xconn == conn) {
+			continue;
+		}
+
+		if (!drop_all && xconn->initiator_port == NULL) {
+			continue;
+		}
+
+		xconn_match =
+			drop_all ? xconn->initiator_name : spdk_scsi_port_get_name(xconn->initiator_port);
+
+		if (!strcasecmp(conn_match, xconn_match) &&
+		    conn->target == xconn->target) {
+
+			if (num == 0) {
+				/*
+				 * Only print this message before we report the
+				 *  first dropped connection.
+				 */
+				SPDK_ERRLOG("drop old connections %s by %s\n",
+					    conn->target->name, conn_match);
+			}
+
+			SPDK_ERRLOG("exiting conn by %s (%s)\n",
+				    xconn_match, xconn->initiator_addr);
+			if (xconn->sess != NULL) {
+				SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=%u\n", xconn->sess->tsih);
+			} else {
+				SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=xx\n");
+			}
+
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CID=%u\n", xconn->cid);
+
+			thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(xconn->pg));
+			spdk_thread_send_msg(thread, _iscsi_conn_drop, xconn);
+
+			num++;
+		}
+	}
+
+exit:
+	pthread_mutex_unlock(&g_conns_mutex);
+
+	if (num != 0) {
+		SPDK_ERRLOG("exiting %d conns\n", num);
+	}
+
+	return 0;
+}
+
+static int
+_iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn,
+				     struct spdk_iscsi_task *task)
+{
+	struct spdk_iscsi_task *subtask;
+	uint32_t remaining_size;
+
+	if (conn->data_in_cnt >= MAX_LARGE_DATAIN_PER_CONNECTION) {
+		return -1;
+	}
+
+	assert(task->current_datain_offset <= task->scsi.transfer_len);
+	/* Stop split and abort read I/O for remaining data. */
+	if (task->current_datain_offset < task->scsi.transfer_len) {
+		remaining_size = task->scsi.transfer_len - task->current_datain_offset;
+		subtask = iscsi_task_get(conn, task, iscsi_task_cpl);
+		assert(subtask != NULL);
+		subtask->scsi.offset = task->current_datain_offset;
+		subtask->scsi.length = remaining_size;
+		spdk_scsi_task_set_data(&subtask->scsi, NULL, 0);
+		task->current_datain_offset += subtask->scsi.length;
+
+		subtask->scsi.transfer_len = subtask->scsi.length;
+		spdk_scsi_task_process_abort(&subtask->scsi);
+		iscsi_task_cpl(&subtask->scsi);
+	}
+
+	/* Remove the primary task from the list because all subtasks are submitted
+	 *  or aborted.
+	 */
+	assert(task->current_datain_offset == task->scsi.transfer_len);
+	TAILQ_REMOVE(&conn->queued_datain_tasks, task, link);
+	return 0;
+}
+
+int
+iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn,
+				    uint32_t ref_task_tag)
+{
+	struct spdk_iscsi_task *task;
+
+	TAILQ_FOREACH(task, &conn->queued_datain_tasks, link) {
+		if (task->tag == ref_task_tag) {
+			return _iscsi_conn_abort_queued_datain_task(conn, task);
+		}
+	}
+
+	return 0;
+}
+
+int
+iscsi_conn_abort_queued_datain_tasks(struct spdk_iscsi_conn *conn,
+				     struct spdk_scsi_lun *lun,
+				     struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task *task, *task_tmp;
+	struct spdk_iscsi_pdu *pdu_tmp;
+	int rc;
+
+	TAILQ_FOREACH_SAFE(task, &conn->queued_datain_tasks, link, task_tmp) {
+		pdu_tmp = iscsi_task_get_pdu(task);
+		if ((lun == NULL || lun == task->scsi.lun) &&
+		    (pdu == NULL || (spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn)))) {
+			rc = _iscsi_conn_abort_queued_datain_task(conn, task);
+			if (rc != 0) {
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+int
+iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_task *task;
+
+	while (!TAILQ_EMPTY(&conn->queued_datain_tasks) &&
+	       conn->data_in_cnt < MAX_LARGE_DATAIN_PER_CONNECTION) {
+		task = TAILQ_FIRST(&conn->queued_datain_tasks);
+		assert(task->current_datain_offset <= task->scsi.transfer_len);
+		if (task->current_datain_offset < task->scsi.transfer_len) {
+			struct spdk_iscsi_task *subtask;
+			uint32_t remaining_size = 0;
+
+			remaining_size = task->scsi.transfer_len - task->current_datain_offset;
+			subtask = iscsi_task_get(conn, task, iscsi_task_cpl);
+			assert(subtask != NULL);
+			subtask->scsi.offset = task->current_datain_offset;
+			spdk_scsi_task_set_data(&subtask->scsi, NULL, 0);
+
+			if (spdk_scsi_dev_get_lun(conn->dev, task->lun_id) == NULL) {
+				/* Stop submitting split read I/Os for remaining data. */
+				TAILQ_REMOVE(&conn->queued_datain_tasks, task, link);
+				task->current_datain_offset += remaining_size;
+				assert(task->current_datain_offset == task->scsi.transfer_len);
+				subtask->scsi.transfer_len = remaining_size;
+				spdk_scsi_task_process_null_lun(&subtask->scsi);
+				iscsi_task_cpl(&subtask->scsi);
+				return 0;
+			}
+
+			subtask->scsi.length = spdk_min(SPDK_BDEV_LARGE_BUF_MAX_SIZE, remaining_size);
+			task->current_datain_offset += subtask->scsi.length;
+			iscsi_queue_task(conn, subtask);
+		}
+		if (task->current_datain_offset == task->scsi.transfer_len) {
+			TAILQ_REMOVE(&conn->queued_datain_tasks, task, link);
+		}
+	}
+	return 0;
+}
+
+void
+iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task)
+{
+	struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task);
+
+	iscsi_task_mgmt_response(task->conn, task);
+	iscsi_task_put(task);
+}
+
+static void
+iscsi_task_copy_to_rsp_scsi_status(struct spdk_iscsi_task *primary,
+				   struct spdk_scsi_task *task)
+{
+	memcpy(primary->rsp_sense_data, task->sense_data, task->sense_data_len);
+	primary->rsp_sense_data_len = task->sense_data_len;
+	primary->rsp_scsi_status = task->status;
+}
+
+static void
+iscsi_task_copy_from_rsp_scsi_status(struct spdk_scsi_task *task,
+				     struct spdk_iscsi_task *primary)
+{
+	memcpy(task->sense_data, primary->rsp_sense_data,
+	       primary->rsp_sense_data_len);
+	task->sense_data_len = primary->rsp_sense_data_len;
+	task->status = primary->rsp_scsi_status;
+}
+
+static void
+process_completed_read_subtask_list(struct spdk_iscsi_conn *conn,
+				    struct spdk_iscsi_task *primary)
+{
+	struct spdk_iscsi_task *subtask, *tmp;
+
+	TAILQ_FOREACH_SAFE(subtask, &primary->subtask_list, subtask_link, tmp) {
+		if (subtask->scsi.offset == primary->bytes_completed) {
+			TAILQ_REMOVE(&primary->subtask_list, subtask, subtask_link);
+			primary->bytes_completed += subtask->scsi.length;
+			iscsi_task_response(conn, subtask);
+			iscsi_task_put(subtask);
+		} else {
+			break;
+		}
+	}
+
+	if (primary->bytes_completed == primary->scsi.transfer_len) {
+		iscsi_task_put(primary);
+	}
+}
+
+static void
+process_read_task_completion(struct spdk_iscsi_conn *conn,
+			     struct spdk_iscsi_task *task,
+			     struct spdk_iscsi_task *primary)
+{
+	struct spdk_iscsi_task *tmp;
+
+	/* If the status of the completed subtask is the first failure,
+	 * copy it to out-of-order subtasks and remember it as the status
+	 * of the command,
+	 *
+	 * Even if the status of the completed task is success,
+	 * there are any failed subtask ever, copy the first failed status
+	 * to it.
+	 */
+	if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+		if (primary->rsp_scsi_status == SPDK_SCSI_STATUS_GOOD) {
+			TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) {
+				spdk_scsi_task_copy_status(&tmp->scsi, &task->scsi);
+			}
+			iscsi_task_copy_to_rsp_scsi_status(primary, &task->scsi);
+		}
+	} else if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) {
+		iscsi_task_copy_from_rsp_scsi_status(&task->scsi, primary);
+	}
+
+	if (task == primary) {
+		primary->bytes_completed = task->scsi.length;
+		/* For non split read I/O */
+		assert(primary->bytes_completed == task->scsi.transfer_len);
+		iscsi_task_response(conn, task);
+		iscsi_task_put(task);
+	} else {
+		if (task->scsi.offset != primary->bytes_completed) {
+			TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) {
+				if (task->scsi.offset < tmp->scsi.offset) {
+					TAILQ_INSERT_BEFORE(tmp, task, subtask_link);
+					return;
+				}
+			}
+
+			TAILQ_INSERT_TAIL(&primary->subtask_list, task, subtask_link);
+		} else {
+			TAILQ_INSERT_HEAD(&primary->subtask_list, task, subtask_link);
+			process_completed_read_subtask_list(conn, primary);
+		}
+	}
+}
+
+static void
+process_non_read_task_completion(struct spdk_iscsi_conn *conn,
+				 struct spdk_iscsi_task *task,
+				 struct spdk_iscsi_task *primary)
+{
+	primary->bytes_completed += task->scsi.length;
+
+	/* If the status of the subtask is the first failure, remember it as
+	 * the status of the command and set it to the status of the primary
+	 * task later.
+	 *
+	 * If the first failed task is the primary, two copies can be avoided
+	 * but code simplicity is prioritized.
+	 */
+	if (task->scsi.status == SPDK_SCSI_STATUS_GOOD) {
+		if (task != primary) {
+			primary->scsi.data_transferred += task->scsi.data_transferred;
+		}
+	} else if (primary->rsp_scsi_status == SPDK_SCSI_STATUS_GOOD) {
+		iscsi_task_copy_to_rsp_scsi_status(primary, &task->scsi);
+	}
+
+	if (primary->bytes_completed == primary->scsi.transfer_len) {
+		/*
+		 * Check if this is the last task completed for an iSCSI write
+		 *  that required child subtasks.  If task != primary, we know
+		 *  for sure that it was part of an iSCSI write with child subtasks.
+		 *  The trickier case is when the last task completed was the initial
+		 *  task - in this case the task will have a smaller length than
+		 *  the overall transfer length.
+		 */
+		if (task != primary || task->scsi.length != task->scsi.transfer_len) {
+			/* If LUN is removed in the middle of the iSCSI write sequence,
+			 *  primary might complete the write to the initiator because it is not
+			 *  ensured that the initiator will send all data requested by R2Ts.
+			 *
+			 * We check it and skip the following if primary is completed. (see
+			 *  iscsi_clear_all_transfer_task() in iscsi.c.)
+			 */
+			if (primary->is_r2t_active) {
+				if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) {
+					iscsi_task_copy_from_rsp_scsi_status(&primary->scsi, primary);
+				}
+				iscsi_task_response(conn, primary);
+				iscsi_del_transfer_task(conn, primary->tag);
+			}
+		} else {
+			iscsi_task_response(conn, task);
+		}
+	}
+	iscsi_task_put(task);
+}
+
+void
+iscsi_task_cpl(struct spdk_scsi_task *scsi_task)
+{
+	struct spdk_iscsi_task *primary;
+	struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task);
+	struct spdk_iscsi_conn *conn = task->conn;
+	struct spdk_iscsi_pdu *pdu = task->pdu;
+
+	spdk_trace_record(TRACE_ISCSI_TASK_DONE, conn->id, 0, (uintptr_t)task, 0);
+
+	task->is_queued = false;
+	primary = iscsi_task_get_primary(task);
+
+	if (iscsi_task_is_read(primary)) {
+		process_read_task_completion(conn, task, primary);
+	} else {
+		process_non_read_task_completion(conn, task, primary);
+	}
+	if (!task->parent) {
+		spdk_trace_record(TRACE_ISCSI_PDU_COMPLETED, 0, 0, (uintptr_t)pdu, 0);
+	}
+}
+
+static void
+iscsi_conn_send_nopin(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_nop_in *rsp;
+	/* Only send nopin if we have logged in and are in a normal session. */
+	if (conn->sess == NULL ||
+	    !conn->full_feature ||
+	    !iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) {
+		return;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "send NOPIN isid=%"PRIx64", tsih=%u, cid=%u\n",
+		      conn->sess->isid, conn->sess->tsih, conn->cid);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+		      conn->StatSN, conn->sess->ExpCmdSN,
+		      conn->sess->MaxCmdSN);
+	rsp_pdu = iscsi_get_pdu(conn);
+	rsp = (struct iscsi_bhs_nop_in *) &rsp_pdu->bhs;
+	rsp_pdu->data = NULL;
+	/*
+	 * iscsi_get_pdu() memset's the PDU for us, so only fill out the needed
+	 *  fields.
+	 */
+	rsp->opcode = ISCSI_OP_NOPIN;
+	rsp->flags = 0x80;
+	/*
+	 * Technically the to_be32() is not needed here, since
+	 *  to_be32(0xFFFFFFFU) returns 0xFFFFFFFFU.
+	 */
+	to_be32(&rsp->itt, 0xFFFFFFFFU);
+	to_be32(&rsp->ttt, conn->id);
+	to_be32(&rsp->stat_sn, conn->StatSN);
+	to_be32(&rsp->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsp->max_cmd_sn, conn->sess->MaxCmdSN);
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+	conn->last_nopin = spdk_get_ticks();
+	conn->nop_outstanding = true;
+}
+
+void
+iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn)
+{
+	uint64_t	tsc;
+
+	/**
+	  * This function will be executed by nop_poller of iSCSI polling group, so
+	  * we need to check the connection state first, then do the nop interval
+	  * expiration check work.
+	  */
+	if ((conn->state == ISCSI_CONN_STATE_EXITED) ||
+	    (conn->state == ISCSI_CONN_STATE_EXITING)) {
+		return;
+	}
+
+	/* Check for nop interval expiration */
+	tsc = spdk_get_ticks();
+	if (conn->nop_outstanding) {
+		if ((tsc - conn->last_nopin) > conn->timeout) {
+			SPDK_ERRLOG("Timed out waiting for NOP-Out response from initiator\n");
+			SPDK_ERRLOG("  tsc=0x%lx, last_nopin=0x%lx\n", tsc, conn->last_nopin);
+			SPDK_ERRLOG("  initiator=%s, target=%s\n", conn->initiator_name,
+				    conn->target_short_name);
+			conn->state = ISCSI_CONN_STATE_EXITING;
+		}
+	} else if (tsc - conn->last_nopin > conn->nopininterval) {
+		iscsi_conn_send_nopin(conn);
+	}
+}
+
+/**
+ * \brief Reads data for the specified iSCSI connection from its TCP socket.
+ *
+ * The TCP socket is marked as non-blocking, so this function may not read
+ * all data requested.
+ *
+ * Returns SPDK_ISCSI_CONNECTION_FATAL if the recv() operation indicates a fatal
+ * error with the TCP connection (including if the TCP connection was closed
+ * unexpectedly.
+ *
+ * Otherwise returns the number of bytes successfully read.
+ */
+int
+iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int bytes,
+		     void *buf)
+{
+	int ret;
+
+	if (bytes == 0) {
+		return 0;
+	}
+
+	ret = spdk_sock_recv(conn->sock, buf, bytes);
+
+	if (ret > 0) {
+		spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0);
+		return ret;
+	}
+
+	if (ret < 0) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			return 0;
+		}
+
+		/* For connect reset issue, do not output error log */
+		if (errno == ECONNRESET) {
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_recv() failed, errno %d: %s\n",
+				      errno, spdk_strerror(errno));
+		} else {
+			SPDK_ERRLOG("spdk_sock_recv() failed, errno %d: %s\n",
+				    errno, spdk_strerror(errno));
+		}
+	}
+
+	/* connection closed */
+	return SPDK_ISCSI_CONNECTION_FATAL;
+}
+
+int
+iscsi_conn_readv_data(struct spdk_iscsi_conn *conn,
+		      struct iovec *iov, int iovcnt)
+{
+	int ret;
+
+	if (iov == NULL || iovcnt == 0) {
+		return 0;
+	}
+
+	if (iovcnt == 1) {
+		return iscsi_conn_read_data(conn, iov[0].iov_len,
+					    iov[0].iov_base);
+	}
+
+	ret = spdk_sock_readv(conn->sock, iov, iovcnt);
+
+	if (ret > 0) {
+		spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0);
+		return ret;
+	}
+
+	if (ret < 0) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			return 0;
+		}
+
+		/* For connect reset issue, do not output error log */
+		if (errno == ECONNRESET) {
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_readv() failed, errno %d: %s\n",
+				      errno, spdk_strerror(errno));
+		} else {
+			SPDK_ERRLOG("spdk_sock_readv() failed, errno %d: %s\n",
+				    errno, spdk_strerror(errno));
+		}
+	}
+
+	/* connection closed */
+	return SPDK_ISCSI_CONNECTION_FATAL;
+}
+
+static bool
+iscsi_is_free_pdu_deferred(struct spdk_iscsi_pdu *pdu)
+{
+	if (pdu == NULL) {
+		return false;
+	}
+
+	if (pdu->bhs.opcode == ISCSI_OP_R2T ||
+	    pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+		return true;
+	}
+
+	return false;
+}
+
+static int
+iscsi_dif_verify(struct spdk_iscsi_pdu *pdu, struct spdk_dif_ctx *dif_ctx)
+{
+	struct iovec iov;
+	struct spdk_dif_error err_blk = {};
+	uint32_t num_blocks;
+	int rc;
+
+	iov.iov_base = pdu->data;
+	iov.iov_len = pdu->data_buf_len;
+	num_blocks = pdu->data_buf_len / dif_ctx->block_size;
+
+	rc = spdk_dif_verify(&iov, 1, num_blocks, dif_ctx, &err_blk);
+	if (rc != 0) {
+		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
+			    err_blk.err_type, err_blk.err_offset);
+	}
+
+	return rc;
+}
+
+static void
+_iscsi_conn_pdu_write_done(void *cb_arg, int err)
+{
+	struct spdk_iscsi_pdu *pdu = cb_arg;
+	struct spdk_iscsi_conn *conn = pdu->conn;
+
+	assert(conn != NULL);
+
+	if (spdk_unlikely(conn->state >= ISCSI_CONN_STATE_EXITING)) {
+		/* The other policy will recycle the resource */
+		return;
+	}
+
+	TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq);
+
+	if (err != 0) {
+		conn->state = ISCSI_CONN_STATE_EXITING;
+	} else {
+		spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_DONE, conn->id, pdu->mapped_length, (uintptr_t)pdu, 0);
+	}
+
+	if ((conn->full_feature) &&
+	    (conn->sess->ErrorRecoveryLevel >= 1) &&
+	    iscsi_is_free_pdu_deferred(pdu)) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "stat_sn=%d\n",
+			      from_be32(&pdu->bhs.stat_sn));
+		TAILQ_INSERT_TAIL(&conn->snack_pdu_list, pdu,
+				  tailq);
+	} else {
+		iscsi_conn_free_pdu(conn, pdu);
+	}
+}
+
+void
+iscsi_conn_pdu_generic_complete(void *cb_arg)
+{
+}
+
+void
+iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu,
+		     iscsi_conn_xfer_complete_cb cb_fn,
+		     void *cb_arg)
+{
+	uint32_t crc32c;
+	ssize_t rc;
+
+	if (spdk_unlikely(pdu->dif_insert_or_strip)) {
+		rc = iscsi_dif_verify(pdu, &pdu->dif_ctx);
+		if (rc != 0) {
+			iscsi_conn_free_pdu(conn, pdu);
+			conn->state = ISCSI_CONN_STATE_EXITING;
+			return;
+		}
+	}
+
+	if (pdu->bhs.opcode != ISCSI_OP_LOGIN_RSP) {
+		/* Header Digest */
+		if (conn->header_digest) {
+			crc32c = iscsi_pdu_calc_header_digest(pdu);
+			MAKE_DIGEST_WORD(pdu->header_digest, crc32c);
+		}
+
+		/* Data Digest */
+		if (conn->data_digest && DGET24(pdu->bhs.data_segment_len) != 0) {
+			crc32c = iscsi_pdu_calc_data_digest(pdu);
+			MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
+		}
+	}
+
+	pdu->cb_fn = cb_fn;
+	pdu->cb_arg = cb_arg;
+	TAILQ_INSERT_TAIL(&conn->write_pdu_list, pdu, tailq);
+
+	if (spdk_unlikely(conn->state >= ISCSI_CONN_STATE_EXITING)) {
+		return;
+	}
+	pdu->sock_req.iovcnt = iscsi_build_iovs(conn, pdu->iov, SPDK_COUNTOF(pdu->iov), pdu,
+						&pdu->mapped_length);
+	pdu->sock_req.cb_fn = _iscsi_conn_pdu_write_done;
+	pdu->sock_req.cb_arg = pdu;
+
+	spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_START, conn->id, pdu->mapped_length, (uintptr_t)pdu,
+			  pdu->sock_req.iovcnt);
+	spdk_sock_writev_async(conn->sock, &pdu->sock_req);
+}
+
+static void
+iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+	struct spdk_iscsi_conn *conn = arg;
+	int rc;
+
+	assert(conn != NULL);
+
+	if ((conn->state == ISCSI_CONN_STATE_EXITED) ||
+	    (conn->state == ISCSI_CONN_STATE_EXITING)) {
+		return;
+	}
+
+	/* Handle incoming PDUs */
+	rc = iscsi_handle_incoming_pdus(conn);
+	if (rc < 0) {
+		conn->state = ISCSI_CONN_STATE_EXITING;
+	}
+}
+
+static void
+iscsi_conn_full_feature_migrate(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->state >= ISCSI_CONN_STATE_EXITING) {
+		/* Connection is being exited before this callback is executed. */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n");
+		return;
+	}
+
+	if (conn->sess->session_type == SESSION_TYPE_NORMAL) {
+		iscsi_conn_open_luns(conn);
+	}
+
+	/* Add this connection to the assigned poll group. */
+	iscsi_poll_group_add_conn(conn->pg, conn);
+}
+
+static struct spdk_iscsi_poll_group *g_next_pg = NULL;
+
+void
+iscsi_conn_schedule(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_poll_group	*pg;
+	struct spdk_iscsi_tgt_node	*target;
+
+	if (conn->sess->session_type != SESSION_TYPE_NORMAL) {
+		/* Leave all non-normal sessions on the acceptor
+		 * thread. */
+		return;
+	}
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	target = conn->sess->target;
+	pthread_mutex_lock(&target->mutex);
+	target->num_active_conns++;
+	if (target->num_active_conns == 1) {
+		/**
+		 * This is the only active connection for this target node.
+		 *  Pick a poll group using round-robin.
+		 */
+		if (g_next_pg == NULL) {
+			g_next_pg = TAILQ_FIRST(&g_iscsi.poll_group_head);
+			assert(g_next_pg != NULL);
+		}
+
+		pg = g_next_pg;
+		g_next_pg = TAILQ_NEXT(g_next_pg, link);
+
+		/* Save the pg in the target node so it can be used for any other connections to this target node. */
+		target->pg = pg;
+	} else {
+		/**
+		 * There are other active connections for this target node.
+		 */
+		pg = target->pg;
+	}
+
+	pthread_mutex_unlock(&target->mutex);
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) ==
+	       spdk_get_thread());
+
+	/* Remove this connection from the previous poll group */
+	iscsi_poll_group_remove_conn(conn->pg, conn);
+
+	conn->last_nopin = spdk_get_ticks();
+	conn->pg = pg;
+
+	spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)),
+			     iscsi_conn_full_feature_migrate, conn);
+}
+
+static int
+logout_timeout(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->state < ISCSI_CONN_STATE_EXITING) {
+		conn->state = ISCSI_CONN_STATE_EXITING;
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+void
+iscsi_conn_logout(struct spdk_iscsi_conn *conn)
+{
+	conn->is_logged_out = true;
+	conn->logout_timer = SPDK_POLLER_REGISTER(logout_timeout, conn, ISCSI_LOGOUT_TIMEOUT * 1000000);
+}
+
+SPDK_TRACE_REGISTER_FN(iscsi_conn_trace, "iscsi_conn", TRACE_GROUP_ISCSI)
+{
+	spdk_trace_register_owner(OWNER_ISCSI_CONN, 'c');
+	spdk_trace_register_object(OBJECT_ISCSI_PDU, 'p');
+	spdk_trace_register_description("ISCSI_READ_DONE", TRACE_ISCSI_READ_FROM_SOCKET_DONE,
+					OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("ISCSI_WRITE_START", TRACE_ISCSI_FLUSH_WRITEBUF_START,
+					OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "iovec: ");
+	spdk_trace_register_description("ISCSI_WRITE_DONE", TRACE_ISCSI_FLUSH_WRITEBUF_DONE,
+					OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("ISCSI_READ_PDU", TRACE_ISCSI_READ_PDU,
+					OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 1, 0, "opc:   ");
+	spdk_trace_register_description("ISCSI_TASK_DONE", TRACE_ISCSI_TASK_DONE,
+					OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 0, 0, "");
+	spdk_trace_register_description("ISCSI_TASK_QUEUE", TRACE_ISCSI_TASK_QUEUE,
+					OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 1, 1, "pdu:   ");
+	spdk_trace_register_description("ISCSI_TASK_EXECUTED", TRACE_ISCSI_TASK_EXECUTED,
+					OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, "");
+	spdk_trace_register_description("ISCSI_PDU_COMPLETED", TRACE_ISCSI_PDU_COMPLETED,
+					OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, "");
+}
+
+void
+iscsi_conn_info_json(struct spdk_json_write_ctx *w, struct spdk_iscsi_conn *conn)
+{
+	uint16_t tsih;
+
+	if (!conn->is_valid) {
+		return;
+	}
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_int32(w, "id", conn->id);
+
+	spdk_json_write_named_int32(w, "cid", conn->cid);
+
+	/*
+	 * If we try to return data for a connection that has not
+	 *  logged in yet, the session will not be set.  So in this
+	 *  case, return -1 for the tsih rather than segfaulting
+	 *  on the null conn->sess.
+	 */
+	if (conn->sess == NULL) {
+		tsih = -1;
+	} else {
+		tsih = conn->sess->tsih;
+	}
+	spdk_json_write_named_int32(w, "tsih", tsih);
+
+	spdk_json_write_named_string(w, "initiator_addr", conn->initiator_addr);
+
+	spdk_json_write_named_string(w, "target_addr", conn->target_addr);
+
+	spdk_json_write_named_string(w, "target_node_name", conn->target_short_name);
+
+	spdk_json_write_named_string(w, "thread_name",
+				     spdk_thread_get_name(spdk_get_thread()));
+
+	spdk_json_write_object_end(w);
+}
diff --git a/src/spdk/lib/iscsi/conn.h b/src/spdk/lib/iscsi/conn.h
new file mode 100644
index 000000000..a85d2ddeb
--- /dev/null
+++ b/src/spdk/lib/iscsi/conn.h
@@ -0,0 +1,237 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_CONN_H
+#define SPDK_ISCSI_CONN_H
+
+#include "spdk/stdinc.h"
+
+#include "iscsi/iscsi.h"
+#include "spdk/queue.h"
+#include "spdk/cpuset.h"
+#include "spdk/scsi.h"
+
+/*
+ * MAX_CONNECTION_PARAMS: The numbers of the params in conn_param_table
+ * MAX_SESSION_PARAMS: The numbers of the params in sess_param_table
+ */
+#define MAX_CONNECTION_PARAMS 14
+#define MAX_SESSION_PARAMS 19
+
+#define MAX_ADDRBUF 64
+#define MAX_INITIATOR_ADDR (MAX_ADDRBUF)
+#define MAX_TARGET_ADDR (MAX_ADDRBUF)
+
+#define OWNER_ISCSI_CONN		0x1
+
+#define OBJECT_ISCSI_PDU		0x1
+
+#define TRACE_GROUP_ISCSI		0x1
+#define TRACE_ISCSI_READ_FROM_SOCKET_DONE	SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x0)
+#define TRACE_ISCSI_FLUSH_WRITEBUF_START	SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x1)
+#define TRACE_ISCSI_FLUSH_WRITEBUF_DONE		SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x2)
+#define TRACE_ISCSI_READ_PDU			SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x3)
+#define TRACE_ISCSI_TASK_DONE			SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x4)
+#define TRACE_ISCSI_TASK_QUEUE			SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x5)
+#define TRACE_ISCSI_TASK_EXECUTED		SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x6)
+#define TRACE_ISCSI_PDU_COMPLETED		SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x7)
+
+enum iscsi_pdu_recv_state {
+	/* Ready to wait for PDU */
+	ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY,
+
+	/* Active connection waiting for any PDU header */
+	ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR,
+
+	/* Active connection waiting for payload */
+	ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD,
+
+	/* Active connection does not wait for payload */
+	ISCSI_PDU_RECV_STATE_ERROR,
+};
+
+struct spdk_poller;
+struct spdk_iscsi_conn;
+
+struct spdk_iscsi_lun {
+	struct spdk_iscsi_conn		*conn;
+	struct spdk_scsi_lun		*lun;
+	struct spdk_scsi_lun_desc	*desc;
+	struct spdk_poller		*remove_poller;
+};
+
+struct spdk_iscsi_conn {
+	int				id;
+	int				is_valid;
+	/*
+	 * All fields below this point are reinitialized each time the
+	 *  connection object is allocated.  Make sure to update the
+	 *  SPDK_ISCSI_CONNECTION_MEMSET() macro if changing which fields
+	 *  are initialized when allocated.
+	 */
+	struct spdk_iscsi_portal	*portal;
+	int				pg_tag;
+	char				portal_host[MAX_PORTAL_ADDR + 1];
+	char				portal_port[MAX_PORTAL_ADDR + 1];
+	struct spdk_iscsi_poll_group	*pg;
+	struct spdk_sock		*sock;
+	struct spdk_iscsi_sess		*sess;
+
+	enum iscsi_connection_state	state;
+	int				login_phase;
+	bool				is_logged_out;
+	struct spdk_iscsi_pdu		*login_rsp_pdu;
+
+	uint64_t	last_flush;
+	uint64_t	last_fill;
+	uint64_t	last_nopin;
+
+	/* Timer used to destroy connection after requesting logout if
+	 *  initiator does not send logout request.
+	 */
+	struct spdk_poller *logout_request_timer;
+
+	/* Timer used to destroy connection after logout if initiator does
+	 *  not close the connection.
+	 */
+	struct spdk_poller *logout_timer;
+
+	/* Timer used to wait for connection to close
+	 */
+	struct spdk_poller *shutdown_timer;
+
+	struct spdk_iscsi_pdu *pdu_in_progress;
+	enum iscsi_pdu_recv_state pdu_recv_state;
+
+	TAILQ_HEAD(, spdk_iscsi_pdu) write_pdu_list;
+	TAILQ_HEAD(, spdk_iscsi_pdu) snack_pdu_list;
+
+	int pending_r2t;
+
+	uint16_t cid;
+
+	/* IP address */
+	char initiator_addr[MAX_INITIATOR_ADDR];
+	char target_addr[MAX_TARGET_ADDR];
+
+	/* Initiator/Target port binds */
+	char				initiator_name[MAX_INITIATOR_NAME];
+	struct spdk_scsi_port		*initiator_port;
+	char				target_short_name[MAX_TARGET_NAME];
+	struct spdk_scsi_port		*target_port;
+	struct spdk_iscsi_tgt_node	*target;
+	struct spdk_scsi_dev		*dev;
+
+	/* for fast access */
+	int header_digest;
+	int data_digest;
+	int full_feature;
+
+	struct iscsi_param *params;
+	bool sess_param_state_negotiated[MAX_SESSION_PARAMS];
+	bool conn_param_state_negotiated[MAX_CONNECTION_PARAMS];
+	struct iscsi_chap_auth auth;
+	bool authenticated;
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+	uint32_t pending_task_cnt;
+	uint32_t data_out_cnt;
+	uint32_t data_in_cnt;
+
+	uint64_t timeout;
+	uint64_t nopininterval;
+	bool nop_outstanding;
+
+	/*
+	 * This is the maximum data segment length that iscsi target can send
+	 *  to the initiator on this connection.  Not to be confused with the
+	 *  maximum data segment length that initiators can send to iscsi target, which
+	 *  is statically defined as SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH.
+	 */
+	int MaxRecvDataSegmentLength;
+
+	uint32_t StatSN;
+	uint32_t exp_statsn;
+	uint32_t ttt; /* target transfer tag */
+	char *partial_text_parameter;
+
+	STAILQ_ENTRY(spdk_iscsi_conn) pg_link;
+	bool			is_stopped;  /* Set true when connection is stopped for migration */
+	TAILQ_HEAD(queued_r2t_tasks, spdk_iscsi_task)	queued_r2t_tasks;
+	TAILQ_HEAD(active_r2t_tasks, spdk_iscsi_task)	active_r2t_tasks;
+	TAILQ_HEAD(queued_datain_tasks, spdk_iscsi_task)	queued_datain_tasks;
+
+	struct spdk_iscsi_lun	*luns[SPDK_SCSI_DEV_MAX_LUN];
+
+	TAILQ_ENTRY(spdk_iscsi_conn)	conn_link;
+};
+
+extern struct spdk_iscsi_conn *g_conns_array;
+
+void iscsi_task_cpl(struct spdk_scsi_task *scsi_task);
+void iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task);
+
+int initialize_iscsi_conns(void);
+void shutdown_iscsi_conns(void);
+void iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target);
+int iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target);
+
+int iscsi_conn_construct(struct spdk_iscsi_portal *portal, struct spdk_sock *sock);
+void iscsi_conn_destruct(struct spdk_iscsi_conn *conn);
+void iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn);
+void iscsi_conn_schedule(struct spdk_iscsi_conn *conn);
+void iscsi_conn_logout(struct spdk_iscsi_conn *conn);
+int iscsi_drop_conns(struct spdk_iscsi_conn *conn,
+		     const char *conn_match, int drop_all);
+int iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn);
+int iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn,
+					uint32_t ref_task_tag);
+int iscsi_conn_abort_queued_datain_tasks(struct spdk_iscsi_conn *conn,
+		struct spdk_scsi_lun *lun,
+		struct spdk_iscsi_pdu *pdu);
+
+int iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int len, void *buf);
+int iscsi_conn_readv_data(struct spdk_iscsi_conn *conn,
+			  struct iovec *iov, int iovcnt);
+void iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu,
+			  iscsi_conn_xfer_complete_cb cb_fn,
+			  void *cb_arg);
+
+void iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu);
+
+void iscsi_conn_info_json(struct spdk_json_write_ctx *w, struct spdk_iscsi_conn *conn);
+void iscsi_conn_pdu_generic_complete(void *cb_arg);
+#endif /* SPDK_ISCSI_CONN_H */
diff --git a/src/spdk/lib/iscsi/init_grp.c b/src/spdk/lib/iscsi/init_grp.c
new file mode 100644
index 000000000..49e78d89d
--- /dev/null
+++ b/src/spdk/lib/iscsi/init_grp.c
@@ -0,0 +1,787 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/init_grp.h"
+
+static struct spdk_iscsi_init_grp *
+iscsi_init_grp_create(int tag)
+{
+	struct spdk_iscsi_init_grp *ig;
+
+	ig = calloc(1, sizeof(*ig));
+	if (ig == NULL) {
+		SPDK_ERRLOG("calloc() failed for initiator group\n");
+		return NULL;
+	}
+
+	ig->tag = tag;
+	TAILQ_INIT(&ig->initiator_head);
+	TAILQ_INIT(&ig->netmask_head);
+	return ig;
+}
+
+static struct spdk_iscsi_initiator_name *
+iscsi_init_grp_find_initiator(struct spdk_iscsi_init_grp *ig, char *name)
+{
+	struct spdk_iscsi_initiator_name *iname;
+
+	TAILQ_FOREACH(iname, &ig->initiator_head, tailq) {
+		if (!strcmp(iname->name, name)) {
+			return iname;
+		}
+	}
+	return NULL;
+}
+
+static int
+iscsi_init_grp_add_initiator(struct spdk_iscsi_init_grp *ig, char *name)
+{
+	struct spdk_iscsi_initiator_name *iname;
+	char *p;
+	size_t len;
+
+	if (ig->ninitiators >= MAX_INITIATOR) {
+		SPDK_ERRLOG("> MAX_INITIATOR(=%d) is not allowed\n", MAX_INITIATOR);
+		return -EPERM;
+	}
+
+	len = strlen(name);
+	if (len > MAX_INITIATOR_NAME) {
+		SPDK_ERRLOG("Initiator Name is larger than 223 bytes\n");
+		return -EINVAL;
+	}
+
+	iname = iscsi_init_grp_find_initiator(ig, name);
+	if (iname != NULL) {
+		return -EEXIST;
+	}
+
+	iname = calloc(1, sizeof(*iname));
+	if (iname == NULL) {
+		SPDK_ERRLOG("malloc() failed for initiator name str\n");
+		return -ENOMEM;
+	}
+
+	memcpy(iname->name, name, len);
+
+	/* Replace "ALL" by "ANY" if set */
+	p = strstr(iname->name, "ALL");
+	if (p != NULL) {
+		SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL");
+		SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY");
+		memcpy(p, "ANY", 3);
+	}
+
+	TAILQ_INSERT_TAIL(&ig->initiator_head, iname, tailq);
+	ig->ninitiators++;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", name);
+	return 0;
+}
+
+static int
+iscsi_init_grp_delete_initiator(struct spdk_iscsi_init_grp *ig, char *name)
+{
+	struct spdk_iscsi_initiator_name *iname;
+
+	iname = iscsi_init_grp_find_initiator(ig, name);
+	if (iname == NULL) {
+		return -ENOENT;
+	}
+
+	TAILQ_REMOVE(&ig->initiator_head, iname, tailq);
+	ig->ninitiators--;
+	free(iname);
+	return 0;
+}
+
+static int
+iscsi_init_grp_add_initiators(struct spdk_iscsi_init_grp *ig, int num_inames,
+			      char **inames)
+{
+	int i;
+	int rc;
+
+	for (i = 0; i < num_inames; i++) {
+		rc = iscsi_init_grp_add_initiator(ig, inames[i]);
+		if (rc < 0) {
+			goto cleanup;
+		}
+	}
+	return 0;
+
+cleanup:
+	for (; i > 0; --i) {
+		iscsi_init_grp_delete_initiator(ig, inames[i - 1]);
+	}
+	return rc;
+}
+
+static void
+iscsi_init_grp_delete_all_initiators(struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_initiator_name *iname, *tmp;
+
+	TAILQ_FOREACH_SAFE(iname, &ig->initiator_head, tailq, tmp) {
+		TAILQ_REMOVE(&ig->initiator_head, iname, tailq);
+		ig->ninitiators--;
+		free(iname);
+	}
+}
+
+static int
+iscsi_init_grp_delete_initiators(struct spdk_iscsi_init_grp *ig, int num_inames, char **inames)
+{
+	int i;
+	int rc;
+
+	for (i = 0; i < num_inames; i++) {
+		rc = iscsi_init_grp_delete_initiator(ig, inames[i]);
+		if (rc < 0) {
+			goto cleanup;
+		}
+	}
+	return 0;
+
+cleanup:
+	for (; i > 0; --i) {
+		rc = iscsi_init_grp_add_initiator(ig, inames[i - 1]);
+		if (rc != 0) {
+			iscsi_init_grp_delete_all_initiators(ig);
+			break;
+		}
+	}
+	return -1;
+}
+
+static struct spdk_iscsi_initiator_netmask *
+iscsi_init_grp_find_netmask(struct spdk_iscsi_init_grp *ig, const char *mask)
+{
+	struct spdk_iscsi_initiator_netmask *netmask;
+
+	TAILQ_FOREACH(netmask, &ig->netmask_head, tailq) {
+		if (!strcmp(netmask->mask, mask)) {
+			return netmask;
+		}
+	}
+	return NULL;
+}
+
+static int
+iscsi_init_grp_add_netmask(struct spdk_iscsi_init_grp *ig, char *mask)
+{
+	struct spdk_iscsi_initiator_netmask *imask;
+	char *p;
+	size_t len;
+
+	if (ig->nnetmasks >= MAX_NETMASK) {
+		SPDK_ERRLOG("> MAX_NETMASK(=%d) is not allowed\n", MAX_NETMASK);
+		return -EPERM;
+	}
+
+	len = strlen(mask);
+	if (len > MAX_INITIATOR_ADDR) {
+		SPDK_ERRLOG("Initiator Name is larger than %d bytes\n", MAX_INITIATOR_ADDR);
+		return -EINVAL;
+	}
+
+	imask = iscsi_init_grp_find_netmask(ig, mask);
+	if (imask != NULL) {
+		return -EEXIST;
+	}
+
+	imask = calloc(1, sizeof(*imask));
+	if (imask == NULL) {
+		SPDK_ERRLOG("malloc() failed for inititator mask str\n");
+		return -ENOMEM;
+	}
+
+	memcpy(imask->mask, mask, len);
+
+	/* Replace "ALL" by "ANY" if set */
+	p = strstr(imask->mask, "ALL");
+	if (p != NULL) {
+		SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL");
+		SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY");
+		memcpy(p, "ANY", 3);
+	}
+
+	TAILQ_INSERT_TAIL(&ig->netmask_head, imask, tailq);
+	ig->nnetmasks++;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", mask);
+	return 0;
+}
+
+static int
+iscsi_init_grp_delete_netmask(struct spdk_iscsi_init_grp *ig, char *mask)
+{
+	struct spdk_iscsi_initiator_netmask *imask;
+
+	imask = iscsi_init_grp_find_netmask(ig, mask);
+	if (imask == NULL) {
+		return -ENOENT;
+	}
+
+	TAILQ_REMOVE(&ig->netmask_head, imask, tailq);
+	ig->nnetmasks--;
+	free(imask);
+	return 0;
+}
+
+static int
+iscsi_init_grp_add_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks)
+{
+	int i;
+	int rc;
+
+	for (i = 0; i < num_imasks; i++) {
+		rc = iscsi_init_grp_add_netmask(ig, imasks[i]);
+		if (rc != 0) {
+			goto cleanup;
+		}
+	}
+	return 0;
+
+cleanup:
+	for (; i > 0; --i) {
+		iscsi_init_grp_delete_netmask(ig, imasks[i - 1]);
+	}
+	return rc;
+}
+
+static void
+iscsi_init_grp_delete_all_netmasks(struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_initiator_netmask *imask, *tmp;
+
+	TAILQ_FOREACH_SAFE(imask, &ig->netmask_head, tailq, tmp) {
+		TAILQ_REMOVE(&ig->netmask_head, imask, tailq);
+		ig->nnetmasks--;
+		free(imask);
+	}
+}
+
+static int
+iscsi_init_grp_delete_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks)
+{
+	int i;
+	int rc;
+
+	for (i = 0; i < num_imasks; i++) {
+		rc = iscsi_init_grp_delete_netmask(ig, imasks[i]);
+		if (rc != 0) {
+			goto cleanup;
+		}
+	}
+	return 0;
+
+cleanup:
+	for (; i > 0; --i) {
+		rc = iscsi_init_grp_add_netmask(ig, imasks[i - 1]);
+		if (rc != 0) {
+			iscsi_init_grp_delete_all_netmasks(ig);
+			break;
+		}
+	}
+	return -1;
+}
+
+/* Read spdk iscsi target's config file and create initiator group */
+static int
+iscsi_parse_init_grp(struct spdk_conf_section *sp)
+{
+	int i, rc = 0;
+	const char *val = NULL;
+	int num_initiator_names;
+	int num_initiator_masks;
+	char **initiators = NULL, **netmasks = NULL;
+	int tag = spdk_conf_section_get_num(sp);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add initiator group %d\n", tag);
+
+	val = spdk_conf_section_get_val(sp, "Comment");
+	if (val != NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+	}
+
+	/* counts number of definitions */
+	for (i = 0; ; i++) {
+		val = spdk_conf_section_get_nval(sp, "InitiatorName", i);
+		if (val == NULL) {
+			break;
+		}
+	}
+	if (i == 0) {
+		SPDK_ERRLOG("num_initiator_names = 0\n");
+		return -EINVAL;
+	}
+	num_initiator_names = i;
+	if (num_initiator_names > MAX_INITIATOR) {
+		SPDK_ERRLOG("%d > MAX_INITIATOR\n", num_initiator_names);
+		return -E2BIG;
+	}
+	for (i = 0; ; i++) {
+		val = spdk_conf_section_get_nval(sp, "Netmask", i);
+		if (val == NULL) {
+			break;
+		}
+	}
+	if (i == 0) {
+		SPDK_ERRLOG("num_initiator_mask = 0\n");
+		return -EINVAL;
+	}
+	num_initiator_masks = i;
+	if (num_initiator_masks > MAX_NETMASK) {
+		SPDK_ERRLOG("%d > MAX_NETMASK\n", num_initiator_masks);
+		return -E2BIG;
+	}
+
+	initiators = calloc(num_initiator_names, sizeof(char *));
+	if (!initiators) {
+		SPDK_ERRLOG("calloc() failed for temp initiator name array\n");
+		return -ENOMEM;
+	}
+	for (i = 0; i < num_initiator_names; i++) {
+		val = spdk_conf_section_get_nval(sp, "InitiatorName", i);
+		if (!val) {
+			SPDK_ERRLOG("InitiatorName %d not found\n", i);
+			rc = -EINVAL;
+			goto cleanup;
+		}
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", val);
+		initiators[i] = strdup(val);
+		if (!initiators[i]) {
+			SPDK_ERRLOG("strdup() failed for temp initiator name\n");
+			rc = -ENOMEM;
+			goto cleanup;
+		}
+	}
+	netmasks = calloc(num_initiator_masks, sizeof(char *));
+	if (!netmasks) {
+		SPDK_ERRLOG("malloc() failed for portal group\n");
+		rc = -ENOMEM;
+		goto cleanup;
+	}
+	for (i = 0; i < num_initiator_masks; i++) {
+		val = spdk_conf_section_get_nval(sp, "Netmask", i);
+		if (!val) {
+			SPDK_ERRLOG("Netmask %d not found\n", i);
+			rc = -EINVAL;
+			goto cleanup;
+		}
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", val);
+		netmasks[i] = strdup(val);
+		if (!netmasks[i]) {
+			SPDK_ERRLOG("strdup() failed for temp initiator mask\n");
+			rc = -ENOMEM;
+			goto cleanup;
+		}
+	}
+
+	rc = iscsi_init_grp_create_from_initiator_list(tag,
+			num_initiator_names, initiators, num_initiator_masks, netmasks);
+
+cleanup:
+	if (initiators) {
+		for (i = 0; i < num_initiator_names; i++) {
+			if (initiators[i]) {
+				free(initiators[i]);
+			}
+		}
+		free(initiators);
+	}
+	if (netmasks) {
+		for (i = 0; i < num_initiator_masks; i++) {
+			if (netmasks[i]) {
+				free(netmasks[i]);
+			}
+		}
+		free(netmasks);
+	}
+	return rc;
+}
+
+int
+iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_init_grp *tmp;
+	int rc = -1;
+
+	assert(ig != NULL);
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	tmp = iscsi_init_grp_find_by_tag(ig->tag);
+	if (tmp == NULL) {
+		TAILQ_INSERT_TAIL(&g_iscsi.ig_head, ig, tailq);
+		rc = 0;
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	return rc;
+}
+
+/*
+ * Create initiator group from list of initiator ip/hostnames and netmasks
+ * The initiator hostname/netmask lists are allocated by the caller on the
+ * heap.  Freed later by common initiator_group_destroy() code
+ */
+int
+iscsi_init_grp_create_from_initiator_list(int tag,
+		int num_initiator_names,
+		char **initiator_names,
+		int num_initiator_masks,
+		char **initiator_masks)
+{
+	int rc = -1;
+	struct spdk_iscsi_init_grp *ig = NULL;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+		      "add initiator group (from initiator list) tag=%d, #initiators=%d, #masks=%d\n",
+		      tag, num_initiator_names, num_initiator_masks);
+
+	ig = iscsi_init_grp_create(tag);
+	if (!ig) {
+		SPDK_ERRLOG("initiator group create error (%d)\n", tag);
+		return rc;
+	}
+
+	rc = iscsi_init_grp_add_initiators(ig, num_initiator_names,
+					   initiator_names);
+	if (rc < 0) {
+		SPDK_ERRLOG("add initiator name error\n");
+		goto cleanup;
+	}
+
+	rc = iscsi_init_grp_add_netmasks(ig, num_initiator_masks,
+					 initiator_masks);
+	if (rc < 0) {
+		SPDK_ERRLOG("add initiator netmask error\n");
+		goto cleanup;
+	}
+
+	rc = iscsi_init_grp_register(ig);
+	if (rc < 0) {
+		SPDK_ERRLOG("initiator group register error (%d)\n", tag);
+		goto cleanup;
+	}
+	return 0;
+
+cleanup:
+	iscsi_init_grp_destroy(ig);
+	return rc;
+}
+
+int
+iscsi_init_grp_add_initiators_from_initiator_list(int tag,
+		int num_initiator_names,
+		char **initiator_names,
+		int num_initiator_masks,
+		char **initiator_masks)
+{
+	int rc = -1;
+	struct spdk_iscsi_init_grp *ig;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+		      "add initiator to initiator group: tag=%d, #initiators=%d, #masks=%d\n",
+		      tag, num_initiator_names, num_initiator_masks);
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	ig = iscsi_init_grp_find_by_tag(tag);
+	if (!ig) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+		SPDK_ERRLOG("initiator group (%d) is not found\n", tag);
+		return rc;
+	}
+
+	rc = iscsi_init_grp_add_initiators(ig, num_initiator_names,
+					   initiator_names);
+	if (rc < 0) {
+		SPDK_ERRLOG("add initiator name error\n");
+		goto error;
+	}
+
+	rc = iscsi_init_grp_add_netmasks(ig, num_initiator_masks,
+					 initiator_masks);
+	if (rc < 0) {
+		SPDK_ERRLOG("add initiator netmask error\n");
+		iscsi_init_grp_delete_initiators(ig, num_initiator_names,
+						 initiator_names);
+	}
+
+error:
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return rc;
+}
+
+int
+iscsi_init_grp_delete_initiators_from_initiator_list(int tag,
+		int num_initiator_names,
+		char **initiator_names,
+		int num_initiator_masks,
+		char **initiator_masks)
+{
+	int rc = -1;
+	struct spdk_iscsi_init_grp *ig;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+		      "delete initiator from initiator group: tag=%d, #initiators=%d, #masks=%d\n",
+		      tag, num_initiator_names, num_initiator_masks);
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	ig = iscsi_init_grp_find_by_tag(tag);
+	if (!ig) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+		SPDK_ERRLOG("initiator group (%d) is not found\n", tag);
+		return rc;
+	}
+
+	rc = iscsi_init_grp_delete_initiators(ig, num_initiator_names,
+					      initiator_names);
+	if (rc < 0) {
+		SPDK_ERRLOG("delete initiator name error\n");
+		goto error;
+	}
+
+	rc = iscsi_init_grp_delete_netmasks(ig, num_initiator_masks,
+					    initiator_masks);
+	if (rc < 0) {
+		SPDK_ERRLOG("delete initiator netmask error\n");
+		iscsi_init_grp_add_initiators(ig, num_initiator_names,
+					      initiator_names);
+		goto error;
+	}
+
+error:
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return rc;
+}
+
+void
+iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig)
+{
+	if (!ig) {
+		return;
+	}
+
+	iscsi_init_grp_delete_all_initiators(ig);
+	iscsi_init_grp_delete_all_netmasks(ig);
+	free(ig);
+};
+
+struct spdk_iscsi_init_grp *
+iscsi_init_grp_find_by_tag(int tag)
+{
+	struct spdk_iscsi_init_grp *ig;
+
+	TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+		if (ig->tag == tag) {
+			return ig;
+		}
+	}
+
+	return NULL;
+}
+
+int
+iscsi_parse_init_grps(void)
+{
+	struct spdk_conf_section *sp;
+	int rc;
+
+	sp = spdk_conf_first_section(NULL);
+	while (sp != NULL) {
+		if (spdk_conf_section_match_prefix(sp, "InitiatorGroup")) {
+			if (spdk_conf_section_get_num(sp) == 0) {
+				SPDK_ERRLOG("Group 0 is invalid\n");
+				return -1;
+			}
+			rc = iscsi_parse_init_grp(sp);
+			if (rc < 0) {
+				SPDK_ERRLOG("parse_init_group() failed\n");
+				return -1;
+			}
+		}
+		sp = spdk_conf_next_section(sp);
+	}
+	return 0;
+}
+
+void
+iscsi_init_grps_destroy(void)
+{
+	struct spdk_iscsi_init_grp *ig, *tmp;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_init_grp_array_destroy\n");
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_FOREACH_SAFE(ig, &g_iscsi.ig_head, tailq, tmp) {
+		TAILQ_REMOVE(&g_iscsi.ig_head, ig, tailq);
+		iscsi_init_grp_destroy(ig);
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+struct spdk_iscsi_init_grp *
+iscsi_init_grp_unregister(int tag)
+{
+	struct spdk_iscsi_init_grp *ig;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+		if (ig->tag == tag) {
+			TAILQ_REMOVE(&g_iscsi.ig_head, ig, tailq);
+			pthread_mutex_unlock(&g_iscsi.mutex);
+			return ig;
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return NULL;
+}
+
+static const char *initiator_group_section = \
+		"\n"
+		"# Users must change the InitiatorGroup section(s) to match the IP\n"
+		"#  addresses and initiator configuration in their environment.\n"
+		"# Netmask can be used to specify a single IP address or a range of IP addresses\n"
+		"#  Netmask 192.168.1.20   <== single IP address\n"
+		"#  Netmask 192.168.1.0/24 <== IP range 192.168.1.*\n";
+
+#define INITIATOR_GROUP_TMPL \
+"[InitiatorGroup%d]\n" \
+"  Comment \"Initiator Group%d\"\n"
+
+#define INITIATOR_TMPL \
+"  InitiatorName "
+
+#define NETMASK_TMPL \
+"  Netmask "
+
+void
+iscsi_init_grps_config_text(FILE *fp)
+{
+	struct spdk_iscsi_init_grp *ig;
+	struct spdk_iscsi_initiator_name *iname;
+	struct spdk_iscsi_initiator_netmask *imask;
+
+	/* Create initiator group section */
+	fprintf(fp, "%s", initiator_group_section);
+
+	/* Dump initiator groups */
+	TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+		if (NULL == ig) { continue; }
+		fprintf(fp, INITIATOR_GROUP_TMPL, ig->tag, ig->tag);
+
+		/* Dump initiators */
+		fprintf(fp, INITIATOR_TMPL);
+		TAILQ_FOREACH(iname, &ig->initiator_head, tailq) {
+			fprintf(fp, "%s ", iname->name);
+		}
+		fprintf(fp, "\n");
+
+		/* Dump netmasks */
+		fprintf(fp, NETMASK_TMPL);
+		TAILQ_FOREACH(imask, &ig->netmask_head, tailq) {
+			fprintf(fp, "%s ", imask->mask);
+		}
+		fprintf(fp, "\n");
+	}
+}
+
+static void
+iscsi_init_grp_info_json(struct spdk_iscsi_init_grp *ig,
+			 struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_initiator_name *iname;
+	struct spdk_iscsi_initiator_netmask *imask;
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_int32(w, "tag", ig->tag);
+
+	spdk_json_write_named_array_begin(w, "initiators");
+	TAILQ_FOREACH(iname, &ig->initiator_head, tailq) {
+		spdk_json_write_string(w, iname->name);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_named_array_begin(w, "netmasks");
+	TAILQ_FOREACH(imask, &ig->netmask_head, tailq) {
+		spdk_json_write_string(w, imask->mask);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_init_grp_config_json(struct spdk_iscsi_init_grp *ig,
+			   struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "method", "iscsi_create_initiator_group");
+
+	spdk_json_write_name(w, "params");
+	iscsi_init_grp_info_json(ig, w);
+
+	spdk_json_write_object_end(w);
+}
+
+void
+iscsi_init_grps_info_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_init_grp *ig;
+
+	TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+		iscsi_init_grp_info_json(ig, w);
+	}
+}
+
+void
+iscsi_init_grps_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_init_grp *ig;
+
+	TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+		iscsi_init_grp_config_json(ig, w);
+	}
+}
diff --git a/src/spdk/lib/iscsi/init_grp.h b/src/spdk/lib/iscsi/init_grp.h
new file mode 100644
index 000000000..8913c98cd
--- /dev/null
+++ b/src/spdk/lib/iscsi/init_grp.h
@@ -0,0 +1,81 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_INIT_GRP_H
+#define SPDK_INIT_GRP_H
+
+#include "spdk/conf.h"
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+
+struct spdk_iscsi_initiator_name {
+	char name[MAX_INITIATOR_NAME + 1];
+	TAILQ_ENTRY(spdk_iscsi_initiator_name) tailq;
+};
+
+struct spdk_iscsi_initiator_netmask {
+	char mask[MAX_INITIATOR_ADDR + 1];
+	TAILQ_ENTRY(spdk_iscsi_initiator_netmask) tailq;
+};
+
+struct spdk_iscsi_init_grp {
+	int ninitiators;
+	TAILQ_HEAD(, spdk_iscsi_initiator_name) initiator_head;
+	int nnetmasks;
+	TAILQ_HEAD(, spdk_iscsi_initiator_netmask) netmask_head;
+	int ref;
+	int tag;
+	TAILQ_ENTRY(spdk_iscsi_init_grp)	tailq;
+};
+
+/* SPDK iSCSI Initiator Group management API */
+int iscsi_init_grp_create_from_initiator_list(int tag,
+		int num_initiator_names, char **initiator_names,
+		int num_initiator_masks, char **initiator_masks);
+int iscsi_init_grp_add_initiators_from_initiator_list(int tag,
+		int num_initiator_names, char **initiator_names,
+		int num_initiator_masks, char **initiator_masks);
+int iscsi_init_grp_delete_initiators_from_initiator_list(int tag,
+		int num_initiator_names, char **initiator_names,
+		int num_initiator_masks, char **initiator_masks);
+int iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig);
+struct spdk_iscsi_init_grp *iscsi_init_grp_unregister(int tag);
+struct spdk_iscsi_init_grp *iscsi_init_grp_find_by_tag(int tag);
+void iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig);
+int iscsi_parse_init_grps(void);
+void iscsi_init_grps_destroy(void);
+void iscsi_init_grps_config_text(FILE *fp);
+void iscsi_init_grps_info_json(struct spdk_json_write_ctx *w);
+void iscsi_init_grps_config_json(struct spdk_json_write_ctx *w);
+#endif /* SPDK_INIT_GRP_H */
diff --git a/src/spdk/lib/iscsi/iscsi.c b/src/spdk/lib/iscsi/iscsi.c
new file mode 100644
index 000000000..febf4cac4
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi.c
@@ -0,0 +1,4797 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/base64.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/trace.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+#include "spdk/queue.h"
+#include "spdk/net.h"
+
+#include "iscsi/md5.h"
+#include "iscsi/iscsi.h"
+#include "iscsi/param.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/task.h"
+#include "iscsi/conn.h"
+#include "spdk/scsi.h"
+#include "spdk/bdev.h"
+#include "iscsi/portal_grp.h"
+
+#include "spdk_internal/log.h"
+
+#define MAX_TMPBUF 1024
+
+#define SPDK_CRC32C_INITIAL    0xffffffffUL
+#define SPDK_CRC32C_XOR        0xffffffffUL
+
+#ifdef __FreeBSD__
+#define HAVE_SRANDOMDEV 1
+#define HAVE_ARC4RANDOM 1
+#endif
+
+struct spdk_iscsi_globals g_iscsi = {
+	.mutex = PTHREAD_MUTEX_INITIALIZER,
+	.portal_head = TAILQ_HEAD_INITIALIZER(g_iscsi.portal_head),
+	.pg_head = TAILQ_HEAD_INITIALIZER(g_iscsi.pg_head),
+	.ig_head = TAILQ_HEAD_INITIALIZER(g_iscsi.ig_head),
+	.target_head = TAILQ_HEAD_INITIALIZER(g_iscsi.target_head),
+	.auth_group_head = TAILQ_HEAD_INITIALIZER(g_iscsi.auth_group_head),
+	.poll_group_head = TAILQ_HEAD_INITIALIZER(g_iscsi.poll_group_head),
+};
+
+#define MATCH_DIGEST_WORD(BUF, CRC32C) \
+	(    ((((uint32_t) *((uint8_t *)(BUF)+0)) << 0)		\
+	    | (((uint32_t) *((uint8_t *)(BUF)+1)) << 8)		\
+	    | (((uint32_t) *((uint8_t *)(BUF)+2)) << 16)	\
+	    | (((uint32_t) *((uint8_t *)(BUF)+3)) << 24))	\
+	    == (CRC32C))
+
+#ifndef HAVE_SRANDOMDEV
+static void
+srandomdev(void)
+{
+	unsigned long seed;
+	time_t now;
+	pid_t pid;
+
+	pid = getpid();
+	now = time(NULL);
+	seed = pid ^ now;
+	srandom(seed);
+}
+#endif /* HAVE_SRANDOMDEV */
+
+#ifndef HAVE_ARC4RANDOM
+static int g_arc4random_initialized = 0;
+
+static uint32_t
+arc4random(void)
+{
+	uint32_t r;
+	uint32_t r1, r2;
+
+	if (!g_arc4random_initialized) {
+		srandomdev();
+		g_arc4random_initialized = 1;
+	}
+	r1 = (uint32_t)(random() & 0xffff);
+	r2 = (uint32_t)(random() & 0xffff);
+	r = (r1 << 16) | r2;
+	return r;
+}
+#endif /* HAVE_ARC4RANDOM */
+
+static void
+gen_random(uint8_t *buf, size_t len)
+{
+	uint32_t r;
+	size_t idx;
+
+	for (idx = 0; idx < len; idx++) {
+		r = arc4random();
+		buf[idx] = (uint8_t) r;
+	}
+}
+
+static uint64_t
+iscsi_get_isid(const uint8_t isid[6])
+{
+	return (uint64_t)isid[0] << 40 |
+	       (uint64_t)isid[1] << 32 |
+	       (uint64_t)isid[2] << 24 |
+	       (uint64_t)isid[3] << 16 |
+	       (uint64_t)isid[4] << 8 |
+	       (uint64_t)isid[5];
+}
+
+static int
+bin2hex(char *buf, size_t len, const uint8_t *data, size_t data_len)
+{
+	const char *digits = "0123456789ABCDEF";
+	size_t total = 0;
+	size_t idx;
+
+	if (len < 3) {
+		return -1;
+	}
+	buf[total] = '0';
+	total++;
+	buf[total] = 'x';
+	total++;
+	buf[total] = '\0';
+
+	for (idx = 0; idx < data_len; idx++) {
+		if (total + 3 > len) {
+			buf[total] = '\0';
+			return - 1;
+		}
+		buf[total] = digits[(data[idx] >> 4) & 0x0fU];
+		total++;
+		buf[total] = digits[data[idx] & 0x0fU];
+		total++;
+	}
+	buf[total] = '\0';
+	return total;
+}
+
+static int
+hex2bin(uint8_t *data, size_t data_len, const char *str)
+{
+	const char *digits = "0123456789ABCDEF";
+	const char *dp;
+	const char *p;
+	size_t total = 0;
+	int n0, n1;
+
+	p = str;
+	if (p[0] != '0' && (p[1] != 'x' && p[1] != 'X')) {
+		return -1;
+	}
+	p += 2;
+
+	while (p[0] != '\0' && p[1] != '\0') {
+		if (total >= data_len) {
+			return -1;
+		}
+		dp = strchr(digits, toupper((int) p[0]));
+		if (dp == NULL) {
+			return -1;
+		}
+		n0 = (int)(dp - digits);
+		dp = strchr(digits, toupper((int) p[1]));
+		if (dp == NULL) {
+			return -1;
+		}
+		n1 = (int)(dp - digits);
+
+		data[total] = (uint8_t)(((n0 & 0x0fU) << 4) | (n1 & 0x0fU));
+		total++;
+		p += 2;
+	}
+	return total;
+}
+
+static int
+iscsi_reject(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu,
+	     int reason)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_reject *rsph;
+	uint8_t *data;
+	int total_ahs_len;
+	int data_len;
+	int alloc_len;
+
+	pdu->is_rejected = true;
+
+	total_ahs_len = pdu->bhs.total_ahs_len;
+	data_len = 0;
+	alloc_len = ISCSI_BHS_LEN + (4 * total_ahs_len);
+
+	if (conn->header_digest) {
+		alloc_len += ISCSI_DIGEST_LEN;
+	}
+
+	data = calloc(1, alloc_len);
+	if (!data) {
+		SPDK_ERRLOG("calloc() failed for data segment\n");
+		return -ENOMEM;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Reject PDU reason=%d\n", reason);
+
+	if (conn->sess != NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+			      conn->StatSN, conn->sess->ExpCmdSN,
+			      conn->sess->MaxCmdSN);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u\n", conn->StatSN);
+	}
+
+	memcpy(data, &pdu->bhs, ISCSI_BHS_LEN);
+	data_len += ISCSI_BHS_LEN;
+
+	if (total_ahs_len != 0) {
+		total_ahs_len = spdk_min((4 * total_ahs_len), ISCSI_AHS_LEN);
+		memcpy(data + data_len, pdu->ahs, total_ahs_len);
+		data_len += total_ahs_len;
+	}
+
+	if (conn->header_digest) {
+		memcpy(data + data_len, pdu->header_digest, ISCSI_DIGEST_LEN);
+		data_len += ISCSI_DIGEST_LEN;
+	}
+
+	rsp_pdu = iscsi_get_pdu(conn);
+	if (rsp_pdu == NULL) {
+		free(data);
+		return -ENOMEM;
+	}
+
+	rsph = (struct iscsi_bhs_reject *)&rsp_pdu->bhs;
+	rsp_pdu->data = data;
+	rsph->opcode = ISCSI_OP_REJECT;
+	rsph->flags |= 0x80;	/* bit 0 is default to 1 */
+	rsph->reason = reason;
+	DSET24(rsph->data_segment_len, data_len);
+
+	rsph->ffffffff = 0xffffffffU;
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	conn->StatSN++;
+
+	if (conn->sess != NULL) {
+		to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+		to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+	} else {
+		to_be32(&rsph->exp_cmd_sn, 1);
+		to_be32(&rsph->max_cmd_sn, 1);
+	}
+
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (void *)&rsp_pdu->bhs, ISCSI_BHS_LEN);
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+
+	return 0;
+}
+
+uint32_t
+iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu)
+{
+	uint32_t crc32c;
+	uint32_t ahs_len_bytes = pdu->bhs.total_ahs_len * 4;
+
+	crc32c = SPDK_CRC32C_INITIAL;
+	crc32c = spdk_crc32c_update(&pdu->bhs, ISCSI_BHS_LEN, crc32c);
+
+	if (ahs_len_bytes) {
+		crc32c = spdk_crc32c_update(pdu->ahs, ahs_len_bytes, crc32c);
+	}
+
+	/* BHS and AHS are always 4-byte multiples in length, so no padding is necessary. */
+	crc32c = crc32c ^ SPDK_CRC32C_XOR;
+	return crc32c;
+}
+
+uint32_t
+iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu)
+{
+	uint32_t data_len = DGET24(pdu->bhs.data_segment_len);
+	uint32_t crc32c;
+	uint32_t mod;
+	struct iovec iov;
+	uint32_t num_blocks;
+
+	crc32c = SPDK_CRC32C_INITIAL;
+	if (spdk_likely(!pdu->dif_insert_or_strip)) {
+		crc32c = spdk_crc32c_update(pdu->data, data_len, crc32c);
+	} else {
+		iov.iov_base = pdu->data_buf;
+		iov.iov_len = pdu->data_buf_len;
+		num_blocks = pdu->data_buf_len / pdu->dif_ctx.block_size;
+
+		spdk_dif_update_crc32c(&iov, 1, num_blocks, &crc32c, &pdu->dif_ctx);
+	}
+
+	mod = data_len % ISCSI_ALIGNMENT;
+	if (mod != 0) {
+		uint32_t pad_length = ISCSI_ALIGNMENT - mod;
+		uint8_t pad[3] = {0, 0, 0};
+
+		assert(pad_length > 0);
+		assert(pad_length <= sizeof(pad));
+		crc32c = spdk_crc32c_update(pad, pad_length, crc32c);
+	}
+
+	crc32c = crc32c ^ SPDK_CRC32C_XOR;
+	return crc32c;
+}
+
+static int
+iscsi_conn_read_data_segment(struct spdk_iscsi_conn *conn,
+			     struct spdk_iscsi_pdu *pdu,
+			     uint32_t segment_len)
+{
+	struct iovec buf_iov, iovs[32];
+	int rc, _rc;
+
+	if (spdk_likely(!pdu->dif_insert_or_strip)) {
+		return iscsi_conn_read_data(conn,
+					    segment_len - pdu->data_valid_bytes,
+					    pdu->data_buf + pdu->data_valid_bytes);
+	} else {
+		buf_iov.iov_base = pdu->data_buf;
+		buf_iov.iov_len = pdu->data_buf_len;
+		rc = spdk_dif_set_md_interleave_iovs(iovs, 32, &buf_iov, 1,
+						     pdu->data_valid_bytes,
+						     segment_len - pdu->data_valid_bytes, NULL,
+						     &pdu->dif_ctx);
+		if (rc > 0) {
+			rc = iscsi_conn_readv_data(conn, iovs, rc);
+			if (rc > 0) {
+				_rc = spdk_dif_generate_stream(&buf_iov, 1,
+							       pdu->data_valid_bytes, rc,
+							       &pdu->dif_ctx);
+				if (_rc != 0) {
+					SPDK_ERRLOG("DIF generate failed\n");
+					rc = _rc;
+				}
+			}
+		} else {
+			SPDK_ERRLOG("Setup iovs for interleaved metadata failed\n");
+		}
+		return rc;
+	}
+}
+
+struct _iscsi_sgl {
+	struct iovec	*iov;
+	int		iovcnt;
+	uint32_t	iov_offset;
+	uint32_t	total_size;
+};
+
+static inline void
+_iscsi_sgl_init(struct _iscsi_sgl *s, struct iovec *iovs, int iovcnt,
+		uint32_t iov_offset)
+{
+	s->iov = iovs;
+	s->iovcnt = iovcnt;
+	s->iov_offset = iov_offset;
+	s->total_size = 0;
+}
+
+static inline bool
+_iscsi_sgl_append(struct _iscsi_sgl *s, uint8_t *data, uint32_t data_len)
+{
+	if (s->iov_offset >= data_len) {
+		s->iov_offset -= data_len;
+	} else {
+		assert(s->iovcnt > 0);
+		s->iov->iov_base = data + s->iov_offset;
+		s->iov->iov_len = data_len - s->iov_offset;
+		s->total_size += data_len - s->iov_offset;
+		s->iov_offset = 0;
+		s->iov++;
+		s->iovcnt--;
+		if (s->iovcnt == 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* Build iovec array to leave metadata space for every data block
+ * when reading data segment from socket.
+ */
+static inline bool
+_iscsi_sgl_append_with_md(struct _iscsi_sgl *s,
+			  void *buf, uint32_t buf_len, uint32_t data_len,
+			  struct spdk_dif_ctx *dif_ctx)
+{
+	int rc;
+	uint32_t total_size = 0;
+	struct iovec buf_iov;
+
+	if (s->iov_offset >= data_len) {
+		s->iov_offset -= data_len;
+	} else {
+		buf_iov.iov_base = buf;
+		buf_iov.iov_len = buf_len;
+		rc = spdk_dif_set_md_interleave_iovs(s->iov, s->iovcnt, &buf_iov, 1,
+						     s->iov_offset, data_len - s->iov_offset,
+						     &total_size, dif_ctx);
+		if (rc < 0) {
+			SPDK_ERRLOG("Failed to setup iovs for DIF strip\n");
+			return false;
+		}
+
+		s->total_size += total_size;
+		s->iov_offset = 0;
+		assert(s->iovcnt >= rc);
+		s->iovcnt -= rc;
+		s->iov += rc;
+
+		if (s->iovcnt == 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+int
+iscsi_build_iovs(struct spdk_iscsi_conn *conn, struct iovec *iovs, int iovcnt,
+		 struct spdk_iscsi_pdu *pdu, uint32_t *_mapped_length)
+{
+	struct _iscsi_sgl sgl;
+	int enable_digest;
+	uint32_t total_ahs_len;
+	uint32_t data_len;
+
+	if (iovcnt == 0) {
+		return 0;
+	}
+
+	total_ahs_len = pdu->bhs.total_ahs_len;
+	data_len = DGET24(pdu->bhs.data_segment_len);
+	data_len = ISCSI_ALIGN(data_len);
+
+	enable_digest = 1;
+	if (pdu->bhs.opcode == ISCSI_OP_LOGIN_RSP) {
+		/* this PDU should be sent without digest */
+		enable_digest = 0;
+	}
+
+	_iscsi_sgl_init(&sgl, iovs, iovcnt, pdu->writev_offset);
+
+	/* BHS */
+	if (!_iscsi_sgl_append(&sgl, (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN)) {
+		goto end;
+	}
+	/* AHS */
+	if (total_ahs_len > 0) {
+		if (!_iscsi_sgl_append(&sgl, pdu->ahs, 4 * total_ahs_len)) {
+			goto end;
+		}
+	}
+
+	/* Header Digest */
+	if (enable_digest && conn->header_digest) {
+		if (!_iscsi_sgl_append(&sgl, pdu->header_digest, ISCSI_DIGEST_LEN)) {
+			goto end;
+		}
+	}
+
+	/* Data Segment */
+	if (data_len > 0) {
+		if (!pdu->dif_insert_or_strip) {
+			if (!_iscsi_sgl_append(&sgl, pdu->data, data_len)) {
+				goto end;
+			}
+		} else {
+			if (!_iscsi_sgl_append_with_md(&sgl, pdu->data, pdu->data_buf_len,
+						       data_len, &pdu->dif_ctx)) {
+				goto end;
+			}
+		}
+	}
+
+	/* Data Digest */
+	if (enable_digest && conn->data_digest && data_len != 0) {
+		_iscsi_sgl_append(&sgl, pdu->data_digest, ISCSI_DIGEST_LEN);
+	}
+
+end:
+	if (_mapped_length != NULL) {
+		*_mapped_length = sgl.total_size;
+	}
+
+	return iovcnt - sgl.iovcnt;
+}
+
+void iscsi_free_sess(struct spdk_iscsi_sess *sess)
+{
+	if (sess == NULL) {
+		return;
+	}
+
+	sess->tag = 0;
+	sess->target = NULL;
+	sess->session_type = SESSION_TYPE_INVALID;
+	iscsi_param_free(sess->params);
+	free(sess->conns);
+	spdk_scsi_port_free(&sess->initiator_port);
+	spdk_mempool_put(g_iscsi.session_pool, (void *)sess);
+}
+
+static int
+create_iscsi_sess(struct spdk_iscsi_conn *conn,
+		  struct spdk_iscsi_tgt_node *target,
+		  enum session_type session_type)
+{
+	struct spdk_iscsi_sess *sess;
+	int rc;
+
+	sess = spdk_mempool_get(g_iscsi.session_pool);
+	if (!sess) {
+		SPDK_ERRLOG("Unable to get session object\n");
+		SPDK_ERRLOG("MaxSessions set to %d\n", g_iscsi.MaxSessions);
+		return -ENOMEM;
+	}
+
+	/* configuration values */
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	sess->MaxConnections = g_iscsi.MaxConnectionsPerSession;
+	sess->MaxOutstandingR2T = DEFAULT_MAXOUTSTANDINGR2T;
+
+	sess->DefaultTime2Wait = g_iscsi.DefaultTime2Wait;
+	sess->DefaultTime2Retain = g_iscsi.DefaultTime2Retain;
+	sess->FirstBurstLength = g_iscsi.FirstBurstLength;
+	sess->MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH;
+	sess->InitialR2T = DEFAULT_INITIALR2T;
+	sess->ImmediateData = g_iscsi.ImmediateData;
+	sess->DataPDUInOrder = DEFAULT_DATAPDUINORDER;
+	sess->DataSequenceInOrder = DEFAULT_DATASEQUENCEINORDER;
+	sess->ErrorRecoveryLevel = g_iscsi.ErrorRecoveryLevel;
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	sess->tag = conn->pg_tag;
+
+	sess->conns = calloc(sess->MaxConnections, sizeof(*sess->conns));
+	if (!sess->conns) {
+		SPDK_ERRLOG("calloc() failed for connection array\n");
+		return -ENOMEM;
+	}
+
+	sess->connections = 0;
+
+	sess->conns[sess->connections] = conn;
+	sess->connections++;
+
+	sess->params = NULL;
+	sess->target = target;
+	sess->isid = 0;
+	sess->session_type = session_type;
+	sess->current_text_itt = 0xffffffffU;
+
+	/* set default params */
+	rc = iscsi_sess_params_init(&sess->params);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_sess_params_init() failed\n");
+		goto error_return;
+	}
+	/* replace with config value */
+	rc = iscsi_param_set_int(sess->params, "MaxConnections",
+				 sess->MaxConnections);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set_int(sess->params, "MaxOutstandingR2T",
+				 sess->MaxOutstandingR2T);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set_int(sess->params, "DefaultTime2Wait",
+				 sess->DefaultTime2Wait);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set_int(sess->params, "DefaultTime2Retain",
+				 sess->DefaultTime2Retain);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set_int(sess->params, "FirstBurstLength",
+				 sess->FirstBurstLength);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set_int(sess->params, "MaxBurstLength",
+				 sess->MaxBurstLength);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set(sess->params, "InitialR2T",
+			     sess->InitialR2T ? "Yes" : "No");
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set(sess->params, "ImmediateData",
+			     sess->ImmediateData ? "Yes" : "No");
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set(sess->params, "DataPDUInOrder",
+			     sess->DataPDUInOrder ? "Yes" : "No");
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set(sess->params, "DataSequenceInOrder",
+			     sess->DataSequenceInOrder ? "Yes" : "No");
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set() failed\n");
+		goto error_return;
+	}
+
+	rc = iscsi_param_set_int(sess->params, "ErrorRecoveryLevel",
+				 sess->ErrorRecoveryLevel);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	/* realloc buffer */
+	rc = iscsi_param_set_int(conn->params, "MaxRecvDataSegmentLength",
+				 conn->MaxRecvDataSegmentLength);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+		goto error_return;
+	}
+
+	/* sess for first connection of session */
+	conn->sess = sess;
+	return 0;
+
+error_return:
+	iscsi_free_sess(sess);
+	conn->sess = NULL;
+	return -1;
+}
+
+static struct spdk_iscsi_sess *
+get_iscsi_sess_by_tsih(uint16_t tsih)
+{
+	struct spdk_iscsi_sess *session;
+
+	if (tsih == 0 || tsih > g_iscsi.MaxSessions) {
+		return NULL;
+	}
+
+	session = g_iscsi.session[tsih - 1];
+	assert(tsih == session->tsih);
+
+	return session;
+}
+
+static uint8_t
+append_iscsi_sess(struct spdk_iscsi_conn *conn,
+		  const char *initiator_port_name, uint16_t tsih, uint16_t cid)
+{
+	struct spdk_iscsi_sess *sess;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "append session: init port name=%s, tsih=%u, cid=%u\n",
+		      initiator_port_name, tsih, cid);
+
+	sess = get_iscsi_sess_by_tsih(tsih);
+	if (sess == NULL) {
+		SPDK_ERRLOG("spdk_get_iscsi_sess_by_tsih failed\n");
+		return ISCSI_LOGIN_CONN_ADD_FAIL;
+	}
+	if ((conn->pg_tag != sess->tag) ||
+	    (strcasecmp(initiator_port_name, spdk_scsi_port_get_name(sess->initiator_port)) != 0) ||
+	    (conn->target != sess->target)) {
+		/* no match */
+		SPDK_ERRLOG("no MCS session for init port name=%s, tsih=%d, cid=%d\n",
+			    initiator_port_name, tsih, cid);
+		return ISCSI_LOGIN_CONN_ADD_FAIL;
+	}
+
+	if (sess->connections >= sess->MaxConnections) {
+		/* no slot for connection */
+		SPDK_ERRLOG("too many connections for init port name=%s, tsih=%d, cid=%d\n",
+			    initiator_port_name, tsih, cid);
+		return ISCSI_LOGIN_TOO_MANY_CONNECTIONS;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connections (tsih %d): %d\n", sess->tsih, sess->connections);
+	conn->sess = sess;
+
+	/*
+	 * TODO: need a mutex or other sync mechanism to protect the session's
+	 *  connection list.
+	 */
+	sess->conns[sess->connections] = conn;
+	sess->connections++;
+
+	return 0;
+}
+
+static int
+iscsi_append_text(struct spdk_iscsi_conn *conn __attribute__((__unused__)),
+		  const char *key, const char *val, uint8_t *data,
+		  int alloc_len, int data_len)
+{
+	int total;
+	int len;
+
+	total = data_len;
+	if (alloc_len < 1) {
+		return 0;
+	}
+	if (total > alloc_len) {
+		total = alloc_len;
+		data[total - 1] = '\0';
+		return total;
+	}
+
+	if (alloc_len - total < 1) {
+		SPDK_ERRLOG("data space small %d\n", alloc_len);
+		return total;
+	}
+	len = snprintf((char *) data + total, alloc_len - total, "%s=%s", key, val);
+	total += len + 1;
+
+	return total;
+}
+
+static int
+iscsi_append_param(struct spdk_iscsi_conn *conn, const char *key,
+		   uint8_t *data, int alloc_len, int data_len)
+{
+	struct iscsi_param *param;
+	int rc;
+
+	param = iscsi_param_find(conn->params, key);
+	if (param == NULL) {
+		param = iscsi_param_find(conn->sess->params, key);
+		if (param == NULL) {
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "no key %.64s\n", key);
+			return data_len;
+		}
+	}
+	rc = iscsi_append_text(conn, param->key, param->val, data,
+			       alloc_len, data_len);
+	return rc;
+}
+
+static int
+iscsi_auth_params(struct spdk_iscsi_conn *conn,
+		  struct iscsi_param *params, const char *method, uint8_t *data,
+		  int alloc_len, int data_len)
+{
+	char *in_val;
+	char *in_next;
+	char *new_val;
+	const char *algorithm;
+	const char *name;
+	const char *response;
+	const char *identifier;
+	const char *challenge;
+	int total;
+	int rc;
+
+	if (conn == NULL || params == NULL || method == NULL) {
+		return -1;
+	}
+	if (strcasecmp(method, "CHAP") == 0) {
+		/* method OK */
+	} else {
+		SPDK_ERRLOG("unsupported AuthMethod %.64s\n", method);
+		return -1;
+	}
+
+	total = data_len;
+	if (alloc_len < 1) {
+		return 0;
+	}
+	if (total > alloc_len) {
+		total = alloc_len;
+		data[total - 1] = '\0';
+		return total;
+	}
+
+	/* for temporary store */
+	in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+	if (!in_val) {
+		SPDK_ERRLOG("malloc() failed for temporary store\n");
+		return -ENOMEM;
+	}
+
+	/* CHAP method (RFC1994) */
+	if ((algorithm = iscsi_param_get_val(params, "CHAP_A")) != NULL) {
+		if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_A) {
+			SPDK_ERRLOG("CHAP sequence error\n");
+			goto error_return;
+		}
+
+		/* CHAP_A is LIST type */
+		snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", algorithm);
+		in_next = in_val;
+		while ((new_val = spdk_strsepq(&in_next, ",")) != NULL) {
+			if (strcasecmp(new_val, "5") == 0) {
+				/* CHAP with MD5 */
+				break;
+			}
+		}
+		if (new_val == NULL) {
+			snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject");
+			new_val = in_val;
+			iscsi_append_text(conn, "CHAP_A", new_val,
+					  data, alloc_len, total);
+			goto error_return;
+		}
+		/* selected algorithm is 5 (MD5) */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_A=%s\n", new_val);
+		total = iscsi_append_text(conn, "CHAP_A", new_val,
+					  data, alloc_len, total);
+
+		/* Identifier is one octet */
+		gen_random(conn->auth.chap_id, 1);
+		snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d",
+			 (int) conn->auth.chap_id[0]);
+		total = iscsi_append_text(conn, "CHAP_I", in_val,
+					  data, alloc_len, total);
+
+		/* Challenge Value is a variable stream of octets */
+		/* (binary length MUST not exceed 1024 bytes) */
+		conn->auth.chap_challenge_len = ISCSI_CHAP_CHALLENGE_LEN;
+		gen_random(conn->auth.chap_challenge, conn->auth.chap_challenge_len);
+		bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN,
+			conn->auth.chap_challenge, conn->auth.chap_challenge_len);
+		total = iscsi_append_text(conn, "CHAP_C", in_val,
+					  data, alloc_len, total);
+
+		conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_NR;
+	} else if ((name = iscsi_param_get_val(params, "CHAP_N")) != NULL) {
+		uint8_t resmd5[SPDK_MD5DIGEST_LEN];
+		uint8_t tgtmd5[SPDK_MD5DIGEST_LEN];
+		struct spdk_md5ctx md5ctx;
+		size_t decoded_len = 0;
+
+		if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_NR) {
+			SPDK_ERRLOG("CHAP sequence error\n");
+			goto error_return;
+		}
+
+		response = iscsi_param_get_val(params, "CHAP_R");
+		if (response == NULL) {
+			SPDK_ERRLOG("no response\n");
+			goto error_return;
+		}
+		if (response[0] == '0' &&
+		    (response[1] == 'x' || response[1] == 'X')) {
+			rc = hex2bin(resmd5, SPDK_MD5DIGEST_LEN, response);
+			if (rc < 0 || rc != SPDK_MD5DIGEST_LEN) {
+				SPDK_ERRLOG("response format error\n");
+				goto error_return;
+			}
+		} else if (response[0] == '0' &&
+			   (response[1] == 'b' || response[1] == 'B')) {
+			response += 2;
+			rc = spdk_base64_decode(resmd5, &decoded_len, response);
+			if (rc < 0 || decoded_len != SPDK_MD5DIGEST_LEN) {
+				SPDK_ERRLOG("response format error\n");
+				goto error_return;
+			}
+		} else {
+			SPDK_ERRLOG("response format error\n");
+			goto error_return;
+		}
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_N/CHAP_R\n");
+
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ag_tag=%d\n", conn->chap_group);
+
+		rc = iscsi_chap_get_authinfo(&conn->auth, name, conn->chap_group);
+		if (rc < 0) {
+			/* SPDK_ERRLOG("auth user or secret is missing\n"); */
+			SPDK_ERRLOG("iscsi_chap_get_authinfo() failed\n");
+			goto error_return;
+		}
+		if (conn->auth.user[0] == '\0' || conn->auth.secret[0] == '\0') {
+			/* SPDK_ERRLOG("auth user or secret is missing\n"); */
+			SPDK_ERRLOG("auth failed (name %.64s)\n", name);
+			goto error_return;
+		}
+
+		md5init(&md5ctx);
+		/* Identifier */
+		md5update(&md5ctx, conn->auth.chap_id, 1);
+		/* followed by secret */
+		md5update(&md5ctx, conn->auth.secret,
+			  strlen(conn->auth.secret));
+		/* followed by Challenge Value */
+		md5update(&md5ctx, conn->auth.chap_challenge,
+			  conn->auth.chap_challenge_len);
+		/* tgtmd5 is expecting Response Value */
+		md5final(tgtmd5, &md5ctx);
+
+		bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN);
+
+#if 0
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "tgtmd5=%s, resmd5=%s\n", in_val, response);
+		spdk_dump("tgtmd5", tgtmd5, SPDK_MD5DIGEST_LEN);
+		spdk_dump("resmd5", resmd5, SPDK_MD5DIGEST_LEN);
+#endif
+
+		/* compare MD5 digest */
+		if (memcmp(tgtmd5, resmd5, SPDK_MD5DIGEST_LEN) != 0) {
+			/* not match */
+			/* SPDK_ERRLOG("auth user or secret is missing\n"); */
+			SPDK_ERRLOG("auth failed (name %.64s)\n", name);
+			goto error_return;
+		}
+		/* OK initiator's secret */
+		conn->authenticated = true;
+
+		/* mutual CHAP? */
+		identifier = iscsi_param_get_val(params, "CHAP_I");
+		if (identifier != NULL) {
+			conn->auth.chap_mid[0] = (uint8_t) strtol(identifier, NULL, 10);
+			challenge = iscsi_param_get_val(params, "CHAP_C");
+			if (challenge == NULL) {
+				SPDK_ERRLOG("CHAP sequence error\n");
+				goto error_return;
+			}
+			if (challenge[0] == '0' &&
+			    (challenge[1] == 'x' || challenge[1] == 'X')) {
+				rc = hex2bin(conn->auth.chap_mchallenge,
+					     ISCSI_CHAP_CHALLENGE_LEN, challenge);
+				if (rc < 0) {
+					SPDK_ERRLOG("challenge format error\n");
+					goto error_return;
+				}
+				conn->auth.chap_mchallenge_len = rc;
+			} else if (challenge[0] == '0' &&
+				   (challenge[1] == 'b' || challenge[1] == 'B')) {
+				challenge += 2;
+				rc = spdk_base64_decode(conn->auth.chap_mchallenge,
+							&decoded_len, challenge);
+				if (rc < 0) {
+					SPDK_ERRLOG("challenge format error\n");
+					goto error_return;
+				}
+				conn->auth.chap_mchallenge_len = decoded_len;
+			} else {
+				SPDK_ERRLOG("challenge format error\n");
+				goto error_return;
+			}
+#if 0
+			spdk_dump("MChallenge", conn->auth.chap_mchallenge,
+				  conn->auth.chap_mchallenge_len);
+#endif
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_I/CHAP_C\n");
+
+			if (conn->auth.muser[0] == '\0' || conn->auth.msecret[0] == '\0') {
+				/* SPDK_ERRLOG("mutual auth user or secret is missing\n"); */
+				SPDK_ERRLOG("auth failed (name %.64s)\n", name);
+				goto error_return;
+			}
+
+			md5init(&md5ctx);
+			/* Identifier */
+			md5update(&md5ctx, conn->auth.chap_mid, 1);
+			/* followed by secret */
+			md5update(&md5ctx, conn->auth.msecret,
+				  strlen(conn->auth.msecret));
+			/* followed by Challenge Value */
+			md5update(&md5ctx, conn->auth.chap_mchallenge,
+				  conn->auth.chap_mchallenge_len);
+			/* tgtmd5 is Response Value */
+			md5final(tgtmd5, &md5ctx);
+
+			bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN);
+
+			total = iscsi_append_text(conn, "CHAP_N",
+						  conn->auth.muser, data, alloc_len, total);
+			total = iscsi_append_text(conn, "CHAP_R",
+						  in_val, data, alloc_len, total);
+		} else {
+			/* not mutual */
+			if (conn->mutual_chap) {
+				SPDK_ERRLOG("required mutual CHAP\n");
+				goto error_return;
+			}
+		}
+
+		conn->auth.chap_phase = ISCSI_CHAP_PHASE_END;
+	} else {
+		/* not found CHAP keys */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "start CHAP\n");
+		conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A;
+	}
+
+	free(in_val);
+	return total;
+
+error_return:
+	conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A;
+	free(in_val);
+	return -1;
+}
+
+static int
+iscsi_check_values(struct spdk_iscsi_conn *conn)
+{
+	if (conn->sess->FirstBurstLength > conn->sess->MaxBurstLength) {
+		SPDK_ERRLOG("FirstBurstLength(%d) > MaxBurstLength(%d)\n",
+			    conn->sess->FirstBurstLength,
+			    conn->sess->MaxBurstLength);
+		return -1;
+	}
+	if (conn->sess->FirstBurstLength > g_iscsi.FirstBurstLength) {
+		SPDK_ERRLOG("FirstBurstLength(%d) > iSCSI target restriction(%d)\n",
+			    conn->sess->FirstBurstLength, g_iscsi.FirstBurstLength);
+		return -1;
+	}
+	if (conn->sess->MaxBurstLength > 0x00ffffff) {
+		SPDK_ERRLOG("MaxBurstLength(%d) > 0x00ffffff\n",
+			    conn->sess->MaxBurstLength);
+		return -1;
+	}
+
+	if (conn->MaxRecvDataSegmentLength < 512) {
+		SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) < 512\n",
+			    conn->MaxRecvDataSegmentLength);
+		return -1;
+	}
+	if (conn->MaxRecvDataSegmentLength > 0x00ffffff) {
+		SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) > 0x00ffffff\n",
+			    conn->MaxRecvDataSegmentLength);
+		return -1;
+	}
+	return 0;
+}
+
+static int
+iscsi_conn_params_update(struct spdk_iscsi_conn *conn)
+{
+	int rc;
+	uint32_t recv_buf_size;
+
+	/* update internal variables */
+	rc = iscsi_copy_param2var(conn);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_copy_param2var() failed\n");
+		if (conn->state < ISCSI_CONN_STATE_EXITING) {
+			conn->state = ISCSI_CONN_STATE_EXITING;
+		}
+		return rc;
+	}
+
+	/* check value */
+	rc = iscsi_check_values(conn);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_check_values() failed\n");
+		if (conn->state < ISCSI_CONN_STATE_EXITING) {
+			conn->state = ISCSI_CONN_STATE_EXITING;
+		}
+	}
+
+	/* The socket receive buffer may need to be adjusted based on the new parameters */
+
+	/* Don't allow the recv buffer to be 0 or very large. */
+	recv_buf_size = spdk_max(0x1000, spdk_min(0x2000, conn->sess->FirstBurstLength));
+
+	/* Add in extra space for the PDU */
+	recv_buf_size += ISCSI_BHS_LEN + ISCSI_AHS_LEN;
+
+	if (conn->header_digest) {
+		recv_buf_size += ISCSI_DIGEST_LEN;
+	}
+
+	if (conn->data_digest) {
+		recv_buf_size += ISCSI_DIGEST_LEN;
+	}
+
+	/* Set up to buffer up to 4 commands with immediate data at once */
+	if (spdk_sock_set_recvbuf(conn->sock, recv_buf_size * 4) < 0) {
+		/* Not fatal. */
+	}
+
+	return rc;
+}
+
+static void
+iscsi_conn_login_pdu_err_complete(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->full_feature) {
+		iscsi_conn_params_update(conn);
+	}
+}
+
+static void
+iscsi_conn_login_pdu_success_complete(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->state >= ISCSI_CONN_STATE_EXITING) {
+		/* Connection is being exited before this callback is executed. */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n");
+		return;
+	}
+	if (conn->full_feature) {
+		if (iscsi_conn_params_update(conn) != 0) {
+			return;
+		}
+	}
+	conn->state = ISCSI_CONN_STATE_RUNNING;
+	if (conn->full_feature != 0) {
+		iscsi_conn_schedule(conn);
+	}
+}
+
+/*
+ * The response function of spdk_iscsi_op_login
+ */
+static void
+iscsi_op_login_response(struct spdk_iscsi_conn *conn,
+			struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param *params,
+			iscsi_conn_xfer_complete_cb cb_fn)
+{
+	struct iscsi_bhs_login_rsp *rsph;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	rsph->version_max = ISCSI_VERSION;
+	rsph->version_act = ISCSI_VERSION;
+	DSET24(rsph->data_segment_len, rsp_pdu->data_segment_len);
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	conn->StatSN++;
+
+	if (conn->sess != NULL) {
+		to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+		to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+	} else {
+		to_be32(&rsph->exp_cmd_sn, rsp_pdu->cmd_sn);
+		to_be32(&rsph->max_cmd_sn, rsp_pdu->cmd_sn);
+	}
+
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)rsph, ISCSI_BHS_LEN);
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "DATA", rsp_pdu->data, rsp_pdu->data_segment_len);
+
+	/* Set T/CSG/NSG to reserved if login error. */
+	if (rsph->status_class != 0) {
+		rsph->flags &= ~ISCSI_LOGIN_TRANSIT;
+		rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK;
+		rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK;
+	}
+	iscsi_param_free(params);
+	iscsi_conn_write_pdu(conn, rsp_pdu, cb_fn, conn);
+}
+
+/*
+ * The function which is used to initialize the internal response data
+ * structure of iscsi login function.
+ * return:
+ * 0, success;
+ * otherwise, error;
+ */
+static int
+iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn,
+			struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu)
+{
+	struct iscsi_bhs_login_req *reqh;
+	struct iscsi_bhs_login_rsp *rsph;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	rsph->opcode = ISCSI_OP_LOGIN_RSP;
+	rsph->status_class = ISCSI_CLASS_SUCCESS;
+	rsph->status_detail = ISCSI_LOGIN_ACCEPT;
+	rsp_pdu->data_segment_len = 0;
+
+	/* The default MaxRecvDataSegmentLength 8192 is used during login. - RFC3720 */
+	rsp_pdu->data = calloc(1, 8192);
+	if (!rsp_pdu->data) {
+		SPDK_ERRLOG("calloc() failed for data segment\n");
+		rsph->status_class = ISCSI_CLASS_TARGET_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+	rsp_pdu->data_buf_len = 8192;
+
+	reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+	rsph->flags |= (reqh->flags & ISCSI_LOGIN_TRANSIT);
+	rsph->flags |= (reqh->flags & ISCSI_LOGIN_CONTINUE);
+	rsph->flags |= (reqh->flags & ISCSI_LOGIN_CURRENT_STAGE_MASK);
+	if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) {
+		rsph->flags |= (reqh->flags & ISCSI_LOGIN_NEXT_STAGE_MASK);
+	}
+
+	/* We don't need to convert from network byte order. Just store it */
+	memcpy(&rsph->isid, reqh->isid, 6);
+	rsph->tsih = reqh->tsih;
+	rsph->itt = reqh->itt;
+	rsp_pdu->cmd_sn = from_be32(&reqh->cmd_sn);
+
+	if (rsph->tsih) {
+		rsph->stat_sn = reqh->exp_stat_sn;
+	}
+
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+		      "T=%d, C=%d, CSG=%d, NSG=%d, Min=%d, Max=%d, ITT=%x\n",
+		      ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags),
+		      ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags),
+		      ISCSI_BHS_LOGIN_GET_CSG(rsph->flags),
+		      ISCSI_BHS_LOGIN_GET_NSG(rsph->flags),
+		      reqh->version_min, reqh->version_max, from_be32(&rsph->itt));
+
+	if (conn->sess != NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u,"
+			      "MaxCmdSN=%u\n", rsp_pdu->cmd_sn,
+			      from_be32(&rsph->stat_sn), conn->StatSN,
+			      conn->sess->ExpCmdSN,
+			      conn->sess->MaxCmdSN);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n",
+			      rsp_pdu->cmd_sn, from_be32(&rsph->stat_sn),
+			      conn->StatSN);
+	}
+
+	if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags) &&
+	    ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags)) {
+		SPDK_ERRLOG("transit error\n");
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+	/* make sure reqh->version_max < ISCSI_VERSION */
+	if (reqh->version_min > ISCSI_VERSION) {
+		SPDK_ERRLOG("unsupported version min %d/max %d, expecting %d\n", reqh->version_min,
+			    reqh->version_max, ISCSI_VERSION);
+		/* Unsupported version */
+		/* set all reserved flag to zero */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_UNSUPPORTED_VERSION;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	if ((ISCSI_BHS_LOGIN_GET_NSG(rsph->flags) == ISCSI_NSG_RESERVED_CODE) &&
+	    ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) {
+		/* set NSG to zero */
+		rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK;
+		/* also set other bits to zero */
+		rsph->flags &= ~ISCSI_LOGIN_TRANSIT;
+		rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK;
+		SPDK_ERRLOG("Received reserved NSG code: %d\n", ISCSI_NSG_RESERVED_CODE);
+		/* Initiator error */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	return 0;
+}
+
+static int
+iscsi_op_login_store_incoming_params(struct spdk_iscsi_conn *conn,
+				     struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu,
+				     struct iscsi_param **params)
+{
+	struct iscsi_bhs_login_req *reqh;
+	struct iscsi_bhs_login_rsp *rsph;
+	int rc;
+
+	reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+	rc = iscsi_parse_params(params, pdu->data,
+				pdu->data_segment_len, ISCSI_BHS_LOGIN_GET_CBIT(reqh->flags),
+				&conn->partial_text_parameter);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_parse_params() failed\n");
+		iscsi_param_free(*params);
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+
+	return 0;
+}
+
+/*
+ * This function is used to initialize the port info
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_initialize_port(struct spdk_iscsi_conn *conn,
+			       struct spdk_iscsi_pdu *rsp_pdu,
+			       char *initiator_port_name,
+			       uint32_t name_length,
+			       struct iscsi_param *params)
+{
+	const char *val;
+	struct iscsi_bhs_login_rsp *rsph;
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+	/* Initiator Name and Port */
+	val = iscsi_param_get_val(params, "InitiatorName");
+	if (val == NULL) {
+		SPDK_ERRLOG("InitiatorName is empty\n");
+		/* Missing parameter */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+	snprintf(conn->initiator_name, sizeof(conn->initiator_name), "%s", val);
+	snprintf(initiator_port_name, name_length,
+		 "%s,i,0x%12.12" PRIx64, val, iscsi_get_isid(rsph->isid));
+	spdk_strlwr(conn->initiator_name);
+	spdk_strlwr(initiator_port_name);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator name: %s\n", conn->initiator_name);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator port: %s\n", initiator_port_name);
+
+	return 0;
+}
+
+/*
+ * This function is used to judge the session type
+ * return
+ * 0: success
+ * Other value: error
+ */
+static int
+iscsi_op_login_session_type(struct spdk_iscsi_conn *conn,
+			    struct spdk_iscsi_pdu *rsp_pdu,
+			    enum session_type *session_type,
+			    struct iscsi_param *params)
+{
+	const char *session_type_str;
+	struct iscsi_bhs_login_rsp *rsph;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	session_type_str = iscsi_param_get_val(params, "SessionType");
+	if (session_type_str == NULL) {
+		if (rsph->tsih != 0) {
+			*session_type = SESSION_TYPE_NORMAL;
+		} else {
+			SPDK_ERRLOG("SessionType is empty\n");
+			/* Missing parameter */
+			rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+	} else {
+		if (strcasecmp(session_type_str, "Discovery") == 0) {
+			*session_type = SESSION_TYPE_DISCOVERY;
+		} else if (strcasecmp(session_type_str, "Normal") == 0) {
+			*session_type = SESSION_TYPE_NORMAL;
+		} else {
+			*session_type = SESSION_TYPE_INVALID;
+			SPDK_ERRLOG("SessionType is invalid\n");
+			/* Missing parameter */
+			rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Session Type: %s\n", session_type_str);
+
+	return 0;
+}
+
+/*
+ * This function is used to check the target info
+ * return:
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_check_target(struct spdk_iscsi_conn *conn,
+			    struct spdk_iscsi_pdu *rsp_pdu,
+			    const char *target_name,
+			    struct spdk_iscsi_tgt_node **target)
+{
+	bool result;
+	struct iscsi_bhs_login_rsp *rsph;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	*target = iscsi_find_tgt_node(target_name);
+	if (*target == NULL) {
+		SPDK_WARNLOG("target %s not found\n", target_name);
+		/* Not found */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_TARGET_NOT_FOUND;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+	if (iscsi_tgt_node_is_destructed(*target)) {
+		SPDK_ERRLOG("target %s is removed\n", target_name);
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_TARGET_REMOVED;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+	result = iscsi_tgt_node_access(conn, *target,
+				       conn->initiator_name,
+				       conn->initiator_addr);
+	if (!result) {
+		SPDK_ERRLOG("access denied\n");
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_AUTHORIZATION_FAIL;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	return 0;
+}
+
+/*
+ * This function use to check the session
+ * return:
+ * 0, success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_check_session(struct spdk_iscsi_conn *conn,
+			     struct spdk_iscsi_pdu *rsp_pdu,
+			     char *initiator_port_name, int cid)
+
+{
+	int rc = 0;
+	struct iscsi_bhs_login_rsp *rsph;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	/* check existing session */
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "isid=%"PRIx64", tsih=%u, cid=%u\n",
+		      iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih), cid);
+	if (rsph->tsih != 0) {
+		/* multiple connections */
+		rc = append_iscsi_sess(conn, initiator_port_name,
+				       from_be16(&rsph->tsih), cid);
+		if (rc != 0) {
+			SPDK_ERRLOG("isid=%"PRIx64", tsih=%u, cid=%u:"
+				    "spdk_append_iscsi_sess() failed\n",
+				    iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih),
+				    cid);
+			/* Can't include in session */
+			rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+			rsph->status_detail = rc;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+	} else if (!g_iscsi.AllowDuplicateIsid) {
+		/* new session, drop old sess by the initiator */
+		iscsi_drop_conns(conn, initiator_port_name, 0 /* drop old */);
+	}
+
+	return rc;
+}
+
+/*
+ * This function is used to del the original param and update it with new
+ * value
+ * return:
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_update_param(struct spdk_iscsi_conn *conn,
+			    const char *key, const char *value,
+			    const char *list)
+{
+	int rc = 0;
+	struct iscsi_param *new_param, *orig_param;
+	int index;
+
+	orig_param = iscsi_param_find(conn->params, key);
+	if (orig_param == NULL) {
+		SPDK_ERRLOG("orig_param %s not found\n", key);
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+
+	index = orig_param->state_index;
+	rc = iscsi_param_del(&conn->params, key);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_del(%s) failed\n", key);
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+	rc = iscsi_param_add(&conn->params, key, value, list, ISPT_LIST);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_add() failed\n");
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+	new_param = iscsi_param_find(conn->params, key);
+	if (new_param == NULL) {
+		SPDK_ERRLOG("iscsi_param_find() failed\n");
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+	new_param->state_index = index;
+	return rc;
+}
+
+static int
+iscsi_negotiate_chap_param(struct spdk_iscsi_conn *conn)
+{
+	int rc = 0;
+
+	if (conn->disable_chap) {
+		rc = iscsi_op_login_update_param(conn, "AuthMethod", "None", "None");
+	} else if (conn->require_chap) {
+		rc = iscsi_op_login_update_param(conn, "AuthMethod", "CHAP", "CHAP");
+	}
+
+	return rc;
+}
+
+/*
+ * The function which is used to handle the part of session discovery
+ * return:
+ * 0, success;
+ * otherwise: error;
+ */
+static int
+iscsi_op_login_session_discovery_chap(struct spdk_iscsi_conn *conn)
+{
+	return iscsi_negotiate_chap_param(conn);
+}
+
+/*
+ * This function is used to update the param related with chap
+ * return:
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_negotiate_chap_param(struct spdk_iscsi_conn *conn,
+				    struct spdk_iscsi_tgt_node *target)
+{
+	conn->disable_chap = target->disable_chap;
+	conn->require_chap = target->require_chap;
+	conn->mutual_chap = target->mutual_chap;
+	conn->chap_group = target->chap_group;
+
+	return iscsi_negotiate_chap_param(conn);
+}
+
+static int
+iscsi_op_login_negotiate_digest_param(struct spdk_iscsi_conn *conn,
+				      struct spdk_iscsi_tgt_node *target)
+{
+	int rc;
+
+	if (target->header_digest) {
+		/*
+		 * User specified header digests, so update the list of
+		 *  HeaderDigest values to remove "None" so that only
+		 *  initiators who support CRC32C can connect.
+		 */
+		rc = iscsi_op_login_update_param(conn, "HeaderDigest", "CRC32C", "CRC32C");
+		if (rc < 0) {
+			return rc;
+		}
+	}
+
+	if (target->data_digest) {
+		/*
+		 * User specified data digests, so update the list of
+		 *  DataDigest values to remove "None" so that only
+		 *  initiators who support CRC32C can connect.
+		 */
+		rc = iscsi_op_login_update_param(conn, "DataDigest", "CRC32C", "CRC32C");
+		if (rc < 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * The function which is used to handle the part of normal login session
+ * return:
+ * 0, success;
+ * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error;
+ */
+static int
+iscsi_op_login_session_normal(struct spdk_iscsi_conn *conn,
+			      struct spdk_iscsi_pdu *rsp_pdu,
+			      char *initiator_port_name,
+			      struct iscsi_param *params,
+			      int cid)
+{
+	struct spdk_iscsi_tgt_node *target = NULL;
+	const char *target_name;
+	const char *target_short_name;
+	struct iscsi_bhs_login_rsp *rsph;
+	int rc = 0;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	target_name = iscsi_param_get_val(params, "TargetName");
+
+	if (target_name == NULL) {
+		SPDK_ERRLOG("TargetName is empty\n");
+		/* Missing parameter */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	memset(conn->target_short_name, 0, MAX_TARGET_NAME);
+	target_short_name = strstr(target_name, ":");
+	if (target_short_name != NULL) {
+		target_short_name++; /* Advance past the ':' */
+		if (strlen(target_short_name) >= MAX_TARGET_NAME) {
+			SPDK_ERRLOG("Target Short Name (%s) is more than %u characters\n",
+				    target_short_name, MAX_TARGET_NAME);
+			/* Invalid request */
+			rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+		snprintf(conn->target_short_name, MAX_TARGET_NAME, "%s",
+			 target_short_name);
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	rc = iscsi_op_login_check_target(conn, rsp_pdu, target_name, &target);
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	if (rc < 0) {
+		return rc;
+	}
+
+	conn->target = target;
+	conn->dev = target->dev;
+	conn->target_port = spdk_scsi_dev_find_port_by_id(target->dev,
+			    conn->pg_tag);
+
+	rc = iscsi_op_login_check_session(conn, rsp_pdu,
+					  initiator_port_name, cid);
+	if (rc < 0) {
+		return rc;
+	}
+
+	/* force target flags */
+	pthread_mutex_lock(&target->mutex);
+	rc = iscsi_op_login_negotiate_chap_param(conn, target);
+	pthread_mutex_unlock(&target->mutex);
+
+	if (rc == 0) {
+		rc = iscsi_op_login_negotiate_digest_param(conn, target);
+	}
+
+	if (rc != 0) {
+		/* Invalid request */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST;
+	}
+
+	return rc;
+}
+
+/*
+ * This function is used to set the info in the connection data structure
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_set_conn_info(struct spdk_iscsi_conn *conn,
+			     struct spdk_iscsi_pdu *rsp_pdu,
+			     char *initiator_port_name,
+			     enum session_type session_type, int cid)
+{
+	int rc = 0;
+	struct spdk_iscsi_tgt_node *target;
+	struct iscsi_bhs_login_rsp *rsph;
+	struct spdk_scsi_port *initiator_port;
+
+	target = conn->target;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	conn->authenticated = false;
+	conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A;
+	conn->cid = cid;
+
+	if (conn->sess == NULL) {
+		/* create initiator port */
+		initiator_port = spdk_scsi_port_create(iscsi_get_isid(rsph->isid), 0, initiator_port_name);
+		if (initiator_port == NULL) {
+			SPDK_ERRLOG("create_port() failed\n");
+			rsph->status_class = ISCSI_CLASS_TARGET_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+
+		/* new session */
+		rc = create_iscsi_sess(conn, target, session_type);
+		if (rc < 0) {
+			spdk_scsi_port_free(&initiator_port);
+			SPDK_ERRLOG("create_sess() failed\n");
+			rsph->status_class = ISCSI_CLASS_TARGET_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+		/* initialize parameters */
+		conn->sess->initiator_port = initiator_port;
+		conn->StatSN = from_be32(&rsph->stat_sn);
+		conn->sess->isid = iscsi_get_isid(rsph->isid);
+
+		/* Initiator port TransportID */
+		spdk_scsi_port_set_iscsi_transport_id(conn->sess->initiator_port,
+						      conn->initiator_name,
+						      conn->sess->isid);
+
+		/* Discovery sessions will not have a target. */
+		if (target != NULL) {
+			conn->sess->queue_depth = target->queue_depth;
+		} else {
+			/*
+			 * Assume discovery sessions have an effective command
+			 *  windows size of 1.
+			 */
+			conn->sess->queue_depth = 1;
+		}
+		conn->sess->ExpCmdSN = rsp_pdu->cmd_sn;
+		conn->sess->MaxCmdSN = rsp_pdu->cmd_sn + conn->sess->queue_depth - 1;
+	}
+
+	conn->initiator_port = conn->sess->initiator_port;
+
+	return 0;
+}
+
+/*
+ * This function is used to set the target info
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_set_target_info(struct spdk_iscsi_conn *conn,
+			       struct spdk_iscsi_pdu *rsp_pdu,
+			       enum session_type session_type)
+{
+	char buf[MAX_TMPBUF];
+	const char *val;
+	int rc = 0;
+	struct spdk_iscsi_tgt_node *target = conn->target;
+
+	/* declarative parameters */
+	if (target != NULL) {
+		pthread_mutex_lock(&target->mutex);
+		if (target->alias[0] != '\0') {
+			snprintf(buf, sizeof buf, "%s", target->alias);
+		} else {
+			snprintf(buf, sizeof buf, "%s", "");
+		}
+		pthread_mutex_unlock(&target->mutex);
+		rc = iscsi_param_set(conn->sess->params, "TargetAlias", buf);
+		if (rc < 0) {
+			SPDK_ERRLOG("iscsi_param_set() failed\n");
+			return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+		}
+	}
+	snprintf(buf, sizeof buf, "%s:%s,%d", conn->portal_host, conn->portal_port,
+		 conn->pg_tag);
+	rc = iscsi_param_set(conn->sess->params, "TargetAddress", buf);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set() failed\n");
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+	snprintf(buf, sizeof buf, "%d", conn->pg_tag);
+	rc = iscsi_param_set(conn->sess->params, "TargetPortalGroupTag", buf);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_set() failed\n");
+		return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+	}
+
+	/* write in response */
+	if (target != NULL) {
+		val = iscsi_param_get_val(conn->sess->params, "TargetAlias");
+		if (val != NULL && strlen(val) != 0) {
+			rsp_pdu->data_segment_len = iscsi_append_param(conn,
+						    "TargetAlias",
+						    rsp_pdu->data,
+						    rsp_pdu->data_buf_len,
+						    rsp_pdu->data_segment_len);
+		}
+		if (session_type == SESSION_TYPE_DISCOVERY) {
+			rsp_pdu->data_segment_len = iscsi_append_param(conn,
+						    "TargetAddress",
+						    rsp_pdu->data,
+						    rsp_pdu->data_buf_len,
+						    rsp_pdu->data_segment_len);
+		}
+		rsp_pdu->data_segment_len = iscsi_append_param(conn,
+					    "TargetPortalGroupTag",
+					    rsp_pdu->data,
+					    rsp_pdu->data_buf_len,
+					    rsp_pdu->data_segment_len);
+	}
+
+	return rc;
+}
+
+/*
+ * This function is used to handle the login of iscsi initiator when there is
+ * no session
+ * return:
+ * 0, success;
+ * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error;
+ * SPDK_ISCSI_LOGIN_ERROR_RESPONSE,  used to notify the login fail.
+ */
+static int
+iscsi_op_login_phase_none(struct spdk_iscsi_conn *conn,
+			  struct spdk_iscsi_pdu *rsp_pdu,
+			  struct iscsi_param *params, int cid)
+{
+	enum session_type session_type;
+	char initiator_port_name[MAX_INITIATOR_PORT_NAME];
+	struct iscsi_bhs_login_rsp *rsph;
+	int rc = 0;
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+	conn->target = NULL;
+	conn->dev = NULL;
+
+	rc = iscsi_op_login_initialize_port(conn, rsp_pdu, initiator_port_name,
+					    MAX_INITIATOR_PORT_NAME, params);
+	if (rc < 0) {
+		return rc;
+	}
+
+	rc = iscsi_op_login_session_type(conn, rsp_pdu, &session_type, params);
+	if (rc < 0) {
+		return rc;
+	}
+
+	/* Target Name and Port */
+	if (session_type == SESSION_TYPE_NORMAL) {
+		rc = iscsi_op_login_session_normal(conn, rsp_pdu,
+						   initiator_port_name,
+						   params, cid);
+		if (rc < 0) {
+			return rc;
+		}
+
+	} else if (session_type == SESSION_TYPE_DISCOVERY) {
+		rsph->tsih = 0;
+
+		/* force target flags */
+		pthread_mutex_lock(&g_iscsi.mutex);
+		rc = iscsi_op_login_session_discovery_chap(conn);
+		pthread_mutex_unlock(&g_iscsi.mutex);
+		if (rc < 0) {
+			return rc;
+		}
+	} else {
+		SPDK_ERRLOG("unknown session type\n");
+		/* Missing parameter */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	rc = iscsi_op_login_set_conn_info(conn, rsp_pdu, initiator_port_name,
+					  session_type, cid);
+	if (rc < 0) {
+		return rc;
+	}
+
+	/* limit conns on discovery session */
+	if (session_type == SESSION_TYPE_DISCOVERY) {
+		conn->sess->MaxConnections = 1;
+		rc = iscsi_param_set_int(conn->sess->params,
+					 "MaxConnections",
+					 conn->sess->MaxConnections);
+		if (rc < 0) {
+			SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+			return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+		}
+	}
+
+	return iscsi_op_login_set_target_info(conn, rsp_pdu, session_type);
+}
+
+/*
+ * This function is used to set the csg bit case in rsp
+ * return:
+ * 0, success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_rsp_handle_csg_bit(struct spdk_iscsi_conn *conn,
+				  struct spdk_iscsi_pdu *rsp_pdu,
+				  struct iscsi_param *params)
+{
+	const char *auth_method;
+	int rc;
+	struct iscsi_bhs_login_rsp *rsph;
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+	switch (ISCSI_BHS_LOGIN_GET_CSG(rsph->flags)) {
+	case ISCSI_SECURITY_NEGOTIATION_PHASE:
+		/* SecurityNegotiation */
+		auth_method = iscsi_param_get_val(conn->params, "AuthMethod");
+		if (auth_method == NULL) {
+			SPDK_ERRLOG("AuthMethod is empty\n");
+			/* Missing parameter */
+			rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+		if (strcasecmp(auth_method, "None") == 0) {
+			conn->authenticated = true;
+		} else {
+			rc = iscsi_auth_params(conn, params, auth_method,
+					       rsp_pdu->data, rsp_pdu->data_buf_len,
+					       rsp_pdu->data_segment_len);
+			if (rc < 0) {
+				SPDK_ERRLOG("iscsi_auth_params() failed\n");
+				/* Authentication failure */
+				rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+				rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL;
+				return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+			}
+			rsp_pdu->data_segment_len = rc;
+			if (!conn->authenticated) {
+				/* not complete */
+				rsph->flags &= ~ISCSI_LOGIN_TRANSIT;
+			} else {
+				if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_END) {
+					SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CHAP phase not complete");
+				}
+			}
+
+			SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Auth Params",
+				     rsp_pdu->data, rsp_pdu->data_segment_len);
+		}
+		break;
+
+	case ISCSI_OPERATIONAL_NEGOTIATION_PHASE:
+		/* LoginOperationalNegotiation */
+		if (conn->state == ISCSI_CONN_STATE_INVALID) {
+			if (conn->require_chap) {
+				/* Authentication failure */
+				rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+				rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL;
+				return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+			} else {
+				/* AuthMethod=None */
+				conn->authenticated = true;
+			}
+		}
+		if (!conn->authenticated) {
+			SPDK_ERRLOG("authentication error\n");
+			/* Authentication failure */
+			rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+			rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL;
+			return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+		}
+		break;
+
+	case ISCSI_FULL_FEATURE_PHASE:
+		/* FullFeaturePhase */
+		SPDK_ERRLOG("XXX Login in FullFeaturePhase\n");
+		/* Initiator error */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+
+	default:
+		SPDK_ERRLOG("unknown stage\n");
+		/* Initiator error */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	return 0;
+}
+
+/* This function is used to notify the session info
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_notify_session_info(struct spdk_iscsi_conn *conn,
+				   struct spdk_iscsi_pdu *rsp_pdu)
+{
+	struct iscsi_bhs_login_rsp *rsph;
+
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	if (conn->sess->session_type == SESSION_TYPE_NORMAL) {
+		/* normal session */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Login from %s (%s) on %s tgt_node%d"
+			      " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+			      " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+			      conn->initiator_name, conn->initiator_addr,
+			      conn->target->name, conn->target->num,
+			      conn->portal_host, conn->portal_port, conn->pg_tag,
+			      conn->sess->isid, conn->sess->tsih, conn->cid,
+			      (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+			       ? "on" : "off"),
+			      (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+			       ? "on" : "off"));
+	} else if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+		/* discovery session */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Login(discovery) from %s (%s) on"
+			      " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+			      " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+			      conn->initiator_name, conn->initiator_addr,
+			      conn->portal_host, conn->portal_port, conn->pg_tag,
+			      conn->sess->isid, conn->sess->tsih, conn->cid,
+			      (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+			       ? "on" : "off"),
+			      (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+			       ? "on" : "off"));
+	} else {
+		SPDK_ERRLOG("unknown session type\n");
+		/* Initiator error */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	return 0;
+}
+
+/*
+ * This function is to handle the tbit cases
+ * return
+ * 0: success
+ * otherwise error
+ */
+static int
+iscsi_op_login_rsp_handle_t_bit(struct spdk_iscsi_conn *conn,
+				struct spdk_iscsi_pdu *rsp_pdu)
+{
+	int rc;
+	struct iscsi_bhs_login_rsp *rsph;
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+	switch (ISCSI_BHS_LOGIN_GET_NSG(rsph->flags)) {
+	case ISCSI_SECURITY_NEGOTIATION_PHASE:
+		/* SecurityNegotiation */
+		conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE;
+		break;
+
+	case ISCSI_OPERATIONAL_NEGOTIATION_PHASE:
+		/* LoginOperationalNegotiation */
+		conn->login_phase = ISCSI_OPERATIONAL_NEGOTIATION_PHASE;
+		break;
+
+	case ISCSI_FULL_FEATURE_PHASE:
+		/* FullFeaturePhase */
+		conn->login_phase = ISCSI_FULL_FEATURE_PHASE;
+		to_be16(&rsph->tsih, conn->sess->tsih);
+
+		rc = iscsi_op_login_notify_session_info(conn, rsp_pdu);
+		if (rc < 0) {
+			return rc;
+		}
+
+		conn->full_feature = 1;
+		break;
+
+	default:
+		SPDK_ERRLOG("unknown stage\n");
+		/* Initiator error */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	return 0;
+}
+
+/*
+ * This function is used to set the values of the internal data structure used
+ * by spdk_iscsi_op_login function
+ * return:
+ * 0, used to notify the a successful login
+ * SPDK_ISCSI_LOGIN_ERROR_RESPONSE,  used to notify a failure login.
+ */
+static int
+iscsi_op_login_rsp_handle(struct spdk_iscsi_conn *conn,
+			  struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param **params)
+{
+	int rc;
+	struct iscsi_bhs_login_rsp *rsph;
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+	/* negotiate parameters */
+	rc = iscsi_negotiate_params(conn, params, rsp_pdu->data,
+				    rsp_pdu->data_buf_len,
+				    rsp_pdu->data_segment_len);
+	if (rc < 0) {
+		/*
+		 * iscsi_negotiate_params just returns -1 on failure,
+		 *  so translate this into meaningful response codes and
+		 *  return values.
+		 */
+		rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+		rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	}
+
+	rsp_pdu->data_segment_len = rc;
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Params", rsp_pdu->data, rc);
+
+	/* handle the CSG bit case */
+	rc = iscsi_op_login_rsp_handle_csg_bit(conn, rsp_pdu, *params);
+	if (rc < 0) {
+		return rc;
+	}
+
+	/* handle the T bit case */
+	if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) {
+		rc = iscsi_op_login_rsp_handle_t_bit(conn, rsp_pdu);
+	}
+
+	return rc;
+}
+
+static int
+iscsi_pdu_hdr_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	int rc;
+	struct iscsi_bhs_login_req *reqh;
+	struct spdk_iscsi_pdu *rsp_pdu;
+
+	if (conn->full_feature && conn->sess != NULL &&
+	    conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+	pdu->cmd_sn = from_be32(&reqh->cmd_sn);
+
+	/* During login processing, use the 8KB default FirstBurstLength as
+	 *  our maximum data segment length value.
+	 */
+	if (pdu->data_segment_len > SPDK_ISCSI_FIRST_BURST_LENGTH) {
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	rsp_pdu = iscsi_get_pdu(conn);
+	if (rsp_pdu == NULL) {
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+	rc = iscsi_op_login_rsp_init(conn, pdu, rsp_pdu);
+	if (rc < 0) {
+		iscsi_op_login_response(conn, rsp_pdu, NULL, iscsi_conn_login_pdu_err_complete);
+		return 0;
+	}
+
+	conn->login_rsp_pdu = rsp_pdu;
+	return 0;
+}
+
+static int
+iscsi_pdu_payload_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	int rc;
+	struct iscsi_bhs_login_req *reqh;
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_param *params = NULL;
+	int cid;
+
+	if (conn->login_rsp_pdu == NULL) {
+		return 0;
+	}
+
+	rsp_pdu = conn->login_rsp_pdu;
+
+	reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+	cid = from_be16(&reqh->cid);
+
+	rc = iscsi_op_login_store_incoming_params(conn, pdu, rsp_pdu, &params);
+	if (rc < 0) {
+		iscsi_op_login_response(conn, rsp_pdu, NULL, iscsi_conn_login_pdu_err_complete);
+		return 0;
+	}
+
+	if (conn->state == ISCSI_CONN_STATE_INVALID) {
+		rc = iscsi_op_login_phase_none(conn, rsp_pdu, params, cid);
+		if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE || rc == SPDK_ISCSI_LOGIN_ERROR_PARAMETER) {
+			iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_err_complete);
+			return 0;
+		}
+	}
+
+	rc = iscsi_op_login_rsp_handle(conn, rsp_pdu, &params);
+	if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE) {
+		iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_err_complete);
+		return 0;
+	}
+
+	iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_success_complete);
+	return 0;
+}
+
+static int
+iscsi_pdu_hdr_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	uint32_t task_tag;
+	uint32_t ExpStatSN;
+	int F_bit, C_bit;
+	struct iscsi_bhs_text_req *reqh;
+
+	if (pdu->data_segment_len > iscsi_get_max_immediate_data_size()) {
+		SPDK_ERRLOG("data segment len(=%zu) > immediate data len(=%"PRIu32")\n",
+			    pdu->data_segment_len, iscsi_get_max_immediate_data_size());
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	reqh = (struct iscsi_bhs_text_req *)&pdu->bhs;
+
+	F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL);
+	C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE);
+	task_tag = from_be32(&reqh->itt);
+	ExpStatSN = from_be32(&reqh->exp_stat_sn);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, F=%d, C=%d, ITT=%x, TTT=%x\n",
+		      reqh->immediate, F_bit, C_bit, task_tag, from_be32(&reqh->ttt));
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+		      "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+		      pdu->cmd_sn, ExpStatSN, conn->StatSN, conn->sess->ExpCmdSN,
+		      conn->sess->MaxCmdSN);
+
+	if (ExpStatSN != conn->StatSN) {
+#if 0
+		SPDK_ERRLOG("StatSN(%u) error\n", ExpStatSN);
+		return -1;
+#else
+		/* StarPort have a bug */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) rewound\n", ExpStatSN);
+		conn->StatSN = ExpStatSN;
+#endif
+	}
+
+	if (F_bit && C_bit) {
+		SPDK_ERRLOG("final and continue\n");
+		return -1;
+	}
+
+	/*
+	 * If this is the first text op in a sequence, save the ITT so we can
+	 * compare it against the ITT for subsequent ops in the same sequence.
+	 * If a subsequent text op in same sequence has a different ITT, reject
+	 * that PDU.
+	 */
+	if (conn->sess->current_text_itt == 0xffffffffU) {
+		conn->sess->current_text_itt = task_tag;
+	} else if (conn->sess->current_text_itt != task_tag) {
+		SPDK_ERRLOG("The correct itt is %u, and the current itt is %u...\n",
+			    conn->sess->current_text_itt, task_tag);
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	return 0;
+}
+
+static void
+iscsi_conn_text_pdu_complete(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	iscsi_conn_params_update(conn);
+}
+
+static int
+iscsi_pdu_payload_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct iscsi_param *params = NULL;
+	struct spdk_iscsi_pdu *rsp_pdu;
+	uint8_t *data;
+	uint64_t lun;
+	uint32_t task_tag;
+	const char *val;
+	int F_bit, C_bit;
+	int data_len;
+	int alloc_len;
+	int rc;
+	struct iscsi_bhs_text_req *reqh;
+	struct iscsi_bhs_text_resp *rsph;
+
+	data_len = 0;
+	alloc_len = conn->MaxRecvDataSegmentLength;
+
+	reqh = (struct iscsi_bhs_text_req *)&pdu->bhs;
+
+	F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL);
+	C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE);
+	lun = from_be64(&reqh->lun);
+	task_tag = from_be32(&reqh->itt);
+
+	/* store incoming parameters */
+	rc = iscsi_parse_params(&params, pdu->data, pdu->data_segment_len,
+				C_bit, &conn->partial_text_parameter);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_parse_params() failed\n");
+		iscsi_param_free(params);
+		return -1;
+	}
+
+	data = calloc(1, alloc_len);
+	if (!data) {
+		SPDK_ERRLOG("calloc() failed for data segment\n");
+		iscsi_param_free(params);
+		return -ENOMEM;
+	}
+
+	/* negotiate parameters */
+	data_len = iscsi_negotiate_params(conn, &params,
+					  data, alloc_len, data_len);
+	if (data_len < 0) {
+		SPDK_ERRLOG("iscsi_negotiate_params() failed\n");
+		iscsi_param_free(params);
+		free(data);
+		return -1;
+	}
+
+	/* sendtargets is special case */
+	val = iscsi_param_get_val(params, "SendTargets");
+	if (val != NULL) {
+		if (iscsi_param_eq_val(conn->sess->params,
+				       "SessionType", "Discovery")) {
+			if (strcasecmp(val, "") == 0) {
+				val = "ALL";
+			}
+
+			data_len = iscsi_send_tgts(conn,
+						   conn->initiator_name,
+						   conn->initiator_addr,
+						   val, data, alloc_len,
+						   data_len);
+		} else {
+			if (strcasecmp(val, "") == 0) {
+				val = conn->target->name;
+			}
+
+			if (strcasecmp(val, "ALL") == 0) {
+				/* not in discovery session */
+				data_len = iscsi_append_text(conn,
+							     "SendTargets",
+							     "Reject", data,
+							     alloc_len, data_len);
+			} else {
+				data_len = iscsi_send_tgts(conn,
+							   conn->initiator_name,
+							   conn->initiator_addr,
+							   val, data, alloc_len,
+							   data_len);
+			}
+		}
+	} else {
+		if (iscsi_param_eq_val(conn->sess->params, "SessionType", "Discovery")) {
+			iscsi_param_free(params);
+			free(data);
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		}
+	}
+
+	iscsi_param_free(params);
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Params", data, data_len);
+
+	/* response PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	if (rsp_pdu == NULL) {
+		free(data);
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+	rsph = (struct iscsi_bhs_text_resp *)&rsp_pdu->bhs;
+
+	rsp_pdu->data = data;
+	rsph->opcode = ISCSI_OP_TEXT_RSP;
+
+	if (F_bit) {
+		rsph->flags |= ISCSI_FLAG_FINAL;
+	}
+
+	if (C_bit) {
+		rsph->flags |= ISCSI_TEXT_CONTINUE;
+	}
+
+	DSET24(rsph->data_segment_len, data_len);
+	to_be64(&rsph->lun, lun);
+	to_be32(&rsph->itt, task_tag);
+
+	if (F_bit) {
+		rsph->ttt = 0xffffffffU;
+		conn->sess->current_text_itt = 0xffffffffU;
+	} else {
+		to_be32(&rsph->ttt, 1 + conn->id);
+	}
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	conn->StatSN++;
+
+	if (reqh->immediate == 0) {
+		conn->sess->MaxCmdSN++;
+	}
+
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_text_pdu_complete, conn);
+	return 0;
+}
+
+static void iscsi_conn_logout_pdu_complete(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	if (conn->sess == NULL) {
+		/*
+		 * login failed but initiator still sent a logout rather than
+		 *  just closing the TCP connection.
+		 */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout(login failed) from %s (%s) on"
+			      " (%s:%s,%d)\n",
+			      conn->initiator_name, conn->initiator_addr,
+			      conn->portal_host, conn->portal_port, conn->pg_tag);
+	} else if (iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout from %s (%s) on %s tgt_node%d"
+			      " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+			      " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+			      conn->initiator_name, conn->initiator_addr,
+			      conn->target->name, conn->target->num,
+			      conn->portal_host, conn->portal_port, conn->pg_tag,
+			      conn->sess->isid, conn->sess->tsih, conn->cid,
+			      (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+			       ? "on" : "off"),
+			      (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+			       ? "on" : "off"));
+	} else {
+		/* discovery session */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout(discovery) from %s (%s) on"
+			      " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+			      " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+			      conn->initiator_name, conn->initiator_addr,
+			      conn->portal_host, conn->portal_port, conn->pg_tag,
+			      conn->sess->isid, conn->sess->tsih, conn->cid,
+			      (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+			       ? "on" : "off"),
+			      (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+			       ? "on" : "off"));
+	}
+}
+
+static int
+iscsi_pdu_hdr_op_logout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	uint32_t task_tag;
+	uint32_t ExpStatSN;
+	int response;
+	struct iscsi_bhs_logout_req *reqh;
+	struct iscsi_bhs_logout_resp *rsph;
+	uint16_t cid;
+
+	reqh = (struct iscsi_bhs_logout_req *)&pdu->bhs;
+
+	cid = from_be16(&reqh->cid);
+	task_tag = from_be32(&reqh->itt);
+	ExpStatSN = from_be32(&reqh->exp_stat_sn);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "reason=%d, ITT=%x, cid=%d\n",
+		      reqh->reason, task_tag, cid);
+
+	if (conn->sess != NULL) {
+		if (conn->sess->session_type == SESSION_TYPE_DISCOVERY &&
+		    reqh->reason != ISCSI_LOGOUT_REASON_CLOSE_SESSION) {
+			SPDK_ERRLOG("Target can accept logout only with reason \"close the session\" "
+				    "on discovery session. %d is not acceptable reason.\n",
+				    reqh->reason);
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+			      pdu->cmd_sn, ExpStatSN, conn->StatSN,
+			      conn->sess->ExpCmdSN, conn->sess->MaxCmdSN);
+
+		if (pdu->cmd_sn != conn->sess->ExpCmdSN) {
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN(%u) might have dropped\n", pdu->cmd_sn);
+			/* ignore error */
+		}
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n",
+			      pdu->cmd_sn, ExpStatSN, conn->StatSN);
+	}
+
+	if (ExpStatSN != conn->StatSN) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u/%u) might have dropped\n",
+			      ExpStatSN, conn->StatSN);
+		/* ignore error */
+	}
+
+	if (conn->id == cid) {
+		/* connection or session closed successfully */
+		response = 0;
+		iscsi_conn_logout(conn);
+	} else {
+		response = 1;
+	}
+
+	/* response PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	if (rsp_pdu == NULL) {
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+	rsph = (struct iscsi_bhs_logout_resp *)&rsp_pdu->bhs;
+	rsp_pdu->data = NULL;
+	rsph->opcode = ISCSI_OP_LOGOUT_RSP;
+	rsph->flags |= 0x80; /* bit 0 must be 1 */
+	rsph->response = response;
+	DSET24(rsph->data_segment_len, 0);
+	to_be32(&rsph->itt, task_tag);
+
+	if (conn->sess != NULL) {
+		to_be32(&rsph->stat_sn, conn->StatSN);
+		conn->StatSN++;
+
+		if (conn->sess->connections == 1) {
+			conn->sess->MaxCmdSN++;
+		}
+
+		to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+		to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+	} else {
+		to_be32(&rsph->stat_sn, conn->StatSN);
+		conn->StatSN++;
+		to_be32(&rsph->exp_cmd_sn, pdu->cmd_sn);
+		to_be32(&rsph->max_cmd_sn, pdu->cmd_sn);
+	}
+
+	rsph->time_2_wait = 0;
+	rsph->time_2_retain = 0;
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_logout_pdu_complete, conn);
+
+	return 0;
+}
+
+static int
+iscsi_send_r2t(struct spdk_iscsi_conn *conn,
+	       struct spdk_iscsi_task *task, int offset,
+	       int len, uint32_t transfer_tag, uint32_t *R2TSN)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_r2t *rsph;
+	uint64_t fmt_lun;
+
+	/* R2T PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	if (rsp_pdu == NULL) {
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+	rsph = (struct iscsi_bhs_r2t *)&rsp_pdu->bhs;
+	rsp_pdu->data = NULL;
+	rsph->opcode = ISCSI_OP_R2T;
+	rsph->flags |= 0x80; /* bit 0 is default to 1 */
+	fmt_lun = spdk_scsi_lun_id_int_to_fmt(task->lun_id);
+	to_be64(&rsph->lun, fmt_lun);
+	to_be32(&rsph->itt, task->tag);
+	to_be32(&rsph->ttt, transfer_tag);
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	to_be32(&rsph->r2t_sn, *R2TSN);
+	*R2TSN += 1;
+
+	task->r2t_datasn = 0; /* next expected datasn to ack */
+
+	to_be32(&rsph->buffer_offset, (uint32_t)offset);
+	to_be32(&rsph->desired_xfer_len, (uint32_t)len);
+	task->desired_data_transfer_length = (size_t)len;
+
+	/* we need to hold onto this task/cmd because until the PDU has been
+	 * written out */
+	rsp_pdu->task = task;
+	task->scsi.ref++;
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+
+	return 0;
+}
+
+/* This function is used to remove the r2t pdu from snack_pdu_list by < task, r2t_sn> info */
+static struct spdk_iscsi_pdu *
+iscsi_remove_r2t_pdu_from_snack_list(struct spdk_iscsi_conn *conn,
+				     struct spdk_iscsi_task *task,
+				     uint32_t r2t_sn)
+{
+	struct spdk_iscsi_pdu *pdu;
+	struct iscsi_bhs_r2t *r2t_header;
+
+	TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) {
+		if (pdu->bhs.opcode == ISCSI_OP_R2T) {
+			r2t_header = (struct iscsi_bhs_r2t *)&pdu->bhs;
+			if (pdu->task == task &&
+			    from_be32(&r2t_header->r2t_sn) == r2t_sn) {
+				TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+				return pdu;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/* This function is used re-send the r2t packet */
+static int
+iscsi_send_r2t_recovery(struct spdk_iscsi_conn *conn,
+			struct spdk_iscsi_task *task, uint32_t r2t_sn,
+			bool send_new_r2tsn)
+{
+	struct spdk_iscsi_pdu *pdu;
+	struct iscsi_bhs_r2t *rsph;
+	uint32_t transfer_len;
+	uint32_t len;
+	int rc;
+
+	/* remove the r2t pdu from the snack_list */
+	pdu = iscsi_remove_r2t_pdu_from_snack_list(conn, task, r2t_sn);
+	if (!pdu) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "No pdu is found\n");
+		return -1;
+	}
+
+	/* flag
+	 * false: only need to re-send the old r2t with changing statsn
+	 * true: we send a r2t with new r2tsn
+	 */
+	if (!send_new_r2tsn) {
+		to_be32(&pdu->bhs.stat_sn, conn->StatSN);
+		iscsi_conn_write_pdu(conn, pdu, iscsi_conn_pdu_generic_complete, NULL);
+	} else {
+		rsph = (struct iscsi_bhs_r2t *)&pdu->bhs;
+		transfer_len = from_be32(&rsph->desired_xfer_len);
+
+		/* still need to increase the acked r2tsn */
+		task->acked_r2tsn++;
+		len = spdk_min(conn->sess->MaxBurstLength,
+			       (transfer_len - task->next_expected_r2t_offset));
+
+		/* remove the old_r2t_pdu */
+		iscsi_conn_free_pdu(conn, pdu);
+
+		/* re-send a new r2t pdu */
+		rc = iscsi_send_r2t(conn, task, task->next_expected_r2t_offset,
+				    len, task->ttt, &task->R2TSN);
+		if (rc < 0) {
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		}
+	}
+
+	return 0;
+}
+
+static int
+add_transfer_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+	uint32_t transfer_len;
+	size_t max_burst_len;
+	size_t segment_len;
+	size_t data_len;
+	int len;
+	int rc;
+	int data_out_req;
+
+	transfer_len = task->scsi.transfer_len;
+	data_len = iscsi_task_get_pdu(task)->data_segment_len;
+	max_burst_len = conn->sess->MaxBurstLength;
+	segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH;
+	data_out_req = 1 + (transfer_len - data_len - 1) / segment_len;
+	task->data_out_cnt = data_out_req;
+
+	/*
+	 * If we already have too many tasks using R2T, then queue this task
+	 *  and start sending R2T for it after some of the tasks using R2T/data
+	 *  out buffers complete.
+	 */
+	if (conn->pending_r2t >= DEFAULT_MAXR2T) {
+		TAILQ_INSERT_TAIL(&conn->queued_r2t_tasks, task, link);
+		return 0;
+	}
+
+	conn->data_out_cnt += data_out_req;
+	conn->pending_r2t++;
+
+	task->next_expected_r2t_offset = data_len;
+	task->current_r2t_length = 0;
+	task->R2TSN = 0;
+	/* According to RFC3720 10.8.5, 0xffffffff is
+	 * reserved for TTT in R2T.
+	 */
+	if (++conn->ttt == 0xffffffffu) {
+		conn->ttt = 0;
+	}
+	task->ttt = conn->ttt;
+
+	while (data_len != transfer_len) {
+		len = spdk_min(max_burst_len, (transfer_len - data_len));
+		rc = iscsi_send_r2t(conn, task, data_len, len,
+				    task->ttt, &task->R2TSN);
+		if (rc < 0) {
+			SPDK_ERRLOG("iscsi_send_r2t() failed\n");
+			return rc;
+		}
+		data_len += len;
+		task->next_r2t_offset = data_len;
+		task->outstanding_r2t++;
+		if (conn->sess->MaxOutstandingR2T == task->outstanding_r2t) {
+			break;
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&conn->active_r2t_tasks, task, link);
+	task->is_r2t_active = true;
+	return 0;
+}
+
+/* If there are additional large writes queued for R2Ts, start them now.
+ *  This is called when a large write is just completed or when multiple LUNs
+ *  are attached and large write tasks for the specific LUN are cleared.
+ */
+static void
+start_queued_transfer_tasks(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_task *task, *tmp;
+
+	TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, tmp) {
+		if (conn->pending_r2t < DEFAULT_MAXR2T) {
+			TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link);
+			add_transfer_task(conn, task);
+		} else {
+			break;
+		}
+	}
+}
+
+bool
+iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag)
+{
+	struct spdk_iscsi_task *task, *tmp;
+
+	TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, tmp) {
+		if (task->tag == task_tag) {
+			assert(conn->data_out_cnt >= task->data_out_cnt);
+			conn->data_out_cnt -= task->data_out_cnt;
+
+			conn->pending_r2t--;
+
+			assert(task->is_r2t_active == true);
+			TAILQ_REMOVE(&conn->active_r2t_tasks, task, link);
+			task->is_r2t_active = false;
+			iscsi_task_put(task);
+
+			start_queued_transfer_tasks(conn);
+			return true;
+		}
+	}
+	return false;
+}
+
+void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn,
+				   struct spdk_scsi_lun *lun,
+				   struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task *task, *task_tmp;
+	struct spdk_iscsi_pdu *pdu_tmp;
+
+	TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, task_tmp) {
+		pdu_tmp = iscsi_task_get_pdu(task);
+		if ((lun == NULL || lun == task->scsi.lun) &&
+		    (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) {
+			task->outstanding_r2t = 0;
+			task->next_r2t_offset = 0;
+			task->next_expected_r2t_offset = 0;
+			assert(conn->data_out_cnt >= task->data_out_cnt);
+			conn->data_out_cnt -= task->data_out_cnt;
+			conn->pending_r2t--;
+
+			TAILQ_REMOVE(&conn->active_r2t_tasks, task, link);
+			task->is_r2t_active = false;
+			if (lun != NULL && spdk_scsi_lun_is_removing(lun)) {
+				spdk_scsi_task_process_null_lun(&task->scsi);
+				iscsi_task_response(conn, task);
+			}
+			iscsi_task_put(task);
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, task_tmp) {
+		pdu_tmp = iscsi_task_get_pdu(task);
+		if ((lun == NULL || lun == task->scsi.lun) &&
+		    (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) {
+			TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link);
+			task->is_r2t_active = false;
+			if (lun != NULL && spdk_scsi_lun_is_removing(lun)) {
+				spdk_scsi_task_process_null_lun(&task->scsi);
+				iscsi_task_response(conn, task);
+			}
+			iscsi_task_put(task);
+		}
+	}
+
+	start_queued_transfer_tasks(conn);
+}
+
+static struct spdk_iscsi_task *
+get_transfer_task(struct spdk_iscsi_conn *conn, uint32_t transfer_tag)
+{
+	struct spdk_iscsi_task *task;
+
+	TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) {
+		if (task->ttt == transfer_tag) {
+			return task;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+iscsi_conn_datain_pdu_complete(void *arg)
+{
+	struct spdk_iscsi_conn *conn = arg;
+
+	iscsi_conn_handle_queued_datain_tasks(conn);
+}
+
+static int
+iscsi_send_datain(struct spdk_iscsi_conn *conn,
+		  struct spdk_iscsi_task *task, int datain_flag,
+		  int residual_len, int offset, int DataSN, int len)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_data_in *rsph;
+	uint32_t task_tag;
+	uint32_t transfer_tag;
+	int F_bit, U_bit, O_bit, S_bit;
+	struct spdk_iscsi_task *primary;
+	struct spdk_scsi_lun *lun_dev;
+
+	primary = iscsi_task_get_primary(task);
+
+	/* DATA PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	rsph = (struct iscsi_bhs_data_in *)&rsp_pdu->bhs;
+	rsp_pdu->data = task->scsi.iovs[0].iov_base + offset;
+	rsp_pdu->data_buf_len = task->scsi.iovs[0].iov_len - offset;
+	rsp_pdu->data_from_mempool = true;
+
+	task_tag = task->tag;
+	transfer_tag = 0xffffffffU;
+
+	F_bit = datain_flag & ISCSI_FLAG_FINAL;
+	O_bit = datain_flag & ISCSI_DATAIN_OVERFLOW;
+	U_bit = datain_flag & ISCSI_DATAIN_UNDERFLOW;
+	S_bit = datain_flag & ISCSI_DATAIN_STATUS;
+
+	/*
+	 * we need to hold onto this task/cmd because until the
+	 * PDU has been written out
+	 */
+	rsp_pdu->task = task;
+	task->scsi.ref++;
+
+	rsph->opcode = ISCSI_OP_SCSI_DATAIN;
+
+	if (F_bit) {
+		rsph->flags |= ISCSI_FLAG_FINAL;
+	}
+
+	/* we leave the A_bit clear */
+
+	if (F_bit && S_bit)  {
+		if (O_bit) {
+			rsph->flags |= ISCSI_DATAIN_OVERFLOW;
+		}
+
+		if (U_bit) {
+			rsph->flags |= ISCSI_DATAIN_UNDERFLOW;
+		}
+	}
+
+	if (S_bit) {
+		rsph->flags |= ISCSI_DATAIN_STATUS;
+		rsph->status = task->scsi.status;
+	}
+
+	DSET24(rsph->data_segment_len, len);
+
+	to_be32(&rsph->itt, task_tag);
+	to_be32(&rsph->ttt, transfer_tag);
+
+	if (S_bit) {
+		to_be32(&rsph->stat_sn, conn->StatSN);
+		conn->StatSN++;
+	}
+
+	if (F_bit && S_bit && !iscsi_task_is_immediate(primary)) {
+		conn->sess->MaxCmdSN++;
+	}
+
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	to_be32(&rsph->data_sn, DataSN);
+
+	if (conn->sess->ErrorRecoveryLevel >= 1) {
+		primary->datain_datasn = DataSN;
+	}
+	DataSN++;
+
+	if (task->parent) {
+		offset += primary->scsi.data_transferred;
+	}
+	to_be32(&rsph->buffer_offset, (uint32_t)offset);
+	task->scsi.offset = offset;
+
+	if (F_bit && S_bit) {
+		to_be32(&rsph->res_cnt, residual_len);
+	}
+
+	lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id);
+	if (spdk_likely(lun_dev != NULL)) {
+		if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(lun_dev, &task->scsi,
+				  &rsp_pdu->dif_ctx))) {
+			rsp_pdu->dif_insert_or_strip = true;
+		}
+	}
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_datain_pdu_complete, conn);
+
+	return DataSN;
+}
+
+static int
+iscsi_transfer_in(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+	uint32_t DataSN;
+	uint32_t transfer_len;
+	uint32_t data_len;
+	uint32_t segment_len;
+	uint32_t offset;
+	uint32_t residual_len = 0;
+	int sent_status;
+	uint32_t len;
+	int datain_flag = 0;
+	int datain_seq_cnt;
+	int i;
+	uint32_t sequence_end;
+	struct spdk_iscsi_task *primary;
+
+	primary = iscsi_task_get_primary(task);
+	segment_len = conn->MaxRecvDataSegmentLength;
+	data_len = task->scsi.data_transferred;
+	transfer_len = task->scsi.length;
+
+	if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+		return 0;
+	}
+
+	if (data_len < transfer_len) {
+		/* underflow */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %u/%u\n", data_len, transfer_len);
+		residual_len = transfer_len - data_len;
+		transfer_len = data_len;
+		datain_flag |= ISCSI_DATAIN_UNDERFLOW;
+	} else if (data_len > transfer_len) {
+		/* overflow */
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %u/%u\n", data_len, transfer_len);
+		residual_len = data_len - transfer_len;
+		datain_flag |= ISCSI_DATAIN_OVERFLOW;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len);
+		residual_len = 0;
+	}
+
+	DataSN = primary->datain_datasn;
+	sent_status = 0;
+
+	/* calculate the number of sequences for all data-in pdus */
+	datain_seq_cnt = 1 + ((transfer_len - 1) / (int)conn->sess->MaxBurstLength);
+	for (i = 0; i < datain_seq_cnt; i++) {
+		offset = i * conn->sess->MaxBurstLength;
+		sequence_end = spdk_min(((i + 1) * conn->sess->MaxBurstLength),
+					transfer_len);
+
+		/* send data splitted by segment_len */
+		for (; offset < sequence_end; offset += segment_len) {
+			len = spdk_min(segment_len, (sequence_end - offset));
+
+			datain_flag &= ~ISCSI_FLAG_FINAL;
+			datain_flag &= ~ISCSI_DATAIN_STATUS;
+
+			if (offset + len == sequence_end) {
+				/* last PDU in a sequence */
+				datain_flag |= ISCSI_FLAG_FINAL;
+				if (task->scsi.sense_data_len == 0) {
+					/* The last pdu in all data-in pdus */
+					if ((offset + len) == transfer_len &&
+					    (primary->bytes_completed == primary->scsi.transfer_len)) {
+						datain_flag |= ISCSI_DATAIN_STATUS;
+						sent_status = 1;
+					}
+				}
+			}
+
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer=%d, Offset=%d, Len=%d\n",
+				      sequence_end, offset, len);
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, DataSN=%u, Offset=%u, Len=%d\n",
+				      conn->StatSN, DataSN, offset, len);
+
+			DataSN = iscsi_send_datain(conn, task, datain_flag, residual_len,
+						   offset, DataSN, len);
+		}
+	}
+
+	if (task != primary) {
+		primary->scsi.data_transferred += task->scsi.data_transferred;
+	}
+	primary->datain_datasn = DataSN;
+
+	return sent_status;
+}
+
+void iscsi_task_response(struct spdk_iscsi_conn *conn,
+			 struct spdk_iscsi_task *task)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_scsi_resp *rsph;
+	uint32_t task_tag;
+	uint32_t transfer_len;
+	size_t residual_len;
+	size_t data_len;
+	int O_bit, U_bit;
+	int rc;
+	struct spdk_iscsi_task *primary;
+
+	primary = iscsi_task_get_primary(task);
+
+	transfer_len = primary->scsi.transfer_len;
+	task_tag = task->tag;
+
+	/* transfer data from logical unit */
+	/* (direction is view of initiator side) */
+	if (iscsi_task_is_read(primary)) {
+		rc = iscsi_transfer_in(conn, task);
+		if (rc > 0) {
+			/* sent status by last DATAIN PDU */
+			return;
+		}
+
+		if (primary->bytes_completed != primary->scsi.transfer_len) {
+			return;
+		}
+	}
+
+	O_bit = U_bit = 0;
+	residual_len = 0;
+	data_len = primary->scsi.data_transferred;
+
+	if ((transfer_len != 0) &&
+	    (task->scsi.status == SPDK_SCSI_STATUS_GOOD)) {
+		if (data_len < transfer_len) {
+			/* underflow */
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %zu/%u\n", data_len, transfer_len);
+			residual_len = transfer_len - data_len;
+			U_bit = 1;
+		} else if (data_len > transfer_len) {
+			/* overflow */
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %zu/%u\n", data_len, transfer_len);
+			residual_len = data_len - transfer_len;
+			O_bit = 1;
+		} else {
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len);
+		}
+	}
+
+	/* response PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	assert(rsp_pdu != NULL);
+	rsph = (struct iscsi_bhs_scsi_resp *)&rsp_pdu->bhs;
+	assert(task->scsi.sense_data_len <= sizeof(rsp_pdu->sense.data));
+	memcpy(rsp_pdu->sense.data, task->scsi.sense_data, task->scsi.sense_data_len);
+	to_be16(&rsp_pdu->sense.length, task->scsi.sense_data_len);
+	rsp_pdu->data = (uint8_t *)&rsp_pdu->sense;
+	rsp_pdu->data_from_mempool = true;
+
+	/*
+	 * we need to hold onto this task/cmd because until the
+	 * PDU has been written out
+	 */
+	rsp_pdu->task = task;
+	task->scsi.ref++;
+
+	rsph->opcode = ISCSI_OP_SCSI_RSP;
+	rsph->flags |= 0x80; /* bit 0 is default to 1 */
+
+	if (O_bit) {
+		rsph->flags |= ISCSI_SCSI_OVERFLOW;
+	}
+
+	if (U_bit) {
+		rsph->flags |= ISCSI_SCSI_UNDERFLOW;
+	}
+
+	rsph->status = task->scsi.status;
+	if (task->scsi.sense_data_len) {
+		/* SenseLength (2 bytes) + SenseData  */
+		DSET24(rsph->data_segment_len, 2 + task->scsi.sense_data_len);
+	}
+	to_be32(&rsph->itt, task_tag);
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	conn->StatSN++;
+
+	if (!iscsi_task_is_immediate(primary)) {
+		conn->sess->MaxCmdSN++;
+	}
+
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	to_be32(&rsph->bi_read_res_cnt, 0);
+	to_be32(&rsph->res_cnt, residual_len);
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+}
+
+/*
+ *  This function compare the input pdu's bhs with the pdu's bhs associated by
+ *  active_r2t_tasks and queued_r2t_tasks in a connection
+ */
+static bool
+iscsi_compare_pdu_bhs_within_existed_r2t_tasks(struct spdk_iscsi_conn *conn,
+		struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task	*task;
+
+	TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) {
+		if (!memcmp(&pdu->bhs, iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) {
+			return true;
+		}
+	}
+
+	TAILQ_FOREACH(task, &conn->queued_r2t_tasks, link) {
+		if (!memcmp(&pdu->bhs, iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void
+iscsi_queue_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+	spdk_trace_record(TRACE_ISCSI_TASK_QUEUE, conn->id, task->scsi.length,
+			  (uintptr_t)task, (uintptr_t)task->pdu);
+	task->is_queued = true;
+	spdk_scsi_dev_queue_task(conn->dev, &task->scsi);
+}
+
+static int
+iscsi_pdu_payload_op_scsi_read(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+	if (task->scsi.transfer_len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
+		task->parent = NULL;
+		task->scsi.offset = 0;
+		task->scsi.length = task->scsi.transfer_len;
+		spdk_scsi_task_set_data(&task->scsi, NULL, 0);
+
+		iscsi_queue_task(conn, task);
+		return 0;
+	} else {
+		TAILQ_INIT(&task->subtask_list);
+		task->current_datain_offset = 0;
+		TAILQ_INSERT_TAIL(&conn->queued_datain_tasks, task, link);
+
+		return iscsi_conn_handle_queued_datain_tasks(conn);
+	}
+}
+
+static int
+iscsi_pdu_payload_op_scsi_write(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+	struct spdk_iscsi_pdu *pdu;
+	struct iscsi_bhs_scsi_req *reqh;
+	uint32_t transfer_len;
+	uint32_t scsi_data_len;
+	int rc;
+
+	pdu = iscsi_task_get_pdu(task);
+	reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs;
+
+	transfer_len = task->scsi.transfer_len;
+
+	if (spdk_likely(!pdu->dif_insert_or_strip)) {
+		scsi_data_len = pdu->data_segment_len;
+	} else {
+		scsi_data_len = pdu->data_buf_len;
+	}
+
+	if (reqh->final_bit &&
+	    pdu->data_segment_len < transfer_len) {
+		/* needs R2T */
+		rc = add_transfer_task(conn, task);
+		if (rc < 0) {
+			SPDK_ERRLOG("add_transfer_task() failed\n");
+			iscsi_task_put(task);
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		}
+
+		/* Non-immediate writes */
+		if (pdu->data_segment_len == 0) {
+			return 0;
+		} else {
+			/* we are doing the first partial write task */
+			task->scsi.ref++;
+			spdk_scsi_task_set_data(&task->scsi, pdu->data, scsi_data_len);
+			task->scsi.length = pdu->data_segment_len;
+		}
+	}
+
+	if (pdu->data_segment_len == transfer_len) {
+		/* we are doing small writes with no R2T */
+		spdk_scsi_task_set_data(&task->scsi, pdu->data, scsi_data_len);
+		task->scsi.length = transfer_len;
+	}
+
+	iscsi_queue_task(conn, task);
+	return 0;
+}
+
+static int
+iscsi_pdu_hdr_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task	*task;
+	struct spdk_scsi_dev	*dev;
+	uint8_t *cdb;
+	uint64_t lun;
+	uint32_t task_tag;
+	uint32_t transfer_len;
+	int R_bit, W_bit;
+	int lun_i;
+	struct iscsi_bhs_scsi_req *reqh;
+
+	if (conn->sess->session_type != SESSION_TYPE_NORMAL) {
+		SPDK_ERRLOG("ISCSI_OP_SCSI not allowed in discovery and invalid session\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs;
+
+	R_bit = reqh->read_bit;
+	W_bit = reqh->write_bit;
+	lun = from_be64(&reqh->lun);
+	task_tag = from_be32(&reqh->itt);
+	transfer_len = from_be32(&reqh->expected_data_xfer_len);
+	cdb = reqh->cdb;
+
+	SPDK_LOGDUMP(SPDK_LOG_ISCSI, "CDB", cdb, 16);
+
+	task = iscsi_task_get(conn, NULL, iscsi_task_cpl);
+	if (!task) {
+		SPDK_ERRLOG("Unable to acquire task\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	iscsi_task_associate_pdu(task, pdu);
+	lun_i = spdk_scsi_lun_id_fmt_to_int(lun);
+	task->lun_id = lun_i;
+	dev = conn->dev;
+	task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i);
+
+	if ((R_bit != 0) && (W_bit != 0)) {
+		SPDK_ERRLOG("Bidirectional CDB is not supported\n");
+		iscsi_task_put(task);
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	task->scsi.cdb = cdb;
+	task->tag = task_tag;
+	task->scsi.transfer_len = transfer_len;
+	task->scsi.target_port = conn->target_port;
+	task->scsi.initiator_port = conn->initiator_port;
+	task->parent = NULL;
+	task->rsp_scsi_status = SPDK_SCSI_STATUS_GOOD;
+
+	if (task->scsi.lun == NULL) {
+		spdk_scsi_task_process_null_lun(&task->scsi);
+		iscsi_task_cpl(&task->scsi);
+		return 0;
+	}
+
+	/* no bi-directional support */
+	if (R_bit) {
+		task->scsi.dxfer_dir = SPDK_SCSI_DIR_FROM_DEV;
+	} else if (W_bit) {
+		task->scsi.dxfer_dir = SPDK_SCSI_DIR_TO_DEV;
+
+		if ((conn->sess->ErrorRecoveryLevel >= 1) &&
+		    (iscsi_compare_pdu_bhs_within_existed_r2t_tasks(conn, pdu))) {
+			iscsi_task_response(conn, task);
+			iscsi_task_put(task);
+			return 0;
+		}
+
+		if (pdu->data_segment_len > iscsi_get_max_immediate_data_size()) {
+			SPDK_ERRLOG("data segment len(=%zu) > immediate data len(=%"PRIu32")\n",
+				    pdu->data_segment_len, iscsi_get_max_immediate_data_size());
+			iscsi_task_put(task);
+			return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+		}
+
+		if (pdu->data_segment_len > transfer_len) {
+			SPDK_ERRLOG("data segment len(=%zu) > task transfer len(=%d)\n",
+				    pdu->data_segment_len, transfer_len);
+			iscsi_task_put(task);
+			return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+		}
+
+		/* check the ImmediateData and also pdu->data_segment_len */
+		if ((!conn->sess->ImmediateData && (pdu->data_segment_len > 0)) ||
+		    (pdu->data_segment_len > conn->sess->FirstBurstLength)) {
+			iscsi_task_put(task);
+			return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+		}
+
+		if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(task->scsi.lun, &task->scsi, &pdu->dif_ctx))) {
+			pdu->dif_insert_or_strip = true;
+		}
+	} else {
+		/* neither R nor W bit set */
+		task->scsi.dxfer_dir = SPDK_SCSI_DIR_NONE;
+		if (transfer_len > 0) {
+			iscsi_task_put(task);
+			SPDK_ERRLOG("Reject scsi cmd with EDTL > 0 but (R | W) == 0\n");
+			return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD);
+		}
+	}
+
+	pdu->task = task;
+	return 0;
+}
+
+static int
+iscsi_pdu_payload_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task *task;
+
+	if (pdu->task == NULL) {
+		return 0;
+	}
+
+	task = pdu->task;
+
+	if (spdk_scsi_dev_get_lun(conn->dev, task->lun_id) == NULL) {
+		spdk_scsi_task_process_null_lun(&task->scsi);
+		iscsi_task_cpl(&task->scsi);
+		return 0;
+	}
+
+	switch (task->scsi.dxfer_dir) {
+	case SPDK_SCSI_DIR_FROM_DEV:
+		return iscsi_pdu_payload_op_scsi_read(conn, task);
+	case SPDK_SCSI_DIR_TO_DEV:
+		return iscsi_pdu_payload_op_scsi_write(conn, task);
+	case SPDK_SCSI_DIR_NONE:
+		iscsi_queue_task(conn, task);
+		return 0;
+	default:
+		assert(false);
+		iscsi_task_put(task);
+		break;
+	}
+
+	return SPDK_ISCSI_CONNECTION_FATAL;
+}
+
+static void
+abort_transfer_task_in_task_mgmt_resp(struct spdk_iscsi_conn *conn,
+				      struct spdk_iscsi_task *task)
+{
+	struct spdk_iscsi_pdu *pdu;
+
+	pdu = iscsi_task_get_pdu(task);
+
+	switch (task->scsi.function) {
+	/* abort task identified by Reference Task Tag field */
+	case ISCSI_TASK_FUNC_ABORT_TASK:
+		iscsi_del_transfer_task(conn, task->scsi.abort_id);
+		break;
+
+	/* abort all tasks issued via this session on the LUN */
+	case ISCSI_TASK_FUNC_ABORT_TASK_SET:
+		iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu);
+		break;
+
+	case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET:
+		iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu);
+		break;
+	}
+}
+
+void
+iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn,
+			 struct spdk_iscsi_task *task)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_task_req *reqh;
+	struct iscsi_bhs_task_resp *rsph;
+
+	if (task->pdu == NULL) {
+		/*
+		 * This was an internally generated task management command,
+		 *  usually from LUN cleanup when a connection closes.
+		 */
+		return;
+	}
+
+	reqh = (struct iscsi_bhs_task_req *)&task->pdu->bhs;
+	/* response PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	rsph = (struct iscsi_bhs_task_resp *)&rsp_pdu->bhs;
+	rsph->opcode = ISCSI_OP_TASK_RSP;
+	rsph->flags |= 0x80; /* bit 0 default to 1 */
+	switch (task->scsi.response) {
+	case SPDK_SCSI_TASK_MGMT_RESP_COMPLETE:
+		abort_transfer_task_in_task_mgmt_resp(conn, task);
+		rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE;
+		break;
+	case SPDK_SCSI_TASK_MGMT_RESP_SUCCESS:
+		abort_transfer_task_in_task_mgmt_resp(conn, task);
+		rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE;
+		break;
+	case SPDK_SCSI_TASK_MGMT_RESP_REJECT:
+		rsph->response = ISCSI_TASK_FUNC_REJECTED;
+		break;
+	case SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN:
+		rsph->response = ISCSI_TASK_FUNC_RESP_LUN_NOT_EXIST;
+		break;
+	case SPDK_SCSI_TASK_MGMT_RESP_TARGET_FAILURE:
+		rsph->response = ISCSI_TASK_FUNC_REJECTED;
+		break;
+	case SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED:
+		rsph->response = ISCSI_TASK_FUNC_RESP_FUNC_NOT_SUPPORTED;
+		break;
+	}
+	rsph->itt = reqh->itt;
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	conn->StatSN++;
+
+	if (reqh->immediate == 0) {
+		conn->sess->MaxCmdSN++;
+	}
+
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+}
+
+static void
+iscsi_queue_mgmt_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+	struct spdk_scsi_lun *lun;
+
+	lun = spdk_scsi_dev_get_lun(conn->dev, task->lun_id);
+	if (lun == NULL) {
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN;
+		iscsi_task_mgmt_response(conn, task);
+		iscsi_task_put(task);
+		return;
+	}
+
+	spdk_scsi_dev_queue_mgmt_task(conn->dev, &task->scsi);
+}
+
+static int
+_iscsi_op_abort_task(void *arg)
+{
+	struct spdk_iscsi_task *task = arg;
+	int rc;
+
+	rc = iscsi_conn_abort_queued_datain_task(task->conn, task->scsi.abort_id);
+	if (rc != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	spdk_poller_unregister(&task->mgmt_poller);
+	iscsi_queue_mgmt_task(task->conn, task);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+iscsi_op_abort_task(struct spdk_iscsi_task *task, uint32_t ref_task_tag)
+{
+	task->scsi.abort_id = ref_task_tag;
+	task->scsi.function = SPDK_SCSI_TASK_FUNC_ABORT_TASK;
+	task->mgmt_poller = SPDK_POLLER_REGISTER(_iscsi_op_abort_task, task, 10);
+}
+
+static int
+_iscsi_op_abort_task_set(void *arg)
+{
+	struct spdk_iscsi_task *task = arg;
+	int rc;
+
+	rc = iscsi_conn_abort_queued_datain_tasks(task->conn, task->scsi.lun,
+			task->pdu);
+	if (rc != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	spdk_poller_unregister(&task->mgmt_poller);
+	iscsi_queue_mgmt_task(task->conn, task);
+	return SPDK_POLLER_BUSY;
+}
+
+void
+iscsi_op_abort_task_set(struct spdk_iscsi_task *task, uint8_t function)
+{
+	task->scsi.function = function;
+	task->mgmt_poller = SPDK_POLLER_REGISTER(_iscsi_op_abort_task_set, task, 10);
+}
+
+static int
+iscsi_pdu_hdr_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct iscsi_bhs_task_req *reqh;
+	uint64_t lun;
+	uint32_t task_tag;
+	uint32_t ref_task_tag;
+	uint8_t function;
+	int lun_i;
+	struct spdk_iscsi_task *task;
+	struct spdk_scsi_dev *dev;
+
+	if (conn->sess->session_type != SESSION_TYPE_NORMAL) {
+		SPDK_ERRLOG("ISCSI_OP_TASK not allowed in discovery and invalid session\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	reqh = (struct iscsi_bhs_task_req *)&pdu->bhs;
+	function = reqh->flags & ISCSI_TASK_FUNCTION_MASK;
+	lun = from_be64(&reqh->lun);
+	task_tag = from_be32(&reqh->itt);
+	ref_task_tag = from_be32(&reqh->ref_task_tag);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, func=%d, ITT=%x, ref TT=%x, LUN=0x%16.16"PRIx64"\n",
+		      reqh->immediate, function, task_tag, ref_task_tag, lun);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+		      conn->StatSN, conn->sess->ExpCmdSN, conn->sess->MaxCmdSN);
+
+	lun_i = spdk_scsi_lun_id_fmt_to_int(lun);
+	dev = conn->dev;
+
+	task = iscsi_task_get(conn, NULL, iscsi_task_mgmt_cpl);
+	if (!task) {
+		SPDK_ERRLOG("Unable to acquire task\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	iscsi_task_associate_pdu(task, pdu);
+	task->scsi.target_port = conn->target_port;
+	task->scsi.initiator_port = conn->initiator_port;
+	task->tag = task_tag;
+	task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i);
+	task->lun_id = lun_i;
+
+	if (task->scsi.lun == NULL) {
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN;
+		iscsi_task_mgmt_response(conn, task);
+		iscsi_task_put(task);
+		return 0;
+	}
+
+	switch (function) {
+	/* abort task identified by Referenced Task Tag field */
+	case ISCSI_TASK_FUNC_ABORT_TASK:
+		SPDK_NOTICELOG("ABORT_TASK\n");
+
+		iscsi_op_abort_task(task, ref_task_tag);
+		return 0;
+
+	/* abort all tasks issued via this session on the LUN */
+	case ISCSI_TASK_FUNC_ABORT_TASK_SET:
+		SPDK_NOTICELOG("ABORT_TASK_SET\n");
+
+		iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET);
+		return 0;
+
+	case ISCSI_TASK_FUNC_CLEAR_TASK_SET:
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		SPDK_NOTICELOG("CLEAR_TASK_SET (Unsupported)\n");
+		break;
+
+	case ISCSI_TASK_FUNC_CLEAR_ACA:
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		SPDK_NOTICELOG("CLEAR_ACA (Unsupported)\n");
+		break;
+
+	case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET:
+		SPDK_NOTICELOG("LOGICAL_UNIT_RESET\n");
+
+		iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+		return 0;
+
+	case ISCSI_TASK_FUNC_TARGET_WARM_RESET:
+		SPDK_NOTICELOG("TARGET_WARM_RESET (Unsupported)\n");
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		break;
+
+	case ISCSI_TASK_FUNC_TARGET_COLD_RESET:
+		SPDK_NOTICELOG("TARGET_COLD_RESET (Unsupported)\n");
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		break;
+
+	case ISCSI_TASK_FUNC_TASK_REASSIGN:
+		SPDK_NOTICELOG("TASK_REASSIGN (Unsupported)\n");
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		break;
+
+	default:
+		SPDK_ERRLOG("unsupported function %d\n", function);
+		task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT;
+		break;
+	}
+
+	iscsi_task_mgmt_response(conn, task);
+	iscsi_task_put(task);
+	return 0;
+}
+
+static int
+iscsi_pdu_hdr_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct iscsi_bhs_nop_out *reqh;
+	uint32_t task_tag;
+	uint32_t transfer_tag;
+	int I_bit;
+
+	if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+		SPDK_ERRLOG("ISCSI_OP_NOPOUT not allowed in discovery session\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs;
+	I_bit = reqh->immediate;
+
+	if (pdu->data_segment_len > SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) {
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	task_tag = from_be32(&reqh->itt);
+	transfer_tag = from_be32(&reqh->ttt);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, ITT=%x, TTT=%x\n",
+		      I_bit, task_tag, transfer_tag);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+		      pdu->cmd_sn, conn->StatSN, conn->sess->ExpCmdSN,
+		      conn->sess->MaxCmdSN);
+
+	if (transfer_tag != 0xFFFFFFFF && transfer_tag != (uint32_t)conn->id) {
+		SPDK_ERRLOG("invalid transfer tag 0x%x\n", transfer_tag);
+		/*
+		 * Technically we should probably fail the connection here, but for now
+		 *  just print the error message and continue.
+		 */
+	}
+
+	if (task_tag == 0xffffffffU && I_bit == 0) {
+		SPDK_ERRLOG("got NOPOUT ITT=0xffffffff, I=0\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	return 0;
+}
+
+static int
+iscsi_pdu_payload_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_pdu *rsp_pdu;
+	struct iscsi_bhs_nop_out *reqh;
+	struct iscsi_bhs_nop_in *rsph;
+	uint8_t *data;
+	uint64_t lun;
+	uint32_t task_tag;
+	int I_bit;
+	int data_len;
+
+	reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs;
+	I_bit = reqh->immediate;
+
+	data_len = pdu->data_segment_len;
+	if (data_len > conn->MaxRecvDataSegmentLength) {
+		data_len = conn->MaxRecvDataSegmentLength;
+	}
+
+	lun = from_be64(&reqh->lun);
+	task_tag = from_be32(&reqh->itt);
+
+	/*
+	 * We don't actually check to see if this is a response to the NOP-In
+	 * that we sent.  Our goal is to just verify that the initiator is
+	 * alive and responding to commands, not to verify that it tags
+	 * NOP-Outs correctly
+	 */
+	conn->nop_outstanding = false;
+
+	if (task_tag == 0xffffffffU) {
+		assert(I_bit == 1);
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got NOPOUT ITT=0xffffffff\n");
+		return 0;
+	}
+
+	data = calloc(1, data_len);
+	if (!data) {
+		SPDK_ERRLOG("calloc() failed for ping data\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	/* response of NOPOUT */
+	if (data_len > 0) {
+		/* copy ping data */
+		memcpy(data, pdu->data, data_len);
+	}
+
+	/* response PDU */
+	rsp_pdu = iscsi_get_pdu(conn);
+	assert(rsp_pdu != NULL);
+
+	rsph = (struct iscsi_bhs_nop_in *)&rsp_pdu->bhs;
+	rsp_pdu->data = data;
+	rsph->opcode = ISCSI_OP_NOPIN;
+	rsph->flags |= 0x80; /* bit 0 default to 1 */
+	DSET24(rsph->data_segment_len, data_len);
+	to_be64(&rsph->lun, lun);
+	to_be32(&rsph->itt, task_tag);
+	to_be32(&rsph->ttt, 0xffffffffU);
+
+	to_be32(&rsph->stat_sn, conn->StatSN);
+	conn->StatSN++;
+
+	if (I_bit == 0) {
+		conn->sess->MaxCmdSN++;
+	}
+
+	to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+	to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+	iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+	conn->last_nopin = spdk_get_ticks();
+
+	return 0;
+}
+
+/* This function returns the spdk_scsi_task by searching the snack list via
+ * task transfertag and the pdu's opcode
+ */
+static struct spdk_iscsi_task *
+get_scsi_task_from_ttt(struct spdk_iscsi_conn *conn, uint32_t transfer_tag)
+{
+	struct spdk_iscsi_pdu *pdu;
+	struct iscsi_bhs_data_in *datain_bhs;
+
+	TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) {
+		if (pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+			datain_bhs = (struct iscsi_bhs_data_in *)&pdu->bhs;
+			if (from_be32(&datain_bhs->ttt) == transfer_tag) {
+				return pdu->task;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/* This function returns the spdk_scsi_task by searching the snack list via
+ * initiator task tag and the pdu's opcode
+ */
+static struct spdk_iscsi_task *
+get_scsi_task_from_itt(struct spdk_iscsi_conn *conn,
+		       uint32_t task_tag, enum iscsi_op opcode)
+{
+	struct spdk_iscsi_pdu *pdu;
+
+	TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) {
+		if (pdu->bhs.opcode == opcode &&
+		    pdu->task != NULL &&
+		    pdu->task->tag == task_tag) {
+			return pdu->task;
+		}
+	}
+
+	return NULL;
+}
+
+/* This function is used to handle the r2t snack */
+static int
+iscsi_handle_r2t_snack(struct spdk_iscsi_conn *conn,
+		       struct spdk_iscsi_task *task,
+		       struct spdk_iscsi_pdu *pdu, uint32_t beg_run,
+		       uint32_t run_length, int32_t task_tag)
+{
+	int32_t last_r2tsn;
+	int i;
+
+	if (beg_run < task->acked_r2tsn) {
+		SPDK_ERRLOG("ITT: 0x%08x, R2T SNACK requests retransmission of"
+			    "R2TSN: from 0x%08x to 0x%08x. But it has already"
+			    "ack to R2TSN:0x%08x, protocol error.\n",
+			    task_tag, beg_run, (beg_run + run_length),
+			    (task->acked_r2tsn - 1));
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	if (run_length) {
+		if ((beg_run + run_length) > task->R2TSN) {
+			SPDK_ERRLOG("ITT: 0x%08x, received R2T SNACK with"
+				    "BegRun: 0x%08x, RunLength: 0x%08x, exceeds"
+				    "current R2TSN: 0x%08x, protocol error.\n",
+				    task_tag, beg_run, run_length,
+				    task->R2TSN);
+
+			return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD);
+		}
+		last_r2tsn = (beg_run + run_length);
+	} else {
+		last_r2tsn = task->R2TSN;
+	}
+
+	for (i = beg_run; i < last_r2tsn; i++) {
+		if (iscsi_send_r2t_recovery(conn, task, i, false) < 0) {
+			SPDK_ERRLOG("The r2t_sn=%d of r2t_task=%p is not sent\n", i, task);
+		}
+	}
+	return 0;
+}
+
+/* This function is used to recover the data in packet */
+static int
+iscsi_handle_recovery_datain(struct spdk_iscsi_conn *conn,
+			     struct spdk_iscsi_task *task,
+			     struct spdk_iscsi_pdu *pdu, uint32_t beg_run,
+			     uint32_t run_length, uint32_t task_tag)
+{
+	struct spdk_iscsi_pdu *old_pdu, *pdu_temp;
+	uint32_t i;
+	struct iscsi_bhs_data_in *datain_header;
+	uint32_t last_statsn;
+
+	task = iscsi_task_get_primary(task);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_handle_recovery_datain\n");
+
+	if (beg_run < task->acked_data_sn) {
+		SPDK_ERRLOG("ITT: 0x%08x, DATA IN SNACK requests retransmission of"
+			    "DATASN: from 0x%08x to 0x%08x but already acked to "
+			    "DATASN: 0x%08x protocol error\n",
+			    task_tag, beg_run,
+			    (beg_run + run_length), (task->acked_data_sn - 1));
+
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	if (run_length == 0) {
+		/* as the DataSN begins at 0 */
+		run_length = task->datain_datasn + 1;
+	}
+
+	if ((beg_run + run_length - 1) > task->datain_datasn) {
+		SPDK_ERRLOG("Initiator requests BegRun: 0x%08x, RunLength:"
+			    "0x%08x greater than maximum DataSN: 0x%08x.\n",
+			    beg_run, run_length, task->datain_datasn);
+
+		return -1;
+	} else {
+		last_statsn = beg_run + run_length - 1;
+	}
+
+	for (i = beg_run; i <= last_statsn; i++) {
+		TAILQ_FOREACH_SAFE(old_pdu, &conn->snack_pdu_list, tailq, pdu_temp) {
+			if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+				datain_header = (struct iscsi_bhs_data_in *)&old_pdu->bhs;
+				if (from_be32(&datain_header->itt) == task_tag &&
+				    from_be32(&datain_header->data_sn) == i) {
+					TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq);
+					iscsi_conn_write_pdu(conn, old_pdu, old_pdu->cb_fn, old_pdu->cb_arg);
+					break;
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+/* This function is used to handle the status snack */
+static int
+iscsi_handle_status_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	uint32_t beg_run;
+	uint32_t run_length;
+	struct iscsi_bhs_snack_req *reqh;
+	uint32_t i;
+	uint32_t last_statsn;
+	bool found_pdu;
+	struct spdk_iscsi_pdu *old_pdu;
+
+	reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+	beg_run = from_be32(&reqh->beg_run);
+	run_length = from_be32(&reqh->run_len);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, conn->StatSN="
+		      "%d, conn->exp_statsn=%d\n", beg_run, run_length,
+		      conn->StatSN, conn->exp_statsn);
+
+	if (!beg_run) {
+		beg_run = conn->exp_statsn;
+	} else if (beg_run < conn->exp_statsn) {
+		SPDK_ERRLOG("Got Status SNACK Begrun: 0x%08x, RunLength: 0x%08x "
+			    "but already got ExpStatSN: 0x%08x on CID:%hu.\n",
+			    beg_run, run_length, conn->StatSN, conn->cid);
+
+		return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD);
+	}
+
+	last_statsn = (!run_length) ? conn->StatSN : (beg_run + run_length);
+
+	for (i = beg_run; i < last_statsn; i++) {
+		found_pdu = false;
+		TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) {
+			if (from_be32(&old_pdu->bhs.stat_sn) == i) {
+				found_pdu = true;
+				break;
+			}
+		}
+
+		if (!found_pdu) {
+			SPDK_ERRLOG("Unable to find StatSN: 0x%08x. For a Status"
+				    "SNACK, assuming this is a proactive SNACK "
+				    "for an untransmitted StatSN, ignoring.\n",
+				    beg_run);
+		} else {
+			TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq);
+			iscsi_conn_write_pdu(conn, old_pdu, old_pdu->cb_fn, old_pdu->cb_arg);
+		}
+	}
+
+	return 0;
+}
+
+/* This function is used to handle the data ack snack */
+static int
+iscsi_handle_data_ack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	uint32_t transfer_tag;
+	uint32_t beg_run;
+	uint32_t run_length;
+	struct spdk_iscsi_pdu *old_pdu;
+	uint32_t old_datasn;
+	struct iscsi_bhs_snack_req *reqh;
+	struct spdk_iscsi_task *task;
+	struct iscsi_bhs_data_in *datain_header;
+	struct spdk_iscsi_task *primary;
+
+	reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+	transfer_tag = from_be32(&reqh->ttt);
+	beg_run = from_be32(&reqh->beg_run);
+	run_length = from_be32(&reqh->run_len);
+	task = NULL;
+	datain_header = NULL;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d,transfer_tag=%d,run_len=%d\n",
+		      beg_run, transfer_tag, run_length);
+
+	task = get_scsi_task_from_ttt(conn, transfer_tag);
+	if (!task) {
+		SPDK_ERRLOG("Data ACK SNACK for TTT: 0x%08x is invalid.\n",
+			    transfer_tag);
+		goto reject_return;
+	}
+
+	primary = iscsi_task_get_primary(task);
+	if ((run_length != 0) || (beg_run < primary->acked_data_sn)) {
+		SPDK_ERRLOG("TTT: 0x%08x Data ACK SNACK BegRUN: %d is less than "
+			    "the next expected acked DataSN: %d\n",
+			    transfer_tag, beg_run, primary->acked_data_sn);
+		goto reject_return;
+	}
+
+	primary->acked_data_sn = beg_run;
+
+	/* To free the pdu */
+	TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) {
+		if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+			datain_header = (struct iscsi_bhs_data_in *) &old_pdu->bhs;
+			old_datasn = from_be32(&datain_header->data_sn);
+			if ((from_be32(&datain_header->ttt) == transfer_tag) &&
+			    (old_datasn == beg_run - 1)) {
+				TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq);
+				iscsi_conn_free_pdu(conn, old_pdu);
+				break;
+			}
+		}
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Received Data ACK SNACK for TTT: 0x%08x,"
+		      " updated acked DataSN to 0x%08x.\n", transfer_tag,
+		      (task->acked_data_sn - 1));
+
+	return 0;
+
+reject_return:
+	return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_SNACK);
+}
+
+/* This function is used to handle the snack request from the initiator */
+static int
+iscsi_pdu_hdr_op_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct iscsi_bhs_snack_req *reqh;
+	struct spdk_iscsi_task *task;
+	int type;
+	uint32_t task_tag;
+	uint32_t beg_run;
+	uint32_t run_length;
+	int rc;
+
+	if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+		SPDK_ERRLOG("ISCSI_OP_SNACK not allowed in  discovery session\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+	if (!conn->sess->ErrorRecoveryLevel) {
+		SPDK_ERRLOG("Got a SNACK request in ErrorRecoveryLevel=0\n");
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	type = reqh->flags & ISCSI_FLAG_SNACK_TYPE_MASK;
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "The value of type is %d\n", type);
+
+	switch (type) {
+	case 0:
+		reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+		task_tag = from_be32(&reqh->itt);
+		beg_run = from_be32(&reqh->beg_run);
+		run_length = from_be32(&reqh->run_len);
+
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, "
+			      "task_tag=%x, transfer_tag=%u\n", beg_run,
+			      run_length, task_tag, from_be32(&reqh->ttt));
+
+		task = get_scsi_task_from_itt(conn, task_tag,
+					      ISCSI_OP_SCSI_DATAIN);
+		if (task) {
+			return iscsi_handle_recovery_datain(conn, task, pdu,
+							    beg_run, run_length, task_tag);
+		}
+		task = get_scsi_task_from_itt(conn, task_tag, ISCSI_OP_R2T);
+		if (task) {
+			return iscsi_handle_r2t_snack(conn, task, pdu, beg_run,
+						      run_length, task_tag);
+		}
+		SPDK_ERRLOG("It is Neither datain nor r2t recovery request\n");
+		rc = -1;
+		break;
+	case ISCSI_FLAG_SNACK_TYPE_STATUS:
+		rc = iscsi_handle_status_snack(conn, pdu);
+		break;
+	case ISCSI_FLAG_SNACK_TYPE_DATA_ACK:
+		rc = iscsi_handle_data_ack(conn, pdu);
+		break;
+	case ISCSI_FLAG_SNACK_TYPE_RDATA:
+		SPDK_ERRLOG("R-Data SNACK is Not Supported int spdk\n");
+		rc = iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+		break;
+	default:
+		SPDK_ERRLOG("Unknown SNACK type %d, protocol error\n", type);
+		rc = iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+		break;
+	}
+
+	return rc;
+}
+
+static int
+iscsi_pdu_hdr_op_data(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task	*task, *subtask;
+	struct iscsi_bhs_data_out *reqh;
+	struct spdk_scsi_lun	*lun_dev;
+	uint32_t transfer_tag;
+	uint32_t task_tag;
+	uint32_t transfer_len;
+	uint32_t DataSN;
+	uint32_t buffer_offset;
+	uint32_t len;
+	int F_bit;
+	int rc;
+	int reject_reason = ISCSI_REASON_INVALID_PDU_FIELD;
+
+	if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+		SPDK_ERRLOG("ISCSI_OP_SCSI_DATAOUT not allowed in discovery session\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	reqh = (struct iscsi_bhs_data_out *)&pdu->bhs;
+	F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL);
+	transfer_tag = from_be32(&reqh->ttt);
+	task_tag = from_be32(&reqh->itt);
+	DataSN = from_be32(&reqh->data_sn);
+	buffer_offset = from_be32(&reqh->buffer_offset);
+
+	if (pdu->data_segment_len > SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) {
+		reject_reason = ISCSI_REASON_PROTOCOL_ERROR;
+		goto reject_return;
+	}
+
+	task = get_transfer_task(conn, transfer_tag);
+	if (task == NULL) {
+		SPDK_ERRLOG("Not found task for transfer_tag=%x\n", transfer_tag);
+		goto reject_return;
+	}
+
+	lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id);
+
+	if (pdu->data_segment_len > task->desired_data_transfer_length) {
+		SPDK_ERRLOG("the dataout pdu data length is larger than the value sent by R2T PDU\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	if (task->tag != task_tag) {
+		SPDK_ERRLOG("The r2t task tag is %u, and the dataout task tag is %u\n",
+			    task->tag, task_tag);
+		goto reject_return;
+	}
+
+	if (DataSN != task->r2t_datasn) {
+		SPDK_ERRLOG("DataSN(%u) exp=%d error\n", DataSN, task->r2t_datasn);
+		if (conn->sess->ErrorRecoveryLevel >= 1) {
+			goto send_r2t_recovery_return;
+		} else {
+			reject_reason = ISCSI_REASON_PROTOCOL_ERROR;
+			goto reject_return;
+		}
+	}
+
+	if (buffer_offset != task->next_expected_r2t_offset) {
+		SPDK_ERRLOG("offset(%u) error\n", buffer_offset);
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	transfer_len = task->scsi.transfer_len;
+	task->current_r2t_length += pdu->data_segment_len;
+	task->next_expected_r2t_offset += pdu->data_segment_len;
+	task->r2t_datasn++;
+
+	if (task->current_r2t_length > conn->sess->MaxBurstLength) {
+		SPDK_ERRLOG("R2T burst(%u) > MaxBurstLength(%u)\n",
+			    task->current_r2t_length,
+			    conn->sess->MaxBurstLength);
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	if (F_bit) {
+		/*
+		 * This R2T burst is done. Clear the length before we
+		 *  receive a PDU for the next R2t burst.
+		 */
+		task->current_r2t_length = 0;
+	}
+
+	subtask = iscsi_task_get(conn, task, iscsi_task_cpl);
+	if (subtask == NULL) {
+		SPDK_ERRLOG("Unable to acquire subtask\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+	subtask->scsi.offset = buffer_offset;
+	subtask->scsi.length = pdu->data_segment_len;
+	iscsi_task_associate_pdu(subtask, pdu);
+
+	if (task->next_expected_r2t_offset == transfer_len) {
+		task->acked_r2tsn++;
+	} else if (F_bit && (task->next_r2t_offset < transfer_len)) {
+		task->acked_r2tsn++;
+		len = spdk_min(conn->sess->MaxBurstLength,
+			       (transfer_len - task->next_r2t_offset));
+		rc = iscsi_send_r2t(conn, task, task->next_r2t_offset, len,
+				    task->ttt, &task->R2TSN);
+		if (rc < 0) {
+			SPDK_ERRLOG("iscsi_send_r2t() failed\n");
+		}
+		task->next_r2t_offset += len;
+	}
+
+	if (lun_dev == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n",
+			      task->lun_id);
+		subtask->scsi.transfer_len = subtask->scsi.length;
+		spdk_scsi_task_process_null_lun(&subtask->scsi);
+		iscsi_task_cpl(&subtask->scsi);
+		return 0;
+	}
+
+	if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(lun_dev, &subtask->scsi, &pdu->dif_ctx))) {
+		pdu->dif_insert_or_strip = true;
+	}
+
+	pdu->task = subtask;
+	return 0;
+
+send_r2t_recovery_return:
+	rc = iscsi_send_r2t_recovery(conn, task, task->acked_r2tsn, true);
+	if (rc == 0) {
+		return 0;
+	}
+
+reject_return:
+	return iscsi_reject(conn, pdu, reject_reason);
+}
+
+static int
+iscsi_pdu_payload_op_data(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	struct spdk_iscsi_task *subtask;
+	struct iscsi_bhs_data_out *reqh;
+	uint32_t transfer_tag;
+
+	if (pdu->task == NULL) {
+		return 0;
+	}
+
+	subtask = pdu->task;
+
+	reqh = (struct iscsi_bhs_data_out *)&pdu->bhs;
+	transfer_tag = from_be32(&reqh->ttt);
+
+	if (get_transfer_task(conn, transfer_tag) == NULL) {
+		SPDK_ERRLOG("Not found for transfer_tag=%x\n", transfer_tag);
+		subtask->scsi.transfer_len = subtask->scsi.length;
+		spdk_scsi_task_process_abort(&subtask->scsi);
+		iscsi_task_cpl(&subtask->scsi);
+		return 0;
+	}
+
+	if (spdk_likely(!pdu->dif_insert_or_strip)) {
+		spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_segment_len);
+	} else {
+		spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_buf_len);
+	}
+
+	if (spdk_scsi_dev_get_lun(conn->dev, subtask->lun_id) == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n",
+			      subtask->lun_id);
+		subtask->scsi.transfer_len = subtask->scsi.length;
+		spdk_scsi_task_process_null_lun(&subtask->scsi);
+		iscsi_task_cpl(&subtask->scsi);
+		return 0;
+	}
+
+	iscsi_queue_task(conn, subtask);
+	return 0;
+}
+
+static void
+init_login_reject_response(struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu)
+{
+	struct iscsi_bhs_login_rsp *rsph;
+
+	memset(rsp_pdu, 0, sizeof(struct spdk_iscsi_pdu));
+	rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+	rsph->version_max = ISCSI_VERSION;
+	rsph->version_act = ISCSI_VERSION;
+	rsph->opcode = ISCSI_OP_LOGIN_RSP;
+	rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+	rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST;
+	rsph->itt = pdu->bhs.itt;
+}
+
+static void
+iscsi_pdu_dump(struct spdk_iscsi_pdu *pdu)
+{
+	SPDK_ERRLOGDUMP("PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN);
+}
+
+/* This function is used to refree the pdu when it is acknowledged */
+static void
+remove_acked_pdu(struct spdk_iscsi_conn *conn, uint32_t ExpStatSN)
+{
+	struct spdk_iscsi_pdu *pdu, *pdu_temp;
+	uint32_t stat_sn;
+
+	conn->exp_statsn = spdk_min(ExpStatSN, conn->StatSN);
+	TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, pdu_temp) {
+		stat_sn = from_be32(&pdu->bhs.stat_sn);
+		if (spdk_sn32_lt(stat_sn, conn->exp_statsn)) {
+			TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+			iscsi_conn_free_pdu(conn, pdu);
+		}
+	}
+}
+
+static int
+iscsi_update_cmdsn(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	int opcode;
+	uint32_t ExpStatSN;
+	int I_bit;
+	struct spdk_iscsi_sess *sess;
+	struct iscsi_bhs_scsi_req *reqh;
+
+	sess = conn->sess;
+	if (!sess) {
+		SPDK_ERRLOG("Connection has no associated session!\n");
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	opcode = pdu->bhs.opcode;
+	reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs;
+
+	pdu->cmd_sn = from_be32(&reqh->cmd_sn);
+
+	I_bit = reqh->immediate;
+	if (I_bit == 0) {
+		if (spdk_sn32_lt(pdu->cmd_sn, sess->ExpCmdSN) ||
+		    spdk_sn32_gt(pdu->cmd_sn, sess->MaxCmdSN)) {
+			if (sess->session_type == SESSION_TYPE_NORMAL &&
+			    opcode != ISCSI_OP_SCSI_DATAOUT) {
+				SPDK_ERRLOG("CmdSN(%u) ignore (ExpCmdSN=%u, MaxCmdSN=%u)\n",
+					    pdu->cmd_sn, sess->ExpCmdSN, sess->MaxCmdSN);
+
+				if (sess->ErrorRecoveryLevel >= 1) {
+					SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n");
+				} else {
+					return SPDK_PDU_FATAL;
+				}
+			}
+		}
+	} else if (pdu->cmd_sn != sess->ExpCmdSN) {
+		SPDK_ERRLOG("CmdSN(%u) error ExpCmdSN=%u\n", pdu->cmd_sn, sess->ExpCmdSN);
+
+		if (sess->ErrorRecoveryLevel >= 1) {
+			SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n");
+		} else if (opcode != ISCSI_OP_NOPOUT) {
+			/*
+			 * The Linux initiator does not send valid CmdSNs for
+			 *  nopout under heavy load, so do not close the
+			 *  connection in that case.
+			 */
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		}
+	}
+
+	ExpStatSN = from_be32(&reqh->exp_stat_sn);
+	if (spdk_sn32_gt(ExpStatSN, conn->StatSN)) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) advanced\n", ExpStatSN);
+		ExpStatSN = conn->StatSN;
+	}
+
+	if (sess->ErrorRecoveryLevel >= 1) {
+		remove_acked_pdu(conn, ExpStatSN);
+	}
+
+	if (!I_bit && opcode != ISCSI_OP_SCSI_DATAOUT) {
+		sess->ExpCmdSN++;
+	}
+
+	return 0;
+}
+
+static int
+iscsi_pdu_hdr_handle(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	int opcode;
+	int rc;
+	struct spdk_iscsi_pdu *rsp_pdu = NULL;
+
+	if (pdu == NULL) {
+		return -1;
+	}
+
+	opcode = pdu->bhs.opcode;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode);
+
+	if (opcode == ISCSI_OP_LOGIN) {
+		return iscsi_pdu_hdr_op_login(conn, pdu);
+	}
+
+	/* connection in login phase but receive non-login opcode
+	 * return response code 0x020b to initiator.
+	 * */
+	if (!conn->full_feature && conn->state == ISCSI_CONN_STATE_RUNNING) {
+		rsp_pdu = iscsi_get_pdu(conn);
+		if (rsp_pdu == NULL) {
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		}
+		init_login_reject_response(pdu, rsp_pdu);
+		iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+		SPDK_ERRLOG("Received opcode %d in login phase\n", opcode);
+		return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+	} else if (conn->state == ISCSI_CONN_STATE_INVALID) {
+		SPDK_ERRLOG("before Full Feature\n");
+		iscsi_pdu_dump(pdu);
+		return SPDK_ISCSI_CONNECTION_FATAL;
+	}
+
+	rc = iscsi_update_cmdsn(conn, pdu);
+	if (rc != 0) {
+		return rc;
+	}
+
+	switch (opcode) {
+	case ISCSI_OP_NOPOUT:
+		rc = iscsi_pdu_hdr_op_nopout(conn, pdu);
+		break;
+
+	case ISCSI_OP_SCSI:
+		rc = iscsi_pdu_hdr_op_scsi(conn, pdu);
+		break;
+	case ISCSI_OP_TASK:
+		rc = iscsi_pdu_hdr_op_task(conn, pdu);
+		break;
+
+	case ISCSI_OP_TEXT:
+		rc = iscsi_pdu_hdr_op_text(conn, pdu);
+		break;
+
+	case ISCSI_OP_LOGOUT:
+		rc = iscsi_pdu_hdr_op_logout(conn, pdu);
+		break;
+
+	case ISCSI_OP_SCSI_DATAOUT:
+		rc = iscsi_pdu_hdr_op_data(conn, pdu);
+		break;
+
+	case ISCSI_OP_SNACK:
+		rc = iscsi_pdu_hdr_op_snack(conn, pdu);
+		break;
+
+	default:
+		SPDK_ERRLOG("unsupported opcode %x\n", opcode);
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	if (rc < 0) {
+		SPDK_ERRLOG("processing PDU header (opcode=%x) failed on %s(%s)\n",
+			    opcode,
+			    conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL",
+			    conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL");
+	}
+
+	return rc;
+}
+
+static int
+iscsi_pdu_payload_handle(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+	int opcode;
+	int rc = 0;
+
+	opcode = pdu->bhs.opcode;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode);
+
+	switch (opcode) {
+	case ISCSI_OP_LOGIN:
+		rc = iscsi_pdu_payload_op_login(conn, pdu);
+		break;
+	case ISCSI_OP_NOPOUT:
+		rc = iscsi_pdu_payload_op_nopout(conn, pdu);
+		break;
+	case ISCSI_OP_SCSI:
+		rc = iscsi_pdu_payload_op_scsi(conn, pdu);
+		break;
+	case ISCSI_OP_TASK:
+		break;
+	case ISCSI_OP_TEXT:
+		rc = iscsi_pdu_payload_op_text(conn, pdu);
+		break;
+	case ISCSI_OP_LOGOUT:
+		break;
+	case ISCSI_OP_SCSI_DATAOUT:
+		rc = iscsi_pdu_payload_op_data(conn, pdu);
+		break;
+	case ISCSI_OP_SNACK:
+		break;
+	default:
+		SPDK_ERRLOG("unsupported opcode %x\n", opcode);
+		return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+	}
+
+	if (rc < 0) {
+		SPDK_ERRLOG("processing PDU payload (opcode=%x) failed on %s(%s)\n",
+			    opcode,
+			    conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL",
+			    conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL");
+	}
+
+	return rc;
+}
+
+static int
+iscsi_read_pdu(struct spdk_iscsi_conn *conn)
+{
+	enum iscsi_pdu_recv_state prev_state;
+	struct spdk_iscsi_pdu *pdu;
+	struct spdk_mempool *pool;
+	uint32_t crc32c;
+	int ahs_len;
+	uint32_t data_len;
+	int rc;
+
+	do {
+		prev_state = conn->pdu_recv_state;
+		pdu = conn->pdu_in_progress;
+
+		switch (conn->pdu_recv_state) {
+		case ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY:
+			assert(conn->pdu_in_progress == NULL);
+
+			conn->pdu_in_progress = iscsi_get_pdu(conn);
+			if (conn->pdu_in_progress == NULL) {
+				return SPDK_ISCSI_CONNECTION_FATAL;
+			}
+			conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR;
+			break;
+		case ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR:
+			if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) {
+				rc = iscsi_conn_read_data(conn,
+							  ISCSI_BHS_LEN - pdu->bhs_valid_bytes,
+							  (uint8_t *)&pdu->bhs + pdu->bhs_valid_bytes);
+				if (rc < 0) {
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+				pdu->bhs_valid_bytes += rc;
+				if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) {
+					return 0;
+				}
+			}
+
+			pdu->data_segment_len = ISCSI_ALIGN(DGET24(pdu->bhs.data_segment_len));
+
+			/* AHS */
+			ahs_len = pdu->bhs.total_ahs_len * 4;
+			assert(ahs_len <= ISCSI_AHS_LEN);
+			if (pdu->ahs_valid_bytes < ahs_len) {
+				rc = iscsi_conn_read_data(conn,
+							  ahs_len - pdu->ahs_valid_bytes,
+							  pdu->ahs + pdu->ahs_valid_bytes);
+				if (rc < 0) {
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+
+				pdu->ahs_valid_bytes += rc;
+				if (pdu->ahs_valid_bytes < ahs_len) {
+					return 0;
+				}
+			}
+
+			/* Header Digest */
+			if (conn->header_digest &&
+			    pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) {
+				rc = iscsi_conn_read_data(conn,
+							  ISCSI_DIGEST_LEN - pdu->hdigest_valid_bytes,
+							  pdu->header_digest + pdu->hdigest_valid_bytes);
+				if (rc < 0) {
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+
+				pdu->hdigest_valid_bytes += rc;
+				if (pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) {
+					return 0;
+				}
+			}
+
+			if (conn->header_digest) {
+				crc32c = iscsi_pdu_calc_header_digest(pdu);
+				rc = MATCH_DIGEST_WORD(pdu->header_digest, crc32c);
+				if (rc == 0) {
+					SPDK_ERRLOG("header digest error (%s)\n", conn->initiator_name);
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+			}
+
+			rc = iscsi_pdu_hdr_handle(conn, pdu);
+			if (rc < 0) {
+				SPDK_ERRLOG("Critical error is detected. Close the connection\n");
+				conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+				break;
+			}
+
+			conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD;
+			break;
+		case ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+			data_len = pdu->data_segment_len;
+
+			if (data_len != 0 && pdu->data_buf == NULL) {
+				if (data_len <= iscsi_get_max_immediate_data_size()) {
+					pool = g_iscsi.pdu_immediate_data_pool;
+					pdu->data_buf_len = SPDK_BDEV_BUF_SIZE_WITH_MD(iscsi_get_max_immediate_data_size());
+				} else if (data_len <= SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) {
+					pool = g_iscsi.pdu_data_out_pool;
+					pdu->data_buf_len = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+				} else {
+					SPDK_ERRLOG("Data(%d) > MaxSegment(%d)\n",
+						    data_len, SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+				pdu->mobj = spdk_mempool_get(pool);
+				if (pdu->mobj == NULL) {
+					return 0;
+				}
+				pdu->data_buf = pdu->mobj->buf;
+				pdu->data = pdu->mobj->buf;
+				pdu->data_from_mempool = true;
+			}
+
+			/* copy the actual data into local buffer */
+			if (pdu->data_valid_bytes < data_len) {
+				rc = iscsi_conn_read_data_segment(conn, pdu, data_len);
+				if (rc < 0) {
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+
+				pdu->data_valid_bytes += rc;
+				if (pdu->data_valid_bytes < data_len) {
+					return 0;
+				}
+			}
+
+			/* copy out the data digest */
+			if (conn->data_digest && data_len != 0 &&
+			    pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) {
+				rc = iscsi_conn_read_data(conn,
+							  ISCSI_DIGEST_LEN - pdu->ddigest_valid_bytes,
+							  pdu->data_digest + pdu->ddigest_valid_bytes);
+				if (rc < 0) {
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+
+				pdu->ddigest_valid_bytes += rc;
+				if (pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) {
+					return 0;
+				}
+			}
+
+			/* All data for this PDU has now been read from the socket. */
+			spdk_trace_record(TRACE_ISCSI_READ_PDU, conn->id, pdu->data_valid_bytes,
+					  (uintptr_t)pdu, pdu->bhs.opcode);
+
+			/* check data digest */
+			if (conn->data_digest && data_len != 0) {
+				crc32c = iscsi_pdu_calc_data_digest(pdu);
+				rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
+				if (rc == 0) {
+					SPDK_ERRLOG("data digest error (%s)\n", conn->initiator_name);
+					conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+					break;
+				}
+			}
+
+			if (conn->is_logged_out) {
+				SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pdu received after logout\n");
+				conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+				break;
+			}
+
+			if (!pdu->is_rejected) {
+				rc = iscsi_pdu_payload_handle(conn, pdu);
+			} else {
+				rc = 0;
+			}
+			if (rc == 0) {
+				spdk_trace_record(TRACE_ISCSI_TASK_EXECUTED, 0, 0, (uintptr_t)pdu, 0);
+				iscsi_put_pdu(pdu);
+				conn->pdu_in_progress = NULL;
+				conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY;
+				return 1;
+			} else {
+				conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+			}
+			break;
+		case ISCSI_PDU_RECV_STATE_ERROR:
+			return SPDK_ISCSI_CONNECTION_FATAL;
+		default:
+			assert(false);
+			SPDK_ERRLOG("code should not come here\n");
+			break;
+		}
+	} while (prev_state != conn->pdu_recv_state);
+
+	return 0;
+}
+
+#define GET_PDU_LOOP_COUNT	16
+
+int
+iscsi_handle_incoming_pdus(struct spdk_iscsi_conn *conn)
+{
+	int i, rc;
+
+	/* Read new PDUs from network */
+	for (i = 0; i < GET_PDU_LOOP_COUNT; i++) {
+		rc = iscsi_read_pdu(conn);
+		if (rc == 0) {
+			break;
+		} else if (rc < 0) {
+			return rc;
+		}
+
+		if (conn->is_stopped) {
+			break;
+		}
+	}
+
+	return i;
+}
diff --git a/src/spdk/lib/iscsi/iscsi.h b/src/spdk/lib/iscsi/iscsi.h
new file mode 100644
index 000000000..b1747e4ab
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi.h
@@ -0,0 +1,465 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_H
+#define SPDK_ISCSI_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/iscsi_spec.h"
+#include "spdk/thread.h"
+#include "spdk/sock.h"
+
+#include "spdk/scsi.h"
+#include "iscsi/param.h"
+
+#include "spdk/assert.h"
+#include "spdk/dif.h"
+#include "spdk/util.h"
+
+#define SPDK_ISCSI_DEFAULT_NODEBASE "iqn.2016-06.io.spdk"
+
+#define DEFAULT_MAXR2T 4
+#define MAX_INITIATOR_PORT_NAME 256
+#define MAX_INITIATOR_NAME 223
+#define MAX_TARGET_NAME 223
+
+#define MAX_PORTAL 1024
+#define MAX_INITIATOR 256
+#define MAX_NETMASK 256
+#define MAX_ISCSI_CONNECTIONS 1024
+#define MAX_PORTAL_ADDR 256
+#define MAX_PORTAL_PORT 32
+
+#define DEFAULT_PORT 3260
+#define DEFAULT_MAX_SESSIONS 128
+#define DEFAULT_MAX_CONNECTIONS_PER_SESSION 2
+#define DEFAULT_MAXOUTSTANDINGR2T 1
+#define DEFAULT_DEFAULTTIME2WAIT 2
+#define DEFAULT_DEFAULTTIME2RETAIN 20
+#define DEFAULT_INITIALR2T true
+#define DEFAULT_IMMEDIATEDATA true
+#define DEFAULT_DATAPDUINORDER true
+#define DEFAULT_DATASEQUENCEINORDER true
+#define DEFAULT_ERRORRECOVERYLEVEL 0
+#define DEFAULT_TIMEOUT 60
+#define MAX_NOPININTERVAL 60
+#define DEFAULT_NOPININTERVAL 30
+
+/*
+ * SPDK iSCSI target currently only supports 64KB as the maximum data segment length
+ *  it can receive from initiators.  Other values may work, but no guarantees.
+ */
+#define SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH  65536
+
+/*
+ * Defines maximum number of data out buffers each connection can have in
+ *  use at any given time.
+ */
+#define MAX_DATA_OUT_PER_CONNECTION 16
+
+/*
+ * Defines maximum number of data in buffers each connection can have in
+ *  use at any given time. So this limit does not affect I/O smaller than
+ *  SPDK_BDEV_SMALL_BUF_MAX_SIZE.
+ */
+#define MAX_LARGE_DATAIN_PER_CONNECTION 64
+
+#define SPDK_ISCSI_MAX_BURST_LENGTH	\
+		(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH * MAX_DATA_OUT_PER_CONNECTION)
+
+/*
+ * Defines default maximum amount in bytes of unsolicited data the iSCSI
+ *  initiator may send to the SPDK iSCSI target during the execution of
+ *  a single SCSI command. And it is smaller than the MaxBurstLength.
+ */
+#define SPDK_ISCSI_FIRST_BURST_LENGTH	8192
+
+/*
+ * Defines minimum amount in bytes of unsolicited data the iSCSI initiator
+ *  may send to the SPDK iSCSI target during the execution of a single
+ *  SCSI command.
+ */
+#define SPDK_ISCSI_MIN_FIRST_BURST_LENGTH	512
+
+#define SPDK_ISCSI_MAX_FIRST_BURST_LENGTH	16777215
+
+/*
+ * Defines default maximum queue depth per connection and this can be
+ * changed by configuration file.
+ */
+#define DEFAULT_MAX_QUEUE_DEPTH	64
+
+/** Defines how long we should wait for a logout request when the target
+ *   requests logout to the initiator asynchronously.
+ */
+#define ISCSI_LOGOUT_REQUEST_TIMEOUT 30 /* in seconds */
+
+/** Defines how long we should wait for a TCP close after responding to a
+ *   logout request, before terminating the connection ourselves.
+ */
+#define ISCSI_LOGOUT_TIMEOUT 5 /* in seconds */
+
+/* For spdk_iscsi_login_in related function use, we need to avoid the conflict
+ * with other errors
+ * */
+#define SPDK_ISCSI_LOGIN_ERROR_RESPONSE -1000
+#define SPDK_ISCSI_LOGIN_ERROR_PARAMETER -1001
+#define SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE -1002
+
+#define ISCSI_AHS_LEN 60
+
+struct spdk_mobj {
+	struct spdk_mempool *mp;
+	void *buf;
+};
+
+/*
+ * Maximum number of SGL elements, i.e.,
+ * BHS, AHS, Header Digest, Data Segment and Data Digest.
+ */
+#define SPDK_ISCSI_MAX_SGL_DESCRIPTORS	(5)
+
+typedef void (*iscsi_conn_xfer_complete_cb)(void *cb_arg);
+
+struct spdk_iscsi_pdu {
+	struct iscsi_bhs bhs;
+	struct spdk_mobj *mobj;
+	bool is_rejected;
+	uint8_t *data_buf;
+	uint8_t *data;
+	uint8_t header_digest[ISCSI_DIGEST_LEN];
+	uint8_t data_digest[ISCSI_DIGEST_LEN];
+	size_t data_segment_len;
+	int bhs_valid_bytes;
+	int ahs_valid_bytes;
+	uint32_t data_valid_bytes;
+	int hdigest_valid_bytes;
+	int ddigest_valid_bytes;
+	int ref;
+	bool data_from_mempool;  /* indicate whether the data buffer is allocated from mempool */
+	struct spdk_iscsi_task *task; /* data tied to a task buffer */
+	uint32_t cmd_sn;
+	uint32_t writev_offset;
+	uint32_t data_buf_len;
+	bool dif_insert_or_strip;
+	struct spdk_dif_ctx dif_ctx;
+	struct spdk_iscsi_conn *conn;
+
+	iscsi_conn_xfer_complete_cb		cb_fn;
+	void					*cb_arg;
+
+	/* The sock request ends with a 0 length iovec. Place the actual iovec immediately
+	 * after it. There is a static assert below to check if the compiler inserted
+	 * any unwanted padding */
+	int32_t						mapped_length;
+	struct spdk_sock_request			sock_req;
+	struct iovec					iov[SPDK_ISCSI_MAX_SGL_DESCRIPTORS];
+	TAILQ_ENTRY(spdk_iscsi_pdu)	tailq;
+
+
+	/*
+	 * 60 bytes of AHS should suffice for now.
+	 * This should always be at the end of PDU data structure.
+	 * we need to not zero this out when doing memory clear.
+	 */
+	uint8_t ahs[ISCSI_AHS_LEN];
+
+	struct {
+		uint16_t length; /* iSCSI SenseLength (big-endian) */
+		uint8_t data[32];
+	} sense;
+};
+SPDK_STATIC_ASSERT(offsetof(struct spdk_iscsi_pdu,
+			    sock_req) + sizeof(struct spdk_sock_request) == offsetof(struct spdk_iscsi_pdu, iov),
+		   "Compiler inserted padding between iov and sock_req");
+
+enum iscsi_connection_state {
+	ISCSI_CONN_STATE_INVALID = 0,
+	ISCSI_CONN_STATE_RUNNING = 1,
+	ISCSI_CONN_STATE_EXITING = 2,
+	ISCSI_CONN_STATE_EXITED = 3,
+};
+
+enum iscsi_chap_phase {
+	ISCSI_CHAP_PHASE_NONE = 0,
+	ISCSI_CHAP_PHASE_WAIT_A = 1,
+	ISCSI_CHAP_PHASE_WAIT_NR = 2,
+	ISCSI_CHAP_PHASE_END = 3,
+};
+
+enum session_type {
+	SESSION_TYPE_INVALID = 0,
+	SESSION_TYPE_NORMAL = 1,
+	SESSION_TYPE_DISCOVERY = 2,
+};
+
+#define ISCSI_CHAP_CHALLENGE_LEN	1024
+#define ISCSI_CHAP_MAX_USER_LEN		255
+#define ISCSI_CHAP_MAX_SECRET_LEN	255
+
+struct iscsi_chap_auth {
+	enum iscsi_chap_phase chap_phase;
+
+	char user[ISCSI_CHAP_MAX_USER_LEN + 1];
+	char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+	char muser[ISCSI_CHAP_MAX_USER_LEN + 1];
+	char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+
+	uint8_t chap_id[1];
+	uint8_t chap_mid[1];
+	int chap_challenge_len;
+	uint8_t chap_challenge[ISCSI_CHAP_CHALLENGE_LEN];
+	int chap_mchallenge_len;
+	uint8_t chap_mchallenge[ISCSI_CHAP_CHALLENGE_LEN];
+};
+
+struct spdk_iscsi_auth_secret {
+	char user[ISCSI_CHAP_MAX_USER_LEN + 1];
+	char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+	char muser[ISCSI_CHAP_MAX_USER_LEN + 1];
+	char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+	TAILQ_ENTRY(spdk_iscsi_auth_secret) tailq;
+};
+
+struct spdk_iscsi_auth_group {
+	int32_t tag;
+	TAILQ_HEAD(, spdk_iscsi_auth_secret) secret_head;
+	TAILQ_ENTRY(spdk_iscsi_auth_group) tailq;
+};
+
+struct spdk_iscsi_sess {
+	uint32_t connections;
+	struct spdk_iscsi_conn **conns;
+
+	struct spdk_scsi_port *initiator_port;
+	int tag;
+
+	uint64_t isid;
+	uint16_t tsih;
+	struct spdk_iscsi_tgt_node *target;
+	int queue_depth;
+
+	struct iscsi_param *params;
+
+	enum session_type session_type;
+	uint32_t MaxConnections;
+	uint32_t MaxOutstandingR2T;
+	uint32_t DefaultTime2Wait;
+	uint32_t DefaultTime2Retain;
+	uint32_t FirstBurstLength;
+	uint32_t MaxBurstLength;
+	bool InitialR2T;
+	bool ImmediateData;
+	bool DataPDUInOrder;
+	bool DataSequenceInOrder;
+	uint32_t ErrorRecoveryLevel;
+
+	uint32_t ExpCmdSN;
+	uint32_t MaxCmdSN;
+
+	uint32_t current_text_itt;
+};
+
+struct spdk_iscsi_poll_group {
+	struct spdk_poller				*poller;
+	struct spdk_poller				*nop_poller;
+	STAILQ_HEAD(connections, spdk_iscsi_conn)	connections;
+	struct spdk_sock_group				*sock_group;
+	TAILQ_ENTRY(spdk_iscsi_poll_group)		link;
+};
+
+struct spdk_iscsi_opts {
+	char *authfile;
+	char *nodebase;
+	int32_t timeout;
+	int32_t nopininterval;
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+	uint32_t MaxSessions;
+	uint32_t MaxConnectionsPerSession;
+	uint32_t MaxConnections;
+	uint32_t MaxQueueDepth;
+	uint32_t DefaultTime2Wait;
+	uint32_t DefaultTime2Retain;
+	uint32_t FirstBurstLength;
+	bool ImmediateData;
+	uint32_t ErrorRecoveryLevel;
+	bool AllowDuplicateIsid;
+};
+
+struct spdk_iscsi_globals {
+	char *authfile;
+	char *nodebase;
+	pthread_mutex_t mutex;
+	uint32_t refcnt;
+	TAILQ_HEAD(, spdk_iscsi_portal)		portal_head;
+	TAILQ_HEAD(, spdk_iscsi_portal_grp)	pg_head;
+	TAILQ_HEAD(, spdk_iscsi_init_grp)	ig_head;
+	TAILQ_HEAD(, spdk_iscsi_tgt_node)	target_head;
+	TAILQ_HEAD(, spdk_iscsi_auth_group)	auth_group_head;
+	TAILQ_HEAD(, spdk_iscsi_poll_group)	poll_group_head;
+
+	int32_t timeout;
+	int32_t nopininterval;
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+
+	uint32_t MaxSessions;
+	uint32_t MaxConnectionsPerSession;
+	uint32_t MaxConnections;
+	uint32_t MaxQueueDepth;
+	uint32_t DefaultTime2Wait;
+	uint32_t DefaultTime2Retain;
+	uint32_t FirstBurstLength;
+	bool ImmediateData;
+	uint32_t ErrorRecoveryLevel;
+	bool AllowDuplicateIsid;
+
+	struct spdk_mempool *pdu_pool;
+	struct spdk_mempool *pdu_immediate_data_pool;
+	struct spdk_mempool *pdu_data_out_pool;
+	struct spdk_mempool *session_pool;
+	struct spdk_mempool *task_pool;
+
+	struct spdk_iscsi_sess	**session;
+};
+
+#define ISCSI_SECURITY_NEGOTIATION_PHASE	0
+#define ISCSI_OPERATIONAL_NEGOTIATION_PHASE	1
+#define ISCSI_NSG_RESERVED_CODE			2
+#define ISCSI_FULL_FEATURE_PHASE		3
+
+/* logout reason */
+#define ISCSI_LOGOUT_REASON_CLOSE_SESSION		0
+#define ISCSI_LOGOUT_REASON_CLOSE_CONNECTION		1
+#define ISCSI_LOGOUT_REASON_REMOVE_CONN_FOR_RECOVERY	2
+
+enum spdk_error_codes {
+	SPDK_ISCSI_CONNECTION_FATAL	= -1,
+	SPDK_PDU_FATAL		= -2,
+};
+
+#define DGET24(B)											\
+	(((  (uint32_t) *((uint8_t *)(B)+0)) << 16)				\
+	 | (((uint32_t) *((uint8_t *)(B)+1)) << 8)				\
+	 | (((uint32_t) *((uint8_t *)(B)+2)) << 0))
+
+#define DSET24(B,D)													\
+	(((*((uint8_t *)(B)+0)) = (uint8_t)((uint32_t)(D) >> 16)),		\
+	 ((*((uint8_t *)(B)+1)) = (uint8_t)((uint32_t)(D) >> 8)),		\
+	 ((*((uint8_t *)(B)+2)) = (uint8_t)((uint32_t)(D) >> 0)))
+
+#define xstrdup(s) (s ? strdup(s) : (char *)NULL)
+
+extern struct spdk_iscsi_globals g_iscsi;
+extern struct spdk_iscsi_opts *g_spdk_iscsi_opts;
+
+struct spdk_iscsi_task;
+struct spdk_json_write_ctx;
+
+typedef void (*spdk_iscsi_init_cb)(void *cb_arg, int rc);
+
+void spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg);
+typedef void (*spdk_iscsi_fini_cb)(void *arg);
+void spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg);
+void shutdown_iscsi_conns_done(void);
+void spdk_iscsi_config_text(FILE *fp);
+void spdk_iscsi_config_json(struct spdk_json_write_ctx *w);
+
+struct spdk_iscsi_opts *iscsi_opts_alloc(void);
+void iscsi_opts_free(struct spdk_iscsi_opts *opts);
+struct spdk_iscsi_opts *iscsi_opts_copy(struct spdk_iscsi_opts *src);
+void iscsi_opts_info_json(struct spdk_json_write_ctx *w);
+int iscsi_set_discovery_auth(bool disable_chap, bool require_chap,
+			     bool mutual_chap, int32_t chap_group);
+int iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser,
+			    int ag_tag);
+int iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group);
+struct spdk_iscsi_auth_group *iscsi_find_auth_group_by_tag(int32_t tag);
+void iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group);
+int iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group,
+				const char *user, const char *secret,
+				const char *muser, const char *msecret);
+int iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group,
+				   const char *user);
+void iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w);
+
+void iscsi_task_response(struct spdk_iscsi_conn *conn,
+			 struct spdk_iscsi_task *task);
+int iscsi_build_iovs(struct spdk_iscsi_conn *conn, struct iovec *iovs, int iovcnt,
+		     struct spdk_iscsi_pdu *pdu, uint32_t *mapped_length);
+int iscsi_handle_incoming_pdus(struct spdk_iscsi_conn *conn);
+void iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn,
+			      struct spdk_iscsi_task *task);
+
+void iscsi_free_sess(struct spdk_iscsi_sess *sess);
+void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn,
+				   struct spdk_scsi_lun *lun,
+				   struct spdk_iscsi_pdu *pdu);
+bool iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t CmdSN);
+
+uint32_t iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu);
+uint32_t iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu);
+
+/* Memory management */
+void iscsi_put_pdu(struct spdk_iscsi_pdu *pdu);
+struct spdk_iscsi_pdu *iscsi_get_pdu(struct spdk_iscsi_conn *conn);
+void iscsi_op_abort_task_set(struct spdk_iscsi_task *task,
+			     uint8_t function);
+void iscsi_queue_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task);
+
+static inline uint32_t
+iscsi_get_max_immediate_data_size(void)
+{
+	/*
+	 * Specify enough extra space in addition to FirstBurstLength to
+	 *  account for a header digest, data digest and additional header
+	 *  segments (AHS).  These are not normally used but they do not
+	 *  take up much space and we need to make sure the worst-case scenario
+	 *  can be satisified by the size returned here.
+	 */
+	return g_iscsi.FirstBurstLength +
+	       ISCSI_DIGEST_LEN + /* data digest */
+	       ISCSI_DIGEST_LEN + /* header digest */
+	       8 +		   /* bidirectional AHS */
+	       52;		   /* extended CDB AHS (for a 64-byte CDB) */
+}
+
+#endif /* SPDK_ISCSI_H */
diff --git a/src/spdk/lib/iscsi/iscsi_rpc.c b/src/spdk/lib/iscsi/iscsi_rpc.c
new file mode 100644
index 000000000..8ab43d31d
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi_rpc.c
@@ -0,0 +1,1639 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/init_grp.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+static void
+rpc_iscsi_get_initiator_groups(struct spdk_jsonrpc_request *request,
+			       const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "iscsi_get_initiator_groups requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	iscsi_init_grps_info_json(w);
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_initiator_groups", rpc_iscsi_get_initiator_groups,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_initiator_groups, get_initiator_groups)
+
+struct rpc_initiator_list {
+	size_t num_initiators;
+	char *initiators[MAX_INITIATOR];
+};
+
+static int
+decode_rpc_initiator_list(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_initiator_list *list = out;
+
+	return spdk_json_decode_array(val, spdk_json_decode_string, list->initiators, MAX_INITIATOR,
+				      &list->num_initiators, sizeof(char *));
+}
+
+static void
+free_rpc_initiator_list(struct rpc_initiator_list *list)
+{
+	size_t i;
+
+	for (i = 0; i < list->num_initiators; i++) {
+		free(list->initiators[i]);
+	}
+}
+
+struct rpc_netmask_list {
+	size_t num_netmasks;
+	char *netmasks[MAX_NETMASK];
+};
+
+static int
+decode_rpc_netmask_list(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_netmask_list *list = out;
+
+	return spdk_json_decode_array(val, spdk_json_decode_string, list->netmasks, MAX_NETMASK,
+				      &list->num_netmasks, sizeof(char *));
+}
+
+static void
+free_rpc_netmask_list(struct rpc_netmask_list *list)
+{
+	size_t i;
+
+	for (i = 0; i < list->num_netmasks; i++) {
+		free(list->netmasks[i]);
+	}
+}
+
+struct rpc_initiator_group {
+	int32_t tag;
+	struct rpc_initiator_list initiator_list;
+	struct rpc_netmask_list netmask_list;
+};
+
+static void
+free_rpc_initiator_group(struct rpc_initiator_group *ig)
+{
+	free_rpc_initiator_list(&ig->initiator_list);
+	free_rpc_netmask_list(&ig->netmask_list);
+}
+
+static const struct spdk_json_object_decoder rpc_initiator_group_decoders[] = {
+	{"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32},
+	{"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list},
+	{"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list},
+};
+
+static void
+rpc_iscsi_create_initiator_group(struct spdk_jsonrpc_request *request,
+				 const struct spdk_json_val *params)
+{
+	struct rpc_initiator_group req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_initiator_group_decoders,
+				    SPDK_COUNTOF(rpc_initiator_group_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (req.initiator_list.num_initiators == 0 ||
+	    req.netmask_list.num_netmasks == 0) {
+		goto invalid;
+	}
+
+	if (iscsi_init_grp_create_from_initiator_list(req.tag,
+			req.initiator_list.num_initiators,
+			req.initiator_list.initiators,
+			req.netmask_list.num_netmasks,
+			req.netmask_list.netmasks)) {
+		SPDK_ERRLOG("create_from_initiator_list failed\n");
+		goto invalid;
+	}
+
+	free_rpc_initiator_group(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_initiator_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_create_initiator_group", rpc_iscsi_create_initiator_group,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_initiator_group, add_initiator_group)
+
+static const struct spdk_json_object_decoder rpc_add_or_delete_initiators_decoders[] = {
+	{"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32},
+	{"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list, true},
+	{"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list, true},
+};
+
+static void
+rpc_iscsi_initiator_group_add_initiators(struct spdk_jsonrpc_request *request,
+		const struct spdk_json_val *params)
+{
+	struct rpc_initiator_group req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders,
+				    SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (iscsi_init_grp_add_initiators_from_initiator_list(req.tag,
+			req.initiator_list.num_initiators,
+			req.initiator_list.initiators,
+			req.netmask_list.num_netmasks,
+			req.netmask_list.netmasks)) {
+		SPDK_ERRLOG("add_initiators_from_initiator_list failed\n");
+		goto invalid;
+	}
+
+	free_rpc_initiator_group(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_initiator_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_initiator_group_add_initiators",
+		  rpc_iscsi_initiator_group_add_initiators, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_initiator_group_add_initiators,
+				   add_initiators_to_initiator_group)
+
+static void
+rpc_iscsi_initiator_group_remove_initiators(struct spdk_jsonrpc_request *request,
+		const struct spdk_json_val *params)
+{
+	struct rpc_initiator_group req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders,
+				    SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (iscsi_init_grp_delete_initiators_from_initiator_list(req.tag,
+			req.initiator_list.num_initiators,
+			req.initiator_list.initiators,
+			req.netmask_list.num_netmasks,
+			req.netmask_list.netmasks)) {
+		SPDK_ERRLOG("delete_initiators_from_initiator_list failed\n");
+		goto invalid;
+	}
+
+	free_rpc_initiator_group(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_initiator_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_initiator_group_remove_initiators",
+		  rpc_iscsi_initiator_group_remove_initiators, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_initiator_group_remove_initiators,
+				   delete_initiators_from_initiator_group)
+
+struct rpc_iscsi_delete_initiator_group {
+	int32_t tag;
+};
+
+static const struct spdk_json_object_decoder rpc_iscsi_delete_initiator_group_decoders[] = {
+	{"tag", offsetof(struct rpc_iscsi_delete_initiator_group, tag), spdk_json_decode_int32},
+};
+
+static void
+rpc_iscsi_delete_initiator_group(struct spdk_jsonrpc_request *request,
+				 const struct spdk_json_val *params)
+{
+	struct rpc_iscsi_delete_initiator_group req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_init_grp *ig;
+
+	if (spdk_json_decode_object(params, rpc_iscsi_delete_initiator_group_decoders,
+				    SPDK_COUNTOF(rpc_iscsi_delete_initiator_group_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	ig = iscsi_init_grp_unregister(req.tag);
+	if (!ig) {
+		goto invalid;
+	}
+	iscsi_tgt_node_delete_map(NULL, ig);
+	iscsi_init_grp_destroy(ig);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+}
+SPDK_RPC_REGISTER("iscsi_delete_initiator_group", rpc_iscsi_delete_initiator_group,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_initiator_group, delete_initiator_group)
+
+static void
+rpc_iscsi_get_target_nodes(struct spdk_jsonrpc_request *request,
+			   const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "iscsi_get_target_nodes requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	iscsi_tgt_nodes_info_json(w);
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_target_nodes", rpc_iscsi_get_target_nodes, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_target_nodes, get_target_nodes)
+
+struct rpc_pg_ig_map {
+	int32_t pg_tag;
+	int32_t ig_tag;
+};
+
+static const struct spdk_json_object_decoder rpc_pg_ig_map_decoders[] = {
+	{"pg_tag", offsetof(struct rpc_pg_ig_map, pg_tag), spdk_json_decode_int32},
+	{"ig_tag", offsetof(struct rpc_pg_ig_map, ig_tag), spdk_json_decode_int32},
+};
+
+static int
+decode_rpc_pg_ig_map(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_pg_ig_map *pg_ig_map = out;
+
+	return spdk_json_decode_object(val, rpc_pg_ig_map_decoders,
+				       SPDK_COUNTOF(rpc_pg_ig_map_decoders),
+				       pg_ig_map);
+}
+
+struct rpc_pg_ig_maps {
+	size_t num_maps;
+	struct rpc_pg_ig_map maps[MAX_TARGET_MAP];
+};
+
+static int
+decode_rpc_pg_ig_maps(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_pg_ig_maps *pg_ig_maps = out;
+
+	return spdk_json_decode_array(val, decode_rpc_pg_ig_map, pg_ig_maps->maps,
+				      MAX_TARGET_MAP, &pg_ig_maps->num_maps,
+				      sizeof(struct rpc_pg_ig_map));
+}
+
+#define RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN	64
+
+struct rpc_lun {
+	char *bdev_name;
+	int32_t lun_id;
+};
+
+static const struct spdk_json_object_decoder rpc_lun_decoders[] = {
+	{"bdev_name", offsetof(struct rpc_lun, bdev_name), spdk_json_decode_string},
+	{"lun_id", offsetof(struct rpc_lun, lun_id), spdk_json_decode_int32},
+};
+
+static int
+decode_rpc_lun(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_lun *lun = out;
+
+	return spdk_json_decode_object(val, rpc_lun_decoders,
+				       SPDK_COUNTOF(rpc_lun_decoders), lun);
+}
+
+struct rpc_luns {
+	size_t num_luns;
+	struct rpc_lun luns[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN];
+};
+
+static int
+decode_rpc_luns(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_luns *luns = out;
+
+	return spdk_json_decode_array(val, decode_rpc_lun, luns->luns,
+				      RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN,
+				      &luns->num_luns, sizeof(struct rpc_lun));
+}
+
+static void
+free_rpc_luns(struct rpc_luns *p)
+{
+	size_t i;
+
+	for (i = 0; i < p->num_luns; i++) {
+		free(p->luns[i].bdev_name);
+	}
+}
+
+struct rpc_target_node {
+	char *name;
+	char *alias_name;
+
+	struct rpc_pg_ig_maps pg_ig_maps;
+	struct rpc_luns luns;
+
+	int32_t queue_depth;
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+
+	bool header_digest;
+	bool data_digest;
+};
+
+static void
+free_rpc_target_node(struct rpc_target_node *req)
+{
+	free(req->name);
+	free(req->alias_name);
+	free_rpc_luns(&req->luns);
+}
+
+static const struct spdk_json_object_decoder rpc_target_node_decoders[] = {
+	{"name", offsetof(struct rpc_target_node, name), spdk_json_decode_string},
+	{"alias_name", offsetof(struct rpc_target_node, alias_name), spdk_json_decode_string},
+	{"pg_ig_maps", offsetof(struct rpc_target_node, pg_ig_maps), decode_rpc_pg_ig_maps},
+	{"luns", offsetof(struct rpc_target_node, luns), decode_rpc_luns},
+	{"queue_depth", offsetof(struct rpc_target_node, queue_depth), spdk_json_decode_int32},
+	{"disable_chap", offsetof(struct rpc_target_node, disable_chap), spdk_json_decode_bool, true},
+	{"require_chap", offsetof(struct rpc_target_node, require_chap), spdk_json_decode_bool, true},
+	{"mutual_chap", offsetof(struct rpc_target_node, mutual_chap), spdk_json_decode_bool, true},
+	{"chap_group", offsetof(struct rpc_target_node, chap_group), spdk_json_decode_int32, true},
+	{"header_digest", offsetof(struct rpc_target_node, header_digest), spdk_json_decode_bool, true},
+	{"data_digest", offsetof(struct rpc_target_node, data_digest), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_iscsi_create_target_node(struct spdk_jsonrpc_request *request,
+			     const struct spdk_json_val *params)
+{
+	struct rpc_target_node req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_tgt_node *target;
+	int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0};
+	char *bdev_names[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN] = {0};
+	int32_t lun_ids[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN] = {0};
+	size_t i;
+
+	if (spdk_json_decode_object(params, rpc_target_node_decoders,
+				    SPDK_COUNTOF(rpc_target_node_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	for (i = 0; i < req.pg_ig_maps.num_maps; i++) {
+		pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag;
+		ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag;
+	}
+
+	for (i = 0; i < req.luns.num_luns; i++) {
+		bdev_names[i] = req.luns.luns[i].bdev_name;
+		lun_ids[i] = req.luns.luns[i].lun_id;
+	}
+
+	/*
+	 * Use default parameters in a few places:
+	 *  index = -1 : automatically pick an index for the new target node
+	 *  alias = NULL
+	 */
+	target = iscsi_tgt_node_construct(-1, req.name, req.alias_name,
+					  pg_tags,
+					  ig_tags,
+					  req.pg_ig_maps.num_maps,
+					  (const char **)bdev_names,
+					  lun_ids,
+					  req.luns.num_luns,
+					  req.queue_depth,
+					  req.disable_chap,
+					  req.require_chap,
+					  req.mutual_chap,
+					  req.chap_group,
+					  req.header_digest,
+					  req.data_digest);
+
+	if (target == NULL) {
+		goto invalid;
+	}
+
+	free_rpc_target_node(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_target_node(&req);
+}
+SPDK_RPC_REGISTER("iscsi_create_target_node", rpc_iscsi_create_target_node, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_target_node, construct_target_node)
+
+struct rpc_tgt_node_pg_ig_maps {
+	char *name;
+	struct rpc_pg_ig_maps pg_ig_maps;
+};
+
+static const struct spdk_json_object_decoder rpc_tgt_node_pg_ig_maps_decoders[] = {
+	{"name", offsetof(struct rpc_tgt_node_pg_ig_maps, name), spdk_json_decode_string},
+	{"pg_ig_maps", offsetof(struct rpc_tgt_node_pg_ig_maps, pg_ig_maps), decode_rpc_pg_ig_maps},
+};
+
+static void
+rpc_iscsi_target_node_add_pg_ig_maps(struct spdk_jsonrpc_request *request,
+				     const struct spdk_json_val *params)
+{
+	struct rpc_tgt_node_pg_ig_maps req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_tgt_node *target;
+	int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0};
+	size_t i;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders,
+				    SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	target = iscsi_find_tgt_node(req.name);
+	if (target == NULL) {
+		SPDK_ERRLOG("target is not found\n");
+		goto invalid;
+	}
+
+	for (i = 0; i < req.pg_ig_maps.num_maps; i++) {
+		pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag;
+		ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag;
+	}
+
+	rc = iscsi_target_node_add_pg_ig_maps(target, pg_tags, ig_tags,
+					      req.pg_ig_maps.num_maps);
+	if (rc < 0) {
+		SPDK_ERRLOG("add pg-ig maps failed\n");
+		goto invalid;
+	}
+
+	free(req.name);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 "Invalid parameters");
+	free(req.name);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_add_pg_ig_maps",
+		  rpc_iscsi_target_node_add_pg_ig_maps, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_add_pg_ig_maps, add_pg_ig_maps)
+
+static void
+rpc_iscsi_target_node_remove_pg_ig_maps(struct spdk_jsonrpc_request *request,
+					const struct spdk_json_val *params)
+{
+	struct rpc_tgt_node_pg_ig_maps req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_tgt_node *target;
+	int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0};
+	size_t i;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders,
+				    SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	target = iscsi_find_tgt_node(req.name);
+	if (target == NULL) {
+		SPDK_ERRLOG("target is not found\n");
+		goto invalid;
+	}
+
+	for (i = 0; i < req.pg_ig_maps.num_maps; i++) {
+		pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag;
+		ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag;
+	}
+
+	rc = iscsi_target_node_remove_pg_ig_maps(target, pg_tags, ig_tags,
+			req.pg_ig_maps.num_maps);
+	if (rc < 0) {
+		SPDK_ERRLOG("remove pg-ig maps failed\n");
+		goto invalid;
+	}
+
+	free(req.name);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 "Invalid parameters");
+	free(req.name);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_remove_pg_ig_maps",
+		  rpc_iscsi_target_node_remove_pg_ig_maps, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_remove_pg_ig_maps,
+				   delete_pg_ig_maps)
+
+struct rpc_iscsi_delete_target_node {
+	char *name;
+};
+
+static void
+free_rpc_iscsi_delete_target_node(struct rpc_iscsi_delete_target_node *r)
+{
+	free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_iscsi_delete_target_node_decoders[] = {
+	{"name", offsetof(struct rpc_iscsi_delete_target_node, name), spdk_json_decode_string},
+};
+
+struct rpc_iscsi_delete_target_node_ctx {
+	struct rpc_iscsi_delete_target_node req;
+	struct spdk_jsonrpc_request *request;
+};
+
+static void
+rpc_iscsi_delete_target_node_done(void *cb_arg, int rc)
+{
+	struct rpc_iscsi_delete_target_node_ctx *ctx = cb_arg;
+	struct spdk_json_write_ctx *w;
+
+	free_rpc_iscsi_delete_target_node(&ctx->req);
+
+	w = spdk_jsonrpc_begin_result(ctx->request);
+	spdk_json_write_bool(w, rc == 0);
+	spdk_jsonrpc_end_result(ctx->request, w);
+
+	free(ctx);
+}
+
+static void
+rpc_iscsi_delete_target_node(struct spdk_jsonrpc_request *request,
+			     const struct spdk_json_val *params)
+{
+	struct rpc_iscsi_delete_target_node_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 spdk_strerror(ENOMEM));
+		return;
+	}
+
+	if (spdk_json_decode_object(params, rpc_iscsi_delete_target_node_decoders,
+				    SPDK_COUNTOF(rpc_iscsi_delete_target_node_decoders),
+				    &ctx->req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (ctx->req.name == NULL) {
+		SPDK_ERRLOG("missing name param\n");
+		goto invalid;
+	}
+
+	ctx->request = request;
+
+	iscsi_shutdown_tgt_node_by_name(ctx->req.name,
+					rpc_iscsi_delete_target_node_done, ctx);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_iscsi_delete_target_node(&ctx->req);
+	free(ctx);
+}
+SPDK_RPC_REGISTER("iscsi_delete_target_node", rpc_iscsi_delete_target_node, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_target_node, delete_target_node)
+
+static void
+rpc_iscsi_get_portal_groups(struct spdk_jsonrpc_request *request,
+			    const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "iscsi_get_portal_groups requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	iscsi_portal_grps_info_json(w);
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_portal_groups", rpc_iscsi_get_portal_groups, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_portal_groups, get_portal_groups)
+
+struct rpc_portal {
+	char *host;
+	char *port;
+};
+
+struct rpc_portal_list {
+	size_t num_portals;
+	struct rpc_portal portals[MAX_PORTAL];
+};
+
+struct rpc_portal_group {
+	int32_t tag;
+	struct rpc_portal_list portal_list;
+};
+
+static void
+free_rpc_portal(struct rpc_portal *portal)
+{
+	free(portal->host);
+	free(portal->port);
+}
+
+static void
+free_rpc_portal_list(struct rpc_portal_list *pl)
+{
+	size_t i;
+
+	for (i = 0; i < pl->num_portals; i++) {
+		free_rpc_portal(&pl->portals[i]);
+	}
+	pl->num_portals = 0;
+}
+
+static void
+free_rpc_portal_group(struct rpc_portal_group *pg)
+{
+	free_rpc_portal_list(&pg->portal_list);
+}
+
+static const struct spdk_json_object_decoder rpc_portal_decoders[] = {
+	{"host", offsetof(struct rpc_portal, host), spdk_json_decode_string},
+	{"port", offsetof(struct rpc_portal, port), spdk_json_decode_string},
+};
+
+static int
+decode_rpc_portal(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_portal *portal = out;
+
+	return spdk_json_decode_object(val, rpc_portal_decoders,
+				       SPDK_COUNTOF(rpc_portal_decoders),
+				       portal);
+}
+
+static int
+decode_rpc_portal_list(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_portal_list *list = out;
+
+	return spdk_json_decode_array(val, decode_rpc_portal, list->portals, MAX_PORTAL, &list->num_portals,
+				      sizeof(struct rpc_portal));
+}
+
+static const struct spdk_json_object_decoder rpc_portal_group_decoders[] = {
+	{"tag", offsetof(struct rpc_portal_group, tag), spdk_json_decode_int32},
+	{"portals", offsetof(struct rpc_portal_group, portal_list), decode_rpc_portal_list},
+};
+
+static void
+rpc_iscsi_create_portal_group(struct spdk_jsonrpc_request *request,
+			      const struct spdk_json_val *params)
+{
+	struct rpc_portal_group req = {};
+	struct spdk_iscsi_portal_grp *pg = NULL;
+	struct spdk_iscsi_portal *portal;
+	struct spdk_json_write_ctx *w;
+	size_t i = 0;
+	int rc = -1;
+
+	if (spdk_json_decode_object(params, rpc_portal_group_decoders,
+				    SPDK_COUNTOF(rpc_portal_group_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto out;
+	}
+
+	pg = iscsi_portal_grp_create(req.tag);
+	if (pg == NULL) {
+		SPDK_ERRLOG("portal_grp_create failed\n");
+		goto out;
+	}
+	for (i = 0; i < req.portal_list.num_portals; i++) {
+		portal = iscsi_portal_create(req.portal_list.portals[i].host,
+					     req.portal_list.portals[i].port);
+		if (portal == NULL) {
+			SPDK_ERRLOG("portal_create failed\n");
+			goto out;
+		}
+		iscsi_portal_grp_add_portal(pg, portal);
+	}
+
+	rc = iscsi_portal_grp_open(pg);
+	if (rc != 0) {
+		SPDK_ERRLOG("portal_grp_open failed\n");
+		goto out;
+	}
+
+	rc = iscsi_portal_grp_register(pg);
+	if (rc != 0) {
+		SPDK_ERRLOG("portal_grp_register failed\n");
+	}
+
+out:
+	if (rc == 0) {
+		w = spdk_jsonrpc_begin_result(request);
+		spdk_json_write_bool(w, true);
+		spdk_jsonrpc_end_result(request, w);
+	} else {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+
+		if (pg != NULL) {
+			iscsi_portal_grp_release(pg);
+		}
+	}
+	free_rpc_portal_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_create_portal_group", rpc_iscsi_create_portal_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_portal_group, add_portal_group)
+
+struct rpc_iscsi_delete_portal_group {
+	int32_t tag;
+};
+
+static const struct spdk_json_object_decoder rpc_iscsi_delete_portal_group_decoders[] = {
+	{"tag", offsetof(struct rpc_iscsi_delete_portal_group, tag), spdk_json_decode_int32},
+};
+
+static void
+rpc_iscsi_delete_portal_group(struct spdk_jsonrpc_request *request,
+			      const struct spdk_json_val *params)
+{
+	struct rpc_iscsi_delete_portal_group req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_portal_grp *pg;
+
+	if (spdk_json_decode_object(params, rpc_iscsi_delete_portal_group_decoders,
+				    SPDK_COUNTOF(rpc_iscsi_delete_portal_group_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	pg = iscsi_portal_grp_unregister(req.tag);
+	if (!pg) {
+		goto invalid;
+	}
+
+	iscsi_tgt_node_delete_map(pg, NULL);
+	iscsi_portal_grp_release(pg);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+}
+SPDK_RPC_REGISTER("iscsi_delete_portal_group", rpc_iscsi_delete_portal_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_portal_group, delete_portal_group)
+
+struct rpc_portal_group_auth {
+	int32_t tag;
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+};
+
+static const struct spdk_json_object_decoder rpc_portal_group_auth_decoders[] = {
+	{"tag", offsetof(struct rpc_portal_group_auth, tag), spdk_json_decode_int32},
+	{"disable_chap", offsetof(struct rpc_portal_group_auth, disable_chap), spdk_json_decode_bool, true},
+	{"require_chap", offsetof(struct rpc_portal_group_auth, require_chap), spdk_json_decode_bool, true},
+	{"mutual_chap", offsetof(struct rpc_portal_group_auth, mutual_chap), spdk_json_decode_bool, true},
+	{"chap_group", offsetof(struct rpc_portal_group_auth, chap_group), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_portal_group_set_auth(struct spdk_jsonrpc_request *request,
+				const struct spdk_json_val *params)
+{
+	struct rpc_portal_group_auth req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_portal_grp *pg;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_portal_group_auth_decoders,
+				    SPDK_COUNTOF(rpc_portal_group_auth_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	pg = iscsi_portal_grp_find_by_tag(req.tag);
+	if (pg == NULL) {
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not find portal group %d", req.tag);
+		goto exit;
+	}
+
+	rc = iscsi_portal_grp_set_chap_params(pg, req.disable_chap, req.require_chap,
+					      req.mutual_chap, req.chap_group);
+	if (rc < 0) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid combination of auth params");
+		goto exit;
+	}
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+
+	return;
+
+exit:
+	pthread_mutex_unlock(&g_iscsi.mutex);
+}
+SPDK_RPC_REGISTER("iscsi_portal_group_set_auth", rpc_iscsi_portal_group_set_auth,
+		  SPDK_RPC_RUNTIME)
+
+struct rpc_iscsi_get_connections_ctx {
+	struct spdk_jsonrpc_request *request;
+	struct spdk_json_write_ctx *w;
+};
+
+static void
+_rpc_iscsi_get_connections_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct rpc_iscsi_get_connections_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	spdk_json_write_array_end(ctx->w);
+	spdk_jsonrpc_end_result(ctx->request, ctx->w);
+
+	free(ctx);
+}
+
+static void
+_rpc_iscsi_get_connections(struct spdk_io_channel_iter *i)
+{
+	struct rpc_iscsi_get_connections_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_iscsi_poll_group *pg = spdk_io_channel_get_ctx(ch);
+	struct spdk_iscsi_conn *conn;
+
+	STAILQ_FOREACH(conn, &pg->connections, pg_link) {
+		iscsi_conn_info_json(ctx->w, conn);
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+rpc_iscsi_get_connections(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct rpc_iscsi_get_connections_ctx *ctx;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "iscsi_get_connections requires no parameters");
+		return;
+	}
+
+	ctx = calloc(1, sizeof(struct rpc_iscsi_get_connections_ctx));
+	if (ctx == NULL) {
+		SPDK_ERRLOG("Failed to allocate rpc_get_iscsi_conns_ctx struct\n");
+		spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+		return;
+	}
+
+	ctx->request = request;
+	ctx->w = spdk_jsonrpc_begin_result(request);
+
+	spdk_json_write_array_begin(ctx->w);
+
+	spdk_for_each_channel(&g_iscsi,
+			      _rpc_iscsi_get_connections,
+			      ctx,
+			      _rpc_iscsi_get_connections_done);
+}
+SPDK_RPC_REGISTER("iscsi_get_connections", rpc_iscsi_get_connections, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_connections, get_iscsi_connections)
+
+struct rpc_target_lun {
+	char *name;
+	char *bdev_name;
+	int32_t lun_id;
+};
+
+static void
+free_rpc_target_lun(struct rpc_target_lun *req)
+{
+	free(req->name);
+	free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_target_lun_decoders[] = {
+	{"name", offsetof(struct rpc_target_lun, name), spdk_json_decode_string},
+	{"bdev_name", offsetof(struct rpc_target_lun, bdev_name), spdk_json_decode_string},
+	{"lun_id", offsetof(struct rpc_target_lun, lun_id), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_target_node_add_lun(struct spdk_jsonrpc_request *request,
+			      const struct spdk_json_val *params)
+{
+	struct rpc_target_lun req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_tgt_node *target;
+	int rc;
+
+	req.lun_id = -1;
+
+	if (spdk_json_decode_object(params, rpc_target_lun_decoders,
+				    SPDK_COUNTOF(rpc_target_lun_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	target = iscsi_find_tgt_node(req.name);
+	if (target == NULL) {
+		SPDK_ERRLOG("target is not found\n");
+		goto invalid;
+	}
+
+	rc = iscsi_tgt_node_add_lun(target, req.bdev_name, req.lun_id);
+	if (rc < 0) {
+		SPDK_ERRLOG("add lun failed\n");
+		goto invalid;
+	}
+
+	free_rpc_target_lun(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 "Invalid parameters");
+	free_rpc_target_lun(&req);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_add_lun", rpc_iscsi_target_node_add_lun, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_add_lun, target_node_add_lun)
+
+struct rpc_target_auth {
+	char *name;
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+};
+
+static void
+free_rpc_target_auth(struct rpc_target_auth *req)
+{
+	free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_target_auth_decoders[] = {
+	{"name", offsetof(struct rpc_target_auth, name), spdk_json_decode_string},
+	{"disable_chap", offsetof(struct rpc_target_auth, disable_chap), spdk_json_decode_bool, true},
+	{"require_chap", offsetof(struct rpc_target_auth, require_chap), spdk_json_decode_bool, true},
+	{"mutual_chap", offsetof(struct rpc_target_auth, mutual_chap), spdk_json_decode_bool, true},
+	{"chap_group", offsetof(struct rpc_target_auth, chap_group), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_target_node_set_auth(struct spdk_jsonrpc_request *request,
+			       const struct spdk_json_val *params)
+{
+	struct rpc_target_auth req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_tgt_node *target;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_target_auth_decoders,
+				    SPDK_COUNTOF(rpc_target_auth_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		goto exit;
+	}
+
+	target = iscsi_find_tgt_node(req.name);
+	if (target == NULL) {
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not find target %s", req.name);
+		goto exit;
+	}
+
+	rc = iscsi_tgt_node_set_chap_params(target, req.disable_chap, req.require_chap,
+					    req.mutual_chap, req.chap_group);
+	if (rc < 0) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid combination of auth params");
+		goto exit;
+	}
+
+	free_rpc_target_auth(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+exit:
+	free_rpc_target_auth(&req);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_set_auth", rpc_iscsi_target_node_set_auth,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_set_auth, set_iscsi_target_node_auth)
+
+static void
+rpc_iscsi_get_options(struct spdk_jsonrpc_request *request,
+		      const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "iscsi_get_options requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	iscsi_opts_info_json(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_options", rpc_iscsi_get_options, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_options, get_iscsi_global_params)
+
+struct rpc_discovery_auth {
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int32_t chap_group;
+};
+
+static const struct spdk_json_object_decoder rpc_discovery_auth_decoders[] = {
+	{"disable_chap", offsetof(struct rpc_discovery_auth, disable_chap), spdk_json_decode_bool, true},
+	{"require_chap", offsetof(struct rpc_discovery_auth, require_chap), spdk_json_decode_bool, true},
+	{"mutual_chap", offsetof(struct rpc_discovery_auth, mutual_chap), spdk_json_decode_bool, true},
+	{"chap_group", offsetof(struct rpc_discovery_auth, chap_group), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_set_discovery_auth(struct spdk_jsonrpc_request *request,
+			     const struct spdk_json_val *params)
+{
+	struct rpc_discovery_auth req = {};
+	struct spdk_json_write_ctx *w;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_discovery_auth_decoders,
+				    SPDK_COUNTOF(rpc_discovery_auth_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	rc = iscsi_set_discovery_auth(req.disable_chap, req.require_chap,
+				      req.mutual_chap, req.chap_group);
+	if (rc < 0) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid combination of CHAP params");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_set_discovery_auth", rpc_iscsi_set_discovery_auth, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_set_discovery_auth, set_iscsi_discovery_auth)
+
+#define MAX_AUTH_SECRETS	64
+
+struct rpc_auth_secret {
+	char *user;
+	char *secret;
+	char *muser;
+	char *msecret;
+};
+
+static void
+free_rpc_auth_secret(struct rpc_auth_secret *_secret)
+{
+	free(_secret->user);
+	free(_secret->secret);
+	free(_secret->muser);
+	free(_secret->msecret);
+}
+
+static const struct spdk_json_object_decoder rpc_auth_secret_decoders[] = {
+	{"user", offsetof(struct rpc_auth_secret, user), spdk_json_decode_string},
+	{"secret", offsetof(struct rpc_auth_secret, secret), spdk_json_decode_string},
+	{"muser", offsetof(struct rpc_auth_secret, muser), spdk_json_decode_string, true},
+	{"msecret", offsetof(struct rpc_auth_secret, msecret), spdk_json_decode_string, true},
+};
+
+static int
+decode_rpc_auth_secret(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_auth_secret *_secret = out;
+
+	return spdk_json_decode_object(val, rpc_auth_secret_decoders,
+				       SPDK_COUNTOF(rpc_auth_secret_decoders), _secret);
+}
+
+struct rpc_auth_secrets {
+	size_t num_secret;
+	struct rpc_auth_secret secrets[MAX_AUTH_SECRETS];
+};
+
+static void
+free_rpc_auth_secrets(struct rpc_auth_secrets *secrets)
+{
+	size_t i;
+
+	for (i = 0; i < secrets->num_secret; i++) {
+		free_rpc_auth_secret(&secrets->secrets[i]);
+	}
+}
+
+static int
+decode_rpc_auth_secrets(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_auth_secrets *secrets = out;
+
+	return spdk_json_decode_array(val, decode_rpc_auth_secret, secrets->secrets,
+				      MAX_AUTH_SECRETS, &secrets->num_secret,
+				      sizeof(struct rpc_auth_secret));
+}
+
+struct rpc_auth_group {
+	int32_t tag;
+	struct rpc_auth_secrets secrets;
+};
+
+static void
+free_rpc_auth_group(struct rpc_auth_group *group)
+{
+	free_rpc_auth_secrets(&group->secrets);
+}
+
+static const struct spdk_json_object_decoder rpc_auth_group_decoders[] = {
+	{"tag", offsetof(struct rpc_auth_group, tag), spdk_json_decode_int32},
+	{"secrets", offsetof(struct rpc_auth_group, secrets), decode_rpc_auth_secrets, true},
+};
+
+static void
+rpc_iscsi_create_auth_group(struct spdk_jsonrpc_request *request,
+			    const struct spdk_json_val *params)
+{
+	struct rpc_auth_group req = {};
+	struct rpc_auth_secret *_secret;
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_auth_group *group = NULL;
+	int rc;
+	size_t i;
+
+	if (spdk_json_decode_object(params, rpc_auth_group_decoders,
+				    SPDK_COUNTOF(rpc_auth_group_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		free_rpc_auth_group(&req);
+		return;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	rc = iscsi_add_auth_group(req.tag, &group);
+	if (rc != 0) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not add auth group (%d), %s",
+						     req.tag, spdk_strerror(-rc));
+		free_rpc_auth_group(&req);
+		return;
+	}
+
+	for (i = 0; i < req.secrets.num_secret; i++) {
+		_secret = &req.secrets.secrets[i];
+		rc = iscsi_auth_group_add_secret(group, _secret->user, _secret->secret,
+						 _secret->muser, _secret->msecret);
+		if (rc != 0) {
+			iscsi_delete_auth_group(group);
+			pthread_mutex_unlock(&g_iscsi.mutex);
+
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							     "Could not add secret to auth group (%d), %s",
+							     req.tag, spdk_strerror(-rc));
+			free_rpc_auth_group(&req);
+			return;
+		}
+	}
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	free_rpc_auth_group(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_create_auth_group", rpc_iscsi_create_auth_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_auth_group, add_iscsi_auth_group)
+
+struct rpc_delete_auth_group {
+	int32_t tag;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_auth_group_decoders[] = {
+	{"tag", offsetof(struct rpc_delete_auth_group, tag), spdk_json_decode_int32},
+};
+
+static void
+rpc_iscsi_delete_auth_group(struct spdk_jsonrpc_request *request,
+			    const struct spdk_json_val *params)
+{
+	struct rpc_delete_auth_group req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_auth_group *group;
+
+	if (spdk_json_decode_object(params, rpc_delete_auth_group_decoders,
+				    SPDK_COUNTOF(rpc_delete_auth_group_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	group = iscsi_find_auth_group_by_tag(req.tag);
+	if (group == NULL) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not find auth group (%d)", req.tag);
+		return;
+	}
+
+	iscsi_delete_auth_group(group);
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_delete_auth_group", rpc_iscsi_delete_auth_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_auth_group, delete_iscsi_auth_group)
+
+struct rpc_add_auth_secret {
+	int32_t tag;
+	char *user;
+	char *secret;
+	char *muser;
+	char *msecret;
+};
+
+static void
+free_rpc_add_auth_secret(struct rpc_add_auth_secret *_secret)
+{
+	free(_secret->user);
+	free(_secret->secret);
+	free(_secret->muser);
+	free(_secret->msecret);
+}
+
+static const struct spdk_json_object_decoder rpc_add_auth_secret_decoders[] = {
+	{"tag", offsetof(struct rpc_add_auth_secret, tag), spdk_json_decode_int32},
+	{"user", offsetof(struct rpc_add_auth_secret, user), spdk_json_decode_string},
+	{"secret", offsetof(struct rpc_add_auth_secret, secret), spdk_json_decode_string},
+	{"muser", offsetof(struct rpc_add_auth_secret, muser), spdk_json_decode_string, true},
+	{"msecret", offsetof(struct rpc_add_auth_secret, msecret), spdk_json_decode_string, true},
+};
+
+static void
+rpc_iscsi_auth_group_add_secret(struct spdk_jsonrpc_request *request,
+				const struct spdk_json_val *params)
+{
+	struct rpc_add_auth_secret req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_auth_group *group;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_add_auth_secret_decoders,
+				    SPDK_COUNTOF(rpc_add_auth_secret_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		free_rpc_add_auth_secret(&req);
+		return;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	group = iscsi_find_auth_group_by_tag(req.tag);
+	if (group == NULL) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not find auth group (%d)", req.tag);
+		free_rpc_add_auth_secret(&req);
+		return;
+	}
+
+	rc = iscsi_auth_group_add_secret(group, req.user, req.secret, req.muser, req.msecret);
+	if (rc != 0) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not add secret to auth group (%d), %s",
+						     req.tag, spdk_strerror(-rc));
+		free_rpc_add_auth_secret(&req);
+		return;
+	}
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	free_rpc_add_auth_secret(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_auth_group_add_secret", rpc_iscsi_auth_group_add_secret,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_auth_group_add_secret, add_secret_to_iscsi_auth_group)
+
+
+struct rpc_remove_auth_secret {
+	int32_t tag;
+	char *user;
+};
+
+static void
+free_rpc_remove_auth_secret(struct rpc_remove_auth_secret *_secret)
+{
+	free(_secret->user);
+}
+
+static const struct spdk_json_object_decoder rpc_remove_auth_secret_decoders[] = {
+	{"tag", offsetof(struct rpc_remove_auth_secret, tag), spdk_json_decode_int32},
+	{"user", offsetof(struct rpc_remove_auth_secret, user), spdk_json_decode_string},
+};
+
+static void
+rpc_iscsi_auth_group_remove_secret(struct spdk_jsonrpc_request *request,
+				   const struct spdk_json_val *params)
+{
+	struct rpc_remove_auth_secret req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_iscsi_auth_group *group;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_remove_auth_secret_decoders,
+				    SPDK_COUNTOF(rpc_remove_auth_secret_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		free_rpc_remove_auth_secret(&req);
+		return;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	group = iscsi_find_auth_group_by_tag(req.tag);
+	if (group == NULL) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not find auth group (%d)", req.tag);
+		free_rpc_remove_auth_secret(&req);
+		return;
+	}
+
+	rc = iscsi_auth_group_delete_secret(group, req.user);
+	if (rc != 0) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Could not delete secret from CHAP group (%d), %s",
+						     req.tag, spdk_strerror(-rc));
+		free_rpc_remove_auth_secret(&req);
+		return;
+	}
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	free_rpc_remove_auth_secret(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_auth_group_remove_secret",
+		  rpc_iscsi_auth_group_remove_secret, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_auth_group_remove_secret,
+				   delete_secret_from_iscsi_auth_group)
+
+static void
+rpc_iscsi_get_auth_groups(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "iscsi_get_auth_groups requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	iscsi_auth_groups_info_json(w);
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_auth_groups", rpc_iscsi_get_auth_groups, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_auth_groups, get_iscsi_auth_groups)
+
+static const struct spdk_json_object_decoder rpc_set_iscsi_opts_decoders[] = {
+	{"auth_file", offsetof(struct spdk_iscsi_opts, authfile), spdk_json_decode_string, true},
+	{"node_base", offsetof(struct spdk_iscsi_opts, nodebase), spdk_json_decode_string, true},
+	{"nop_timeout", offsetof(struct spdk_iscsi_opts, timeout), spdk_json_decode_int32, true},
+	{"nop_in_interval", offsetof(struct spdk_iscsi_opts, nopininterval), spdk_json_decode_int32, true},
+	{"no_discovery_auth", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true},
+	{"req_discovery_auth", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true},
+	{"req_discovery_auth_mutual", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true},
+	{"discovery_auth_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true},
+	{"disable_chap", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true},
+	{"require_chap", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true},
+	{"mutual_chap", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true},
+	{"chap_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true},
+	{"max_sessions", offsetof(struct spdk_iscsi_opts, MaxSessions), spdk_json_decode_uint32, true},
+	{"max_queue_depth", offsetof(struct spdk_iscsi_opts, MaxQueueDepth), spdk_json_decode_uint32, true},
+	{"max_connections_per_session", offsetof(struct spdk_iscsi_opts, MaxConnectionsPerSession), spdk_json_decode_uint32, true},
+	{"default_time2wait", offsetof(struct spdk_iscsi_opts, DefaultTime2Wait), spdk_json_decode_uint32, true},
+	{"default_time2retain", offsetof(struct spdk_iscsi_opts, DefaultTime2Retain), spdk_json_decode_uint32, true},
+	{"first_burst_length", offsetof(struct spdk_iscsi_opts, FirstBurstLength), spdk_json_decode_uint32, true},
+	{"immediate_data", offsetof(struct spdk_iscsi_opts, ImmediateData), spdk_json_decode_bool, true},
+	{"error_recovery_level", offsetof(struct spdk_iscsi_opts, ErrorRecoveryLevel), spdk_json_decode_uint32, true},
+	{"allow_duplicated_isid", offsetof(struct spdk_iscsi_opts, AllowDuplicateIsid), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_iscsi_set_options(struct spdk_jsonrpc_request *request,
+		      const struct spdk_json_val *params)
+{
+	struct spdk_iscsi_opts *opts;
+	struct spdk_json_write_ctx *w;
+
+	if (g_spdk_iscsi_opts != NULL) {
+		SPDK_ERRLOG("this RPC must not be called more than once.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Must not call more than once");
+		return;
+	}
+
+	opts = iscsi_opts_alloc();
+	if (opts == NULL) {
+		SPDK_ERRLOG("iscsi_opts_alloc() failed.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Out of memory");
+		return;
+	}
+
+	if (params != NULL) {
+		if (spdk_json_decode_object(params, rpc_set_iscsi_opts_decoders,
+					    SPDK_COUNTOF(rpc_set_iscsi_opts_decoders), opts)) {
+			SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							 "Invalid parameters");
+			iscsi_opts_free(opts);
+			return;
+		}
+	}
+
+	g_spdk_iscsi_opts = iscsi_opts_copy(opts);
+	iscsi_opts_free(opts);
+
+	if (g_spdk_iscsi_opts == NULL) {
+		SPDK_ERRLOG("iscsi_opts_copy() failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Out of memory");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_set_options", rpc_iscsi_set_options, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_set_options, set_iscsi_options)
diff --git a/src/spdk/lib/iscsi/iscsi_subsystem.c b/src/spdk/lib/iscsi/iscsi_subsystem.c
new file mode 100644
index 000000000..1eb766233
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi_subsystem.c
@@ -0,0 +1,1577 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/sock.h"
+#include "spdk/likely.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/init_grp.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/conn.h"
+#include "iscsi/task.h"
+#include "iscsi/tgt_node.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+
+struct spdk_iscsi_opts *g_spdk_iscsi_opts = NULL;
+
+static struct spdk_thread *g_init_thread = NULL;
+static spdk_iscsi_init_cb g_init_cb_fn = NULL;
+static void *g_init_cb_arg = NULL;
+
+static spdk_iscsi_fini_cb g_fini_cb_fn;
+static void *g_fini_cb_arg;
+
+#define ISCSI_CONFIG_TMPL \
+"[iSCSI]\n" \
+"  # node name (not include optional part)\n" \
+"  # Users can optionally change this to fit their environment.\n" \
+"  NodeBase \"%s\"\n" \
+"\n" \
+"  # files\n" \
+"  %s %s\n" \
+"\n" \
+"  # socket I/O timeout sec. (polling is infinity)\n" \
+"  Timeout %d\n" \
+"\n" \
+"  # authentication information for discovery session\n" \
+"  DiscoveryAuthMethod %s\n" \
+"  DiscoveryAuthGroup %s\n" \
+"\n" \
+"  MaxSessions %d\n" \
+"  MaxConnectionsPerSession %d\n" \
+"  MaxConnections %d\n" \
+"  MaxQueueDepth %d\n" \
+"\n" \
+"  # iSCSI initial parameters negotiate with initiators\n" \
+"  # NOTE: incorrect values might crash\n" \
+"  DefaultTime2Wait %d\n" \
+"  DefaultTime2Retain %d\n" \
+"\n" \
+"  FirstBurstLength %d\n" \
+"  ImmediateData %s\n" \
+"  ErrorRecoveryLevel %d\n" \
+"\n"
+
+static void
+iscsi_globals_config_text(FILE *fp)
+{
+	const char *authmethod = "None";
+	char authgroup[32] = "None";
+
+	if (NULL == fp) {
+		return;
+	}
+
+	if (g_iscsi.require_chap) {
+		authmethod = "CHAP";
+	} else if (g_iscsi.mutual_chap) {
+		authmethod = "CHAP Mutual";
+	} else if (!g_iscsi.disable_chap) {
+		authmethod = "Auto";
+	}
+
+	if (g_iscsi.chap_group) {
+		snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", g_iscsi.chap_group);
+	}
+
+	fprintf(fp, ISCSI_CONFIG_TMPL,
+		g_iscsi.nodebase,
+		g_iscsi.authfile ? "AuthFile" : "",
+		g_iscsi.authfile ? g_iscsi.authfile : "",
+		g_iscsi.timeout, authmethod, authgroup,
+		g_iscsi.MaxSessions, g_iscsi.MaxConnectionsPerSession,
+		g_iscsi.MaxConnections,
+		g_iscsi.MaxQueueDepth,
+		g_iscsi.DefaultTime2Wait, g_iscsi.DefaultTime2Retain,
+		g_iscsi.FirstBurstLength,
+		(g_iscsi.ImmediateData) ? "Yes" : "No",
+		g_iscsi.ErrorRecoveryLevel);
+}
+
+#define ISCSI_DATA_BUFFER_ALIGNMENT	(0x1000)
+#define ISCSI_DATA_BUFFER_MASK		(ISCSI_DATA_BUFFER_ALIGNMENT - 1)
+
+static void
+mobj_ctor(struct spdk_mempool *mp, __attribute__((unused)) void *arg,
+	  void *_m, __attribute__((unused)) unsigned i)
+{
+	struct spdk_mobj *m = _m;
+
+	m->mp = mp;
+	m->buf = (uint8_t *)m + sizeof(struct spdk_mobj);
+	m->buf = (void *)((unsigned long)((uint8_t *)m->buf + ISCSI_DATA_BUFFER_ALIGNMENT) &
+			  ~ISCSI_DATA_BUFFER_MASK);
+}
+
+#define NUM_PDU_PER_CONNECTION(iscsi)	(2 * (iscsi->MaxQueueDepth + MAX_LARGE_DATAIN_PER_CONNECTION + 8))
+#define PDU_POOL_SIZE(iscsi)		(iscsi->MaxConnections * NUM_PDU_PER_CONNECTION(iscsi))
+#define IMMEDIATE_DATA_POOL_SIZE(iscsi)	(iscsi->MaxConnections * 128)
+#define DATA_OUT_POOL_SIZE(iscsi)	(iscsi->MaxConnections * MAX_DATA_OUT_PER_CONNECTION)
+
+static int
+iscsi_initialize_pdu_pool(void)
+{
+	struct spdk_iscsi_globals *iscsi = &g_iscsi;
+	int imm_mobj_size = SPDK_BDEV_BUF_SIZE_WITH_MD(iscsi_get_max_immediate_data_size()) +
+			    sizeof(struct spdk_mobj) + ISCSI_DATA_BUFFER_ALIGNMENT;
+	int dout_mobj_size = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) +
+			     sizeof(struct spdk_mobj) + ISCSI_DATA_BUFFER_ALIGNMENT;
+
+	/* create PDU pool */
+	iscsi->pdu_pool = spdk_mempool_create("PDU_Pool",
+					      PDU_POOL_SIZE(iscsi),
+					      sizeof(struct spdk_iscsi_pdu),
+					      256, SPDK_ENV_SOCKET_ID_ANY);
+	if (!iscsi->pdu_pool) {
+		SPDK_ERRLOG("create PDU pool failed\n");
+		return -1;
+	}
+
+	iscsi->pdu_immediate_data_pool = spdk_mempool_create_ctor("PDU_immediate_data_Pool",
+					 IMMEDIATE_DATA_POOL_SIZE(iscsi),
+					 imm_mobj_size, 256,
+					 SPDK_ENV_SOCKET_ID_ANY,
+					 mobj_ctor, NULL);
+	if (!iscsi->pdu_immediate_data_pool) {
+		SPDK_ERRLOG("create PDU immediate data pool failed\n");
+		return -1;
+	}
+
+	iscsi->pdu_data_out_pool = spdk_mempool_create_ctor("PDU_data_out_Pool",
+				   DATA_OUT_POOL_SIZE(iscsi),
+				   dout_mobj_size, 256,
+				   SPDK_ENV_SOCKET_ID_ANY,
+				   mobj_ctor, NULL);
+	if (!iscsi->pdu_data_out_pool) {
+		SPDK_ERRLOG("create PDU data out pool failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+iscsi_sess_ctor(struct spdk_mempool *pool, void *arg, void *session_buf,
+		unsigned index)
+{
+	struct spdk_iscsi_globals		*iscsi = arg;
+	struct spdk_iscsi_sess	*sess = session_buf;
+
+	iscsi->session[index] = sess;
+
+	/* tsih 0 is reserved, so start tsih values at 1. */
+	sess->tsih = index + 1;
+}
+
+#define DEFAULT_TASK_POOL_SIZE 32768
+
+static int
+iscsi_initialize_task_pool(void)
+{
+	struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+	/* create scsi_task pool */
+	iscsi->task_pool = spdk_mempool_create("SCSI_TASK_Pool",
+					       DEFAULT_TASK_POOL_SIZE,
+					       sizeof(struct spdk_iscsi_task),
+					       128, SPDK_ENV_SOCKET_ID_ANY);
+	if (!iscsi->task_pool) {
+		SPDK_ERRLOG("create task pool failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+#define SESSION_POOL_SIZE(iscsi)	(iscsi->MaxSessions)
+static int
+iscsi_initialize_session_pool(void)
+{
+	struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+	iscsi->session_pool = spdk_mempool_create_ctor("Session_Pool",
+			      SESSION_POOL_SIZE(iscsi),
+			      sizeof(struct spdk_iscsi_sess), 0,
+			      SPDK_ENV_SOCKET_ID_ANY,
+			      iscsi_sess_ctor, iscsi);
+	if (!iscsi->session_pool) {
+		SPDK_ERRLOG("create session pool failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+iscsi_initialize_all_pools(void)
+{
+	if (iscsi_initialize_pdu_pool() != 0) {
+		return -1;
+	}
+
+	if (iscsi_initialize_session_pool() != 0) {
+		return -1;
+	}
+
+	if (iscsi_initialize_task_pool() != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+iscsi_check_pool(struct spdk_mempool *pool, size_t count)
+{
+	if (pool && spdk_mempool_count(pool) != count) {
+		SPDK_ERRLOG("spdk_mempool_count(%s) == %zu, should be %zu\n",
+			    spdk_mempool_get_name(pool), spdk_mempool_count(pool), count);
+	}
+}
+
+static void
+iscsi_check_pools(void)
+{
+	struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+	iscsi_check_pool(iscsi->pdu_pool, PDU_POOL_SIZE(iscsi));
+	iscsi_check_pool(iscsi->session_pool, SESSION_POOL_SIZE(iscsi));
+	iscsi_check_pool(iscsi->pdu_immediate_data_pool, IMMEDIATE_DATA_POOL_SIZE(iscsi));
+	iscsi_check_pool(iscsi->pdu_data_out_pool, DATA_OUT_POOL_SIZE(iscsi));
+	iscsi_check_pool(iscsi->task_pool, DEFAULT_TASK_POOL_SIZE);
+}
+
+static void
+iscsi_free_pools(void)
+{
+	struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+	spdk_mempool_free(iscsi->pdu_pool);
+	spdk_mempool_free(iscsi->session_pool);
+	spdk_mempool_free(iscsi->pdu_immediate_data_pool);
+	spdk_mempool_free(iscsi->pdu_data_out_pool);
+	spdk_mempool_free(iscsi->task_pool);
+}
+
+void iscsi_put_pdu(struct spdk_iscsi_pdu *pdu)
+{
+	if (!pdu) {
+		return;
+	}
+
+	assert(pdu->ref > 0);
+	pdu->ref--;
+
+	if (pdu->ref == 0) {
+		if (pdu->mobj) {
+			spdk_mempool_put(pdu->mobj->mp, (void *)pdu->mobj);
+		}
+
+		if (pdu->data && !pdu->data_from_mempool) {
+			free(pdu->data);
+		}
+
+		spdk_mempool_put(g_iscsi.pdu_pool, (void *)pdu);
+	}
+}
+
+struct spdk_iscsi_pdu *iscsi_get_pdu(struct spdk_iscsi_conn *conn)
+{
+	struct spdk_iscsi_pdu *pdu;
+
+	assert(conn != NULL);
+	pdu = spdk_mempool_get(g_iscsi.pdu_pool);
+	if (!pdu) {
+		SPDK_ERRLOG("Unable to get PDU\n");
+		abort();
+	}
+
+	/* we do not want to zero out the last part of the structure reserved for AHS and sense data */
+	memset(pdu, 0, offsetof(struct spdk_iscsi_pdu, ahs));
+	pdu->ref = 1;
+	pdu->conn = conn;
+
+	return pdu;
+}
+
+static void
+iscsi_log_globals(void)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthFile %s\n",
+		      g_iscsi.authfile ? g_iscsi.authfile : "(none)");
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NodeBase %s\n", g_iscsi.nodebase);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxSessions %d\n", g_iscsi.MaxSessions);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxConnectionsPerSession %d\n",
+		      g_iscsi.MaxConnectionsPerSession);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxQueueDepth %d\n", g_iscsi.MaxQueueDepth);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Wait %d\n",
+		      g_iscsi.DefaultTime2Wait);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Retain %d\n",
+		      g_iscsi.DefaultTime2Retain);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "FirstBurstLength %d\n",
+		      g_iscsi.FirstBurstLength);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ImmediateData %s\n",
+		      g_iscsi.ImmediateData ? "Yes" : "No");
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AllowDuplicateIsid %s\n",
+		      g_iscsi.AllowDuplicateIsid ? "Yes" : "No");
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ErrorRecoveryLevel %d\n",
+		      g_iscsi.ErrorRecoveryLevel);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Timeout %d\n", g_iscsi.timeout);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NopInInterval %d\n",
+		      g_iscsi.nopininterval);
+	if (g_iscsi.disable_chap) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "DiscoveryAuthMethod None\n");
+	} else if (!g_iscsi.require_chap) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "DiscoveryAuthMethod Auto\n");
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "DiscoveryAuthMethod %s %s\n",
+			      g_iscsi.require_chap ? "CHAP" : "",
+			      g_iscsi.mutual_chap ? "Mutual" : "");
+	}
+
+	if (g_iscsi.chap_group == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "DiscoveryAuthGroup None\n");
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "DiscoveryAuthGroup AuthGroup%d\n",
+			      g_iscsi.chap_group);
+	}
+}
+
+static void
+iscsi_opts_init(struct spdk_iscsi_opts *opts)
+{
+	opts->MaxSessions = DEFAULT_MAX_SESSIONS;
+	opts->MaxConnectionsPerSession = DEFAULT_MAX_CONNECTIONS_PER_SESSION;
+	opts->MaxQueueDepth = DEFAULT_MAX_QUEUE_DEPTH;
+	opts->DefaultTime2Wait = DEFAULT_DEFAULTTIME2WAIT;
+	opts->DefaultTime2Retain = DEFAULT_DEFAULTTIME2RETAIN;
+	opts->FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH;
+	opts->ImmediateData = DEFAULT_IMMEDIATEDATA;
+	opts->AllowDuplicateIsid = false;
+	opts->ErrorRecoveryLevel = DEFAULT_ERRORRECOVERYLEVEL;
+	opts->timeout = DEFAULT_TIMEOUT;
+	opts->nopininterval = DEFAULT_NOPININTERVAL;
+	opts->disable_chap = false;
+	opts->require_chap = false;
+	opts->mutual_chap = false;
+	opts->chap_group = 0;
+	opts->authfile = NULL;
+	opts->nodebase = NULL;
+}
+
+struct spdk_iscsi_opts *
+iscsi_opts_alloc(void)
+{
+	struct spdk_iscsi_opts *opts;
+
+	opts = calloc(1, sizeof(*opts));
+	if (!opts) {
+		SPDK_ERRLOG("calloc() failed for iscsi options\n");
+		return NULL;
+	}
+
+	iscsi_opts_init(opts);
+
+	return opts;
+}
+
+void
+iscsi_opts_free(struct spdk_iscsi_opts *opts)
+{
+	free(opts->authfile);
+	free(opts->nodebase);
+	free(opts);
+}
+
+/* Deep copy of spdk_iscsi_opts */
+struct spdk_iscsi_opts *
+iscsi_opts_copy(struct spdk_iscsi_opts *src)
+{
+	struct spdk_iscsi_opts *dst;
+
+	dst = calloc(1, sizeof(*dst));
+	if (!dst) {
+		SPDK_ERRLOG("calloc() failed for iscsi options\n");
+		return NULL;
+	}
+
+	if (src->authfile) {
+		dst->authfile = strdup(src->authfile);
+		if (!dst->authfile) {
+			free(dst);
+			SPDK_ERRLOG("failed to strdup for auth file %s\n", src->authfile);
+			return NULL;
+		}
+	}
+
+	if (src->nodebase) {
+		dst->nodebase = strdup(src->nodebase);
+		if (!dst->nodebase) {
+			free(dst->authfile);
+			free(dst);
+			SPDK_ERRLOG("failed to strdup for nodebase %s\n", src->nodebase);
+			return NULL;
+		}
+	}
+
+	dst->MaxSessions = src->MaxSessions;
+	dst->MaxConnectionsPerSession = src->MaxConnectionsPerSession;
+	dst->MaxQueueDepth = src->MaxQueueDepth;
+	dst->DefaultTime2Wait = src->DefaultTime2Wait;
+	dst->DefaultTime2Retain = src->DefaultTime2Retain;
+	dst->FirstBurstLength = src->FirstBurstLength;
+	dst->ImmediateData = src->ImmediateData;
+	dst->AllowDuplicateIsid = src->AllowDuplicateIsid;
+	dst->ErrorRecoveryLevel = src->ErrorRecoveryLevel;
+	dst->timeout = src->timeout;
+	dst->nopininterval = src->nopininterval;
+	dst->disable_chap = src->disable_chap;
+	dst->require_chap = src->require_chap;
+	dst->mutual_chap = src->mutual_chap;
+	dst->chap_group = src->chap_group;
+
+	return dst;
+}
+
+static int
+iscsi_read_config_file_params(struct spdk_conf_section *sp,
+			      struct spdk_iscsi_opts *opts)
+{
+	const char *val;
+	int MaxSessions;
+	int MaxConnectionsPerSession;
+	int MaxQueueDepth;
+	int DefaultTime2Wait;
+	int DefaultTime2Retain;
+	int FirstBurstLength;
+	int ErrorRecoveryLevel;
+	int timeout;
+	int nopininterval;
+	const char *ag_tag;
+	int ag_tag_i;
+	int i;
+
+	val = spdk_conf_section_get_val(sp, "Comment");
+	if (val != NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+	}
+
+	val = spdk_conf_section_get_val(sp, "AuthFile");
+	if (val != NULL) {
+		opts->authfile = strdup(val);
+		if (!opts->authfile) {
+			SPDK_ERRLOG("strdup() failed for AuthFile\n");
+			return -ENOMEM;
+		}
+	}
+
+	val = spdk_conf_section_get_val(sp, "NodeBase");
+	if (val != NULL) {
+		opts->nodebase = strdup(val);
+		if (!opts->nodebase) {
+			free(opts->authfile);
+			SPDK_ERRLOG("strdup() failed for NodeBase\n");
+			return -ENOMEM;
+		}
+	}
+
+	MaxSessions = spdk_conf_section_get_intval(sp, "MaxSessions");
+	if (MaxSessions >= 0) {
+		opts->MaxSessions = MaxSessions;
+	}
+
+	MaxConnectionsPerSession = spdk_conf_section_get_intval(sp, "MaxConnectionsPerSession");
+	if (MaxConnectionsPerSession >= 0) {
+		opts->MaxConnectionsPerSession = MaxConnectionsPerSession;
+	}
+
+	MaxQueueDepth = spdk_conf_section_get_intval(sp, "MaxQueueDepth");
+	if (MaxQueueDepth >= 0) {
+		opts->MaxQueueDepth = MaxQueueDepth;
+	}
+
+	DefaultTime2Wait = spdk_conf_section_get_intval(sp, "DefaultTime2Wait");
+	if (DefaultTime2Wait >= 0) {
+		opts->DefaultTime2Wait = DefaultTime2Wait;
+	}
+
+	DefaultTime2Retain = spdk_conf_section_get_intval(sp, "DefaultTime2Retain");
+	if (DefaultTime2Retain >= 0) {
+		opts->DefaultTime2Retain = DefaultTime2Retain;
+	}
+
+	FirstBurstLength = spdk_conf_section_get_intval(sp, "FirstBurstLength");
+	if (FirstBurstLength >= 0) {
+		opts->FirstBurstLength = FirstBurstLength;
+	}
+
+	opts->ImmediateData = spdk_conf_section_get_boolval(sp, "ImmediateData",
+			      opts->ImmediateData);
+
+	/* This option is only for test.
+	 * If AllowDuplicateIsid is enabled, it allows different connections carrying
+	 * TSIH=0 login the target within the same session.
+	 */
+	opts->AllowDuplicateIsid = spdk_conf_section_get_boolval(sp, "AllowDuplicateIsid",
+				   opts->AllowDuplicateIsid);
+
+	ErrorRecoveryLevel = spdk_conf_section_get_intval(sp, "ErrorRecoveryLevel");
+	if (ErrorRecoveryLevel >= 0) {
+		opts->ErrorRecoveryLevel = ErrorRecoveryLevel;
+	}
+	timeout = spdk_conf_section_get_intval(sp, "Timeout");
+	if (timeout >= 0) {
+		opts->timeout = timeout;
+	}
+	nopininterval = spdk_conf_section_get_intval(sp, "NopInInterval");
+	if (nopininterval >= 0) {
+		opts->nopininterval = nopininterval;
+	}
+	val = spdk_conf_section_get_val(sp, "DiscoveryAuthMethod");
+	if (val != NULL) {
+		for (i = 0; ; i++) {
+			val = spdk_conf_section_get_nmval(sp, "DiscoveryAuthMethod", 0, i);
+			if (val == NULL) {
+				break;
+			}
+			if (strcasecmp(val, "CHAP") == 0) {
+				opts->require_chap = true;
+			} else if (strcasecmp(val, "Mutual") == 0) {
+				opts->require_chap = true;
+				opts->mutual_chap = true;
+			} else if (strcasecmp(val, "Auto") == 0) {
+				opts->disable_chap = false;
+				opts->require_chap = false;
+				opts->mutual_chap = false;
+			} else if (strcasecmp(val, "None") == 0) {
+				opts->disable_chap = true;
+				opts->require_chap = false;
+				opts->mutual_chap = false;
+			} else {
+				SPDK_ERRLOG("unknown CHAP mode %s\n", val);
+			}
+		}
+		if (opts->mutual_chap && !opts->require_chap) {
+			free(opts->authfile);
+			free(opts->nodebase);
+			SPDK_ERRLOG("CHAP must set to be required when using mutual CHAP.\n");
+			return -EINVAL;
+		}
+	}
+	val = spdk_conf_section_get_val(sp, "DiscoveryAuthGroup");
+	if (val != NULL) {
+		ag_tag = val;
+		if (strcasecmp(ag_tag, "None") == 0) {
+			opts->chap_group = 0;
+		} else {
+			if (strncasecmp(ag_tag, "AuthGroup",
+					strlen("AuthGroup")) != 0
+			    || sscanf(ag_tag, "%*[^0-9]%d", &ag_tag_i) != 1
+			    || ag_tag_i == 0) {
+				SPDK_ERRLOG("invalid auth group %s, ignoring\n", ag_tag);
+			} else {
+				opts->chap_group = ag_tag_i;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int
+iscsi_opts_verify(struct spdk_iscsi_opts *opts)
+{
+	if (!opts->nodebase) {
+		opts->nodebase = strdup(SPDK_ISCSI_DEFAULT_NODEBASE);
+		if (opts->nodebase == NULL) {
+			SPDK_ERRLOG("strdup() failed for default nodebase\n");
+			return -ENOMEM;
+		}
+	}
+
+	if (opts->MaxSessions == 0 || opts->MaxSessions > 65535) {
+		SPDK_ERRLOG("%d is invalid. MaxSessions must be more than 0 and no more than 65535\n",
+			    opts->MaxSessions);
+		return -EINVAL;
+	}
+
+	if (opts->MaxConnectionsPerSession == 0 || opts->MaxConnectionsPerSession > 65535) {
+		SPDK_ERRLOG("%d is invalid. MaxConnectionsPerSession must be more than 0 and no more than 65535\n",
+			    opts->MaxConnectionsPerSession);
+		return -EINVAL;
+	}
+
+	if (opts->MaxQueueDepth == 0 || opts->MaxQueueDepth > 256) {
+		SPDK_ERRLOG("%d is invalid. MaxQueueDepth must be more than 0 and no more than 256\n",
+			    opts->MaxQueueDepth);
+		return -EINVAL;
+	}
+
+	if (opts->DefaultTime2Wait > 3600) {
+		SPDK_ERRLOG("%d is invalid. DefaultTime2Wait must be no more than 3600\n",
+			    opts->DefaultTime2Wait);
+		return -EINVAL;
+	}
+
+	if (opts->DefaultTime2Retain > 3600) {
+		SPDK_ERRLOG("%d is invalid. DefaultTime2Retain must be no more than 3600\n",
+			    opts->DefaultTime2Retain);
+		return -EINVAL;
+	}
+
+	if (opts->FirstBurstLength >= SPDK_ISCSI_MIN_FIRST_BURST_LENGTH) {
+		if (opts->FirstBurstLength > SPDK_ISCSI_MAX_BURST_LENGTH) {
+			SPDK_ERRLOG("FirstBurstLength %d shall not exceed MaxBurstLength %d\n",
+				    opts->FirstBurstLength, SPDK_ISCSI_MAX_BURST_LENGTH);
+			return -EINVAL;
+		}
+	} else {
+		SPDK_ERRLOG("FirstBurstLength %d shall be no less than %d\n",
+			    opts->FirstBurstLength, SPDK_ISCSI_MIN_FIRST_BURST_LENGTH);
+		return -EINVAL;
+	}
+
+	if (opts->ErrorRecoveryLevel > 2) {
+		SPDK_ERRLOG("ErrorRecoveryLevel %d is not supported.\n", opts->ErrorRecoveryLevel);
+		return -EINVAL;
+	}
+
+	if (opts->timeout < 0) {
+		SPDK_ERRLOG("%d is invalid. timeout must not be less than 0\n", opts->timeout);
+		return -EINVAL;
+	}
+
+	if (opts->nopininterval < 0 || opts->nopininterval > MAX_NOPININTERVAL) {
+		SPDK_ERRLOG("%d is invalid. nopinterval must be between 0 and %d\n",
+			    opts->nopininterval, MAX_NOPININTERVAL);
+		return -EINVAL;
+	}
+
+	if (!iscsi_check_chap_params(opts->disable_chap, opts->require_chap,
+				     opts->mutual_chap, opts->chap_group)) {
+		SPDK_ERRLOG("CHAP params in opts are illegal combination\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+iscsi_parse_options(struct spdk_iscsi_opts **popts)
+{
+	struct spdk_iscsi_opts *opts;
+	struct spdk_conf_section *sp;
+	int rc;
+
+	opts = iscsi_opts_alloc();
+	if (!opts) {
+		SPDK_ERRLOG("iscsi_opts_alloc_failed() failed\n");
+		return -ENOMEM;
+	}
+
+	/* Process parameters */
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_read_config_file_parmas\n");
+	sp = spdk_conf_find_section(NULL, "iSCSI");
+	if (sp != NULL) {
+		rc = iscsi_read_config_file_params(sp, opts);
+		if (rc != 0) {
+			free(opts);
+			SPDK_ERRLOG("iscsi_read_config_file_params() failed\n");
+			return rc;
+		}
+	}
+
+	*popts = opts;
+
+	return 0;
+}
+
+static int
+iscsi_set_global_params(struct spdk_iscsi_opts *opts)
+{
+	int rc;
+
+	rc = iscsi_opts_verify(opts);
+	if (rc != 0) {
+		SPDK_ERRLOG("spdk_iscsi_opts_verify() failed\n");
+		return rc;
+	}
+
+	if (opts->authfile != NULL) {
+		g_iscsi.authfile = strdup(opts->authfile);
+		if (!g_iscsi.authfile) {
+			SPDK_ERRLOG("failed to strdup for auth file %s\n", opts->authfile);
+			return -ENOMEM;
+		}
+	}
+
+	g_iscsi.nodebase = strdup(opts->nodebase);
+	if (!g_iscsi.nodebase) {
+		SPDK_ERRLOG("failed to strdup for nodebase %s\n", opts->nodebase);
+		return -ENOMEM;
+	}
+
+	g_iscsi.MaxSessions = opts->MaxSessions;
+	g_iscsi.MaxConnectionsPerSession = opts->MaxConnectionsPerSession;
+	g_iscsi.MaxQueueDepth = opts->MaxQueueDepth;
+	g_iscsi.DefaultTime2Wait = opts->DefaultTime2Wait;
+	g_iscsi.DefaultTime2Retain = opts->DefaultTime2Retain;
+	g_iscsi.FirstBurstLength = opts->FirstBurstLength;
+	g_iscsi.ImmediateData = opts->ImmediateData;
+	g_iscsi.AllowDuplicateIsid = opts->AllowDuplicateIsid;
+	g_iscsi.ErrorRecoveryLevel = opts->ErrorRecoveryLevel;
+	g_iscsi.timeout = opts->timeout;
+	g_iscsi.nopininterval = opts->nopininterval;
+	g_iscsi.disable_chap = opts->disable_chap;
+	g_iscsi.require_chap = opts->require_chap;
+	g_iscsi.mutual_chap = opts->mutual_chap;
+	g_iscsi.chap_group = opts->chap_group;
+
+	iscsi_log_globals();
+
+	return 0;
+}
+
+int
+iscsi_set_discovery_auth(bool disable_chap, bool require_chap, bool mutual_chap,
+			 int32_t chap_group)
+{
+	if (!iscsi_check_chap_params(disable_chap, require_chap, mutual_chap,
+				     chap_group)) {
+		SPDK_ERRLOG("CHAP params are illegal combination\n");
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	g_iscsi.disable_chap = disable_chap;
+	g_iscsi.require_chap = require_chap;
+	g_iscsi.mutual_chap = mutual_chap;
+	g_iscsi.chap_group = chap_group;
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	return 0;
+}
+
+int
+iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group,
+			    const char *user, const char *secret,
+			    const char *muser, const char *msecret)
+{
+	struct spdk_iscsi_auth_secret *_secret;
+	size_t len;
+
+	if (user == NULL || secret == NULL) {
+		SPDK_ERRLOG("user and secret must be specified\n");
+		return -EINVAL;
+	}
+
+	if (muser != NULL && msecret == NULL) {
+		SPDK_ERRLOG("msecret must be specified with muser\n");
+		return -EINVAL;
+	}
+
+	TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+		if (strcmp(_secret->user, user) == 0) {
+			SPDK_ERRLOG("user for secret is duplicated\n");
+			return -EEXIST;
+		}
+	}
+
+	_secret = calloc(1, sizeof(*_secret));
+	if (_secret == NULL) {
+		SPDK_ERRLOG("calloc() failed for CHAP secret\n");
+		return -ENOMEM;
+	}
+
+	len = strnlen(user, sizeof(_secret->user));
+	if (len > sizeof(_secret->user) - 1) {
+		SPDK_ERRLOG("CHAP user longer than %zu characters: %s\n",
+			    sizeof(_secret->user) - 1, user);
+		free(_secret);
+		return -EINVAL;
+	}
+	memcpy(_secret->user, user, len);
+
+	len = strnlen(secret, sizeof(_secret->secret));
+	if (len > sizeof(_secret->secret) - 1) {
+		SPDK_ERRLOG("CHAP secret longer than %zu characters: %s\n",
+			    sizeof(_secret->secret) - 1, secret);
+		free(_secret);
+		return -EINVAL;
+	}
+	memcpy(_secret->secret, secret, len);
+
+	if (muser != NULL) {
+		len = strnlen(muser, sizeof(_secret->muser));
+		if (len > sizeof(_secret->muser) - 1) {
+			SPDK_ERRLOG("Mutual CHAP user longer than %zu characters: %s\n",
+				    sizeof(_secret->muser) - 1, muser);
+			free(_secret);
+			return -EINVAL;
+		}
+		memcpy(_secret->muser, muser, len);
+
+		len = strnlen(msecret, sizeof(_secret->msecret));
+		if (len > sizeof(_secret->msecret) - 1) {
+			SPDK_ERRLOG("Mutual CHAP secret longer than %zu characters: %s\n",
+				    sizeof(_secret->msecret) - 1, msecret);
+			free(_secret);
+			return -EINVAL;
+		}
+		memcpy(_secret->msecret, msecret, len);
+	}
+
+	TAILQ_INSERT_TAIL(&group->secret_head, _secret, tailq);
+	return 0;
+}
+
+int
+iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group,
+			       const char *user)
+{
+	struct spdk_iscsi_auth_secret *_secret;
+
+	if (user == NULL) {
+		SPDK_ERRLOG("user must be specified\n");
+		return -EINVAL;
+	}
+
+	TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+		if (strcmp(_secret->user, user) == 0) {
+			break;
+		}
+	}
+
+	if (_secret == NULL) {
+		SPDK_ERRLOG("secret is not found\n");
+		return -ENODEV;
+	}
+
+	TAILQ_REMOVE(&group->secret_head, _secret, tailq);
+	free(_secret);
+
+	return 0;
+}
+
+int
+iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group)
+{
+	struct spdk_iscsi_auth_group *group;
+
+	TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+		if (group->tag == tag) {
+			SPDK_ERRLOG("Auth group (%d) already exists\n", tag);
+			return -EEXIST;
+		}
+	}
+
+	group = calloc(1, sizeof(*group));
+	if (group == NULL) {
+		SPDK_ERRLOG("calloc() failed for auth group\n");
+		return -ENOMEM;
+	}
+
+	TAILQ_INIT(&group->secret_head);
+	group->tag = tag;
+
+	TAILQ_INSERT_TAIL(&g_iscsi.auth_group_head, group, tailq);
+
+	*_group = group;
+	return 0;
+}
+
+void
+iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group)
+{
+	struct spdk_iscsi_auth_secret *_secret, *tmp;
+
+	TAILQ_REMOVE(&g_iscsi.auth_group_head, group, tailq);
+
+	TAILQ_FOREACH_SAFE(_secret, &group->secret_head, tailq, tmp) {
+		TAILQ_REMOVE(&group->secret_head, _secret, tailq);
+		free(_secret);
+	}
+	free(group);
+}
+
+struct spdk_iscsi_auth_group *
+iscsi_find_auth_group_by_tag(int32_t tag)
+{
+	struct spdk_iscsi_auth_group *group;
+
+	TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+		if (group->tag == tag) {
+			return group;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+iscsi_auth_groups_destroy(void)
+{
+	struct spdk_iscsi_auth_group *group, *tmp;
+
+	TAILQ_FOREACH_SAFE(group, &g_iscsi.auth_group_head, tailq, tmp) {
+		iscsi_delete_auth_group(group);
+	}
+}
+
+static int
+iscsi_parse_auth_group(struct spdk_conf_section *sp)
+{
+	int rc;
+	int i;
+	int tag;
+	const char *val, *user, *secret, *muser, *msecret;
+	struct spdk_iscsi_auth_group *group = NULL;
+
+	val = spdk_conf_section_get_val(sp, "Comment");
+	if (val != NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+	}
+
+	tag = spdk_conf_section_get_num(sp);
+
+	rc = iscsi_add_auth_group(tag, &group);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to add auth group\n");
+		return rc;
+	}
+
+	for (i = 0; ; i++) {
+		val = spdk_conf_section_get_nval(sp, "Auth", i);
+		if (val == NULL) {
+			break;
+		}
+
+		user = spdk_conf_section_get_nmval(sp, "Auth", i, 0);
+		secret = spdk_conf_section_get_nmval(sp, "Auth", i, 1);
+		muser = spdk_conf_section_get_nmval(sp, "Auth", i, 2);
+		msecret = spdk_conf_section_get_nmval(sp, "Auth", i, 3);
+
+		rc = iscsi_auth_group_add_secret(group, user, secret, muser, msecret);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to add secret to auth group\n");
+			iscsi_delete_auth_group(group);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int
+iscsi_parse_auth_info(void)
+{
+	struct spdk_conf *config;
+	struct spdk_conf_section *sp;
+	int rc;
+
+	config = spdk_conf_allocate();
+	if (!config) {
+		SPDK_ERRLOG("Failed to allocate config file\n");
+		return -ENOMEM;
+	}
+
+	rc = spdk_conf_read(config, g_iscsi.authfile);
+	if (rc != 0) {
+		SPDK_INFOLOG(SPDK_LOG_ISCSI, "Failed to load auth file\n");
+		spdk_conf_free(config);
+		return rc;
+	}
+
+	sp = spdk_conf_first_section(config);
+	while (sp != NULL) {
+		if (spdk_conf_section_match_prefix(sp, "AuthGroup")) {
+			if (spdk_conf_section_get_num(sp) == 0) {
+				SPDK_ERRLOG("Group 0 is invalid\n");
+				iscsi_auth_groups_destroy();
+				spdk_conf_free(config);
+				return -EINVAL;
+			}
+
+			rc = iscsi_parse_auth_group(sp);
+			if (rc != 0) {
+				SPDK_ERRLOG("parse_auth_group() failed\n");
+				iscsi_auth_groups_destroy();
+				spdk_conf_free(config);
+				return rc;
+			}
+		}
+		sp = spdk_conf_next_section(sp);
+	}
+
+	spdk_conf_free(config);
+	return 0;
+}
+
+static struct spdk_iscsi_auth_secret *
+iscsi_find_auth_secret(const char *authuser, int ag_tag)
+{
+	struct spdk_iscsi_auth_group *group;
+	struct spdk_iscsi_auth_secret *_secret;
+
+	TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+		if (group->tag == ag_tag) {
+			TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+				if (strcmp(_secret->user, authuser) == 0) {
+					return _secret;
+				}
+			}
+		}
+	}
+
+	return NULL;
+}
+
+int
+iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser,
+			int ag_tag)
+{
+	struct spdk_iscsi_auth_secret *_secret;
+
+	if (authuser == NULL) {
+		return -EINVAL;
+	}
+
+	if (auth->user[0] != '\0') {
+		memset(auth->user, 0, sizeof(auth->user));
+		memset(auth->secret, 0, sizeof(auth->secret));
+		memset(auth->muser, 0, sizeof(auth->muser));
+		memset(auth->msecret, 0, sizeof(auth->msecret));
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	_secret = iscsi_find_auth_secret(authuser, ag_tag);
+	if (_secret == NULL) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		SPDK_ERRLOG("CHAP secret is not found: user:%s, tag:%d\n",
+			    authuser, ag_tag);
+		return -ENOENT;
+	}
+
+	memcpy(auth->user, _secret->user, sizeof(auth->user));
+	memcpy(auth->secret, _secret->secret, sizeof(auth->secret));
+
+	if (_secret->muser[0] != '\0') {
+		memcpy(auth->muser, _secret->muser, sizeof(auth->muser));
+		memcpy(auth->msecret, _secret->msecret, sizeof(auth->msecret));
+	}
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return 0;
+}
+
+static int
+iscsi_initialize_global_params(void)
+{
+	int rc;
+
+	if (!g_spdk_iscsi_opts) {
+		rc = iscsi_parse_options(&g_spdk_iscsi_opts);
+		if (rc != 0) {
+			SPDK_ERRLOG("iscsi_parse_options() failed\n");
+			return rc;
+		}
+	}
+
+	rc = iscsi_set_global_params(g_spdk_iscsi_opts);
+	if (rc != 0) {
+		SPDK_ERRLOG("iscsi_set_global_params() failed\n");
+	}
+
+	iscsi_opts_free(g_spdk_iscsi_opts);
+	g_spdk_iscsi_opts = NULL;
+
+	return rc;
+}
+
+static void
+iscsi_init_complete(int rc)
+{
+	spdk_iscsi_init_cb cb_fn = g_init_cb_fn;
+	void *cb_arg = g_init_cb_arg;
+
+	g_init_cb_fn = NULL;
+	g_init_cb_arg = NULL;
+
+	cb_fn(cb_arg, rc);
+}
+
+static void
+iscsi_parse_configuration(void)
+{
+	int rc;
+
+	rc = iscsi_parse_portal_grps();
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_parse_portal_grps() failed\n");
+		goto end;
+	}
+
+	rc = iscsi_parse_init_grps();
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_parse_init_grps() failed\n");
+		goto end;
+	}
+
+	rc = iscsi_parse_tgt_nodes();
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_parse_tgt_nodes() failed\n");
+	}
+
+	if (g_iscsi.authfile != NULL) {
+		if (access(g_iscsi.authfile, R_OK) == 0) {
+			rc = iscsi_parse_auth_info();
+			if (rc < 0) {
+				SPDK_ERRLOG("iscsi_parse_auth_info() failed\n");
+			}
+		} else {
+			SPDK_INFOLOG(SPDK_LOG_ISCSI, "CHAP secret file is not found in the path %s\n",
+				     g_iscsi.authfile);
+		}
+	}
+
+end:
+	iscsi_init_complete(rc);
+}
+
+static int
+iscsi_poll_group_poll(void *ctx)
+{
+	struct spdk_iscsi_poll_group *group = ctx;
+	struct spdk_iscsi_conn *conn, *tmp;
+	int rc;
+
+	if (spdk_unlikely(STAILQ_EMPTY(&group->connections))) {
+		return SPDK_POLLER_IDLE;
+	}
+
+	rc = spdk_sock_group_poll(group->sock_group);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to poll sock_group=%p\n", group->sock_group);
+	}
+
+	STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) {
+		if (conn->state == ISCSI_CONN_STATE_EXITING) {
+			iscsi_conn_destruct(conn);
+		}
+	}
+
+	return rc != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static int
+iscsi_poll_group_handle_nop(void *ctx)
+{
+	struct spdk_iscsi_poll_group *group = ctx;
+	struct spdk_iscsi_conn *conn, *tmp;
+
+	STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) {
+		iscsi_conn_handle_nop(conn);
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+iscsi_poll_group_create(void *io_device, void *ctx_buf)
+{
+	struct spdk_iscsi_poll_group *pg = ctx_buf;
+
+	STAILQ_INIT(&pg->connections);
+	pg->sock_group = spdk_sock_group_create(NULL);
+	assert(pg->sock_group != NULL);
+
+	pg->poller = SPDK_POLLER_REGISTER(iscsi_poll_group_poll, pg, 0);
+	/* set the period to 1 sec */
+	pg->nop_poller = SPDK_POLLER_REGISTER(iscsi_poll_group_handle_nop, pg, 1000000);
+
+	return 0;
+}
+
+static void
+iscsi_poll_group_destroy(void *io_device, void *ctx_buf)
+{
+	struct spdk_iscsi_poll_group *pg = ctx_buf;
+	struct spdk_io_channel *ch;
+	struct spdk_thread *thread;
+
+	assert(pg->poller != NULL);
+	assert(pg->sock_group != NULL);
+
+	spdk_sock_group_close(&pg->sock_group);
+	spdk_poller_unregister(&pg->poller);
+	spdk_poller_unregister(&pg->nop_poller);
+
+	ch = spdk_io_channel_from_ctx(pg);
+	thread = spdk_io_channel_get_thread(ch);
+
+	assert(thread == spdk_get_thread());
+
+	spdk_thread_exit(thread);
+}
+
+static void
+_iscsi_init_thread_done(void *ctx)
+{
+	struct spdk_iscsi_poll_group *pg = ctx;
+
+	TAILQ_INSERT_TAIL(&g_iscsi.poll_group_head, pg, link);
+	if (--g_iscsi.refcnt == 0) {
+		iscsi_parse_configuration();
+	}
+}
+
+static void
+_iscsi_init_thread(void *ctx)
+{
+	struct spdk_io_channel *ch;
+	struct spdk_iscsi_poll_group *pg;
+
+	ch = spdk_get_io_channel(&g_iscsi);
+	pg = spdk_io_channel_get_ctx(ch);
+
+	spdk_thread_send_msg(g_init_thread, _iscsi_init_thread_done, pg);
+}
+
+static void
+initialize_iscsi_poll_group(void)
+{
+	struct spdk_cpuset tmp_cpumask = {};
+	uint32_t i;
+	char thread_name[32];
+	struct spdk_thread *thread;
+
+	spdk_io_device_register(&g_iscsi, iscsi_poll_group_create, iscsi_poll_group_destroy,
+				sizeof(struct spdk_iscsi_poll_group), "iscsi_tgt");
+
+	/* Create threads for CPU cores active for this application, and send a
+	 * message to each thread to create a poll group on it.
+	 */
+	g_init_thread = spdk_get_thread();
+	assert(g_init_thread != NULL);
+	assert(g_iscsi.refcnt == 0);
+
+	SPDK_ENV_FOREACH_CORE(i) {
+		spdk_cpuset_zero(&tmp_cpumask);
+		spdk_cpuset_set_cpu(&tmp_cpumask, i, true);
+		snprintf(thread_name, sizeof(thread_name), "iscsi_poll_group_%u", i);
+
+		thread = spdk_thread_create(thread_name, &tmp_cpumask);
+		assert(thread != NULL);
+
+		g_iscsi.refcnt++;
+		spdk_thread_send_msg(thread, _iscsi_init_thread, NULL);
+	}
+}
+
+static int
+iscsi_parse_globals(void)
+{
+	int rc;
+
+	rc = iscsi_initialize_global_params();
+	if (rc != 0) {
+		SPDK_ERRLOG("iscsi_initialize_iscsi_global_params() failed\n");
+		return rc;
+	}
+
+	g_iscsi.session = calloc(1, sizeof(struct spdk_iscsi_sess *) * g_iscsi.MaxSessions);
+	if (!g_iscsi.session) {
+		SPDK_ERRLOG("calloc() failed for session array\n");
+		return -1;
+	}
+
+	/*
+	 * For now, just support same number of total connections, rather
+	 *  than MaxSessions * MaxConnectionsPerSession.  After we add better
+	 *  handling for low resource conditions from our various buffer
+	 *  pools, we can bump this up to support more connections.
+	 */
+	g_iscsi.MaxConnections = g_iscsi.MaxSessions;
+
+	rc = iscsi_initialize_all_pools();
+	if (rc != 0) {
+		SPDK_ERRLOG("initialize_all_pools() failed\n");
+		free(g_iscsi.session);
+		g_iscsi.session = NULL;
+		return -1;
+	}
+
+	rc = initialize_iscsi_conns();
+	if (rc < 0) {
+		SPDK_ERRLOG("initialize_iscsi_conns() failed\n");
+		free(g_iscsi.session);
+		g_iscsi.session = NULL;
+		return rc;
+	}
+
+	initialize_iscsi_poll_group();
+	return 0;
+}
+
+void
+spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg)
+{
+	int rc;
+
+	assert(cb_fn != NULL);
+	g_init_cb_fn = cb_fn;
+	g_init_cb_arg = cb_arg;
+
+	rc = iscsi_parse_globals();
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_parse_globals() failed\n");
+		iscsi_init_complete(-1);
+	}
+
+	/*
+	 * iscsi_parse_configuration() will be called as the callback to
+	 * spdk_initialize_iscsi_poll_group() and will complete iSCSI
+	 * subsystem initialization.
+	 */
+}
+
+void
+spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg)
+{
+	g_fini_cb_fn = cb_fn;
+	g_fini_cb_arg = cb_arg;
+
+	iscsi_portal_grp_close_all();
+	shutdown_iscsi_conns();
+}
+
+static void
+iscsi_fini_done(void *io_device)
+{
+	free(g_iscsi.authfile);
+	free(g_iscsi.nodebase);
+
+	pthread_mutex_destroy(&g_iscsi.mutex);
+	g_fini_cb_fn(g_fini_cb_arg);
+}
+
+static void
+_iscsi_fini_dev_unreg(struct spdk_io_channel_iter *i, int status)
+{
+	iscsi_check_pools();
+	iscsi_free_pools();
+	free(g_iscsi.session);
+
+	assert(TAILQ_EMPTY(&g_iscsi.poll_group_head));
+
+	iscsi_shutdown_tgt_nodes();
+	iscsi_init_grps_destroy();
+	iscsi_portal_grps_destroy();
+	iscsi_auth_groups_destroy();
+
+	spdk_io_device_unregister(&g_iscsi, iscsi_fini_done);
+}
+
+static void
+_iscsi_fini_thread(struct spdk_io_channel_iter *i)
+{
+	struct spdk_io_channel *ch;
+	struct spdk_iscsi_poll_group *pg;
+
+	ch = spdk_io_channel_iter_get_channel(i);
+	pg = spdk_io_channel_get_ctx(ch);
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_REMOVE(&g_iscsi.poll_group_head, pg, link);
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	spdk_put_io_channel(ch);
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+void
+shutdown_iscsi_conns_done(void)
+{
+	spdk_for_each_channel(&g_iscsi, _iscsi_fini_thread, NULL, _iscsi_fini_dev_unreg);
+}
+
+void
+spdk_iscsi_config_text(FILE *fp)
+{
+	iscsi_globals_config_text(fp);
+	iscsi_portal_grps_config_text(fp);
+	iscsi_init_grps_config_text(fp);
+	iscsi_tgt_nodes_config_text(fp);
+}
+
+void
+iscsi_opts_info_json(struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+
+	if (g_iscsi.authfile != NULL) {
+		spdk_json_write_named_string(w, "auth_file", g_iscsi.authfile);
+	}
+	spdk_json_write_named_string(w, "node_base", g_iscsi.nodebase);
+
+	spdk_json_write_named_uint32(w, "max_sessions", g_iscsi.MaxSessions);
+	spdk_json_write_named_uint32(w, "max_connections_per_session",
+				     g_iscsi.MaxConnectionsPerSession);
+
+	spdk_json_write_named_uint32(w, "max_queue_depth", g_iscsi.MaxQueueDepth);
+
+	spdk_json_write_named_uint32(w, "default_time2wait", g_iscsi.DefaultTime2Wait);
+	spdk_json_write_named_uint32(w, "default_time2retain", g_iscsi.DefaultTime2Retain);
+
+	spdk_json_write_named_uint32(w, "first_burst_length", g_iscsi.FirstBurstLength);
+
+	spdk_json_write_named_bool(w, "immediate_data", g_iscsi.ImmediateData);
+
+	spdk_json_write_named_bool(w, "allow_duplicated_isid", g_iscsi.AllowDuplicateIsid);
+
+	spdk_json_write_named_uint32(w, "error_recovery_level", g_iscsi.ErrorRecoveryLevel);
+
+	spdk_json_write_named_int32(w, "nop_timeout", g_iscsi.timeout);
+	spdk_json_write_named_int32(w, "nop_in_interval", g_iscsi.nopininterval);
+
+	spdk_json_write_named_bool(w, "disable_chap", g_iscsi.disable_chap);
+	spdk_json_write_named_bool(w, "require_chap", g_iscsi.require_chap);
+	spdk_json_write_named_bool(w, "mutual_chap", g_iscsi.mutual_chap);
+	spdk_json_write_named_int32(w, "chap_group", g_iscsi.chap_group);
+
+	spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_auth_group_info_json(struct spdk_iscsi_auth_group *group,
+			   struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_auth_secret *_secret;
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_int32(w, "tag", group->tag);
+
+	spdk_json_write_named_array_begin(w, "secrets");
+	TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+		spdk_json_write_object_begin(w);
+
+		spdk_json_write_named_string(w, "user", _secret->user);
+		spdk_json_write_named_string(w, "secret", _secret->secret);
+
+		if (_secret->muser[0] != '\0') {
+			spdk_json_write_named_string(w, "muser", _secret->muser);
+			spdk_json_write_named_string(w, "msecret", _secret->msecret);
+		}
+
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_auth_group_config_json(struct spdk_iscsi_auth_group *group,
+			     struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "method", "iscsi_create_auth_group");
+
+	spdk_json_write_name(w, "params");
+	iscsi_auth_group_info_json(group, w);
+
+	spdk_json_write_object_end(w);
+}
+
+void
+iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_auth_group *group;
+
+	TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+		iscsi_auth_group_info_json(group, w);
+	}
+}
+
+static void
+iscsi_auth_groups_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_auth_group *group;
+
+	TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+		iscsi_auth_group_config_json(group, w);
+	}
+}
+
+static void
+iscsi_opts_config_json(struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "method", "iscsi_set_options");
+
+	spdk_json_write_name(w, "params");
+	iscsi_opts_info_json(w);
+
+	spdk_json_write_object_end(w);
+}
+
+void
+spdk_iscsi_config_json(struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_array_begin(w);
+	iscsi_opts_config_json(w);
+	iscsi_portal_grps_config_json(w);
+	iscsi_init_grps_config_json(w);
+	iscsi_tgt_nodes_config_json(w);
+	iscsi_auth_groups_config_json(w);
+	spdk_json_write_array_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("iscsi", SPDK_LOG_ISCSI)
diff --git a/src/spdk/lib/iscsi/md5.c b/src/spdk/lib/iscsi/md5.c
new file mode 100644
index 000000000..c316ac354
--- /dev/null
+++ b/src/spdk/lib/iscsi/md5.c
@@ -0,0 +1,75 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <openssl/md5.h>
+
+#include "iscsi/md5.h"
+
+int md5init(struct spdk_md5ctx *md5ctx)
+{
+	int rc;
+
+	if (md5ctx == NULL) {
+		return -1;
+	}
+	rc = MD5_Init(&md5ctx->md5ctx);
+	return rc;
+}
+
+int md5final(void *md5, struct spdk_md5ctx *md5ctx)
+{
+	int rc;
+
+	if (md5ctx == NULL || md5 == NULL) {
+		return -1;
+	}
+	rc = MD5_Final(md5, &md5ctx->md5ctx);
+	return rc;
+}
+
+int md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len)
+{
+	int rc;
+
+	if (md5ctx == NULL) {
+		return -1;
+	}
+	if (data == NULL || len == 0) {
+		return 0;
+	}
+	rc = MD5_Update(&md5ctx->md5ctx, data, len);
+	return rc;
+}
diff --git a/src/spdk/lib/iscsi/md5.h b/src/spdk/lib/iscsi/md5.h
new file mode 100644
index 000000000..d6fc4c1ff
--- /dev/null
+++ b/src/spdk/lib/iscsi/md5.h
@@ -0,0 +1,52 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_MD5_H
+#define SPDK_MD5_H
+
+#include "spdk/stdinc.h"
+
+#include <openssl/md5.h>
+
+#define SPDK_MD5DIGEST_LEN MD5_DIGEST_LENGTH
+
+struct spdk_md5ctx {
+	MD5_CTX md5ctx;
+};
+
+int md5init(struct spdk_md5ctx *md5ctx);
+int md5final(void *md5, struct spdk_md5ctx *md5ctx);
+int md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len);
+
+#endif /* SPDK_MD5_H */
diff --git a/src/spdk/lib/iscsi/param.c b/src/spdk/lib/iscsi/param.c
new file mode 100644
index 000000000..18f579359
--- /dev/null
+++ b/src/spdk/lib/iscsi/param.c
@@ -0,0 +1,1216 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/string.h"
+#include "iscsi/iscsi.h"
+#include "iscsi/param.h"
+#include "iscsi/conn.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#define MAX_TMPBUF 1024
+
+/* whose value may be bigger than 255 */
+static const char *non_simple_value_params[] = {
+	"CHAP_C",
+	"CHAP_R",
+	NULL,
+};
+
+void
+iscsi_param_free(struct iscsi_param *params)
+{
+	struct iscsi_param *param, *next_param;
+
+	if (params == NULL) {
+		return;
+	}
+	for (param = params; param != NULL; param = next_param) {
+		next_param = param->next;
+		if (param->list) {
+			free(param->list);
+		}
+		free(param->val);
+		free(param->key);
+		free(param);
+	}
+}
+
+static int
+iscsi_find_key_in_array(const char *key, const char *array[])
+{
+	int i;
+
+	for (i = 0; array[i] != NULL; i++) {
+		if (strcasecmp(key, array[i]) == 0) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+struct iscsi_param *
+iscsi_param_find(struct iscsi_param *params, const char *key)
+{
+	struct iscsi_param *param;
+
+	if (params == NULL || key == NULL) {
+		return NULL;
+	}
+	for (param = params; param != NULL; param = param->next) {
+		if (param->key != NULL && param->key[0] == key[0]
+		    && strcasecmp(param->key, key) == 0) {
+			return param;
+		}
+	}
+	return NULL;
+}
+
+int
+iscsi_param_del(struct iscsi_param **params, const char *key)
+{
+	struct iscsi_param *param, *prev_param = NULL;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "del %s\n", key);
+	if (params == NULL || key == NULL) {
+		return 0;
+	}
+	for (param = *params; param != NULL; param = param->next) {
+		if (param->key != NULL && param->key[0] == key[0]
+		    && strcasecmp(param->key, key) == 0) {
+			if (prev_param != NULL) {
+				prev_param->next = param->next;
+			} else {
+				*params = param->next;
+			}
+			param->next = NULL;
+			iscsi_param_free(param);
+			return 0;
+		}
+		prev_param = param;
+	}
+	return -1;
+}
+
+int
+iscsi_param_add(struct iscsi_param **params, const char *key,
+		const char *val, const char *list, int type)
+{
+	struct iscsi_param *param, *last_param;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add %s=%s, list=[%s], type=%d\n",
+		      key, val, list, type);
+	if (key == NULL) {
+		return -1;
+	}
+
+	param = iscsi_param_find(*params, key);
+	if (param != NULL) {
+		iscsi_param_del(params, key);
+	}
+
+	param = calloc(1, sizeof(*param));
+	if (!param) {
+		SPDK_ERRLOG("calloc() failed for parameter\n");
+		return -ENOMEM;
+	}
+
+	param->next = NULL;
+	param->key = xstrdup(key);
+	param->val = xstrdup(val);
+	param->list = xstrdup(list);
+	param->type = type;
+
+	last_param = *params;
+	if (last_param != NULL) {
+		while (last_param->next != NULL) {
+			last_param = last_param->next;
+		}
+		last_param->next = param;
+	} else {
+		*params = param;
+	}
+
+	return 0;
+}
+
+int
+iscsi_param_set(struct iscsi_param *params, const char *key,
+		const char *val)
+{
+	struct iscsi_param *param;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%s\n", key, val);
+	param = iscsi_param_find(params, key);
+	if (param == NULL) {
+		SPDK_ERRLOG("no key %s\n", key);
+		return -1;
+	}
+
+	free(param->val);
+
+	param->val = xstrdup(val);
+
+	return 0;
+}
+
+int
+iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val)
+{
+	char buf[MAX_TMPBUF];
+	struct iscsi_param *param;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%d\n", key, val);
+	param = iscsi_param_find(params, key);
+	if (param == NULL) {
+		SPDK_ERRLOG("no key %s\n", key);
+		return -1;
+	}
+
+	free(param->val);
+	snprintf(buf, sizeof buf, "%d", val);
+
+	param->val = strdup(buf);
+
+	return 0;
+}
+
+/**
+ * Parse a single KEY=VAL pair
+ *
+ * data = "KEY=VAL<NUL>"
+ */
+static int
+iscsi_parse_param(struct iscsi_param **params, const uint8_t *data, uint32_t data_len)
+{
+	int rc;
+	uint8_t *key_copy, *val_copy;
+	const uint8_t *key_end;
+	int key_len, val_len;
+	int max_len;
+
+	data_len = strnlen(data, data_len);
+	/* No such thing as strnchr so use memchr instead. */
+	key_end = memchr(data, '=', data_len);
+	if (!key_end) {
+		SPDK_ERRLOG("'=' not found\n");
+		return -1;
+	}
+
+	key_len = key_end - data;
+	if (key_len == 0) {
+		SPDK_ERRLOG("Empty key\n");
+		return -1;
+	}
+	/*
+	 * RFC 7143 6.1
+	 */
+	if (key_len > ISCSI_TEXT_MAX_KEY_LEN) {
+		SPDK_ERRLOG("Key name length is bigger than 63\n");
+		return -1;
+	}
+
+	key_copy = malloc(key_len + 1);
+	if (!key_copy) {
+		SPDK_ERRLOG("malloc() failed for key_copy\n");
+		return -ENOMEM;
+	}
+
+	memcpy(key_copy, data, key_len);
+	key_copy[key_len] = '\0';
+	/* check whether this key is duplicated */
+	if (NULL != iscsi_param_find(*params, key_copy)) {
+		SPDK_ERRLOG("Duplicated Key %s\n", key_copy);
+		free(key_copy);
+		return -1;
+	}
+
+	val_len = strnlen(key_end + 1, data_len - key_len - 1);
+	/*
+	 * RFC 3720 5.1
+	 * If not otherwise specified, the maximum length of a simple-value
+	 * (not its encoded representation) is 255 bytes, not including the delimiter
+	 * (comma or zero byte).
+	 */
+	/*
+	 * comma or zero is counted in, otherwise we need to iterate each parameter
+	 * value
+	 */
+	max_len = iscsi_find_key_in_array(key_copy, non_simple_value_params) ?
+		  ISCSI_TEXT_MAX_VAL_LEN : ISCSI_TEXT_MAX_SIMPLE_VAL_LEN;
+	if (val_len > max_len) {
+		SPDK_ERRLOG("Overflow Val %d\n", val_len);
+		free(key_copy);
+		return -1;
+	}
+
+	val_copy = calloc(1, val_len + 1);
+	if (val_copy == NULL) {
+		SPDK_ERRLOG("Could not allocate value string\n");
+		free(key_copy);
+		return -1;
+	}
+
+	memcpy(val_copy, key_end + 1, val_len);
+
+	rc = iscsi_param_add(params, key_copy, val_copy, NULL, 0);
+	free(val_copy);
+	free(key_copy);
+	if (rc < 0) {
+		SPDK_ERRLOG("iscsi_param_add() failed\n");
+		return -1;
+	}
+
+	/* return number of bytes consumed
+	 * +1 for '=' and +1 for NUL
+	 */
+	return key_len + 1 + val_len + 1;
+}
+
+/**
+ * Parse a sequence of KEY=VAL pairs.
+ *
+ * \param data "KEY=VAL<NUL>KEY=VAL<NUL>..."
+ * \param len length of data in bytes
+ */
+int
+iscsi_parse_params(struct iscsi_param **params, const uint8_t *data,
+		   int len, bool cbit_enabled, char **partial_parameter)
+{
+	int rc, offset = 0;
+	char *p;
+	int i;
+
+	/* strip the partial text parameters if previous PDU have C enabled */
+	if (partial_parameter && *partial_parameter) {
+		for (i = 0; i < len && data[i] != '\0'; i++) {
+			;
+		}
+		p = spdk_sprintf_alloc("%s%s", *partial_parameter, (const char *)data);
+		if (!p) {
+			return -1;
+		}
+		rc = iscsi_parse_param(params, p, i + strlen(*partial_parameter));
+		free(p);
+		if (rc < 0) {
+			return -1;
+		}
+		free(*partial_parameter);
+		*partial_parameter = NULL;
+
+		data = data + i + 1;
+		len = len - (i + 1);
+	}
+
+	/* strip the partial text parameters if C bit is enabled */
+	if (cbit_enabled) {
+		if (partial_parameter == NULL) {
+			SPDK_ERRLOG("C bit set but no partial parameters provided\n");
+			return -1;
+		}
+
+		/*
+		 * reverse iterate the string from the tail not including '\0'
+		 */
+		for (i = len - 1; data[i] != '\0' && i > 0; i--) {
+			;
+		}
+		if (i != 0) {
+			/* We found a NULL character - don't copy it into the
+			 * partial parameter.
+			 */
+			i++;
+		}
+
+		*partial_parameter = calloc(1, len - i + 1);
+		if (*partial_parameter == NULL) {
+			SPDK_ERRLOG("could not allocate partial parameter\n");
+			return -1;
+		}
+		memcpy(*partial_parameter, &data[i], len - i);
+		if (i == 0) {
+			/* No full parameters to parse - so return now. */
+			return 0;
+		} else {
+			len = i - 1;
+		}
+	}
+
+	while (offset < len && data[offset] != '\0') {
+		rc = iscsi_parse_param(params, data + offset, len - offset);
+		if (rc < 0) {
+			return -1;
+		}
+		offset += rc;
+	}
+	return 0;
+}
+
+char *
+iscsi_param_get_val(struct iscsi_param *params, const char *key)
+{
+	struct iscsi_param *param;
+
+	param = iscsi_param_find(params, key);
+	if (param == NULL) {
+		return NULL;
+	}
+	return param->val;
+}
+
+int
+iscsi_param_eq_val(struct iscsi_param *params, const char *key,
+		   const char *val)
+{
+	struct iscsi_param *param;
+
+	param = iscsi_param_find(params, key);
+	if (param == NULL) {
+		return 0;
+	}
+	if (strcasecmp(param->val, val) == 0) {
+		return 1;
+	}
+	return 0;
+}
+
+struct iscsi_param_table {
+	const char *key;
+	const char *val;
+	const char *list;
+	int type;
+};
+
+static const struct iscsi_param_table conn_param_table[] = {
+	{ "HeaderDigest", "None", "CRC32C,None", ISPT_LIST },
+	{ "DataDigest", "None", "CRC32C,None", ISPT_LIST },
+	{ "MaxRecvDataSegmentLength", "8192", "512,16777215", ISPT_NUMERICAL_DECLARATIVE },
+	{ "OFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND },
+	{ "IFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND },
+	{ "OFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN },
+	{ "IFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN },
+	{ "AuthMethod", "None", "CHAP,None", ISPT_LIST },
+	{ "CHAP_A", "5", "5", ISPT_LIST },
+	{ "CHAP_N", "", "", ISPT_DECLARATIVE },
+	{ "CHAP_R", "", "", ISPT_DECLARATIVE },
+	{ "CHAP_I", "", "", ISPT_DECLARATIVE },
+	{ "CHAP_C", "", "", ISPT_DECLARATIVE },
+	{ NULL, NULL, NULL, ISPT_INVALID },
+};
+
+static const struct iscsi_param_table sess_param_table[] = {
+	{ "MaxConnections", "1", "1,65535", ISPT_NUMERICAL_MIN },
+#if 0
+	/* need special handling */
+	{ "SendTargets", "", "", ISPT_DECLARATIVE },
+#endif
+	{ "TargetName", "", "", ISPT_DECLARATIVE },
+	{ "InitiatorName", "", "", ISPT_DECLARATIVE },
+	{ "TargetAlias", "", "", ISPT_DECLARATIVE },
+	{ "InitiatorAlias", "", "", ISPT_DECLARATIVE },
+	{ "TargetAddress", "", "", ISPT_DECLARATIVE },
+	{ "TargetPortalGroupTag", "1", "1,65535", ISPT_NUMERICAL_DECLARATIVE },
+	{ "InitialR2T", "Yes", "Yes,No", ISPT_BOOLEAN_OR },
+	{ "ImmediateData", "Yes", "Yes,No", ISPT_BOOLEAN_AND },
+	{ "MaxBurstLength", "262144", "512,16777215", ISPT_NUMERICAL_MIN },
+	{ "FirstBurstLength", "65536", "512,16777215", ISPT_NUMERICAL_MIN },
+	{ "DefaultTime2Wait", "2", "0,3600", ISPT_NUMERICAL_MAX },
+	{ "DefaultTime2Retain", "20", "0,3600", ISPT_NUMERICAL_MIN },
+	{ "MaxOutstandingR2T", "1", "1,65536", ISPT_NUMERICAL_MIN },
+	{ "DataPDUInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR },
+	{ "DataSequenceInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR },
+	{ "ErrorRecoveryLevel", "0", "0,2", ISPT_NUMERICAL_MIN },
+	{ "SessionType", "Normal", "Normal,Discovery", ISPT_DECLARATIVE },
+	{ NULL, NULL, NULL, ISPT_INVALID },
+};
+
+static int
+iscsi_params_init_internal(struct iscsi_param **params,
+			   const struct iscsi_param_table *table)
+{
+	int rc;
+	int i;
+	struct iscsi_param *param;
+
+	for (i = 0; table[i].key != NULL; i++) {
+		rc = iscsi_param_add(params, table[i].key, table[i].val,
+				     table[i].list, table[i].type);
+		if (rc < 0) {
+			SPDK_ERRLOG("iscsi_param_add() failed\n");
+			return -1;
+		}
+		param = iscsi_param_find(*params, table[i].key);
+		if (param != NULL) {
+			param->state_index = i;
+		} else {
+			SPDK_ERRLOG("iscsi_param_find() failed\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+iscsi_conn_params_init(struct iscsi_param **params)
+{
+	return iscsi_params_init_internal(params, &conn_param_table[0]);
+}
+
+int
+iscsi_sess_params_init(struct iscsi_param **params)
+{
+	return iscsi_params_init_internal(params, &sess_param_table[0]);
+}
+
+static const char *chap_type[] = {
+	"CHAP_A",
+	"CHAP_N",
+	"CHAP_R",
+	"CHAP_I",
+	"CHAP_C",
+	NULL,
+};
+
+static const char *discovery_ignored_param[] = {
+	"MaxConnections",
+	"InitialR2T",
+	"ImmediateData",
+	"MaxBurstLength",
+	"FirstBurstLength"
+	"MaxOutstandingR2T",
+	"DataPDUInOrder",
+	"DataSequenceInOrder",
+	NULL,
+};
+
+static const char *multi_negot_conn_params[] = {
+	"MaxRecvDataSegmentLength",
+	NULL,
+};
+
+/* The following params should be declared by target */
+static const char *target_declarative_params[] = {
+	"TargetAlias",
+	"TargetAddress",
+	"TargetPortalGroupTag",
+	NULL,
+};
+
+/* This function is used to construct the data from the special param (e.g.,
+ * MaxRecvDataSegmentLength)
+ * return:
+ * normal: the total len of the data
+ * error: -1
+ */
+static int
+iscsi_special_param_construction(struct spdk_iscsi_conn *conn,
+				 struct iscsi_param *param,
+				 bool FirstBurstLength_flag, char *data,
+				 int alloc_len, int total)
+{
+	int len;
+	struct iscsi_param *param_first;
+	struct iscsi_param *param_max;
+	uint32_t FirstBurstLength;
+	uint32_t MaxBurstLength;
+	char *val;
+
+	val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+	if (!val) {
+		SPDK_ERRLOG("malloc() failed for temporary buffer\n");
+		return -ENOMEM;
+	}
+
+	if (strcasecmp(param->key, "MaxRecvDataSegmentLength") == 0) {
+		/*
+		 * MaxRecvDataSegmentLength is sent by both
+		 *      initiator and target, but is declarative - meaning
+		 *      each direction can have different values.
+		 * So when MaxRecvDataSegmentLength is found in the
+		 *      the parameter set sent from the initiator, add SPDK
+		 *      iscsi target's MaxRecvDataSegmentLength value to
+		 *      the returned parameter list.
+		 */
+		if (alloc_len - total < 1) {
+			SPDK_ERRLOG("data space small %d\n", alloc_len);
+			free(val);
+			return -1;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "returning MaxRecvDataSegmentLength=%d\n",
+			      SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+		len = snprintf((char *)data + total, alloc_len - total,
+			       "MaxRecvDataSegmentLength=%d",
+			       SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+		total += len + 1;
+	}
+
+	if (strcasecmp(param->key, "MaxBurstLength") == 0 &&
+	    !FirstBurstLength_flag) {
+		if (alloc_len - total < 1) {
+			SPDK_ERRLOG("data space small %d\n", alloc_len);
+			free(val);
+			return -1;
+		}
+
+		param_first = iscsi_param_find(conn->sess->params,
+					       "FirstBurstLength");
+		if (param_first != NULL) {
+			FirstBurstLength = (uint32_t)strtol(param_first->val, NULL, 10);
+		} else {
+			FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH;
+		}
+		param_max = iscsi_param_find(conn->sess->params,
+					     "MaxBurstLength");
+		if (param_max != NULL) {
+			MaxBurstLength = (uint32_t)strtol(param_max->val, NULL, 10);
+		} else {
+			MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH;
+		}
+
+		if (FirstBurstLength > MaxBurstLength) {
+			FirstBurstLength = MaxBurstLength;
+			if (param_first != NULL) {
+				free(param_first->val);
+				snprintf(val, ISCSI_TEXT_MAX_VAL_LEN, "%d",
+					 FirstBurstLength);
+				param_first->val = xstrdup(val);
+			}
+		}
+		len = snprintf((char *)data + total, alloc_len - total,
+			       "FirstBurstLength=%d", FirstBurstLength);
+		total += len + 1;
+	}
+
+	free(val);
+	return total;
+
+}
+
+/**
+ * iscsi_construct_data_from_param:
+ * To construct the data which will be returned to the initiator
+ * return: length of the negotiated data, -1 indicates error;
+ */
+static int
+iscsi_construct_data_from_param(struct iscsi_param *param, char *new_val,
+				char *data, int alloc_len, int total)
+{
+	int len;
+
+	if (param->type != ISPT_DECLARATIVE &&
+	    param->type != ISPT_NUMERICAL_DECLARATIVE) {
+		if (alloc_len - total < 1) {
+			SPDK_ERRLOG("data space small %d\n", alloc_len);
+			return -1;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "negotiated %s=%s\n",
+			      param->key, new_val);
+		len = snprintf((char *)data + total, alloc_len - total, "%s=%s",
+			       param->key, new_val);
+		total += len + 1;
+	}
+	return total;
+}
+
+/**
+ * To negotiate param with
+ * type = ISPT_LIST
+ * return: the negotiated value of the key
+ */
+static char *
+iscsi_negotiate_param_list(int *add_param_value,
+			   struct iscsi_param *param,
+			   char *valid_list, char *in_val,
+			   char *cur_val)
+{
+	char *val_start, *val_end;
+	char *in_start, *in_end;
+	int flag = 0;
+
+	if (add_param_value == NULL) {
+		return NULL;
+	}
+
+	in_start = in_val;
+	do {
+		if ((in_end = strchr(in_start, (int)',')) != NULL) {
+			*in_end = '\0';
+		}
+		val_start = valid_list;
+		do {
+			if ((val_end = strchr(val_start, (int)',')) != NULL) {
+				*val_end = '\0';
+			}
+			if (strcasecmp(in_start, val_start) == 0) {
+				SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "match %s\n",
+					      val_start);
+				flag = 1;
+				break;
+			}
+			if (val_end) {
+				*val_end = ',';
+				val_start = val_end + 1;
+			}
+		} while (val_end);
+		if (flag) {
+			break;
+		}
+		if (in_end) {
+			*in_end = ',';
+			in_start = in_end + 1;
+		}
+	} while (in_end);
+
+	return flag ? val_start : NULL;
+}
+
+/**
+ * To negotiate param with
+ * type = ISPT_NUMERICAL_MIN/MAX, ISPT_NUMERICAL_DECLARATIVE
+ * return: the negotiated value of the key
+ */
+static char *
+iscsi_negotiate_param_numerical(int *add_param_value,
+				struct iscsi_param *param,
+				char *valid_list, char *in_val,
+				char *cur_val)
+{
+	char *valid_next;
+	char *new_val = NULL;
+	char *min_val, *max_val;
+	int val_i, cur_val_i;
+	int min_i, max_i;
+
+	if (add_param_value == NULL) {
+		return NULL;
+	}
+
+	val_i = (int)strtol(param->val, NULL, 10);
+	/* check whether the key is FirstBurstLength, if that we use in_val */
+	if (strcasecmp(param->key, "FirstBurstLength") == 0) {
+		val_i = (int)strtol(in_val, NULL, 10);
+	}
+
+	cur_val_i = (int)strtol(cur_val, NULL, 10);
+	valid_next = valid_list;
+	min_val = spdk_strsepq(&valid_next, ",");
+	max_val = spdk_strsepq(&valid_next, ",");
+	min_i = (min_val != NULL) ? (int)strtol(min_val, NULL, 10) : 0;
+	max_i = (max_val != NULL) ? (int)strtol(max_val, NULL, 10) : 0;
+	if (val_i < min_i || val_i > max_i) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "key %.64s reject\n", param->key);
+		new_val = NULL;
+	} else {
+		switch (param->type) {
+		case ISPT_NUMERICAL_MIN:
+			if (val_i > cur_val_i) {
+				val_i = cur_val_i;
+			}
+			break;
+		case ISPT_NUMERICAL_MAX:
+			if (val_i < cur_val_i) {
+				val_i = cur_val_i;
+			}
+			break;
+		default:
+			break;
+		}
+		snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", val_i);
+		new_val = in_val;
+	}
+
+	return new_val;
+}
+
+/**
+ * To negotiate param with
+ * type = ISPT_BOOLEAN_OR, ISPT_BOOLEAN_AND
+ * return: the negotiated value of the key
+ */
+static char *
+iscsi_negotiate_param_boolean(int *add_param_value,
+			      struct iscsi_param *param,
+			      char *in_val, char *cur_val,
+			      const char *value)
+{
+	char *new_val = NULL;
+
+	if (add_param_value == NULL) {
+		return NULL;
+	}
+
+	/* Make sure the val is Yes or No */
+	if (!((strcasecmp(in_val, "Yes") == 0) ||
+	      (strcasecmp(in_val, "No") == 0))) {
+		/* unknown value */
+		snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject");
+		new_val = in_val;
+		*add_param_value = 1;
+		return new_val;
+	}
+
+	if (strcasecmp(cur_val, value) == 0) {
+		snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", value);
+		new_val = in_val;
+	} else {
+		new_val = param->val;
+	}
+
+	return new_val;
+}
+
+/**
+ * The entry function to handle each type of the param
+ * return value: the new negotiated value
+ */
+static char *
+iscsi_negotiate_param_all(int *add_param_value, struct iscsi_param *param,
+			  char *valid_list, char *in_val, char *cur_val)
+{
+	char *new_val;
+	switch (param->type) {
+	case ISPT_LIST:
+		new_val = iscsi_negotiate_param_list(add_param_value,
+						     param,
+						     valid_list,
+						     in_val,
+						     cur_val);
+		break;
+
+	case ISPT_NUMERICAL_MIN:
+	case ISPT_NUMERICAL_MAX:
+	case ISPT_NUMERICAL_DECLARATIVE:
+		new_val = iscsi_negotiate_param_numerical(add_param_value,
+				param,
+				valid_list,
+				in_val,
+				cur_val);
+		break;
+
+	case ISPT_BOOLEAN_OR:
+		new_val = iscsi_negotiate_param_boolean(add_param_value,
+							param,
+							in_val,
+							cur_val,
+							"Yes");
+		break;
+	case ISPT_BOOLEAN_AND:
+		new_val = iscsi_negotiate_param_boolean(add_param_value,
+							param,
+							in_val,
+							cur_val,
+							"No");
+		break;
+
+	default:
+		snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val);
+		new_val = in_val;
+		break;
+	}
+
+	return new_val;
+}
+
+/**
+ * This function is used to judge whether the param is in session's params or
+ * connection's params
+ */
+static int
+iscsi_negotiate_param_init(struct spdk_iscsi_conn *conn,
+			   struct iscsi_param **cur_param_p,
+			   struct iscsi_param **params_dst_p,
+			   struct iscsi_param *param)
+{
+	int index;
+
+	*cur_param_p = iscsi_param_find(*params_dst_p, param->key);
+	if (*cur_param_p == NULL) {
+		*params_dst_p = conn->sess->params;
+		*cur_param_p = iscsi_param_find(*params_dst_p, param->key);
+		if (*cur_param_p == NULL) {
+			if ((strncasecmp(param->key, "X-", 2) == 0) ||
+			    (strncasecmp(param->key, "X#", 2) == 0)) {
+				/* Extension Key */
+				SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+					      "extension key %.64s\n",
+					      param->key);
+			} else {
+				SPDK_ERRLOG("unknown key %.64s\n", param->key);
+			}
+			return 1;
+		} else {
+			index = (*cur_param_p)->state_index;
+			if (conn->sess_param_state_negotiated[index] &&
+			    !iscsi_find_key_in_array(param->key,
+						     target_declarative_params)) {
+				return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE;
+			}
+			conn->sess_param_state_negotiated[index] = true;
+		}
+	} else {
+		index = (*cur_param_p)->state_index;
+		if (conn->conn_param_state_negotiated[index] &&
+		    !iscsi_find_key_in_array(param->key,
+					     multi_negot_conn_params)) {
+			return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE;
+		}
+		conn->conn_param_state_negotiated[index] = true;
+	}
+
+	return 0;
+}
+
+int
+iscsi_negotiate_params(struct spdk_iscsi_conn *conn,
+		       struct iscsi_param **params, uint8_t *data, int alloc_len,
+		       int data_len)
+{
+	struct iscsi_param *param;
+	struct iscsi_param *cur_param;
+	char *valid_list, *in_val;
+	char *cur_val;
+	char *new_val;
+	int discovery;
+	int total;
+	int rc;
+	uint32_t FirstBurstLength;
+	uint32_t MaxBurstLength;
+	bool FirstBurstLength_flag = false;
+	int type;
+
+	total = data_len;
+	if (data_len < 0) {
+		assert(false);
+		return -EINVAL;
+	}
+	if (alloc_len < 1) {
+		return 0;
+	}
+	if (total > alloc_len) {
+		total = alloc_len;
+		data[total - 1] = '\0';
+		return total;
+	}
+
+	if (*params == NULL) {
+		/* no input */
+		return total;
+	}
+
+	/* discovery? */
+	discovery = 0;
+	cur_param = iscsi_param_find(*params, "SessionType");
+	if (cur_param == NULL) {
+		cur_param = iscsi_param_find(conn->sess->params, "SessionType");
+		if (cur_param == NULL) {
+			/* no session type */
+		} else {
+			if (strcasecmp(cur_param->val, "Discovery") == 0) {
+				discovery = 1;
+			}
+		}
+	} else {
+		if (strcasecmp(cur_param->val, "Discovery") == 0) {
+			discovery = 1;
+		}
+	}
+
+	/* for temporary store */
+	valid_list = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+	if (!valid_list) {
+		SPDK_ERRLOG("malloc() failed for valid_list\n");
+		return -ENOMEM;
+	}
+
+	in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+	if (!in_val) {
+		SPDK_ERRLOG("malloc() failed for in_val\n");
+		free(valid_list);
+		return -ENOMEM;
+	}
+
+	cur_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+	if (!cur_val) {
+		SPDK_ERRLOG("malloc() failed for cur_val\n");
+		free(valid_list);
+		free(in_val);
+		return -ENOMEM;
+	}
+
+	/* To adjust the location of FirstBurstLength location and put it to
+	 *  the end, then we can always firstly determine the MaxBurstLength
+	 */
+	param = iscsi_param_find(*params, "MaxBurstLength");
+	if (param != NULL) {
+		param = iscsi_param_find(*params, "FirstBurstLength");
+
+		/* check the existence of FirstBurstLength */
+		if (param != NULL) {
+			FirstBurstLength_flag = true;
+			if (param->next != NULL) {
+				snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val);
+				type = param->type;
+				iscsi_param_add(params, "FirstBurstLength",
+						in_val, NULL, type);
+			}
+		}
+	}
+
+	for (param = *params; param != NULL; param = param->next) {
+		struct iscsi_param *params_dst = conn->params;
+		int add_param_value = 0;
+		new_val = NULL;
+		param->type = ISPT_INVALID;
+
+		/* sendtargets is special */
+		if (strcasecmp(param->key, "SendTargets") == 0) {
+			continue;
+		}
+		/* CHAP keys */
+		if (iscsi_find_key_in_array(param->key, chap_type)) {
+			continue;
+		}
+
+		/* 12.2, 12.10, 12.11, 12.13, 12.14, 12.17, 12.18, 12.19 */
+		if (discovery &&
+		    iscsi_find_key_in_array(param->key, discovery_ignored_param)) {
+			snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Irrelevant");
+			new_val = in_val;
+			add_param_value = 1;
+		} else {
+			rc = iscsi_negotiate_param_init(conn,
+							&cur_param,
+							&params_dst,
+							param);
+			if (rc < 0) {
+				free(valid_list);
+				free(in_val);
+				free(cur_val);
+				return rc;
+			} else if (rc > 0) {
+				snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "NotUnderstood");
+				new_val = in_val;
+				add_param_value = 1;
+			} else {
+				snprintf(valid_list, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->list);
+				snprintf(cur_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->val);
+				param->type = cur_param->type;
+			}
+		}
+
+		if (param->type > 0) {
+			snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val);
+
+			/* "NotUnderstood" value shouldn't be assigned to "Understood" key */
+			if (strcasecmp(in_val, "NotUnderstood") == 0) {
+				free(in_val);
+				free(valid_list);
+				free(cur_val);
+				return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+			}
+
+			if (strcasecmp(param->key, "FirstBurstLength") == 0) {
+				FirstBurstLength = (uint32_t)strtol(param->val, NULL,
+								    10);
+				new_val = iscsi_param_get_val(conn->sess->params,
+							      "MaxBurstLength");
+				if (new_val != NULL) {
+					MaxBurstLength = (uint32_t) strtol(new_val, NULL,
+									   10);
+				} else {
+					MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH;
+				}
+				if (FirstBurstLength < SPDK_ISCSI_MAX_FIRST_BURST_LENGTH &&
+				    FirstBurstLength > MaxBurstLength) {
+					FirstBurstLength = MaxBurstLength;
+					snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d",
+						 FirstBurstLength);
+				}
+			}
+
+			/* prevent target's declarative params from being changed by initiator */
+			if (iscsi_find_key_in_array(param->key, target_declarative_params)) {
+				add_param_value = 1;
+			}
+
+			new_val = iscsi_negotiate_param_all(&add_param_value,
+							    param,
+							    valid_list,
+							    in_val,
+							    cur_val);
+		}
+
+		/* check the negotiated value of the key */
+		if (new_val != NULL) {
+			/* add_param_value = 0 means updating the value of
+			 *      existed key in the connection's parameters
+			 */
+			if (add_param_value == 0) {
+				iscsi_param_set(params_dst, param->key, new_val);
+			}
+			total = iscsi_construct_data_from_param(param,
+								new_val,
+								data,
+								alloc_len,
+								total);
+			if (total < 0) {
+				goto final_return;
+			}
+
+			total = iscsi_special_param_construction(conn,
+					param,
+					FirstBurstLength_flag,
+					data,
+					alloc_len,
+					total);
+			if (total < 0) {
+				goto final_return;
+			}
+		} else {
+			total = -1;
+			break;
+		}
+	}
+
+final_return:
+	free(valid_list);
+	free(in_val);
+	free(cur_val);
+
+	return total;
+}
+
+int
+iscsi_copy_param2var(struct spdk_iscsi_conn *conn)
+{
+	const char *val;
+
+	val = iscsi_param_get_val(conn->params, "MaxRecvDataSegmentLength");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval MaxRecvDataSegmentLength failed\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+		      "copy MaxRecvDataSegmentLength=%s\n", val);
+	conn->MaxRecvDataSegmentLength = (int)strtol(val, NULL, 10);
+	if (conn->MaxRecvDataSegmentLength > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
+		conn->MaxRecvDataSegmentLength = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+	}
+
+	val = iscsi_param_get_val(conn->params, "HeaderDigest");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval HeaderDigest failed\n");
+		return -1;
+	}
+	if (strcasecmp(val, "CRC32C") == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=1\n");
+		conn->header_digest = 1;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=0\n");
+		conn->header_digest = 0;
+	}
+	val = iscsi_param_get_val(conn->params, "DataDigest");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval DataDigest failed\n");
+		return -1;
+	}
+	if (strcasecmp(val, "CRC32C") == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=1\n");
+		conn->data_digest = 1;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=0\n");
+		conn->data_digest = 0;
+	}
+
+	val = iscsi_param_get_val(conn->sess->params, "MaxConnections");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval MaxConnections failed\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxConnections=%s\n", val);
+	conn->sess->MaxConnections = (uint32_t) strtol(val, NULL, 10);
+	val = iscsi_param_get_val(conn->sess->params, "MaxOutstandingR2T");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval MaxOutstandingR2T failed\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxOutstandingR2T=%s\n", val);
+	conn->sess->MaxOutstandingR2T = (uint32_t) strtol(val, NULL, 10);
+	val = iscsi_param_get_val(conn->sess->params, "FirstBurstLength");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval FirstBurstLength failed\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy FirstBurstLength=%s\n", val);
+	conn->sess->FirstBurstLength = (uint32_t) strtol(val, NULL, 10);
+	val = iscsi_param_get_val(conn->sess->params, "MaxBurstLength");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval MaxBurstLength failed\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxBurstLength=%s\n", val);
+	conn->sess->MaxBurstLength = (uint32_t) strtol(val, NULL, 10);
+	val = iscsi_param_get_val(conn->sess->params, "InitialR2T");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval InitialR2T failed\n");
+		return -1;
+	}
+	if (strcasecmp(val, "Yes") == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=1\n");
+		conn->sess->InitialR2T = true;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=0\n");
+		conn->sess->InitialR2T = false;
+	}
+	val = iscsi_param_get_val(conn->sess->params, "ImmediateData");
+	if (val == NULL) {
+		SPDK_ERRLOG("Getval ImmediateData failed\n");
+		return -1;
+	}
+	if (strcasecmp(val, "Yes") == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=1\n");
+		conn->sess->ImmediateData = true;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=0\n");
+		conn->sess->ImmediateData = false;
+	}
+	return 0;
+}
diff --git a/src/spdk/lib/iscsi/param.h b/src/spdk/lib/iscsi/param.h
new file mode 100644
index 000000000..ce194c514
--- /dev/null
+++ b/src/spdk/lib/iscsi/param.h
@@ -0,0 +1,94 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_PARAM_H
+#define SPDK_ISCSI_PARAM_H
+
+#include "spdk/stdinc.h"
+
+struct spdk_iscsi_conn;
+
+enum iscsi_param_type {
+	ISPT_INVALID = -1,
+	ISPT_NOTSPECIFIED = 0,
+	ISPT_LIST,
+	ISPT_NUMERICAL_MIN,
+	ISPT_NUMERICAL_MAX,
+	ISPT_NUMERICAL_DECLARATIVE,
+	ISPT_DECLARATIVE,
+	ISPT_BOOLEAN_OR,
+	ISPT_BOOLEAN_AND,
+};
+
+struct iscsi_param {
+	struct iscsi_param *next;
+	char *key;
+	char *val;
+	char *list;
+	int type;
+	int state_index;
+};
+
+void
+iscsi_param_free(struct iscsi_param *params);
+struct iscsi_param *
+iscsi_param_find(struct iscsi_param *params, const char *key);
+int
+iscsi_param_del(struct iscsi_param **params, const char *key);
+int
+iscsi_param_add(struct iscsi_param **params, const char *key,
+		const char *val, const char *list, int type);
+int
+iscsi_param_set(struct iscsi_param *params, const char *key,
+		const char *val);
+int
+iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val);
+int
+iscsi_parse_params(struct iscsi_param **params, const uint8_t *data,
+		   int len, bool cbit_enabled, char **partial_parameter);
+char *
+iscsi_param_get_val(struct iscsi_param *params, const char *key);
+int
+iscsi_param_eq_val(struct iscsi_param *params, const char *key,
+		   const char *val);
+
+int iscsi_negotiate_params(struct spdk_iscsi_conn *conn,
+			   struct iscsi_param **params_p, uint8_t *data,
+			   int alloc_len, int data_len);
+int iscsi_copy_param2var(struct spdk_iscsi_conn *conn);
+
+int iscsi_conn_params_init(struct iscsi_param **params);
+int iscsi_sess_params_init(struct iscsi_param **params);
+
+#endif /* SPDK_ISCSI_PARAM_H */
diff --git a/src/spdk/lib/iscsi/portal_grp.c b/src/spdk/lib/iscsi/portal_grp.c
new file mode 100644
index 000000000..986562ad7
--- /dev/null
+++ b/src/spdk/lib/iscsi/portal_grp.c
@@ -0,0 +1,655 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/tgt_node.h"
+
+#define PORTNUMSTRLEN 32
+#define ACCEPT_TIMEOUT_US 1000 /* 1ms */
+
+static int
+iscsi_portal_accept(void *arg)
+{
+	struct spdk_iscsi_portal	*portal = arg;
+	struct spdk_sock		*sock;
+	int				rc;
+	int				count = 0;
+
+	if (portal->sock == NULL) {
+		return -1;
+	}
+
+	while (1) {
+		sock = spdk_sock_accept(portal->sock);
+		if (sock != NULL) {
+			rc = iscsi_conn_construct(portal, sock);
+			if (rc < 0) {
+				spdk_sock_close(&sock);
+				SPDK_ERRLOG("spdk_iscsi_connection_construct() failed\n");
+				break;
+			}
+			count++;
+		} else {
+			if (errno != EAGAIN && errno != EWOULDBLOCK) {
+				SPDK_ERRLOG("accept error(%d): %s\n", errno, spdk_strerror(errno));
+			}
+			break;
+		}
+	}
+
+	return count;
+}
+
+static struct spdk_iscsi_portal *
+iscsi_portal_find_by_addr(const char *host, const char *port)
+{
+	struct spdk_iscsi_portal *p;
+
+	TAILQ_FOREACH(p, &g_iscsi.portal_head, g_tailq) {
+		if (!strcmp(p->host, host) && !strcmp(p->port, port)) {
+			return p;
+		}
+	}
+
+	return NULL;
+}
+
+/* Assumes caller allocated host and port strings on the heap */
+struct spdk_iscsi_portal *
+iscsi_portal_create(const char *host, const char *port)
+{
+	struct spdk_iscsi_portal *p = NULL, *tmp;
+
+	assert(host != NULL);
+	assert(port != NULL);
+
+	if (strlen(host) > MAX_PORTAL_ADDR || strlen(port) > MAX_PORTAL_PORT) {
+		return NULL;
+	}
+
+	p = calloc(1, sizeof(*p));
+	if (!p) {
+		SPDK_ERRLOG("calloc() failed for portal\n");
+		return NULL;
+	}
+
+	/* check and overwrite abbreviation of wildcard */
+	if (strcasecmp(host, "[*]") == 0) {
+		SPDK_WARNLOG("Please use \"[::]\" as IPv6 wildcard\n");
+		SPDK_WARNLOG("Convert \"[*]\" to \"[::]\" automatically\n");
+		SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)");
+		snprintf(p->host, sizeof(p->host), "[::]");
+	} else if (strcasecmp(host, "*") == 0) {
+		SPDK_WARNLOG("Please use \"0.0.0.0\" as IPv4 wildcard\n");
+		SPDK_WARNLOG("Convert \"*\" to \"0.0.0.0\" automatically\n");
+		SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)");
+		snprintf(p->host, sizeof(p->host), "0.0.0.0");
+	} else {
+		memcpy(p->host, host, strlen(host));
+	}
+
+	memcpy(p->port, port, strlen(port));
+
+	p->sock = NULL;
+	p->group = NULL; /* set at a later time by caller */
+	p->acceptor_poller = NULL;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	tmp = iscsi_portal_find_by_addr(host, port);
+	if (tmp != NULL) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+		SPDK_ERRLOG("portal (%s, %s) already exists\n", host, port);
+		goto error_out;
+	}
+
+	TAILQ_INSERT_TAIL(&g_iscsi.portal_head, p, g_tailq);
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	return p;
+
+error_out:
+	free(p);
+
+	return NULL;
+}
+
+void
+iscsi_portal_destroy(struct spdk_iscsi_portal *p)
+{
+	assert(p != NULL);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_destroy\n");
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_REMOVE(&g_iscsi.portal_head, p, g_tailq);
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	free(p);
+
+}
+
+static int
+iscsi_portal_open(struct spdk_iscsi_portal *p)
+{
+	struct spdk_sock *sock;
+	int port;
+
+	if (p->sock != NULL) {
+		SPDK_ERRLOG("portal (%s, %s) is already opened\n",
+			    p->host, p->port);
+		return -1;
+	}
+
+	port = (int)strtol(p->port, NULL, 0);
+	sock = spdk_sock_listen(p->host, port, NULL);
+	if (sock == NULL) {
+		SPDK_ERRLOG("listen error %.64s.%d\n", p->host, port);
+		return -1;
+	}
+
+	p->sock = sock;
+
+	/*
+	 * When the portal is created by config file, incoming connection
+	 * requests for the socket are pended to accept until reactors start.
+	 * However the gap between listen() and accept() will be slight and
+	 * the requests will be queued by the nonzero backlog of the socket
+	 * or resend by TCP.
+	 */
+	p->acceptor_poller = SPDK_POLLER_REGISTER(iscsi_portal_accept, p, ACCEPT_TIMEOUT_US);
+
+	return 0;
+}
+
+static void
+iscsi_portal_close(struct spdk_iscsi_portal *p)
+{
+	if (p->sock) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "close portal (%s, %s)\n",
+			      p->host, p->port);
+		spdk_poller_unregister(&p->acceptor_poller);
+		spdk_sock_close(&p->sock);
+	}
+}
+
+static int
+iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip)
+{
+	char *host = NULL, *port = NULL;
+	int len, rc = -1;
+	const char *p;
+
+	if (portalstring == NULL) {
+		SPDK_ERRLOG("portal error\n");
+		goto error_out;
+	}
+
+	/* IP address */
+	if (portalstring[0] == '[') {
+		/* IPv6 */
+		p = strchr(portalstring + 1, ']');
+		if (p == NULL) {
+			SPDK_ERRLOG("portal error\n");
+			goto error_out;
+		}
+		p++;
+	} else {
+		/* IPv4 */
+		p = strchr(portalstring, ':');
+		if (p == NULL) {
+			p = portalstring + strlen(portalstring);
+		}
+	}
+
+	len = p - portalstring;
+	host = malloc(len + 1);
+	if (host == NULL) {
+		SPDK_ERRLOG("malloc() failed for host\n");
+		goto error_out;
+	}
+	memcpy(host, portalstring, len);
+	host[len] = '\0';
+
+	/* Port number (IPv4 and IPv6 are the same) */
+	if (p[0] == '\0') {
+		port = malloc(PORTNUMSTRLEN);
+		if (!port) {
+			SPDK_ERRLOG("malloc() failed for port\n");
+			goto error_out;
+		}
+		snprintf(port, PORTNUMSTRLEN, "%d", DEFAULT_PORT);
+	} else {
+		p++;
+		len = strlen(p);
+		port = malloc(len + 1);
+		if (port == NULL) {
+			SPDK_ERRLOG("malloc() failed for port\n");
+			goto error_out;
+		}
+		memcpy(port, p, len);
+		port[len] = '\0';
+	}
+
+	*ip = iscsi_portal_create(host, port);
+	if (!*ip) {
+		goto error_out;
+	}
+
+	rc = 0;
+error_out:
+	free(host);
+	free(port);
+
+	return rc;
+}
+
+struct spdk_iscsi_portal_grp *
+iscsi_portal_grp_create(int tag)
+{
+	struct spdk_iscsi_portal_grp *pg = malloc(sizeof(*pg));
+
+	if (!pg) {
+		SPDK_ERRLOG("malloc() failed for portal group\n");
+		return NULL;
+	}
+
+	pg->ref = 0;
+	pg->tag = tag;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	pg->disable_chap = g_iscsi.disable_chap;
+	pg->require_chap = g_iscsi.require_chap;
+	pg->mutual_chap = g_iscsi.mutual_chap;
+	pg->chap_group = g_iscsi.chap_group;
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	TAILQ_INIT(&pg->head);
+
+	return pg;
+}
+
+void
+iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg)
+{
+	struct spdk_iscsi_portal	*p;
+
+	assert(pg != NULL);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grp_destroy\n");
+	while (!TAILQ_EMPTY(&pg->head)) {
+		p = TAILQ_FIRST(&pg->head);
+		TAILQ_REMOVE(&pg->head, p, per_pg_tailq);
+		iscsi_portal_destroy(p);
+	}
+	free(pg);
+}
+
+int
+iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg)
+{
+	int rc = -1;
+	struct spdk_iscsi_portal_grp *tmp;
+
+	assert(pg != NULL);
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	tmp = iscsi_portal_grp_find_by_tag(pg->tag);
+	if (tmp == NULL) {
+		TAILQ_INSERT_TAIL(&g_iscsi.pg_head, pg, tailq);
+		rc = 0;
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return rc;
+}
+
+void
+iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg,
+			    struct spdk_iscsi_portal *p)
+{
+	assert(pg != NULL);
+	assert(p != NULL);
+
+	p->group = pg;
+	TAILQ_INSERT_TAIL(&pg->head, p, per_pg_tailq);
+}
+
+int
+iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg,
+				 bool disable_chap, bool require_chap,
+				 bool mutual_chap, int32_t chap_group)
+{
+	if (!iscsi_check_chap_params(disable_chap, require_chap,
+				     mutual_chap, chap_group)) {
+		return -EINVAL;
+	}
+
+	pg->disable_chap = disable_chap;
+	pg->require_chap = require_chap;
+	pg->mutual_chap = mutual_chap;
+	pg->chap_group = chap_group;
+
+	return 0;
+}
+
+static int
+iscsi_parse_portal_grp(struct spdk_conf_section *sp)
+{
+	struct spdk_iscsi_portal_grp *pg;
+	struct spdk_iscsi_portal *p;
+	const char *val;
+	char *label, *portal;
+	int i = 0, rc = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add portal group (from config file) %d\n",
+		      spdk_conf_section_get_num(sp));
+
+	val = spdk_conf_section_get_val(sp, "Comment");
+	if (val != NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+	}
+
+	pg = iscsi_portal_grp_create(spdk_conf_section_get_num(sp));
+	if (!pg) {
+		SPDK_ERRLOG("portal group malloc error (%s)\n", spdk_conf_section_get_name(sp));
+		return -1;
+	}
+
+	for (i = 0; ; i++) {
+		label = spdk_conf_section_get_nmval(sp, "Portal", i, 0);
+		portal = spdk_conf_section_get_nmval(sp, "Portal", i, 1);
+		if (label == NULL || portal == NULL) {
+			break;
+		}
+
+		rc = iscsi_parse_portal(portal, &p);
+		if (rc < 0) {
+			SPDK_ERRLOG("parse portal error (%s)\n", portal);
+			goto error;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+			      "RIndex=%d, Host=%s, Port=%s, Tag=%d\n",
+			      i, p->host, p->port, spdk_conf_section_get_num(sp));
+
+		iscsi_portal_grp_add_portal(pg, p);
+	}
+
+	rc = iscsi_portal_grp_open(pg);
+	if (rc != 0) {
+		SPDK_ERRLOG("portal_grp_open failed\n");
+		goto error;
+	}
+
+	/* Add portal group to the end of the pg list */
+	rc = iscsi_portal_grp_register(pg);
+	if (rc != 0) {
+		SPDK_ERRLOG("register portal failed\n");
+		goto error;
+	}
+
+	return 0;
+
+error:
+	iscsi_portal_grp_release(pg);
+	return -1;
+}
+
+struct spdk_iscsi_portal_grp *
+iscsi_portal_grp_find_by_tag(int tag)
+{
+	struct spdk_iscsi_portal_grp *pg;
+
+	TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+		if (pg->tag == tag) {
+			return pg;
+		}
+	}
+
+	return NULL;
+}
+
+int
+iscsi_parse_portal_grps(void)
+{
+	int rc = 0;
+	struct spdk_conf_section *sp;
+
+	sp = spdk_conf_first_section(NULL);
+	while (sp != NULL) {
+		if (spdk_conf_section_match_prefix(sp, "PortalGroup")) {
+			if (spdk_conf_section_get_num(sp) == 0) {
+				SPDK_ERRLOG("Group 0 is invalid\n");
+				return -1;
+			}
+
+			/* Build portal group from cfg section PortalGroup */
+			rc = iscsi_parse_portal_grp(sp);
+			if (rc < 0) {
+				SPDK_ERRLOG("parse_portal_group() failed\n");
+				return -1;
+			}
+		}
+		sp = spdk_conf_next_section(sp);
+	}
+	return 0;
+}
+
+void
+iscsi_portal_grps_destroy(void)
+{
+	struct spdk_iscsi_portal_grp *pg;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grps_destroy\n");
+	pthread_mutex_lock(&g_iscsi.mutex);
+	while (!TAILQ_EMPTY(&g_iscsi.pg_head)) {
+		pg = TAILQ_FIRST(&g_iscsi.pg_head);
+		TAILQ_REMOVE(&g_iscsi.pg_head, pg, tailq);
+		pthread_mutex_unlock(&g_iscsi.mutex);
+		iscsi_portal_grp_destroy(pg);
+		pthread_mutex_lock(&g_iscsi.mutex);
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+int
+iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg)
+{
+	struct spdk_iscsi_portal *p;
+	int rc;
+
+	TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+		rc = iscsi_portal_open(p);
+		if (rc < 0) {
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void
+iscsi_portal_grp_close(struct spdk_iscsi_portal_grp *pg)
+{
+	struct spdk_iscsi_portal *p;
+
+	TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+		iscsi_portal_close(p);
+	}
+}
+
+void
+iscsi_portal_grp_close_all(void)
+{
+	struct spdk_iscsi_portal_grp *pg;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grp_close_all\n");
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+		iscsi_portal_grp_close(pg);
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+struct spdk_iscsi_portal_grp *
+iscsi_portal_grp_unregister(int tag)
+{
+	struct spdk_iscsi_portal_grp *pg;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+		if (pg->tag == tag) {
+			TAILQ_REMOVE(&g_iscsi.pg_head, pg, tailq);
+			pthread_mutex_unlock(&g_iscsi.mutex);
+			return pg;
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return NULL;
+}
+
+void
+iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg)
+{
+	iscsi_portal_grp_close(pg);
+	iscsi_portal_grp_destroy(pg);
+}
+
+static const char *portal_group_section = \
+		"\n"
+		"# Users must change the PortalGroup section(s) to match the IP addresses\n"
+		"#  for their environment.\n"
+		"# PortalGroup sections define which network portals the iSCSI target\n"
+		"# will use to listen for incoming connections.  These are also used to\n"
+		"#  determine which targets are accessible over each portal group.\n"
+		"# Up to 1024 Portal directives are allowed.  These define the network\n"
+		"#  portals of the portal group. The user must specify a IP address\n"
+		"#  for each network portal, and may optionally specify a port.\n"
+		"# If the port is omitted, 3260 will be used\n"
+		"#  Syntax:\n"
+		"#    Portal <Name> <IP address>[:<port>]\n";
+
+#define PORTAL_GROUP_TMPL \
+"[PortalGroup%d]\n" \
+"  Comment \"Portal%d\"\n"
+
+#define PORTAL_TMPL \
+"  Portal DA1 %s:%s\n"
+
+void
+iscsi_portal_grps_config_text(FILE *fp)
+{
+	struct spdk_iscsi_portal *p = NULL;
+	struct spdk_iscsi_portal_grp *pg = NULL;
+
+	/* Create portal group section */
+	fprintf(fp, "%s", portal_group_section);
+
+	/* Dump portal groups */
+	TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+		if (NULL == pg) { continue; }
+		fprintf(fp, PORTAL_GROUP_TMPL, pg->tag, pg->tag);
+		/* Dump portals */
+		TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+			if (NULL == p) { continue; }
+			fprintf(fp, PORTAL_TMPL, p->host, p->port);
+		}
+	}
+}
+
+static void
+iscsi_portal_grp_info_json(struct spdk_iscsi_portal_grp *pg,
+			   struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_portal *portal;
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_int32(w, "tag", pg->tag);
+
+	spdk_json_write_named_array_begin(w, "portals");
+	TAILQ_FOREACH(portal, &pg->head, per_pg_tailq) {
+		spdk_json_write_object_begin(w);
+
+		spdk_json_write_named_string(w, "host", portal->host);
+		spdk_json_write_named_string(w, "port", portal->port);
+
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_portal_grp_config_json(struct spdk_iscsi_portal_grp *pg,
+			     struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "method", "iscsi_create_portal_group");
+
+	spdk_json_write_name(w, "params");
+	iscsi_portal_grp_info_json(pg, w);
+
+	spdk_json_write_object_end(w);
+}
+
+void
+iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_portal_grp *pg;
+
+	TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+		iscsi_portal_grp_info_json(pg, w);
+	}
+}
+
+void
+iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_portal_grp *pg;
+
+	TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+		iscsi_portal_grp_config_json(pg, w);
+	}
+}
diff --git a/src/spdk/lib/iscsi/portal_grp.h b/src/spdk/lib/iscsi/portal_grp.h
new file mode 100644
index 000000000..7ac72e36c
--- /dev/null
+++ b/src/spdk/lib/iscsi/portal_grp.h
@@ -0,0 +1,90 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_PORTAL_GRP_H
+#define SPDK_PORTAL_GRP_H
+
+#include "spdk/conf.h"
+#include "spdk/cpuset.h"
+#include "iscsi/iscsi.h"
+
+struct spdk_json_write_ctx;
+
+struct spdk_iscsi_portal {
+	struct spdk_iscsi_portal_grp	*group;
+	char				host[MAX_PORTAL_ADDR + 1];
+	char				port[MAX_PORTAL_PORT + 1];
+	struct spdk_sock		*sock;
+	struct spdk_poller		*acceptor_poller;
+	TAILQ_ENTRY(spdk_iscsi_portal)	per_pg_tailq;
+	TAILQ_ENTRY(spdk_iscsi_portal)	g_tailq;
+};
+
+struct spdk_iscsi_portal_grp {
+	int					ref;
+	int					tag;
+	bool					disable_chap;
+	bool					require_chap;
+	bool					mutual_chap;
+	int32_t					chap_group;
+	TAILQ_ENTRY(spdk_iscsi_portal_grp)	tailq;
+	TAILQ_HEAD(, spdk_iscsi_portal)		head;
+};
+
+/* SPDK iSCSI Portal Group management API */
+
+struct spdk_iscsi_portal *iscsi_portal_create(const char *host, const char *port);
+void iscsi_portal_destroy(struct spdk_iscsi_portal *p);
+
+struct spdk_iscsi_portal_grp *iscsi_portal_grp_create(int tag);
+void iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg,
+				 struct spdk_iscsi_portal *p);
+void iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg);
+void iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg);
+int iscsi_parse_portal_grps(void);
+void iscsi_portal_grps_destroy(void);
+int iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg);
+struct spdk_iscsi_portal_grp *iscsi_portal_grp_unregister(int tag);
+struct spdk_iscsi_portal_grp *iscsi_portal_grp_find_by_tag(int tag);
+int iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg);
+int iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg,
+				     bool disable_chap, bool require_chap,
+				     bool mutual_chap, int32_t chap_group);
+
+void iscsi_portal_grp_close_all(void);
+void iscsi_portal_grps_config_text(FILE *fp);
+void iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w);
+void iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w);
+
+#endif /* SPDK_PORTAL_GRP_H */
diff --git a/src/spdk/lib/iscsi/spdk_iscsi.map b/src/spdk/lib/iscsi/spdk_iscsi.map
new file mode 100644
index 000000000..0475a800d
--- /dev/null
+++ b/src/spdk/lib/iscsi/spdk_iscsi.map
@@ -0,0 +1,11 @@
+{
+	global:
+
+	# Functions used by other SPDK libraries
+	spdk_iscsi_init;
+	spdk_iscsi_fini;
+	spdk_iscsi_config_text;
+	spdk_iscsi_config_json;
+
+	local: *;
+};
diff --git a/src/spdk/lib/iscsi/task.c b/src/spdk/lib/iscsi/task.c
new file mode 100644
index 000000000..964621178
--- /dev/null
+++ b/src/spdk/lib/iscsi/task.c
@@ -0,0 +1,98 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "iscsi/conn.h"
+#include "iscsi/task.h"
+
+static void
+iscsi_task_free(struct spdk_scsi_task *scsi_task)
+{
+	struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task);
+
+	if (task->parent) {
+		if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+			assert(task->conn->data_in_cnt > 0);
+			task->conn->data_in_cnt--;
+		}
+
+		spdk_scsi_task_put(&task->parent->scsi);
+		task->parent = NULL;
+	}
+
+	iscsi_task_disassociate_pdu(task);
+	assert(task->conn->pending_task_cnt > 0);
+	task->conn->pending_task_cnt--;
+	spdk_mempool_put(g_iscsi.task_pool, (void *)task);
+}
+
+struct spdk_iscsi_task *
+iscsi_task_get(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *parent,
+	       spdk_scsi_task_cpl cpl_fn)
+{
+	struct spdk_iscsi_task *task;
+
+	task = spdk_mempool_get(g_iscsi.task_pool);
+	if (!task) {
+		SPDK_ERRLOG("Unable to get task\n");
+		abort();
+	}
+
+	assert(conn != NULL);
+	memset(task, 0, sizeof(*task));
+	task->conn = conn;
+	assert(conn->pending_task_cnt < UINT32_MAX);
+	conn->pending_task_cnt++;
+	spdk_scsi_task_construct(&task->scsi,
+				 cpl_fn,
+				 iscsi_task_free);
+	if (parent) {
+		parent->scsi.ref++;
+		task->parent = parent;
+		task->tag = parent->tag;
+		task->lun_id = parent->lun_id;
+		task->scsi.dxfer_dir = parent->scsi.dxfer_dir;
+		task->scsi.transfer_len = parent->scsi.transfer_len;
+		task->scsi.lun = parent->scsi.lun;
+		task->scsi.cdb = parent->scsi.cdb;
+		task->scsi.target_port = parent->scsi.target_port;
+		task->scsi.initiator_port = parent->scsi.initiator_port;
+		if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+			conn->data_in_cnt++;
+		}
+	}
+
+	return task;
+}
diff --git a/src/spdk/lib/iscsi/task.h b/src/spdk/lib/iscsi/task.h
new file mode 100644
index 000000000..0ef48599a
--- /dev/null
+++ b/src/spdk/lib/iscsi/task.h
@@ -0,0 +1,188 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_TASK_H
+#define SPDK_ISCSI_TASK_H
+
+#include "iscsi/iscsi.h"
+#include "spdk/scsi.h"
+#include "spdk/util.h"
+
+struct spdk_iscsi_task {
+	struct spdk_scsi_task	scsi;
+
+	struct spdk_iscsi_task *parent;
+
+	uint8_t rsp_scsi_status;
+	uint8_t rsp_sense_data[32];
+	size_t rsp_sense_data_len;
+
+	struct spdk_iscsi_conn *conn;
+	struct spdk_iscsi_pdu *pdu;
+	uint32_t outstanding_r2t;
+
+	uint32_t desired_data_transfer_length;
+
+	/* Only valid for Read/Write */
+	uint32_t bytes_completed;
+
+	uint32_t data_out_cnt;
+
+	/*
+	 * Tracks the current offset of large read io.
+	 */
+	uint32_t current_datain_offset;
+
+	/*
+	 * next_expected_r2t_offset is used when we receive
+	 * the DataOUT PDU.
+	 */
+	uint32_t next_expected_r2t_offset;
+
+	/*
+	 * Tracks the length of the R2T that is in progress.
+	 * Used to check that an R2T burst does not exceed
+	 *  MaxBurstLength.
+	 */
+	uint32_t current_r2t_length;
+
+	/*
+	 * next_r2t_offset is used when we are sending the
+	 * R2T packet to keep track of next offset of r2t.
+	 */
+	uint32_t next_r2t_offset;
+	uint32_t R2TSN;
+	uint32_t r2t_datasn; /* record next datasn for a r2tsn */
+	uint32_t acked_r2tsn; /* next r2tsn to be acked */
+	uint32_t datain_datasn;
+	uint32_t acked_data_sn; /* next expected datain datasn */
+	uint32_t ttt;
+	bool is_r2t_active;
+
+	uint32_t tag;
+
+	/**
+	 * Record the lun id just in case the lun is invalid,
+	 * which will happen when hot removing the lun.
+	 */
+	int lun_id;
+
+	struct spdk_poller *mgmt_poller;
+
+	TAILQ_ENTRY(spdk_iscsi_task) link;
+
+	TAILQ_HEAD(subtask_list, spdk_iscsi_task) subtask_list;
+	TAILQ_ENTRY(spdk_iscsi_task) subtask_link;
+	bool is_queued; /* is queued in scsi layer for handling */
+};
+
+static inline void
+iscsi_task_put(struct spdk_iscsi_task *task)
+{
+	spdk_scsi_task_put(&task->scsi);
+}
+
+static inline struct spdk_iscsi_pdu *
+iscsi_task_get_pdu(struct spdk_iscsi_task *task)
+{
+	return task->pdu;
+}
+
+static inline void
+iscsi_task_set_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu)
+{
+	task->pdu = pdu;
+}
+
+static inline struct iscsi_bhs *
+iscsi_task_get_bhs(struct spdk_iscsi_task *task)
+{
+	return &iscsi_task_get_pdu(task)->bhs;
+}
+
+static inline void
+iscsi_task_associate_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu)
+{
+	iscsi_task_set_pdu(task, pdu);
+	pdu->ref++;
+}
+
+static inline void
+iscsi_task_disassociate_pdu(struct spdk_iscsi_task *task)
+{
+	if (iscsi_task_get_pdu(task)) {
+		iscsi_put_pdu(iscsi_task_get_pdu(task));
+		iscsi_task_set_pdu(task, NULL);
+	}
+}
+
+static inline int
+iscsi_task_is_immediate(struct spdk_iscsi_task *task)
+{
+	struct iscsi_bhs_scsi_req *scsi_req;
+
+	scsi_req = (struct iscsi_bhs_scsi_req *)iscsi_task_get_bhs(task);
+	return (scsi_req->immediate == 1);
+}
+
+static inline int
+iscsi_task_is_read(struct spdk_iscsi_task *task)
+{
+	struct iscsi_bhs_scsi_req *scsi_req;
+
+	scsi_req = (struct iscsi_bhs_scsi_req *)iscsi_task_get_bhs(task);
+	return (scsi_req->read_bit == 1);
+}
+
+struct spdk_iscsi_task *iscsi_task_get(struct spdk_iscsi_conn *conn,
+				       struct spdk_iscsi_task *parent,
+				       spdk_scsi_task_cpl cpl_fn);
+
+static inline struct spdk_iscsi_task *
+iscsi_task_from_scsi_task(struct spdk_scsi_task *task)
+{
+	return SPDK_CONTAINEROF(task, struct spdk_iscsi_task, scsi);
+}
+
+static inline struct spdk_iscsi_task *
+iscsi_task_get_primary(struct spdk_iscsi_task *task)
+{
+	if (task->parent) {
+		return task->parent;
+	} else {
+		return task;
+	}
+}
+
+#endif /* SPDK_ISCSI_TASK_H */
diff --git a/src/spdk/lib/iscsi/tgt_node.c b/src/spdk/lib/iscsi/tgt_node.c
new file mode 100644
index 000000000..0807a3384
--- /dev/null
+++ b/src/spdk/lib/iscsi/tgt_node.c
@@ -0,0 +1,1607 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/sock.h"
+#include "spdk/scsi.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/init_grp.h"
+#include "iscsi/task.h"
+
+#define MAX_TMPBUF 4096
+#define MAX_MASKBUF 128
+
+static bool
+iscsi_ipv6_netmask_allow_addr(const char *netmask, const char *addr)
+{
+	struct in6_addr in6_mask;
+	struct in6_addr in6_addr;
+	char mask[MAX_MASKBUF];
+	const char *p;
+	size_t n;
+	int bits, bmask;
+	int i;
+
+	if (netmask[0] != '[') {
+		return false;
+	}
+	p = strchr(netmask, ']');
+	if (p == NULL) {
+		return false;
+	}
+	n = p - (netmask + 1);
+	if (n + 1 > sizeof mask) {
+		return false;
+	}
+
+	memcpy(mask, netmask + 1, n);
+	mask[n] = '\0';
+	p++;
+
+	if (p[0] == '/') {
+		bits = (int) strtol(p + 1, NULL, 10);
+		if (bits <= 0 || bits > 128) {
+			return false;
+		}
+	} else {
+		bits = 128;
+	}
+
+#if 0
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "input %s\n", addr);
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "mask  %s / %d\n", mask, bits);
+#endif
+
+	/* presentation to network order binary */
+	if (inet_pton(AF_INET6, mask, &in6_mask) <= 0
+	    || inet_pton(AF_INET6, addr, &in6_addr) <= 0) {
+		return false;
+	}
+
+	/* check 128bits */
+	for (i = 0; i < (bits / 8); i++) {
+		if (in6_mask.s6_addr[i] != in6_addr.s6_addr[i]) {
+			return false;
+		}
+	}
+	if (bits % 8) {
+		bmask = (0xffU << (8 - (bits % 8))) & 0xffU;
+		if ((in6_mask.s6_addr[i] & bmask) != (in6_addr.s6_addr[i] & bmask)) {
+			return false;
+		}
+	}
+
+	/* match */
+	return true;
+}
+
+static bool
+iscsi_ipv4_netmask_allow_addr(const char *netmask, const char *addr)
+{
+	struct in_addr in4_mask;
+	struct in_addr in4_addr;
+	char mask[MAX_MASKBUF];
+	const char *p;
+	uint32_t bmask;
+	size_t n;
+	int bits;
+
+	p = strchr(netmask, '/');
+	if (p == NULL) {
+		p = netmask + strlen(netmask);
+	}
+	n = p - netmask;
+	if (n + 1 > sizeof mask) {
+		return false;
+	}
+
+	memcpy(mask, netmask, n);
+	mask[n] = '\0';
+
+	if (p[0] == '/') {
+		bits = (int) strtol(p + 1, NULL, 10);
+		if (bits <= 0 || bits > 32) {
+			return false;
+		}
+	} else {
+		bits = 32;
+	}
+
+	/* presentation to network order binary */
+	if (inet_pton(AF_INET, mask, &in4_mask) <= 0
+	    || inet_pton(AF_INET, addr, &in4_addr) <= 0) {
+		return false;
+	}
+
+	/* check 32bits */
+	bmask = (0xffffffffU << (32 - bits)) & 0xffffffffU;
+	if ((ntohl(in4_mask.s_addr) & bmask) != (ntohl(in4_addr.s_addr) & bmask)) {
+		return false;
+	}
+
+	/* match */
+	return true;
+}
+
+static bool
+iscsi_netmask_allow_addr(const char *netmask, const char *addr)
+{
+	if (netmask == NULL || addr == NULL) {
+		return false;
+	}
+	if (strcasecmp(netmask, "ANY") == 0) {
+		return true;
+	}
+	if (netmask[0] == '[') {
+		/* IPv6 */
+		if (iscsi_ipv6_netmask_allow_addr(netmask, addr)) {
+			return true;
+		}
+	} else {
+		/* IPv4 */
+		if (iscsi_ipv4_netmask_allow_addr(netmask, addr)) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static bool
+iscsi_init_grp_allow_addr(struct spdk_iscsi_init_grp *igp,
+			  const char *addr)
+{
+	struct spdk_iscsi_initiator_netmask *imask;
+
+	TAILQ_FOREACH(imask, &igp->netmask_head, tailq) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "netmask=%s, addr=%s\n",
+			      imask->mask, addr);
+		if (iscsi_netmask_allow_addr(imask->mask, addr)) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static int
+iscsi_init_grp_allow_iscsi_name(struct spdk_iscsi_init_grp *igp,
+				const char *iqn, bool *result)
+{
+	struct spdk_iscsi_initiator_name *iname;
+
+	TAILQ_FOREACH(iname, &igp->initiator_head, tailq) {
+		/* denied if iqn is matched */
+		if ((iname->name[0] == '!')
+		    && (strcasecmp(&iname->name[1], "ANY") == 0
+			|| strcasecmp(&iname->name[1], iqn) == 0)) {
+			*result = false;
+			return 0;
+		}
+		/* allowed if iqn is matched */
+		if (strcasecmp(iname->name, "ANY") == 0
+		    || strcasecmp(iname->name, iqn) == 0) {
+			*result = true;
+			return 0;
+		}
+	}
+	return -1;
+}
+
+static struct spdk_iscsi_pg_map *
+iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target,
+			   struct spdk_iscsi_portal_grp *pg);
+
+bool
+iscsi_tgt_node_access(struct spdk_iscsi_conn *conn,
+		      struct spdk_iscsi_tgt_node *target, const char *iqn, const char *addr)
+{
+	struct spdk_iscsi_portal_grp *pg;
+	struct spdk_iscsi_pg_map *pg_map;
+	struct spdk_iscsi_ig_map *ig_map;
+	int rc;
+	bool allowed = false;
+
+	if (conn == NULL || target == NULL || iqn == NULL || addr == NULL) {
+		return false;
+	}
+	pg = conn->portal->group;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pg=%d, iqn=%s, addr=%s\n",
+		      pg->tag, iqn, addr);
+	pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+	if (pg_map == NULL) {
+		return false;
+	}
+	TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+		rc = iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &allowed);
+		if (rc == 0) {
+			if (allowed == false) {
+				goto denied;
+			} else {
+				if (iscsi_init_grp_allow_addr(ig_map->ig, addr)) {
+					return true;
+				}
+			}
+		} else {
+			/* netmask is denied in this initiator group */
+		}
+	}
+
+denied:
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "access denied from %s (%s) to %s (%s:%s,%d)\n",
+		      iqn, addr, target->name, conn->portal_host,
+		      conn->portal_port, conn->pg_tag);
+	return false;
+}
+
+static bool
+iscsi_tgt_node_allow_iscsi_name(struct spdk_iscsi_tgt_node *target, const char *iqn)
+{
+	struct spdk_iscsi_pg_map *pg_map;
+	struct spdk_iscsi_ig_map *ig_map;
+	int rc;
+	bool result = false;
+
+	if (target == NULL || iqn == NULL) {
+		return false;
+	}
+
+	TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+		TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+			rc = iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &result);
+			if (rc == 0) {
+				return result;
+			}
+		}
+	}
+
+	return false;
+}
+
+int
+iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn,
+		const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len,
+		int data_len)
+{
+	char buf[MAX_TMPBUF];
+	struct spdk_iscsi_portal_grp	*pg;
+	struct spdk_iscsi_pg_map	*pg_map;
+	struct spdk_iscsi_portal	*p;
+	struct spdk_iscsi_tgt_node	*target;
+	char *host;
+	int total;
+	int len;
+	int rc;
+
+	if (conn == NULL) {
+		return 0;
+	}
+
+	total = data_len;
+	if (alloc_len < 1) {
+		return 0;
+	}
+	if (total >= alloc_len) {
+		total = alloc_len;
+		data[total - 1] = '\0';
+		return total;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+		if (strcasecmp(tiqn, "ALL") != 0
+		    && strcasecmp(tiqn, target->name) != 0) {
+			continue;
+		}
+		rc = iscsi_tgt_node_allow_iscsi_name(target, iiqn);
+		if (rc == 0) {
+			continue;
+		}
+
+		/* DO SENDTARGETS */
+		len = snprintf((char *) data + total, alloc_len - total,
+			       "TargetName=%s", target->name);
+		total += len + 1;
+
+		/* write to data */
+		TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+			pg = pg_map->pg;
+			TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+				if (alloc_len - total < 1) {
+					pthread_mutex_unlock(&g_iscsi.mutex);
+					/* TODO: long text responses support */
+					SPDK_ERRLOG("SPDK doesn't support long text responses now, "
+						    "you can use larger MaxRecvDataSegmentLength"
+						    "value in initiator\n");
+					return alloc_len;
+				}
+				host = p->host;
+				/* wildcard? */
+				if (strcasecmp(host, "[::]") == 0
+				    || strcasecmp(host, "0.0.0.0") == 0) {
+					if (spdk_sock_is_ipv6(conn->sock)) {
+						snprintf(buf, sizeof buf, "[%s]",
+							 conn->target_addr);
+						host = buf;
+					} else if (spdk_sock_is_ipv4(conn->sock)) {
+						snprintf(buf, sizeof buf, "%s",
+							 conn->target_addr);
+						host = buf;
+					} else {
+						/* skip portal for the family */
+						continue;
+					}
+				}
+				SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+					      "TargetAddress=%s:%s,%d\n",
+					      host, p->port, pg->tag);
+				len = snprintf((char *) data + total,
+					       alloc_len - total,
+					       "TargetAddress=%s:%s,%d",
+					       host, p->port, pg->tag);
+				total += len + 1;
+			}
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	return total;
+}
+
+struct spdk_iscsi_tgt_node *
+iscsi_find_tgt_node(const char *target_name)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	if (target_name == NULL) {
+		return NULL;
+	}
+	TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+		if (strcasecmp(target_name, target->name) == 0) {
+			return target;
+		}
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "can't find target %s\n", target_name);
+	return NULL;
+}
+
+static int
+iscsi_tgt_node_register(struct spdk_iscsi_tgt_node *target)
+{
+	pthread_mutex_lock(&g_iscsi.mutex);
+
+	if (iscsi_find_tgt_node(target->name) != NULL) {
+		pthread_mutex_unlock(&g_iscsi.mutex);
+		return -EEXIST;
+	}
+
+	TAILQ_INSERT_TAIL(&g_iscsi.target_head, target, tailq);
+
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return 0;
+}
+
+static int
+iscsi_tgt_node_unregister(struct spdk_iscsi_tgt_node *target)
+{
+	struct spdk_iscsi_tgt_node *t;
+
+	TAILQ_FOREACH(t, &g_iscsi.target_head, tailq) {
+		if (t == target) {
+			TAILQ_REMOVE(&g_iscsi.target_head, t, tailq);
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static struct spdk_iscsi_ig_map *
+iscsi_pg_map_find_ig_map(struct spdk_iscsi_pg_map *pg_map,
+			 struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_ig_map *ig_map;
+
+	TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+		if (ig_map->ig == ig) {
+			return ig_map;
+		}
+	}
+
+	return NULL;
+}
+
+static struct spdk_iscsi_ig_map *
+iscsi_pg_map_add_ig_map(struct spdk_iscsi_pg_map *pg_map,
+			struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_ig_map *ig_map;
+
+	if (iscsi_pg_map_find_ig_map(pg_map, ig) != NULL) {
+		return NULL;
+	}
+
+	ig_map = malloc(sizeof(*ig_map));
+	if (ig_map == NULL) {
+		return NULL;
+	}
+
+	ig_map->ig = ig;
+	ig->ref++;
+	pg_map->num_ig_maps++;
+	TAILQ_INSERT_TAIL(&pg_map->ig_map_head, ig_map, tailq);
+
+	return ig_map;
+}
+
+static void
+_iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map,
+			    struct spdk_iscsi_ig_map *ig_map)
+{
+	TAILQ_REMOVE(&pg_map->ig_map_head, ig_map, tailq);
+	pg_map->num_ig_maps--;
+	ig_map->ig->ref--;
+	free(ig_map);
+}
+
+static int
+iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map,
+			   struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_ig_map *ig_map;
+
+	ig_map = iscsi_pg_map_find_ig_map(pg_map, ig);
+	if (ig_map == NULL) {
+		return -ENOENT;
+	}
+
+	_iscsi_pg_map_delete_ig_map(pg_map, ig_map);
+	return 0;
+}
+
+static void
+iscsi_pg_map_delete_all_ig_maps(struct spdk_iscsi_pg_map *pg_map)
+{
+	struct spdk_iscsi_ig_map *ig_map, *tmp;
+
+	TAILQ_FOREACH_SAFE(ig_map, &pg_map->ig_map_head, tailq, tmp) {
+		_iscsi_pg_map_delete_ig_map(pg_map, ig_map);
+	}
+}
+
+static struct spdk_iscsi_pg_map *
+iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target,
+			   struct spdk_iscsi_portal_grp *pg)
+{
+	struct spdk_iscsi_pg_map *pg_map;
+
+	TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+		if (pg_map->pg == pg) {
+			return pg_map;
+		}
+	}
+
+	return NULL;
+}
+
+static struct spdk_iscsi_pg_map *
+iscsi_tgt_node_add_pg_map(struct spdk_iscsi_tgt_node *target,
+			  struct spdk_iscsi_portal_grp *pg)
+{
+	struct spdk_iscsi_pg_map *pg_map;
+	char port_name[MAX_TMPBUF];
+	int rc;
+
+	if (iscsi_tgt_node_find_pg_map(target, pg) != NULL) {
+		return NULL;
+	}
+
+	if (target->num_pg_maps >= SPDK_SCSI_DEV_MAX_PORTS) {
+		SPDK_ERRLOG("Number of PG maps is more than allowed (max=%d)\n",
+			    SPDK_SCSI_DEV_MAX_PORTS);
+		return NULL;
+	}
+
+	pg_map = malloc(sizeof(*pg_map));
+	if (pg_map == NULL) {
+		return NULL;
+	}
+
+	snprintf(port_name, sizeof(port_name), "%s,t,0x%4.4x",
+		 spdk_scsi_dev_get_name(target->dev), pg->tag);
+	rc = spdk_scsi_dev_add_port(target->dev, pg->tag, port_name);
+	if (rc != 0) {
+		free(pg_map);
+		return NULL;
+	}
+
+	TAILQ_INIT(&pg_map->ig_map_head);
+	pg_map->num_ig_maps = 0;
+	pg->ref++;
+	pg_map->pg = pg;
+	target->num_pg_maps++;
+	TAILQ_INSERT_TAIL(&target->pg_map_head, pg_map, tailq);
+
+	return pg_map;
+}
+
+static void
+_iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target,
+			      struct spdk_iscsi_pg_map *pg_map)
+{
+	TAILQ_REMOVE(&target->pg_map_head, pg_map, tailq);
+	target->num_pg_maps--;
+	pg_map->pg->ref--;
+
+	spdk_scsi_dev_delete_port(target->dev, pg_map->pg->tag);
+
+	free(pg_map);
+}
+
+static int
+iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target,
+			     struct spdk_iscsi_portal_grp *pg)
+{
+	struct spdk_iscsi_pg_map *pg_map;
+
+	pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+	if (pg_map == NULL) {
+		return -ENOENT;
+	}
+
+	if (pg_map->num_ig_maps > 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "delete %d ig_maps forcefully\n",
+			      pg_map->num_ig_maps);
+	}
+
+	iscsi_pg_map_delete_all_ig_maps(pg_map);
+	_iscsi_tgt_node_delete_pg_map(target, pg_map);
+	return 0;
+}
+
+static void
+iscsi_tgt_node_delete_ig_maps(struct spdk_iscsi_tgt_node *target,
+			      struct spdk_iscsi_init_grp *ig)
+{
+	struct spdk_iscsi_pg_map *pg_map, *tmp;
+
+	TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) {
+		iscsi_pg_map_delete_ig_map(pg_map, ig);
+		if (pg_map->num_ig_maps == 0) {
+			_iscsi_tgt_node_delete_pg_map(target, pg_map);
+		}
+	}
+}
+
+static void
+iscsi_tgt_node_delete_all_pg_maps(struct spdk_iscsi_tgt_node *target)
+{
+	struct spdk_iscsi_pg_map *pg_map, *tmp;
+
+	TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) {
+		iscsi_pg_map_delete_all_ig_maps(pg_map);
+		_iscsi_tgt_node_delete_pg_map(target, pg_map);
+	}
+}
+
+static void
+_iscsi_tgt_node_destruct(void *cb_arg, int rc)
+{
+	struct spdk_iscsi_tgt_node *target = cb_arg;
+	iscsi_tgt_node_destruct_cb destruct_cb_fn = target->destruct_cb_fn;
+	void *destruct_cb_arg = target->destruct_cb_arg;
+
+	if (rc != 0) {
+		if (destruct_cb_fn) {
+			destruct_cb_fn(destruct_cb_arg, rc);
+		}
+		return;
+	}
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	iscsi_tgt_node_delete_all_pg_maps(target);
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	pthread_mutex_destroy(&target->mutex);
+	free(target);
+
+	if (destruct_cb_fn) {
+		destruct_cb_fn(destruct_cb_arg, 0);
+	}
+}
+
+static int
+iscsi_tgt_node_check_active_conns(void *arg)
+{
+	struct spdk_iscsi_tgt_node *target = arg;
+
+	if (iscsi_get_active_conns(target) != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	spdk_poller_unregister(&target->destruct_poller);
+
+	spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+iscsi_tgt_node_destruct(struct spdk_iscsi_tgt_node *target,
+			iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg)
+{
+	if (target == NULL) {
+		if (cb_fn) {
+			cb_fn(cb_arg, -ENOENT);
+		}
+		return;
+	}
+
+	if (target->destructed) {
+		SPDK_ERRLOG("Destructing %s is already started\n", target->name);
+		if (cb_fn) {
+			cb_fn(cb_arg, -EBUSY);
+		}
+		return;
+	}
+
+	target->destructed = true;
+	target->destruct_cb_fn = cb_fn;
+	target->destruct_cb_arg = cb_arg;
+
+	iscsi_conns_request_logout(target);
+
+	if (iscsi_get_active_conns(target) != 0) {
+		target->destruct_poller = SPDK_POLLER_REGISTER(iscsi_tgt_node_check_active_conns,
+					  target, 10);
+	} else {
+		spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target);
+	}
+
+}
+
+static int
+iscsi_tgt_node_delete_pg_ig_map(struct spdk_iscsi_tgt_node *target,
+				int pg_tag, int ig_tag)
+{
+	struct spdk_iscsi_portal_grp	*pg;
+	struct spdk_iscsi_init_grp	*ig;
+	struct spdk_iscsi_pg_map	*pg_map;
+	struct spdk_iscsi_ig_map	*ig_map;
+
+	pg = iscsi_portal_grp_find_by_tag(pg_tag);
+	if (pg == NULL) {
+		SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag);
+		return -ENOENT;
+	}
+	ig = iscsi_init_grp_find_by_tag(ig_tag);
+	if (ig == NULL) {
+		SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag);
+		return -ENOENT;
+	}
+
+	pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+	if (pg_map == NULL) {
+		SPDK_ERRLOG("%s: PortalGroup%d is not mapped\n", target->name, pg_tag);
+		return -ENOENT;
+	}
+	ig_map = iscsi_pg_map_find_ig_map(pg_map, ig);
+	if (ig_map == NULL) {
+		SPDK_ERRLOG("%s: InitiatorGroup%d is not mapped\n", target->name, pg_tag);
+		return -ENOENT;
+	}
+
+	_iscsi_pg_map_delete_ig_map(pg_map, ig_map);
+	if (pg_map->num_ig_maps == 0) {
+		_iscsi_tgt_node_delete_pg_map(target, pg_map);
+	}
+
+	return 0;
+}
+
+static int
+iscsi_tgt_node_add_pg_ig_map(struct spdk_iscsi_tgt_node *target,
+			     int pg_tag, int ig_tag)
+{
+	struct spdk_iscsi_portal_grp	*pg;
+	struct spdk_iscsi_pg_map	*pg_map;
+	struct spdk_iscsi_init_grp	*ig;
+	struct spdk_iscsi_ig_map	*ig_map;
+	bool				new_pg_map = false;
+
+	pg = iscsi_portal_grp_find_by_tag(pg_tag);
+	if (pg == NULL) {
+		SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag);
+		return -ENOENT;
+	}
+	ig = iscsi_init_grp_find_by_tag(ig_tag);
+	if (ig == NULL) {
+		SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag);
+		return -ENOENT;
+	}
+
+	/* get existing pg_map or create new pg_map and add it to target */
+	pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+	if (pg_map == NULL) {
+		pg_map = iscsi_tgt_node_add_pg_map(target, pg);
+		if (pg_map == NULL) {
+			goto failed;
+		}
+		new_pg_map = true;
+	}
+
+	/* create new ig_map and add it to pg_map */
+	ig_map = iscsi_pg_map_add_ig_map(pg_map, ig);
+	if (ig_map == NULL) {
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	if (new_pg_map) {
+		_iscsi_tgt_node_delete_pg_map(target, pg_map);
+	}
+
+	return -1;
+}
+
+int
+iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+				 int *pg_tag_list, int *ig_tag_list, uint16_t num_maps)
+{
+	uint16_t i;
+	int rc;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	for (i = 0; i < num_maps; i++) {
+		rc = iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i],
+						  ig_tag_list[i]);
+		if (rc != 0) {
+			SPDK_ERRLOG("could not add map to target\n");
+			goto invalid;
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return 0;
+
+invalid:
+	for (; i > 0; --i) {
+		iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i - 1],
+						ig_tag_list[i - 1]);
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return -1;
+}
+
+int
+iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+				    int *pg_tag_list, int *ig_tag_list, uint16_t num_maps)
+{
+	uint16_t i;
+	int rc;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	for (i = 0; i < num_maps; i++) {
+		rc = iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i],
+						     ig_tag_list[i]);
+		if (rc != 0) {
+			SPDK_ERRLOG("could not delete map from target\n");
+			goto invalid;
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return 0;
+
+invalid:
+	for (; i > 0; --i) {
+		rc = iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i - 1],
+						  ig_tag_list[i - 1]);
+		if (rc != 0) {
+			iscsi_tgt_node_delete_all_pg_maps(target);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+	return -1;
+}
+
+static int
+check_iscsi_name(const char *name)
+{
+	const unsigned char *up = (const unsigned char *) name;
+	size_t n;
+
+	/* valid iSCSI name no larger than 223 bytes */
+	if (strlen(name) > MAX_TARGET_NAME) {
+		return -1;
+	}
+
+	/* valid iSCSI name? */
+	for (n = 0; up[n] != 0; n++) {
+		if (up[n] > 0x00U && up[n] <= 0x2cU) {
+			return -1;
+		}
+		if (up[n] == 0x2fU) {
+			return -1;
+		}
+		if (up[n] >= 0x3bU && up[n] <= 0x40U) {
+			return -1;
+		}
+		if (up[n] >= 0x5bU && up[n] <= 0x60U) {
+			return -1;
+		}
+		if (up[n] >= 0x7bU && up[n] <= 0x7fU) {
+			return -1;
+		}
+		if (isspace(up[n])) {
+			return -1;
+		}
+	}
+	/* valid format? */
+	if (strncasecmp(name, "iqn.", 4) == 0) {
+		/* iqn.YYYY-MM.reversed.domain.name */
+		if (!isdigit(up[4]) || !isdigit(up[5]) || !isdigit(up[6])
+		    || !isdigit(up[7]) || up[8] != '-' || !isdigit(up[9])
+		    || !isdigit(up[10]) || up[11] != '.') {
+			SPDK_ERRLOG("invalid iqn format. "
+				    "expect \"iqn.YYYY-MM.reversed.domain.name\"\n");
+			return -1;
+		}
+	} else if (strncasecmp(name, "eui.", 4) == 0) {
+		/* EUI-64 -> 16bytes */
+		/* XXX */
+	} else if (strncasecmp(name, "naa.", 4) == 0) {
+		/* 64bit -> 16bytes, 128bit -> 32bytes */
+		/* XXX */
+	}
+	/* OK */
+	return 0;
+}
+
+bool
+iscsi_check_chap_params(bool disable, bool require, bool mutual, int group)
+{
+	if (group < 0) {
+		SPDK_ERRLOG("Invalid auth group ID (%d)\n", group);
+		return false;
+	}
+	if ((!disable && !require && !mutual) ||	/* Auto */
+	    (disable && !require && !mutual) ||	/* None */
+	    (!disable && require && !mutual) ||	/* CHAP */
+	    (!disable && require && mutual)) {	/* CHAP Mutual */
+		return true;
+	}
+	SPDK_ERRLOG("Invalid combination of CHAP params (d=%d,r=%d,m=%d)\n",
+		    disable, require, mutual);
+	return false;
+}
+
+struct spdk_iscsi_tgt_node *iscsi_tgt_node_construct(int target_index,
+		const char *name, const char *alias,
+		int *pg_tag_list, int *ig_tag_list, uint16_t num_maps,
+		const char *bdev_name_list[], int *lun_id_list, int num_luns,
+		int queue_depth,
+		bool disable_chap, bool require_chap, bool mutual_chap, int chap_group,
+		bool header_digest, bool data_digest)
+{
+	char				fullname[MAX_TMPBUF];
+	struct spdk_iscsi_tgt_node	*target;
+	int				rc;
+
+	if (!iscsi_check_chap_params(disable_chap, require_chap,
+				     mutual_chap, chap_group)) {
+		return NULL;
+	}
+
+	if (num_maps == 0) {
+		SPDK_ERRLOG("num_maps = 0\n");
+		return NULL;
+	}
+
+	if (name == NULL) {
+		SPDK_ERRLOG("TargetName not found\n");
+		return NULL;
+	}
+
+	if (strncasecmp(name, "iqn.", 4) != 0
+	    && strncasecmp(name, "eui.", 4) != 0
+	    && strncasecmp(name, "naa.", 4) != 0) {
+		snprintf(fullname, sizeof(fullname), "%s:%s", g_iscsi.nodebase, name);
+	} else {
+		snprintf(fullname, sizeof(fullname), "%s", name);
+	}
+
+	if (check_iscsi_name(fullname) != 0) {
+		SPDK_ERRLOG("TargetName %s contains an invalid character or format.\n",
+			    name);
+		return NULL;
+	}
+
+	target = calloc(1, sizeof(*target));
+	if (!target) {
+		SPDK_ERRLOG("could not allocate target\n");
+		return NULL;
+	}
+
+	rc = pthread_mutex_init(&target->mutex, NULL);
+	if (rc != 0) {
+		SPDK_ERRLOG("tgt_node%d: mutex_init() failed\n", target->num);
+		iscsi_tgt_node_destruct(target, NULL, NULL);
+		return NULL;
+	}
+
+	target->num = target_index;
+
+	memcpy(target->name, fullname, strlen(fullname));
+
+	if (alias != NULL) {
+		if (strlen(alias) > MAX_TARGET_NAME) {
+			iscsi_tgt_node_destruct(target, NULL, NULL);
+			return NULL;
+		}
+		memcpy(target->alias, alias, strlen(alias));
+	}
+
+	target->dev = spdk_scsi_dev_construct(fullname, bdev_name_list, lun_id_list, num_luns,
+					      SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI, NULL, NULL);
+	if (!target->dev) {
+		SPDK_ERRLOG("Could not construct SCSI device\n");
+		iscsi_tgt_node_destruct(target, NULL, NULL);
+		return NULL;
+	}
+
+	TAILQ_INIT(&target->pg_map_head);
+	rc = iscsi_target_node_add_pg_ig_maps(target, pg_tag_list,
+					      ig_tag_list, num_maps);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not add map to target\n");
+		iscsi_tgt_node_destruct(target, NULL, NULL);
+		return NULL;
+	}
+
+	target->disable_chap = disable_chap;
+	target->require_chap = require_chap;
+	target->mutual_chap = mutual_chap;
+	target->chap_group = chap_group;
+	target->header_digest = header_digest;
+	target->data_digest = data_digest;
+
+	if (queue_depth > 0 && ((uint32_t)queue_depth <= g_iscsi.MaxQueueDepth)) {
+		target->queue_depth = queue_depth;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "QueueDepth %d is invalid and %d is used instead.\n",
+			      queue_depth, g_iscsi.MaxQueueDepth);
+		target->queue_depth = g_iscsi.MaxQueueDepth;
+	}
+
+	rc = iscsi_tgt_node_register(target);
+	if (rc != 0) {
+		SPDK_ERRLOG("register target is failed\n");
+		iscsi_tgt_node_destruct(target, NULL, NULL);
+		return NULL;
+	}
+
+	return target;
+}
+
+static int
+iscsi_parse_tgt_node(struct spdk_conf_section *sp)
+{
+	char buf[MAX_TMPBUF];
+	struct spdk_iscsi_tgt_node *target;
+	int pg_tag_list[MAX_TARGET_MAP], ig_tag_list[MAX_TARGET_MAP];
+	int num_target_maps;
+	const char *alias, *pg_tag, *ig_tag;
+	const char *ag_tag;
+	const char *val, *name;
+	int target_num, chap_group, pg_tag_i, ig_tag_i;
+	bool header_digest, data_digest;
+	bool disable_chap, require_chap, mutual_chap;
+	int i;
+	int lun_id_list[SPDK_SCSI_DEV_MAX_LUN];
+	const char *bdev_name_list[SPDK_SCSI_DEV_MAX_LUN];
+	int num_luns, queue_depth;
+
+	target_num = spdk_conf_section_get_num(sp);
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add unit %d\n", target_num);
+
+	data_digest = false;
+	header_digest = false;
+
+	name = spdk_conf_section_get_val(sp, "TargetName");
+
+	if (name == NULL) {
+		SPDK_ERRLOG("tgt_node%d: TargetName not found\n", target_num);
+		return -1;
+	}
+
+	alias = spdk_conf_section_get_val(sp, "TargetAlias");
+
+	/* Setup initiator and portal group mapping */
+	val = spdk_conf_section_get_val(sp, "Mapping");
+	if (val == NULL) {
+		/* no map */
+		SPDK_ERRLOG("tgt_node%d: no Mapping\n", target_num);
+		return -1;
+	}
+
+	for (i = 0; i < MAX_TARGET_MAP; i++) {
+		val = spdk_conf_section_get_nmval(sp, "Mapping", i, 0);
+		if (val == NULL) {
+			break;
+		}
+		pg_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 0);
+		ig_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 1);
+		if (pg_tag == NULL || ig_tag == NULL) {
+			SPDK_ERRLOG("tgt_node%d: mapping error\n", target_num);
+			return -1;
+		}
+		if (strncasecmp(pg_tag, "PortalGroup",
+				strlen("PortalGroup")) != 0
+		    || sscanf(pg_tag, "%*[^0-9]%d", &pg_tag_i) != 1) {
+			SPDK_ERRLOG("tgt_node%d: mapping portal error\n", target_num);
+			return -1;
+		}
+		if (strncasecmp(ig_tag, "InitiatorGroup",
+				strlen("InitiatorGroup")) != 0
+		    || sscanf(ig_tag, "%*[^0-9]%d", &ig_tag_i) != 1) {
+			SPDK_ERRLOG("tgt_node%d: mapping initiator error\n", target_num);
+			return -1;
+		}
+		if (pg_tag_i < 1 || ig_tag_i < 1) {
+			SPDK_ERRLOG("tgt_node%d: invalid group tag\n", target_num);
+			return -1;
+		}
+		pg_tag_list[i] = pg_tag_i;
+		ig_tag_list[i] = ig_tag_i;
+	}
+
+	num_target_maps = i;
+
+	/* Setup AuthMethod */
+	val = spdk_conf_section_get_val(sp, "AuthMethod");
+	disable_chap = false;
+	require_chap = false;
+	mutual_chap = false;
+	if (val != NULL) {
+		for (i = 0; ; i++) {
+			val = spdk_conf_section_get_nmval(sp, "AuthMethod", 0, i);
+			if (val == NULL) {
+				break;
+			}
+			if (strcasecmp(val, "CHAP") == 0) {
+				require_chap = true;
+			} else if (strcasecmp(val, "Mutual") == 0) {
+				mutual_chap = true;
+			} else if (strcasecmp(val, "Auto") == 0) {
+				disable_chap = false;
+				require_chap = false;
+				mutual_chap = false;
+			} else if (strcasecmp(val, "None") == 0) {
+				disable_chap = true;
+				require_chap = false;
+				mutual_chap = false;
+			} else {
+				SPDK_ERRLOG("tgt_node%d: unknown auth\n", target_num);
+				return -1;
+			}
+		}
+		if (mutual_chap && !require_chap) {
+			SPDK_ERRLOG("tgt_node%d: Mutual but not CHAP\n", target_num);
+			return -1;
+		}
+	}
+	if (disable_chap) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod None\n");
+	} else if (!require_chap) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod Auto\n");
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod CHAP %s\n",
+			      mutual_chap ? "Mutual" : "");
+	}
+
+	val = spdk_conf_section_get_val(sp, "AuthGroup");
+	if (val == NULL) {
+		chap_group = 0;
+	} else {
+		ag_tag = val;
+		if (strcasecmp(ag_tag, "None") == 0) {
+			chap_group = 0;
+		} else {
+			if (strncasecmp(ag_tag, "AuthGroup",
+					strlen("AuthGroup")) != 0
+			    || sscanf(ag_tag, "%*[^0-9]%d", &chap_group) != 1) {
+				SPDK_ERRLOG("tgt_node%d: auth group error\n", target_num);
+				return -1;
+			}
+			if (chap_group == 0) {
+				SPDK_ERRLOG("tgt_node%d: invalid auth group 0\n", target_num);
+				return -1;
+			}
+		}
+	}
+	if (chap_group == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup None\n");
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup AuthGroup%d\n", chap_group);
+	}
+
+	val = spdk_conf_section_get_val(sp, "UseDigest");
+	if (val != NULL) {
+		for (i = 0; ; i++) {
+			val = spdk_conf_section_get_nmval(sp, "UseDigest", 0, i);
+			if (val == NULL) {
+				break;
+			}
+			if (strcasecmp(val, "Header") == 0) {
+				header_digest = true;
+			} else if (strcasecmp(val, "Data") == 0) {
+				data_digest = true;
+			} else if (strcasecmp(val, "Auto") == 0) {
+				header_digest = false;
+				data_digest = false;
+			} else {
+				SPDK_ERRLOG("tgt_node%d: unknown digest\n", target_num);
+				return -1;
+			}
+		}
+	}
+	if (!header_digest && !data_digest) {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest Auto\n");
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest %s %s\n",
+			      header_digest ? "Header" : "",
+			      data_digest ? "Data" : "");
+	}
+
+	val = spdk_conf_section_get_val(sp, "QueueDepth");
+	if (val == NULL) {
+		queue_depth = g_iscsi.MaxQueueDepth;
+	} else {
+		queue_depth = (int) strtol(val, NULL, 10);
+	}
+
+	num_luns = 0;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		snprintf(buf, sizeof(buf), "LUN%d", i);
+		val = spdk_conf_section_get_val(sp, buf);
+		if (val == NULL) {
+			continue;
+		}
+
+		bdev_name_list[num_luns] = val;
+		lun_id_list[num_luns] = i;
+		num_luns++;
+	}
+
+	if (num_luns == 0) {
+		SPDK_ERRLOG("tgt_node%d: No LUN specified for target %s.\n", target_num, name);
+		return -1;
+	}
+
+	target = iscsi_tgt_node_construct(target_num, name, alias,
+					  pg_tag_list, ig_tag_list, num_target_maps,
+					  bdev_name_list, lun_id_list, num_luns, queue_depth,
+					  disable_chap, require_chap, mutual_chap, chap_group,
+					  header_digest, data_digest);
+
+	if (target == NULL) {
+		SPDK_ERRLOG("tgt_node%d: add_iscsi_target_node error\n", target_num);
+		return -1;
+	}
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i);
+
+		if (lun) {
+			SPDK_INFOLOG(SPDK_LOG_ISCSI, "device %d: LUN%d %s\n",
+				     spdk_scsi_dev_get_id(target->dev),
+				     spdk_scsi_lun_get_id(lun),
+				     spdk_scsi_lun_get_bdev_name(lun));
+		}
+	}
+
+	return 0;
+}
+
+int iscsi_parse_tgt_nodes(void)
+{
+	struct spdk_conf_section *sp;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_parse_tgt_nodes\n");
+
+	sp = spdk_conf_first_section(NULL);
+	while (sp != NULL) {
+		if (spdk_conf_section_match_prefix(sp, "TargetNode")) {
+			int tag = spdk_conf_section_get_num(sp);
+
+			if (tag > SPDK_TN_TAG_MAX) {
+				SPDK_ERRLOG("tag %d is invalid\n", tag);
+				return -1;
+			}
+			rc = iscsi_parse_tgt_node(sp);
+			if (rc < 0) {
+				SPDK_ERRLOG("spdk_iscsi_parse_tgt_node() failed\n");
+				return -1;
+			}
+		}
+		sp = spdk_conf_next_section(sp);
+	}
+	return 0;
+}
+
+void
+iscsi_shutdown_tgt_nodes(void)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	while (!TAILQ_EMPTY(&g_iscsi.target_head)) {
+		target = TAILQ_FIRST(&g_iscsi.target_head);
+		TAILQ_REMOVE(&g_iscsi.target_head, target, tailq);
+
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		iscsi_tgt_node_destruct(target, NULL, NULL);
+
+		pthread_mutex_lock(&g_iscsi.mutex);
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+void
+iscsi_shutdown_tgt_node_by_name(const char *target_name,
+				iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	target = iscsi_find_tgt_node(target_name);
+	if (target != NULL) {
+		iscsi_tgt_node_unregister(target);
+		pthread_mutex_unlock(&g_iscsi.mutex);
+
+		iscsi_tgt_node_destruct(target, cb_fn, cb_arg);
+
+		return;
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+
+	if (cb_fn) {
+		cb_fn(cb_arg, -ENOENT);
+	}
+}
+
+bool
+iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target)
+{
+	return target->destructed;
+}
+
+int
+iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn,
+			    struct spdk_iscsi_tgt_node *target)
+{
+	int i;
+	struct spdk_iscsi_task *task;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i);
+
+		if (!lun) {
+			continue;
+		}
+
+		/* we create a fake management task per LUN to cleanup */
+		task = iscsi_task_get(conn, NULL, iscsi_task_mgmt_cpl);
+		if (!task) {
+			SPDK_ERRLOG("Unable to acquire task\n");
+			return -1;
+		}
+
+		task->scsi.target_port = conn->target_port;
+		task->scsi.initiator_port = conn->initiator_port;
+		task->scsi.lun = lun;
+
+		iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+	}
+
+	return 0;
+}
+
+void iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group,
+			       struct spdk_iscsi_init_grp *initiator_group)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	pthread_mutex_lock(&g_iscsi.mutex);
+	TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+		if (portal_group) {
+			iscsi_tgt_node_delete_pg_map(target, portal_group);
+		}
+		if (initiator_group) {
+			iscsi_tgt_node_delete_ig_maps(target, initiator_group);
+		}
+	}
+	pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+int
+iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target,
+		       const char *bdev_name, int lun_id)
+{
+	struct spdk_scsi_dev *dev;
+	int rc;
+
+	if (target->num_active_conns > 0) {
+		SPDK_ERRLOG("Target has active connections (count=%d)\n",
+			    target->num_active_conns);
+		return -1;
+	}
+
+	if (lun_id < -1 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) {
+		SPDK_ERRLOG("Specified LUN ID (%d) is invalid\n", lun_id);
+		return -1;
+	}
+
+	dev = target->dev;
+	if (dev == NULL) {
+		SPDK_ERRLOG("SCSI device is not found\n");
+		return -1;
+	}
+
+	rc = spdk_scsi_dev_add_lun(dev, bdev_name, lun_id, NULL, NULL);
+	if (rc != 0) {
+		SPDK_ERRLOG("spdk_scsi_dev_add_lun failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target,
+			       bool disable_chap, bool require_chap,
+			       bool mutual_chap, int32_t chap_group)
+{
+	if (!iscsi_check_chap_params(disable_chap, require_chap,
+				     mutual_chap, chap_group)) {
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&target->mutex);
+	target->disable_chap = disable_chap;
+	target->require_chap = require_chap;
+	target->mutual_chap = mutual_chap;
+	target->chap_group = chap_group;
+	pthread_mutex_unlock(&target->mutex);
+
+	return 0;
+}
+
+static const char *target_nodes_section = \
+		"\n"
+		"# Users should change the TargetNode section(s) below to match the\n"
+		"#  desired iSCSI target node configuration.\n"
+		"# TargetName, Mapping, LUN0 are minimum required\n";
+
+#define TARGET_NODE_TMPL \
+"[TargetNode%d]\n" \
+"  Comment \"Target%d\"\n" \
+"  TargetName %s\n" \
+"  TargetAlias \"%s\"\n"
+
+#define TARGET_NODE_PGIG_MAPPING_TMPL \
+"  Mapping PortalGroup%d InitiatorGroup%d\n"
+
+#define TARGET_NODE_AUTH_TMPL \
+"  AuthMethod %s\n" \
+"  AuthGroup %s\n" \
+"  UseDigest %s\n"
+
+#define TARGET_NODE_QD_TMPL \
+"  QueueDepth %d\n\n"
+
+#define TARGET_NODE_LUN_TMPL \
+"  LUN%d %s\n"
+
+void
+iscsi_tgt_nodes_config_text(FILE *fp)
+{
+	int l = 0;
+	struct spdk_scsi_dev *dev = NULL;
+	struct spdk_iscsi_tgt_node *target = NULL;
+	struct spdk_iscsi_pg_map *pg_map;
+	struct spdk_iscsi_ig_map *ig_map;
+
+	/* Create target nodes section */
+	fprintf(fp, "%s", target_nodes_section);
+
+	TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+		int idx;
+		const char *authmethod = "None";
+		char authgroup[32] = "None";
+		const char *usedigest = "Auto";
+
+		dev = target->dev;
+		if (NULL == dev) { continue; }
+
+		idx = target->num;
+		fprintf(fp, TARGET_NODE_TMPL, idx, idx, target->name, spdk_scsi_dev_get_name(dev));
+
+		TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+			TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+				fprintf(fp, TARGET_NODE_PGIG_MAPPING_TMPL,
+					pg_map->pg->tag,
+					ig_map->ig->tag);
+			}
+		}
+
+		if (target->disable_chap) {
+			authmethod = "None";
+		} else if (!target->require_chap) {
+			authmethod = "Auto";
+		} else if (target->mutual_chap) {
+			authmethod = "CHAP Mutual";
+		} else {
+			authmethod = "CHAP";
+		}
+
+		if (target->chap_group > 0) {
+			snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", target->chap_group);
+		}
+
+		if (target->header_digest) {
+			usedigest = "Header";
+		} else if (target->data_digest) {
+			usedigest = "Data";
+		}
+
+		fprintf(fp, TARGET_NODE_AUTH_TMPL,
+			authmethod, authgroup, usedigest);
+
+		for (l = 0; l < SPDK_SCSI_DEV_MAX_LUN; l++) {
+			struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(dev, l);
+
+			if (!lun) {
+				continue;
+			}
+
+			fprintf(fp, TARGET_NODE_LUN_TMPL,
+				spdk_scsi_lun_get_id(lun),
+				spdk_scsi_lun_get_bdev_name(lun));
+		}
+
+		fprintf(fp, TARGET_NODE_QD_TMPL,
+			target->queue_depth);
+	}
+}
+
+static void
+iscsi_tgt_node_info_json(struct spdk_iscsi_tgt_node *target,
+			 struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_pg_map *pg_map;
+	struct spdk_iscsi_ig_map *ig_map;
+	int i;
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "name", target->name);
+
+	if (target->alias[0] != '\0') {
+		spdk_json_write_named_string(w, "alias_name", target->alias);
+	}
+
+	spdk_json_write_named_array_begin(w, "pg_ig_maps");
+	TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+		TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+			spdk_json_write_object_begin(w);
+			spdk_json_write_named_int32(w, "pg_tag", pg_map->pg->tag);
+			spdk_json_write_named_int32(w, "ig_tag", ig_map->ig->tag);
+			spdk_json_write_object_end(w);
+		}
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_named_array_begin(w, "luns");
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i);
+
+		if (lun) {
+			spdk_json_write_object_begin(w);
+			spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+			spdk_json_write_named_int32(w, "lun_id", spdk_scsi_lun_get_id(lun));
+			spdk_json_write_object_end(w);
+		}
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_named_int32(w, "queue_depth", target->queue_depth);
+
+	spdk_json_write_named_bool(w, "disable_chap", target->disable_chap);
+	spdk_json_write_named_bool(w, "require_chap", target->require_chap);
+	spdk_json_write_named_bool(w, "mutual_chap", target->mutual_chap);
+	spdk_json_write_named_int32(w, "chap_group", target->chap_group);
+
+	spdk_json_write_named_bool(w, "header_digest", target->header_digest);
+	spdk_json_write_named_bool(w, "data_digest", target->data_digest);
+
+	spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_tgt_node_config_json(struct spdk_iscsi_tgt_node *target,
+			   struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "method", "iscsi_create_target_node");
+
+	spdk_json_write_name(w, "params");
+	iscsi_tgt_node_info_json(target, w);
+
+	spdk_json_write_object_end(w);
+}
+
+void
+iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+		iscsi_tgt_node_info_json(target, w);
+	}
+}
+
+void
+iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_iscsi_tgt_node *target;
+
+	TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+		iscsi_tgt_node_config_json(target, w);
+	}
+}
diff --git a/src/spdk/lib/iscsi/tgt_node.h b/src/spdk/lib/iscsi/tgt_node.h
new file mode 100644
index 000000000..2787fac91
--- /dev/null
+++ b/src/spdk/lib/iscsi/tgt_node.h
@@ -0,0 +1,147 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_TGT_NODE_H_
+#define SPDK_ISCSI_TGT_NODE_H_
+
+#include "spdk/stdinc.h"
+
+#include "iscsi/iscsi.h"
+
+struct spdk_iscsi_conn;
+struct spdk_iscsi_init_grp;
+struct spdk_iscsi_portal_grp;
+struct spdk_iscsi_portal;
+struct spdk_json_write_ctx;
+
+#define MAX_TARGET_MAP			256
+#define SPDK_TN_TAG_MAX			0x0000ffff
+
+typedef void (*iscsi_tgt_node_destruct_cb)(void *cb_arg, int rc);
+
+struct spdk_iscsi_ig_map {
+	struct spdk_iscsi_init_grp *ig;
+	TAILQ_ENTRY(spdk_iscsi_ig_map) tailq;
+};
+
+struct spdk_iscsi_pg_map {
+	struct spdk_iscsi_portal_grp *pg;
+	int num_ig_maps;
+	TAILQ_HEAD(, spdk_iscsi_ig_map) ig_map_head;
+	TAILQ_ENTRY(spdk_iscsi_pg_map) tailq ;
+};
+
+struct spdk_iscsi_tgt_node {
+	int num;
+	char name[MAX_TARGET_NAME + 1];
+	char alias[MAX_TARGET_NAME + 1];
+
+	pthread_mutex_t mutex;
+
+	bool disable_chap;
+	bool require_chap;
+	bool mutual_chap;
+	int chap_group;
+	bool header_digest;
+	bool data_digest;
+	int queue_depth;
+
+	struct spdk_scsi_dev *dev;
+	/**
+	 * Counts number of active iSCSI connections associated with this
+	 *  target node.
+	 */
+	uint32_t num_active_conns;
+	struct spdk_iscsi_poll_group *pg;
+
+	int num_pg_maps;
+	TAILQ_HEAD(, spdk_iscsi_pg_map) pg_map_head;
+	TAILQ_ENTRY(spdk_iscsi_tgt_node) tailq;
+
+	bool destructed;
+	struct spdk_poller *destruct_poller;
+	iscsi_tgt_node_destruct_cb destruct_cb_fn;
+	void *destruct_cb_arg;
+};
+
+int iscsi_parse_tgt_nodes(void);
+
+void iscsi_shutdown_tgt_nodes(void);
+void iscsi_shutdown_tgt_node_by_name(const char *target_name,
+				     iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg);
+bool iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target);
+int iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn,
+		    const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len,
+		    int data_len);
+
+/*
+ * bdev_name_list and lun_id_list are equal sized arrays of size num_luns.
+ * bdev_name_list refers to the names of the bdevs that will be used for the LUNs on the
+ *  new target node.
+ * lun_id_list refers to the LUN IDs that will be used for the LUNs on the target node.
+ */
+struct spdk_iscsi_tgt_node *iscsi_tgt_node_construct(int target_index,
+		const char *name, const char *alias,
+		int *pg_tag_list, int *ig_tag_list, uint16_t num_maps,
+		const char *bdev_name_list[], int *lun_id_list, int num_luns,
+		int queue_depth,
+		bool disable_chap, bool require_chap, bool mutual_chap, int chap_group,
+		bool header_digest, bool data_digest);
+
+bool iscsi_check_chap_params(bool disable, bool require, bool mutual, int group);
+
+int iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+				     int *pg_tag_list, int *ig_tag_list,
+				     uint16_t num_maps);
+int iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+					int *pg_tag_list, int *ig_tag_list,
+					uint16_t num_maps);
+
+bool iscsi_tgt_node_access(struct spdk_iscsi_conn *conn,
+			   struct spdk_iscsi_tgt_node *target, const char *iqn,
+			   const char *addr);
+struct spdk_iscsi_tgt_node *iscsi_find_tgt_node(const char *target_name);
+int iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn,
+				struct spdk_iscsi_tgt_node *target);
+void iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group,
+			       struct spdk_iscsi_init_grp *initiator_group);
+int iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target,
+			   const char *bdev_name, int lun_id);
+int iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target,
+				   bool disable_chap, bool require_chap,
+				   bool mutual_chap, int32_t chap_group);
+void iscsi_tgt_nodes_config_text(FILE *fp);
+void iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w);
+void iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w);
+#endif /* SPDK_ISCSI_TGT_NODE_H_ */
diff --git a/src/spdk/lib/json/Makefile b/src/spdk/lib/json/Makefile
new file mode 100644
index 000000000..91cb8868f
--- /dev/null
+++ b/src/spdk/lib/json/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = json_parse.c json_util.c json_write.c
+LIBNAME = json
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_json.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/json/json_parse.c b/src/spdk/lib/json/json_parse.c
new file mode 100644
index 000000000..8639d5ff8
--- /dev/null
+++ b/src/spdk/lib/json/json_parse.c
@@ -0,0 +1,668 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/json.h"
+
+#include "spdk_internal/utf.h"
+
+#define SPDK_JSON_MAX_NESTING_DEPTH	64
+
+static int
+hex_value(uint8_t c)
+{
+#define V(x, y) [x] = y + 1
+	static const int8_t val[256] = {
+		V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
+		V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
+		V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
+		V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
+	};
+#undef V
+
+	return val[c] - 1;
+}
+
+static int
+json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
+{
+	uint8_t *str = *strp;
+	int v0, v1, v2, v3;
+	uint32_t val;
+	uint32_t surrogate_high = 0;
+	int rc;
+decode:
+	/* \uXXXX */
+	assert(buf_end > str);
+
+	if (*str++ != '\\') { return SPDK_JSON_PARSE_INVALID; }
+	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+	if (*str++ != 'u') { return SPDK_JSON_PARSE_INVALID; }
+	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+	if ((v3 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+	if ((v2 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+	if ((v1 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+	if ((v0 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+	val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12);
+
+	if (surrogate_high) {
+		/* We already parsed the high surrogate, so this should be the low part. */
+		if (!utf16_valid_surrogate_low(val)) {
+			return SPDK_JSON_PARSE_INVALID;
+		}
+
+		/* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */
+		val = utf16_decode_surrogate_pair(surrogate_high, val);
+	} else if (utf16_valid_surrogate_high(val)) {
+		surrogate_high = val;
+
+		/*
+		 * We parsed a \uXXXX sequence that decoded to the first half of a
+		 *  UTF-16 surrogate pair, so it must be immediately followed by another
+		 *  \uXXXX escape.
+		 *
+		 * Loop around to get the low half of the surrogate pair.
+		 */
+		if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+		goto decode;
+	} else if (utf16_valid_surrogate_low(val)) {
+		/*
+		 * We found the second half of surrogate pair without the first half;
+		 *  this is an invalid encoding.
+		 */
+		return SPDK_JSON_PARSE_INVALID;
+	}
+
+	/*
+	 * Convert Unicode escape (or surrogate pair) to UTF-8 in place.
+	 *
+	 * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes
+	 *  (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a
+	 *  single codepoint is 4 bytes.
+	 */
+	if (out) {
+		rc = utf8_encode_unsafe(out, val);
+	} else {
+		rc = utf8_codepoint_len(val);
+	}
+	if (rc < 0) {
+		return SPDK_JSON_PARSE_INVALID;
+	}
+
+	*strp = str; /* update input pointer */
+	return rc; /* return number of bytes decoded */
+}
+
+static int
+json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
+{
+	static const uint8_t escapes[256] = {
+		['b'] = '\b',
+		['f'] = '\f',
+		['n'] = '\n',
+		['r'] = '\r',
+		['t'] = '\t',
+		['/'] = '/',
+		['"'] = '"',
+		['\\'] = '\\',
+	};
+	uint8_t *str = *strp;
+	uint8_t c;
+
+	assert(buf_end > str);
+	if (buf_end - str < 2) {
+		return SPDK_JSON_PARSE_INCOMPLETE;
+	}
+
+	assert(str[0] == '\\');
+
+	c = escapes[str[1]];
+	if (c) {
+		if (out) {
+			*out = c;
+		}
+		*strp += 2; /* consumed two bytes */
+		return 1; /* produced one byte */
+	}
+
+	return SPDK_JSON_PARSE_INVALID;
+}
+
+/*
+ * Decode JSON string backslash escape.
+ * \param strp pointer to pointer to first character of escape (the backslash).
+ *  *strp is also advanced to indicate how much input was consumed.
+ *
+ * \return Number of bytes appended to out
+ */
+static int
+json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
+{
+	int rc;
+
+	rc = json_decode_string_escape_twochar(strp, buf_end, out);
+	if (rc > 0) {
+		return rc;
+	}
+
+	return json_decode_string_escape_unicode(strp, buf_end, out);
+}
+
+/*
+ * Decode JSON string in place.
+ *
+ * \param str_start Pointer to the beginning of the string (the opening " character).
+ *
+ * \return Number of bytes in decoded string (beginning from start).
+ */
+static int
+json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags)
+{
+	uint8_t *str = str_start;
+	uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */
+	int rc;
+
+	if (buf_end - str_start < 2) {
+		/*
+		 * Shortest valid string (the empty string) is two bytes (""),
+		 *  so this can't possibly be valid
+		 */
+		*str_end = str;
+		return SPDK_JSON_PARSE_INCOMPLETE;
+	}
+
+	if (*str++ != '"') {
+		*str_end = str;
+		return SPDK_JSON_PARSE_INVALID;
+	}
+
+	while (str < buf_end) {
+		if (str[0] == '"') {
+			/*
+			 * End of string.
+			 * Update str_end to point at next input byte and return output length.
+			 */
+			*str_end = str + 1;
+			return out - str_start - 1;
+		} else if (str[0] == '\\') {
+			rc = json_decode_string_escape(&str, buf_end,
+						       flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL);
+			assert(rc != 0);
+			if (rc < 0) {
+				*str_end = str;
+				return rc;
+			}
+			out += rc;
+		} else if (str[0] <= 0x1f) {
+			/* control characters must be escaped */
+			*str_end = str;
+			return SPDK_JSON_PARSE_INVALID;
+		} else {
+			rc = utf8_valid(str, buf_end);
+			if (rc == 0) {
+				*str_end = str;
+				return SPDK_JSON_PARSE_INCOMPLETE;
+			} else if (rc < 0) {
+				*str_end = str;
+				return SPDK_JSON_PARSE_INVALID;
+			}
+
+			if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) {
+				memmove(out, str, rc);
+			}
+			out += rc;
+			str += rc;
+		}
+	}
+
+	/* If execution gets here, we ran out of buffer. */
+	*str_end = str;
+	return SPDK_JSON_PARSE_INCOMPLETE;
+}
+
+static int
+json_valid_number(uint8_t *start, uint8_t *buf_end)
+{
+	uint8_t *p = start;
+	uint8_t c;
+
+	if (p >= buf_end) { return -1; }
+
+	c = *p++;
+	if (c >= '1' && c <= '9') { goto num_int_digits; }
+	if (c == '0') { goto num_frac_or_exp; }
+	if (c == '-') { goto num_int_first_digit; }
+	p--;
+	goto done_invalid;
+
+num_int_first_digit:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c == '0') { goto num_frac_or_exp; }
+		if (c >= '1' && c <= '9') { goto num_int_digits; }
+		p--;
+	}
+	goto done_invalid;
+
+num_int_digits:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c >= '0' && c <= '9') { goto num_int_digits; }
+		if (c == '.') { goto num_frac_first_digit; }
+		if (c == 'e' || c == 'E') { goto num_exp_sign; }
+		p--;
+	}
+	goto done_valid;
+
+num_frac_or_exp:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c == '.') { goto num_frac_first_digit; }
+		if (c == 'e' || c == 'E') { goto num_exp_sign; }
+		p--;
+	}
+	goto done_valid;
+
+num_frac_first_digit:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c >= '0' && c <= '9') { goto num_frac_digits; }
+		p--;
+	}
+	goto done_invalid;
+
+num_frac_digits:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c >= '0' && c <= '9') { goto num_frac_digits; }
+		if (c == 'e' || c == 'E') { goto num_exp_sign; }
+		p--;
+	}
+	goto done_valid;
+
+num_exp_sign:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c >= '0' && c <= '9') { goto num_exp_digits; }
+		if (c == '-' || c == '+') { goto num_exp_first_digit; }
+		p--;
+	}
+	goto done_invalid;
+
+num_exp_first_digit:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c >= '0' && c <= '9') { goto num_exp_digits; }
+		p--;
+	}
+	goto done_invalid;
+
+num_exp_digits:
+	if (spdk_likely(p != buf_end)) {
+		c = *p++;
+		if (c >= '0' && c <= '9') { goto num_exp_digits; }
+		p--;
+	}
+	goto done_valid;
+
+done_valid:
+	/* Valid end state */
+	return p - start;
+
+done_invalid:
+	/* Invalid end state */
+	if (p == buf_end) {
+		/* Hit the end of the buffer - the stream is incomplete. */
+		return SPDK_JSON_PARSE_INCOMPLETE;
+	}
+
+	/* Found an invalid character in an invalid end state */
+	return SPDK_JSON_PARSE_INVALID;
+}
+
+static int
+json_valid_comment(const uint8_t *start, const uint8_t *buf_end)
+{
+	const uint8_t *p = start;
+	bool multiline;
+
+	assert(buf_end > p);
+	if (buf_end - p < 2) {
+		return SPDK_JSON_PARSE_INCOMPLETE;
+	}
+
+	if (p[0] != '/') {
+		return SPDK_JSON_PARSE_INVALID;
+	}
+	if (p[1] == '*') {
+		multiline = true;
+	} else if (p[1] == '/') {
+		multiline = false;
+	} else {
+		return SPDK_JSON_PARSE_INVALID;
+	}
+	p += 2;
+
+	if (multiline) {
+		while (p != buf_end - 1) {
+			if (p[0] == '*' && p[1] == '/') {
+				/* Include the terminating star and slash in the comment */
+				return p - start + 2;
+			}
+			p++;
+		}
+	} else {
+		while (p != buf_end) {
+			if (*p == '\r' || *p == '\n') {
+				/* Do not include the line terminator in the comment */
+				return p - start;
+			}
+			p++;
+		}
+	}
+
+	return SPDK_JSON_PARSE_INCOMPLETE;
+}
+
+struct json_literal {
+	enum spdk_json_val_type type;
+	uint32_t len;
+	uint8_t str[8];
+};
+
+/*
+ * JSON only defines 3 possible literals; they can be uniquely identified by bits
+ *  3 and 4 of the first character:
+ *   'f' = 0b11[00]110
+ *   'n' = 0b11[01]110
+ *   't' = 0b11[10]100
+ * These two bits can be used as an index into the g_json_literals array.
+ */
+static const struct json_literal g_json_literals[] = {
+	{SPDK_JSON_VAL_FALSE, 5, "false"},
+	{SPDK_JSON_VAL_NULL,  4, "null"},
+	{SPDK_JSON_VAL_TRUE,  4, "true"},
+	{}
+};
+
+static int
+match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len)
+{
+	assert(end >= start);
+	if ((size_t)(end - start) < len) {
+		return SPDK_JSON_PARSE_INCOMPLETE;
+	}
+
+	if (memcmp(start, literal, len) != 0) {
+		return SPDK_JSON_PARSE_INVALID;
+	}
+
+	return len;
+}
+
+ssize_t
+spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values,
+		void **end, uint32_t flags)
+{
+	uint8_t *json_end = json + size;
+	enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH];
+	size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH];
+	enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID;
+	bool trailing_comma = false;
+	size_t depth = 0; /* index into containers */
+	size_t cur_value = 0; /* index into values */
+	size_t con_start_value;
+	uint8_t *data = json;
+	uint8_t *new_data;
+	int rc = 0;
+	const struct json_literal *lit;
+	enum {
+		STATE_VALUE, /* initial state */
+		STATE_VALUE_SEPARATOR, /* value separator (comma) */
+		STATE_NAME, /* "name": value */
+		STATE_NAME_SEPARATOR, /* colon */
+		STATE_END, /* parsed the complete value, so only whitespace is valid */
+	} state = STATE_VALUE;
+
+#define ADD_VALUE(t, val_start_ptr, val_end_ptr) \
+	if (values && cur_value < num_values) { \
+		values[cur_value].type = t; \
+		values[cur_value].start = val_start_ptr; \
+		values[cur_value].len = val_end_ptr - val_start_ptr; \
+	} \
+	cur_value++
+
+	while (data < json_end) {
+		uint8_t c = *data;
+
+		switch (c) {
+		case ' ':
+		case '\t':
+		case '\r':
+		case '\n':
+			/* Whitespace is allowed between any tokens. */
+			data++;
+			break;
+
+		case 't':
+		case 'f':
+		case 'n':
+			/* true, false, or null */
+			if (state != STATE_VALUE) { goto done_invalid; }
+			lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */
+			assert(lit->str[0] == c);
+			rc = match_literal(data, json_end, lit->str, lit->len);
+			if (rc < 0) { goto done_rc; }
+			ADD_VALUE(lit->type, data, data + rc);
+			data += rc;
+			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+			trailing_comma = false;
+			break;
+
+		case '"':
+			if (state != STATE_VALUE && state != STATE_NAME) { goto done_invalid; }
+			rc = json_decode_string(data, json_end, &new_data, flags);
+			if (rc < 0) {
+				data = new_data;
+				goto done_rc;
+			}
+			/*
+			 * Start is data + 1 to skip initial quote.
+			 * Length is data + rc - 1 to skip both quotes.
+			 */
+			ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME,
+				  data + 1, data + rc - 1);
+			data = new_data;
+			if (state == STATE_NAME) {
+				state = STATE_NAME_SEPARATOR;
+			} else {
+				state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+			}
+			trailing_comma = false;
+			break;
+
+		case '-':
+		case '0':
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (state != STATE_VALUE) { goto done_invalid; }
+			rc = json_valid_number(data, json_end);
+			if (rc < 0) { goto done_rc; }
+			ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc);
+			data += rc;
+			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+			trailing_comma = false;
+			break;
+
+		case '{':
+		case '[':
+			if (state != STATE_VALUE) { goto done_invalid; }
+			if (depth == SPDK_JSON_MAX_NESTING_DEPTH) {
+				rc = SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED;
+				goto done_rc;
+			}
+			if (c == '{') {
+				con_type = SPDK_JSON_VAL_OBJECT_BEGIN;
+				state = STATE_NAME;
+			} else {
+				con_type = SPDK_JSON_VAL_ARRAY_BEGIN;
+				state = STATE_VALUE;
+			}
+			con_value[depth] = cur_value;
+			containers[depth++] = con_type;
+			ADD_VALUE(con_type, data, data + 1);
+			data++;
+			trailing_comma = false;
+			break;
+
+		case '}':
+		case ']':
+			if (trailing_comma) { goto done_invalid; }
+			if (depth == 0) { goto done_invalid; }
+			con_type = containers[--depth];
+			con_start_value = con_value[depth];
+			if (values && con_start_value < num_values) {
+				values[con_start_value].len = cur_value - con_start_value - 1;
+			}
+			if (c == '}') {
+				if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) {
+					goto done_invalid;
+				}
+				if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+					goto done_invalid;
+				}
+				ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1);
+			} else {
+				if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) {
+					goto done_invalid;
+				}
+				if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) {
+					goto done_invalid;
+				}
+				ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1);
+			}
+			con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1];
+			data++;
+			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+			trailing_comma = false;
+			break;
+
+		case ',':
+			if (state != STATE_VALUE_SEPARATOR) { goto done_invalid; }
+			data++;
+			assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN ||
+			       con_type == SPDK_JSON_VAL_OBJECT_BEGIN);
+			state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME;
+			trailing_comma = true;
+			break;
+
+		case ':':
+			if (state != STATE_NAME_SEPARATOR) { goto done_invalid; }
+			data++;
+			state = STATE_VALUE;
+			break;
+
+		case '/':
+			if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) {
+				goto done_invalid;
+			}
+			rc = json_valid_comment(data, json_end);
+			if (rc < 0) { goto done_rc; }
+			/* Skip over comment */
+			data += rc;
+			break;
+
+		default:
+			goto done_invalid;
+		}
+
+		if (state == STATE_END) {
+			break;
+		}
+	}
+
+	if (state == STATE_END) {
+		/* Skip trailing whitespace */
+		while (data < json_end) {
+			uint8_t c = *data;
+
+			if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
+				data++;
+			} else {
+				break;
+			}
+		}
+
+		/*
+		 * These asserts are just for sanity checking - they are guaranteed by the allowed
+		 *  state transitions.
+		 */
+		assert(depth == 0);
+		assert(trailing_comma == false);
+		assert(data <= json_end);
+		if (end) {
+			*end = data;
+		}
+		return cur_value;
+	}
+
+	/* Invalid end state - ran out of data */
+	rc = SPDK_JSON_PARSE_INCOMPLETE;
+
+done_rc:
+	assert(rc < 0);
+	if (end) {
+		*end = data;
+	}
+	return rc;
+
+done_invalid:
+	rc = SPDK_JSON_PARSE_INVALID;
+	goto done_rc;
+}
diff --git a/src/spdk/lib/json/json_util.c b/src/spdk/lib/json/json_util.c
new file mode 100644
index 000000000..18d751047
--- /dev/null
+++ b/src/spdk/lib/json/json_util.c
@@ -0,0 +1,653 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/json.h"
+
+#include "spdk_internal/utf.h"
+#include "spdk_internal/log.h"
+
+#define SPDK_JSON_DEBUG(...) SPDK_DEBUGLOG(SPDK_LOG_JSON, __VA_ARGS__)
+
+size_t
+spdk_json_val_len(const struct spdk_json_val *val)
+{
+	if (val == NULL) {
+		return 0;
+	}
+
+	if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN || val->type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+		return val->len + 2;
+	}
+
+	return 1;
+}
+
+bool
+spdk_json_strequal(const struct spdk_json_val *val, const char *str)
+{
+	size_t len;
+
+	if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) {
+		return false;
+	}
+
+	len = strlen(str);
+	if (val->len != len) {
+		return false;
+	}
+
+	return memcmp(val->start, str, len) == 0;
+}
+
+char *
+spdk_json_strdup(const struct spdk_json_val *val)
+{
+	size_t len;
+	char *s;
+
+	if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) {
+		return NULL;
+	}
+
+	len = val->len;
+
+	if (memchr(val->start, '\0', len)) {
+		/* String contains embedded NUL, so it is not a valid C string. */
+		return NULL;
+	}
+
+	s = malloc(len + 1);
+	if (s == NULL) {
+		return s;
+	}
+
+	memcpy(s, val->start, len);
+	s[len] = '\0';
+
+	return s;
+}
+
+struct spdk_json_num {
+	bool negative;
+	uint64_t significand;
+	int64_t exponent;
+};
+
+static int
+json_number_split(const struct spdk_json_val *val, struct spdk_json_num *num)
+{
+	const char *iter;
+	size_t remaining;
+	uint64_t *pval;
+	uint64_t frac_digits = 0;
+	uint64_t exponent_u64 = 0;
+	bool exponent_negative = false;
+	enum {
+		NUM_STATE_INT,
+		NUM_STATE_FRAC,
+		NUM_STATE_EXP,
+	} state;
+
+	memset(num, 0, sizeof(*num));
+
+	if (val->type != SPDK_JSON_VAL_NUMBER) {
+		return -EINVAL;
+	}
+
+	remaining = val->len;
+	if (remaining == 0) {
+		return -EINVAL;
+	}
+
+	iter = val->start;
+	if (*iter == '-') {
+		num->negative = true;
+		iter++;
+		remaining--;
+	}
+
+	state = NUM_STATE_INT;
+	pval = &num->significand;
+	while (remaining--) {
+		char c = *iter++;
+
+		if (c == '.') {
+			state = NUM_STATE_FRAC;
+		} else if (c == 'e' || c == 'E') {
+			state = NUM_STATE_EXP;
+			pval = &exponent_u64;
+		} else if (c == '-') {
+			assert(state == NUM_STATE_EXP);
+			exponent_negative = true;
+		} else if (c == '+') {
+			assert(state == NUM_STATE_EXP);
+			/* exp_negative = false; */ /* already false by default */
+		} else {
+			uint64_t new_val;
+
+			assert(c >= '0' && c <= '9');
+			new_val = *pval * 10 + c - '0';
+			if (new_val < *pval) {
+				return -ERANGE;
+			}
+
+			if (state == NUM_STATE_FRAC) {
+				frac_digits++;
+			}
+
+			*pval = new_val;
+		}
+	}
+
+	if (exponent_negative) {
+		if (exponent_u64 > 9223372036854775808ULL) { /* abs(INT64_MIN) */
+			return -ERANGE;
+		}
+		num->exponent = (int64_t) - exponent_u64;
+	} else {
+		if (exponent_u64 > INT64_MAX) {
+			return -ERANGE;
+		}
+		num->exponent = exponent_u64;
+	}
+	num->exponent -= frac_digits;
+
+	/* Apply as much of the exponent as possible without overflow or truncation */
+	if (num->exponent < 0) {
+		while (num->exponent && num->significand >= 10 && num->significand % 10 == 0) {
+			num->significand /= 10;
+			num->exponent++;
+		}
+	} else { /* positive exponent */
+		while (num->exponent) {
+			uint64_t new_val = num->significand * 10;
+
+			if (new_val < num->significand) {
+				break;
+			}
+
+			num->significand = new_val;
+			num->exponent--;
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_json_number_to_uint16(const struct spdk_json_val *val, uint16_t *num)
+{
+	struct spdk_json_num split_num;
+	int rc;
+
+	rc = json_number_split(val, &split_num);
+	if (rc) {
+		return rc;
+	}
+
+	if (split_num.exponent || split_num.negative) {
+		return -ERANGE;
+	}
+
+	if (split_num.significand > UINT16_MAX) {
+		return -ERANGE;
+	}
+	*num = (uint16_t)split_num.significand;
+	return 0;
+}
+
+int
+spdk_json_number_to_int32(const struct spdk_json_val *val, int32_t *num)
+{
+	struct spdk_json_num split_num;
+	int rc;
+
+	rc = json_number_split(val, &split_num);
+	if (rc) {
+		return rc;
+	}
+
+	if (split_num.exponent) {
+		return -ERANGE;
+	}
+
+	if (split_num.negative) {
+		if (split_num.significand > 2147483648) { /* abs(INT32_MIN) */
+			return -ERANGE;
+		}
+		*num = (int32_t) - (int64_t)split_num.significand;
+		return 0;
+	}
+
+	/* positive */
+	if (split_num.significand > INT32_MAX) {
+		return -ERANGE;
+	}
+	*num = (int32_t)split_num.significand;
+	return 0;
+}
+
+int
+spdk_json_number_to_uint32(const struct spdk_json_val *val, uint32_t *num)
+{
+	struct spdk_json_num split_num;
+	int rc;
+
+	rc = json_number_split(val, &split_num);
+	if (rc) {
+		return rc;
+	}
+
+	if (split_num.exponent || split_num.negative) {
+		return -ERANGE;
+	}
+
+	if (split_num.significand > UINT32_MAX) {
+		return -ERANGE;
+	}
+	*num = (uint32_t)split_num.significand;
+	return 0;
+}
+
+int
+spdk_json_number_to_uint64(const struct spdk_json_val *val, uint64_t *num)
+{
+	struct spdk_json_num split_num;
+	int rc;
+
+	rc = json_number_split(val, &split_num);
+	if (rc) {
+		return rc;
+	}
+
+	if (split_num.exponent || split_num.negative) {
+		return -ERANGE;
+	}
+
+	*num = split_num.significand;
+	return 0;
+}
+
+int
+spdk_json_decode_object(const struct spdk_json_val *values,
+			const struct spdk_json_object_decoder *decoders, size_t num_decoders, void *out)
+{
+	uint32_t i;
+	bool invalid = false;
+	size_t decidx;
+	bool *seen;
+
+	if (values == NULL || values->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+		return -1;
+	}
+
+	seen = calloc(sizeof(bool), num_decoders);
+	if (seen == NULL) {
+		return -1;
+	}
+
+	for (i = 0; i < values->len;) {
+		const struct spdk_json_val *name = &values[i + 1];
+		const struct spdk_json_val *v = &values[i + 2];
+		bool found = false;
+
+		for (decidx = 0; decidx < num_decoders; decidx++) {
+			const struct spdk_json_object_decoder *dec = &decoders[decidx];
+			if (spdk_json_strequal(name, dec->name)) {
+				void *field = (void *)((uintptr_t)out + dec->offset);
+
+				found = true;
+
+				if (seen[decidx]) {
+					/* duplicate field name */
+					invalid = true;
+					SPDK_JSON_DEBUG("Duplicate key '%s'\n", dec->name);
+				} else {
+					seen[decidx] = true;
+					if (dec->decode_func(v, field)) {
+						invalid = true;
+						SPDK_JSON_DEBUG("Decoder failed to decode key '%s'\n", dec->name);
+						/* keep going to fill out any other valid keys */
+					}
+				}
+				break;
+			}
+		}
+
+		if (!found) {
+			invalid = true;
+			SPDK_JSON_DEBUG("Decoder not found for key '%.*s'\n", name->len, (char *)name->start);
+		}
+
+		i += 1 + spdk_json_val_len(v);
+	}
+
+	for (decidx = 0; decidx < num_decoders; decidx++) {
+		if (!decoders[decidx].optional && !seen[decidx]) {
+			/* required field is missing */
+			invalid = true;
+			break;
+		}
+	}
+
+	free(seen);
+	return invalid ? -1 : 0;
+}
+
+int
+spdk_json_decode_array(const struct spdk_json_val *values, spdk_json_decode_fn decode_func,
+		       void *out, size_t max_size, size_t *out_size, size_t stride)
+{
+	uint32_t i;
+	char *field;
+	char *out_end;
+
+	if (values == NULL || values->type != SPDK_JSON_VAL_ARRAY_BEGIN) {
+		return -1;
+	}
+
+	*out_size = 0;
+	field = out;
+	out_end = field + max_size * stride;
+	for (i = 0; i < values->len;) {
+		const struct spdk_json_val *v = &values[i + 1];
+
+		if (field == out_end) {
+			return -1;
+		}
+
+		if (decode_func(v, field)) {
+			return -1;
+		}
+
+		i += spdk_json_val_len(v);
+		field += stride;
+		(*out_size)++;
+	}
+
+	return 0;
+}
+
+int
+spdk_json_decode_bool(const struct spdk_json_val *val, void *out)
+{
+	bool *f = out;
+
+	if (val->type != SPDK_JSON_VAL_TRUE && val->type != SPDK_JSON_VAL_FALSE) {
+		return -1;
+	}
+
+	*f = val->type == SPDK_JSON_VAL_TRUE;
+	return 0;
+}
+
+int
+spdk_json_decode_uint16(const struct spdk_json_val *val, void *out)
+{
+	uint16_t *i = out;
+
+	return spdk_json_number_to_uint16(val, i);
+}
+
+int
+spdk_json_decode_int32(const struct spdk_json_val *val, void *out)
+{
+	int32_t *i = out;
+
+	return spdk_json_number_to_int32(val, i);
+}
+
+int
+spdk_json_decode_uint32(const struct spdk_json_val *val, void *out)
+{
+	uint32_t *i = out;
+
+	return spdk_json_number_to_uint32(val, i);
+}
+
+int
+spdk_json_decode_uint64(const struct spdk_json_val *val, void *out)
+{
+	uint64_t *i = out;
+
+	return spdk_json_number_to_uint64(val, i);
+}
+
+int
+spdk_json_decode_string(const struct spdk_json_val *val, void *out)
+{
+	char **s = out;
+
+	free(*s);
+
+	*s = spdk_json_strdup(val);
+
+	if (*s) {
+		return 0;
+	} else {
+		return -1;
+	}
+}
+
+static struct spdk_json_val *
+json_first(struct spdk_json_val *object, enum spdk_json_val_type type)
+{
+	/* 'object' must be JSON object or array. 'type' might be combination of these two. */
+	assert((type & (SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN)) != 0);
+
+	assert(object != NULL);
+
+	if ((object->type & type) == 0) {
+		return NULL;
+	}
+
+	object++;
+	if (object->len == 0) {
+		return NULL;
+	}
+
+	return object;
+}
+
+static struct spdk_json_val *
+json_value(struct spdk_json_val *key)
+{
+	return key->type == SPDK_JSON_VAL_NAME ? key + 1 : NULL;
+}
+
+int
+spdk_json_find(struct spdk_json_val *object, const char *key_name, struct spdk_json_val **key,
+	       struct spdk_json_val **val, enum spdk_json_val_type type)
+{
+	struct spdk_json_val *_key = NULL;
+	struct spdk_json_val *_val = NULL;
+	struct spdk_json_val *it;
+
+	assert(object != NULL);
+
+	for (it = json_first(object, SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN);
+	     it != NULL;
+	     it = spdk_json_next(it)) {
+		if (it->type != SPDK_JSON_VAL_NAME) {
+			continue;
+		}
+
+		if (spdk_json_strequal(it, key_name) != true) {
+			continue;
+		}
+
+		if (_key) {
+			SPDK_JSON_DEBUG("Duplicate key '%s'", key_name);
+			return -EINVAL;
+		}
+
+		_key = it;
+		_val = json_value(_key);
+
+		if (type != SPDK_JSON_VAL_INVALID && (_val->type & type) == 0) {
+			SPDK_JSON_DEBUG("key '%s' type is %#x but expected one of %#x\n", key_name, _val->type, type);
+			return -EDOM;
+		}
+	}
+
+	if (key) {
+		*key = _key;
+	}
+
+	if (val) {
+		*val = _val;
+	}
+
+	return _val ? 0 : -ENOENT;
+}
+
+int
+spdk_json_find_string(struct spdk_json_val *object, const char *key_name,
+		      struct spdk_json_val **key, struct spdk_json_val **val)
+{
+	return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_STRING);
+}
+
+int
+spdk_json_find_array(struct spdk_json_val *object, const char *key_name,
+		     struct spdk_json_val **key, struct spdk_json_val **val)
+{
+	return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_ARRAY_BEGIN);
+}
+
+struct spdk_json_val *
+spdk_json_object_first(struct spdk_json_val *object)
+{
+	struct spdk_json_val *first = json_first(object, SPDK_JSON_VAL_OBJECT_BEGIN);
+
+	/* Empty object? */
+	return first && first->type != SPDK_JSON_VAL_OBJECT_END ? first : NULL;
+}
+
+struct spdk_json_val *
+spdk_json_array_first(struct spdk_json_val *array_begin)
+{
+	struct spdk_json_val *first = json_first(array_begin, SPDK_JSON_VAL_ARRAY_BEGIN);
+
+	/* Empty array? */
+	return first && first->type != SPDK_JSON_VAL_ARRAY_END ? first : NULL;
+}
+
+static struct spdk_json_val *
+json_skip_object_or_array(struct spdk_json_val *val)
+{
+	unsigned lvl;
+	enum spdk_json_val_type end_type;
+	struct spdk_json_val *it;
+
+	if (val->type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+		end_type = SPDK_JSON_VAL_OBJECT_END;
+	} else if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN) {
+		end_type = SPDK_JSON_VAL_ARRAY_END;
+	} else {
+		SPDK_JSON_DEBUG("Expected JSON object (%#x) or array (%#x) but got %#x\n",
+				SPDK_JSON_VAL_OBJECT_BEGIN, SPDK_JSON_VAL_ARRAY_BEGIN, val->type);
+		return NULL;
+	}
+
+	lvl = 1;
+	for (it = val + 1; it->type != SPDK_JSON_VAL_INVALID && lvl != 0; it++) {
+		if (it->type == val->type) {
+			lvl++;
+		} else if (it->type == end_type) {
+			lvl--;
+		}
+	}
+
+	/* if lvl != 0 we have invalid JSON object */
+	if (lvl != 0) {
+		SPDK_JSON_DEBUG("Can't find end of object (type: %#x): lvl (%u) != 0)\n", val->type, lvl);
+		it = NULL;
+	}
+
+	return it;
+}
+
+struct spdk_json_val *
+spdk_json_next(struct spdk_json_val *it)
+{
+	struct spdk_json_val *val, *next;
+
+	switch (it->type) {
+	case SPDK_JSON_VAL_NAME:
+		val = json_value(it);
+		next = spdk_json_next(val);
+		break;
+
+	/* We are in the middle of an array - get to next entry */
+	case SPDK_JSON_VAL_NULL:
+	case SPDK_JSON_VAL_TRUE:
+	case SPDK_JSON_VAL_FALSE:
+	case SPDK_JSON_VAL_NUMBER:
+	case SPDK_JSON_VAL_STRING:
+		val = it + 1;
+		return val;
+
+	case SPDK_JSON_VAL_ARRAY_BEGIN:
+	case SPDK_JSON_VAL_OBJECT_BEGIN:
+		next = json_skip_object_or_array(it);
+		break;
+
+	/* Can't go to the next object if started from the end of array or object */
+	case SPDK_JSON_VAL_ARRAY_END:
+	case SPDK_JSON_VAL_OBJECT_END:
+	case SPDK_JSON_VAL_INVALID:
+		return NULL;
+	default:
+		assert(false);
+		return NULL;
+
+	}
+
+	/* EOF ? */
+	if (next == NULL) {
+		return NULL;
+	}
+
+	switch (next->type) {
+	case SPDK_JSON_VAL_ARRAY_END:
+	case SPDK_JSON_VAL_OBJECT_END:
+	case SPDK_JSON_VAL_INVALID:
+		return NULL;
+	default:
+		/* Next value */
+		return next;
+	}
+}
+
+SPDK_LOG_REGISTER_COMPONENT("json_util", SPDK_LOG_JSON)
diff --git a/src/spdk/lib/json/json_write.c b/src/spdk/lib/json/json_write.c
new file mode 100644
index 000000000..7e9fbb5c3
--- /dev/null
+++ b/src/spdk/lib/json/json_write.c
@@ -0,0 +1,687 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/json.h"
+
+#include "spdk_internal/utf.h"
+
+struct spdk_json_write_ctx {
+	spdk_json_write_cb write_cb;
+	void *cb_ctx;
+	uint32_t flags;
+	uint32_t indent;
+	bool new_indent;
+	bool first_value;
+	bool failed;
+	size_t buf_filled;
+	uint8_t buf[4096];
+};
+
+static int emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size);
+
+static int
+fail(struct spdk_json_write_ctx *w)
+{
+	w->failed = true;
+	return -1;
+}
+
+static int
+flush_buf(struct spdk_json_write_ctx *w)
+{
+	int rc;
+
+	rc = w->write_cb(w->cb_ctx, w->buf, w->buf_filled);
+	if (rc != 0) {
+		return fail(w);
+	}
+
+	w->buf_filled = 0;
+
+	return 0;
+}
+
+struct spdk_json_write_ctx *
+spdk_json_write_begin(spdk_json_write_cb write_cb, void *cb_ctx, uint32_t flags)
+{
+	struct spdk_json_write_ctx *w;
+
+	w = calloc(1, sizeof(*w));
+	if (w == NULL) {
+		return w;
+	}
+
+	w->write_cb = write_cb;
+	w->cb_ctx = cb_ctx;
+	w->flags = flags;
+	w->indent = 0;
+	w->new_indent = false;
+	w->first_value = true;
+	w->failed = false;
+	w->buf_filled = 0;
+
+	return w;
+}
+
+int
+spdk_json_write_end(struct spdk_json_write_ctx *w)
+{
+	bool failed;
+	int rc;
+
+	if (w == NULL) {
+		return 0;
+	}
+
+	failed = w->failed;
+
+	rc = flush_buf(w);
+	if (rc != 0) {
+		failed = true;
+	}
+
+	free(w);
+
+	return failed ? -1 : 0;
+}
+
+static inline int
+emit(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+	size_t buf_remain = sizeof(w->buf) - w->buf_filled;
+
+	if (spdk_unlikely(size > buf_remain)) {
+		/* Not enough space in buffer for the new data. */
+		return emit_buf_full(w, data, size);
+	}
+
+	/* Copy the new data into buf. */
+	memcpy(w->buf + w->buf_filled, data, size);
+	w->buf_filled += size;
+	return 0;
+}
+
+static int
+emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+	size_t buf_remain = sizeof(w->buf) - w->buf_filled;
+	int rc;
+
+	assert(size > buf_remain);
+
+	/* Copy as much of the new data as possible into the buffer and flush it. */
+	memcpy(w->buf + w->buf_filled, data, buf_remain);
+	w->buf_filled += buf_remain;
+
+	rc = flush_buf(w);
+	if (rc != 0) {
+		return fail(w);
+	}
+
+	/* Recurse to emit the rest of the data. */
+	return emit(w, data + buf_remain, size - buf_remain);
+}
+
+static int
+emit_fmt(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+	if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) {
+		return emit(w, data, size);
+	}
+	return 0;
+}
+
+static int
+emit_indent(struct spdk_json_write_ctx *w)
+{
+	uint32_t i;
+
+	if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) {
+		for (i = 0; i < w->indent; i++) {
+			if (emit(w, "  ", 2)) { return fail(w); }
+		}
+	}
+	return 0;
+}
+
+static int
+begin_value(struct spdk_json_write_ctx *w)
+{
+	/* TODO: check for value state */
+	if (w->new_indent) {
+		if (emit_fmt(w, "\n", 1)) { return fail(w); }
+		if (emit_indent(w)) { return fail(w); }
+	}
+	if (!w->first_value) {
+		if (emit(w, ",", 1)) { return fail(w); }
+		if (emit_fmt(w, "\n", 1)) { return fail(w); }
+		if (emit_indent(w)) { return fail(w); }
+	}
+	w->first_value = false;
+	w->new_indent = false;
+	return 0;
+}
+
+int
+spdk_json_write_val_raw(struct spdk_json_write_ctx *w, const void *data, size_t len)
+{
+	if (begin_value(w)) { return fail(w); }
+	return emit(w, data, len);
+}
+
+int
+spdk_json_write_null(struct spdk_json_write_ctx *w)
+{
+	if (begin_value(w)) { return fail(w); }
+	return emit(w, "null", 4);
+}
+
+int
+spdk_json_write_bool(struct spdk_json_write_ctx *w, bool val)
+{
+	if (begin_value(w)) { return fail(w); }
+	if (val) {
+		return emit(w, "true", 4);
+	} else {
+		return emit(w, "false", 5);
+	}
+}
+
+int
+spdk_json_write_int32(struct spdk_json_write_ctx *w, int32_t val)
+{
+	char buf[32];
+	int count;
+
+	if (begin_value(w)) { return fail(w); }
+	count = snprintf(buf, sizeof(buf), "%" PRId32, val);
+	if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+	return emit(w, buf, count);
+}
+
+int
+spdk_json_write_uint32(struct spdk_json_write_ctx *w, uint32_t val)
+{
+	char buf[32];
+	int count;
+
+	if (begin_value(w)) { return fail(w); }
+	count = snprintf(buf, sizeof(buf), "%" PRIu32, val);
+	if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+	return emit(w, buf, count);
+}
+
+int
+spdk_json_write_int64(struct spdk_json_write_ctx *w, int64_t val)
+{
+	char buf[32];
+	int count;
+
+	if (begin_value(w)) { return fail(w); }
+	count = snprintf(buf, sizeof(buf), "%" PRId64, val);
+	if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+	return emit(w, buf, count);
+}
+
+int
+spdk_json_write_uint64(struct spdk_json_write_ctx *w, uint64_t val)
+{
+	char buf[32];
+	int count;
+
+	if (begin_value(w)) { return fail(w); }
+	count = snprintf(buf, sizeof(buf), "%" PRIu64, val);
+	if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+	return emit(w, buf, count);
+}
+
+static void
+write_hex_4(void *dest, uint16_t val)
+{
+	uint8_t *p = dest;
+	char hex[] = "0123456789ABCDEF";
+
+	p[0] = hex[(val >> 12)];
+	p[1] = hex[(val >> 8) & 0xF];
+	p[2] = hex[(val >> 4) & 0xF];
+	p[3] = hex[val & 0xF];
+}
+
+static inline int
+write_codepoint(struct spdk_json_write_ctx *w, uint32_t codepoint)
+{
+	static const uint8_t escapes[] = {
+		['\b'] = 'b',
+		['\f'] = 'f',
+		['\n'] = 'n',
+		['\r'] = 'r',
+		['\t'] = 't',
+		['"'] = '"',
+		['\\'] = '\\',
+		/*
+		 * Forward slash (/) is intentionally not converted to an escape
+		 *  (it is valid unescaped).
+		 */
+	};
+	uint16_t high, low;
+	char out[13];
+	size_t out_len;
+
+	if (codepoint < sizeof(escapes) && escapes[codepoint]) {
+		out[0] = '\\';
+		out[1] = escapes[codepoint];
+		out_len = 2;
+	} else if (codepoint >= 0x20 && codepoint < 0x7F) {
+		/*
+		 * Encode plain ASCII directly (except 0x7F, since it is really
+		 *  a control character, despite the JSON spec not considering it one).
+		 */
+		out[0] = (uint8_t)codepoint;
+		out_len = 1;
+	} else if (codepoint < 0x10000) {
+		out[0] = '\\';
+		out[1] = 'u';
+		write_hex_4(&out[2], (uint16_t)codepoint);
+		out_len = 6;
+	} else {
+		utf16_encode_surrogate_pair(codepoint, &high, &low);
+		out[0] = '\\';
+		out[1] = 'u';
+		write_hex_4(&out[2], high);
+		out[6] = '\\';
+		out[7] = 'u';
+		write_hex_4(&out[8], low);
+		out_len = 12;
+	}
+
+	return emit(w, out, out_len);
+}
+
+static int
+write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len)
+{
+	const uint8_t *p = val;
+	const uint8_t *end = val + len;
+
+	if (emit(w, "\"", 1)) { return fail(w); }
+
+	while (p != end) {
+		int codepoint_len;
+		uint32_t codepoint;
+
+		codepoint_len = utf8_valid(p, end);
+		switch (codepoint_len) {
+		case 1:
+			codepoint = utf8_decode_unsafe_1(p);
+			break;
+		case 2:
+			codepoint = utf8_decode_unsafe_2(p);
+			break;
+		case 3:
+			codepoint = utf8_decode_unsafe_3(p);
+			break;
+		case 4:
+			codepoint = utf8_decode_unsafe_4(p);
+			break;
+		default:
+			return fail(w);
+		}
+
+		if (write_codepoint(w, codepoint)) { return fail(w); }
+		p += codepoint_len;
+	}
+
+	return emit(w, "\"", 1);
+}
+
+static int
+write_string_or_name_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len)
+{
+	const uint16_t *p = val;
+	const uint16_t *end = val + len;
+
+	if (emit(w, "\"", 1)) { return fail(w); }
+
+	while (p != end) {
+		int codepoint_len;
+		uint32_t codepoint;
+
+		codepoint_len = utf16le_valid(p, end);
+		switch (codepoint_len) {
+		case 1:
+			codepoint = from_le16(&p[0]);
+			break;
+		case 2:
+			codepoint = utf16_decode_surrogate_pair(from_le16(&p[0]), from_le16(&p[1]));
+			break;
+		default:
+			return fail(w);
+		}
+
+		if (write_codepoint(w, codepoint)) { return fail(w); }
+		p += codepoint_len;
+	}
+
+	return emit(w, "\"", 1);
+}
+
+int
+spdk_json_write_string_raw(struct spdk_json_write_ctx *w, const char *val, size_t len)
+{
+	if (begin_value(w)) { return fail(w); }
+	return write_string_or_name(w, val, len);
+}
+
+int
+spdk_json_write_string(struct spdk_json_write_ctx *w, const char *val)
+{
+	return spdk_json_write_string_raw(w, val, strlen(val));
+}
+
+int
+spdk_json_write_string_utf16le_raw(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len)
+{
+	if (begin_value(w)) { return fail(w); }
+	return write_string_or_name_utf16le(w, val, len);
+}
+
+int
+spdk_json_write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val)
+{
+	const uint16_t *p;
+	size_t len;
+
+	for (len = 0, p = val; *p; p++) {
+		len++;
+	}
+
+	return spdk_json_write_string_utf16le_raw(w, val, len);
+}
+
+int
+spdk_json_write_string_fmt(struct spdk_json_write_ctx *w, const char *fmt, ...)
+{
+	va_list args;
+	int rc;
+
+	va_start(args, fmt);
+	rc = spdk_json_write_string_fmt_v(w, fmt, args);
+	va_end(args);
+
+	return rc;
+}
+
+int
+spdk_json_write_string_fmt_v(struct spdk_json_write_ctx *w, const char *fmt, va_list args)
+{
+	char *s;
+	int rc;
+
+	s = spdk_vsprintf_alloc(fmt, args);
+	if (s == NULL) {
+		return -1;
+	}
+
+	rc = spdk_json_write_string(w, s);
+	free(s);
+	return rc;
+}
+
+int
+spdk_json_write_array_begin(struct spdk_json_write_ctx *w)
+{
+	if (begin_value(w)) { return fail(w); }
+	w->first_value = true;
+	w->new_indent = true;
+	w->indent++;
+	if (emit(w, "[", 1)) { return fail(w); }
+	return 0;
+}
+
+int
+spdk_json_write_array_end(struct spdk_json_write_ctx *w)
+{
+	w->first_value = false;
+	if (w->indent == 0) { return fail(w); }
+	w->indent--;
+	if (!w->new_indent) {
+		if (emit_fmt(w, "\n", 1)) { return fail(w); }
+		if (emit_indent(w)) { return fail(w); }
+	}
+	w->new_indent = false;
+	return emit(w, "]", 1);
+}
+
+int
+spdk_json_write_object_begin(struct spdk_json_write_ctx *w)
+{
+	if (begin_value(w)) { return fail(w); }
+	w->first_value = true;
+	w->new_indent = true;
+	w->indent++;
+	if (emit(w, "{", 1)) { return fail(w); }
+	return 0;
+}
+
+int
+spdk_json_write_object_end(struct spdk_json_write_ctx *w)
+{
+	w->first_value = false;
+	w->indent--;
+	if (!w->new_indent) {
+		if (emit_fmt(w, "\n", 1)) { return fail(w); }
+		if (emit_indent(w)) { return fail(w); }
+	}
+	w->new_indent = false;
+	return emit(w, "}", 1);
+}
+
+int
+spdk_json_write_name_raw(struct spdk_json_write_ctx *w, const char *name, size_t len)
+{
+	/* TODO: check that container is an object */
+	if (begin_value(w)) { return fail(w); }
+	if (write_string_or_name(w, name, len)) { return fail(w); }
+	w->first_value = true;
+	if (emit(w, ":", 1)) { return fail(w); }
+	return emit_fmt(w, " ", 1);
+}
+
+int
+spdk_json_write_name(struct spdk_json_write_ctx *w, const char *name)
+{
+	return spdk_json_write_name_raw(w, name, strlen(name));
+}
+
+int
+spdk_json_write_val(struct spdk_json_write_ctx *w, const struct spdk_json_val *val)
+{
+	size_t num_values, i;
+
+	switch (val->type) {
+	case SPDK_JSON_VAL_NUMBER:
+		return spdk_json_write_val_raw(w, val->start, val->len);
+
+	case SPDK_JSON_VAL_STRING:
+		return spdk_json_write_string_raw(w, val->start, val->len);
+
+	case SPDK_JSON_VAL_NAME:
+		return spdk_json_write_name_raw(w, val->start, val->len);
+
+	case SPDK_JSON_VAL_TRUE:
+		return spdk_json_write_bool(w, true);
+
+	case SPDK_JSON_VAL_FALSE:
+		return spdk_json_write_bool(w, false);
+
+	case SPDK_JSON_VAL_NULL:
+		return spdk_json_write_null(w);
+
+	case SPDK_JSON_VAL_ARRAY_BEGIN:
+	case SPDK_JSON_VAL_OBJECT_BEGIN:
+		num_values = val[0].len;
+
+		if (val[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+			if (spdk_json_write_object_begin(w)) {
+				return fail(w);
+			}
+		} else {
+			if (spdk_json_write_array_begin(w)) {
+				return fail(w);
+			}
+		}
+
+		/* Loop up to and including the _END value */
+		for (i = 0; i < num_values + 1;) {
+			if (spdk_json_write_val(w, &val[i + 1])) {
+				return fail(w);
+			}
+			if (val[i + 1].type == SPDK_JSON_VAL_ARRAY_BEGIN ||
+			    val[i + 1].type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+				i += val[i + 1].len + 2;
+			} else {
+				i++;
+			}
+		}
+		return 0;
+
+	case SPDK_JSON_VAL_ARRAY_END:
+		return spdk_json_write_array_end(w);
+
+	case SPDK_JSON_VAL_OBJECT_END:
+		return spdk_json_write_object_end(w);
+
+	case SPDK_JSON_VAL_INVALID:
+		/* Handle INVALID to make the compiler happy (and catch other unhandled types) */
+		return fail(w);
+	}
+
+	return fail(w);
+}
+
+int spdk_json_write_named_null(struct spdk_json_write_ctx *w, const char *name)
+{
+	int rc = spdk_json_write_name(w, name);
+	return rc ? rc : spdk_json_write_null(w);
+}
+
+int spdk_json_write_named_bool(struct spdk_json_write_ctx *w, const char *name, bool val)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_bool(w, val);
+}
+
+int spdk_json_write_named_int32(struct spdk_json_write_ctx *w, const char *name, int32_t val)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_int32(w, val);
+}
+
+int spdk_json_write_named_uint32(struct spdk_json_write_ctx *w, const char *name, uint32_t val)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_uint32(w, val);
+}
+
+int spdk_json_write_named_uint64(struct spdk_json_write_ctx *w, const char *name, uint64_t val)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_uint64(w, val);
+}
+
+int spdk_json_write_named_int64(struct spdk_json_write_ctx *w, const char *name, int64_t val)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_int64(w, val);
+}
+
+int spdk_json_write_named_string(struct spdk_json_write_ctx *w, const char *name, const char *val)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_string(w, val);
+}
+
+int spdk_json_write_named_string_fmt(struct spdk_json_write_ctx *w, const char *name,
+				     const char *fmt, ...)
+{
+	va_list args;
+	int rc;
+
+	va_start(args, fmt);
+	rc = spdk_json_write_named_string_fmt_v(w, name, fmt, args);
+	va_end(args);
+
+	return rc;
+}
+
+int spdk_json_write_named_string_fmt_v(struct spdk_json_write_ctx *w, const char *name,
+				       const char *fmt, va_list args)
+{
+	char *s;
+	int rc;
+
+	rc = spdk_json_write_name(w, name);
+	if (rc) {
+		return rc;
+	}
+
+	s = spdk_vsprintf_alloc(fmt, args);
+
+	if (s == NULL) {
+		return -1;
+	}
+
+	rc = spdk_json_write_string(w, s);
+	free(s);
+	return rc;
+}
+
+int spdk_json_write_named_array_begin(struct spdk_json_write_ctx *w, const char *name)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_array_begin(w);
+}
+
+int spdk_json_write_named_object_begin(struct spdk_json_write_ctx *w, const char *name)
+{
+	int rc = spdk_json_write_name(w, name);
+
+	return rc ? rc : spdk_json_write_object_begin(w);
+}
diff --git a/src/spdk/lib/json/spdk_json.map b/src/spdk/lib/json/spdk_json.map
new file mode 100644
index 000000000..0699feaad
--- /dev/null
+++ b/src/spdk/lib/json/spdk_json.map
@@ -0,0 +1,67 @@
+{
+	global:
+
+	# public functions
+	spdk_json_parse;
+	spdk_json_decode_object;
+	spdk_json_decode_array;
+	spdk_json_decode_bool;
+	spdk_json_decode_uint16;
+	spdk_json_decode_int32;
+	spdk_json_decode_uint32;
+	spdk_json_decode_uint64;
+	spdk_json_decode_string;
+
+	spdk_json_val_len;
+	spdk_json_strequal;
+	spdk_json_strdup;
+
+	spdk_json_number_to_uint16;
+	spdk_json_number_to_int32;
+	spdk_json_number_to_uint32;
+	spdk_json_number_to_uint64;
+
+	spdk_json_write_begin;
+	spdk_json_write_end;
+	spdk_json_write_null;
+	spdk_json_write_bool;
+	spdk_json_write_int32;
+	spdk_json_write_uint32;
+	spdk_json_write_int64;
+	spdk_json_write_uint64;
+	spdk_json_write_string;
+	spdk_json_write_string_raw;
+	spdk_json_write_string_utf16le;
+	spdk_json_write_string_utf16le_raw;
+	spdk_json_write_string_fmt;
+	spdk_json_write_string_fmt_v;
+	spdk_json_write_array_begin;
+	spdk_json_write_array_end;
+	spdk_json_write_object_begin;
+	spdk_json_write_object_end;
+	spdk_json_write_name;
+	spdk_json_write_name_raw;
+	spdk_json_write_val;
+	spdk_json_write_val_raw;
+
+	spdk_json_write_named_null;
+	spdk_json_write_named_bool;
+	spdk_json_write_named_int32;
+	spdk_json_write_named_uint32;
+	spdk_json_write_named_uint64;
+	spdk_json_write_named_int64;
+	spdk_json_write_named_string;
+	spdk_json_write_named_string_fmt;
+	spdk_json_write_named_string_fmt_v;
+	spdk_json_write_named_array_begin;
+	spdk_json_write_named_object_begin;
+
+	spdk_json_find;
+	spdk_json_find_string;
+	spdk_json_find_array;
+	spdk_json_object_first;
+	spdk_json_array_first;
+	spdk_json_next;
+
+	local: *;
+};
diff --git a/src/spdk/lib/jsonrpc/Makefile b/src/spdk/lib/jsonrpc/Makefile
new file mode 100644
index 000000000..7eb8dd683
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+LIBNAME = jsonrpc
+C_SRCS = jsonrpc_server.c jsonrpc_server_tcp.c
+C_SRCS += jsonrpc_client.c jsonrpc_client_tcp.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_jsonrpc.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client.c b/src/spdk/lib/jsonrpc/jsonrpc_client.c
new file mode 100644
index 000000000..e3940a4d4
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_client.c
@@ -0,0 +1,227 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/util.h"
+#include "jsonrpc_internal.h"
+
+static int
+capture_version(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	if (spdk_json_strequal(val, "2.0") != true) {
+		return SPDK_JSON_PARSE_INVALID;
+	}
+
+	*vptr = val;
+	return 0;
+}
+
+static int
+capture_id(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NUMBER) {
+		return -EINVAL;
+	}
+
+	*vptr = val;
+	return 0;
+}
+
+static int
+capture_any(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	*vptr = val;
+	return 0;
+}
+
+static const struct spdk_json_object_decoder jsonrpc_response_decoders[] = {
+	{"jsonrpc", offsetof(struct spdk_jsonrpc_client_response, version), capture_version},
+	{"id", offsetof(struct spdk_jsonrpc_client_response, id), capture_id, true},
+	{"result", offsetof(struct spdk_jsonrpc_client_response, result), capture_any, true},
+	{"error", offsetof(struct spdk_jsonrpc_client_response, error), capture_any, true},
+};
+
+int
+jsonrpc_parse_response(struct spdk_jsonrpc_client *client)
+{
+	struct spdk_jsonrpc_client_response_internal *r;
+	ssize_t rc;
+	size_t buf_len;
+	size_t values_cnt;
+	void *end = NULL;
+
+
+	/* Check to see if we have received a full JSON value. */
+	rc = spdk_json_parse(client->recv_buf, client->recv_offset, NULL, 0, &end, 0);
+	if (rc == SPDK_JSON_PARSE_INCOMPLETE) {
+		return 0;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_RPC_CLIENT, "JSON string is :\n%s\n", client->recv_buf);
+	if (rc < 0 || rc > SPDK_JSONRPC_CLIENT_MAX_VALUES) {
+		SPDK_ERRLOG("JSON parse error (rc: %zd)\n", rc);
+		/*
+		 * Can't recover from parse error (no guaranteed resync point in streaming JSON).
+		 * Return an error to indicate that the connection should be closed.
+		 */
+		return -EINVAL;
+	}
+
+	values_cnt = rc;
+
+	r = calloc(1, sizeof(*r) + sizeof(struct spdk_json_val) * (values_cnt + 1));
+	if (!r) {
+		return -errno;
+	}
+
+	if (client->resp) {
+		free(r);
+		return -ENOSPC;
+	}
+
+	client->resp = r;
+
+	r->buf = client->recv_buf;
+	buf_len = client->recv_offset;
+	r->values_cnt = values_cnt;
+
+	client->recv_buf_size = 0;
+	client->recv_offset = 0;
+	client->recv_buf = NULL;
+
+	/* Decode a second time now that there is a full JSON value available. */
+	rc = spdk_json_parse(r->buf, buf_len, r->values, values_cnt, &end,
+			     SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE);
+	if (rc != (ssize_t)values_cnt) {
+		SPDK_ERRLOG("JSON parse error on second pass (rc: %zd, expected: %zu)\n", rc, values_cnt);
+		goto err;
+	}
+
+	assert(end != NULL);
+
+	if (r->values[0].type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+		SPDK_ERRLOG("top-level JSON value was not object\n");
+		goto err;
+	}
+
+	if (spdk_json_decode_object(r->values, jsonrpc_response_decoders,
+				    SPDK_COUNTOF(jsonrpc_response_decoders), &r->jsonrpc)) {
+		goto err;
+	}
+
+	r->ready = 1;
+	return 1;
+
+err:
+	client->resp = NULL;
+	spdk_jsonrpc_client_free_response(&r->jsonrpc);
+	return -EINVAL;
+}
+
+static int
+jsonrpc_client_write_cb(void *cb_ctx, const void *data, size_t size)
+{
+	struct spdk_jsonrpc_client_request *request = cb_ctx;
+	size_t new_size = request->send_buf_size;
+
+	while (new_size - request->send_len < size) {
+		if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) {
+			SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n",
+				    (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX);
+			return -ENOSPC;
+		}
+
+		new_size *= 2;
+	}
+
+	if (new_size != request->send_buf_size) {
+		uint8_t *new_buf;
+
+		new_buf = realloc(request->send_buf, new_size);
+		if (new_buf == NULL) {
+			SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n",
+				    request->send_buf_size, new_size);
+			return -ENOMEM;
+		}
+
+		request->send_buf = new_buf;
+		request->send_buf_size = new_size;
+	}
+
+	memcpy(request->send_buf + request->send_len, data, size);
+	request->send_len += size;
+
+	return 0;
+}
+
+struct spdk_json_write_ctx *
+spdk_jsonrpc_begin_request(struct spdk_jsonrpc_client_request *request, int32_t id,
+			   const char *method)
+{
+	struct spdk_json_write_ctx *w;
+
+	w = spdk_json_write_begin(jsonrpc_client_write_cb, request, 0);
+	if (w == NULL) {
+		return NULL;
+	}
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "jsonrpc", "2.0");
+
+	if (id >= 0) {
+		spdk_json_write_named_int32(w, "id", id);
+	}
+
+	if (method) {
+		spdk_json_write_named_string(w, "method", method);
+	}
+
+	return w;
+}
+
+void
+spdk_jsonrpc_end_request(struct spdk_jsonrpc_client_request *request, struct spdk_json_write_ctx *w)
+{
+	assert(w != NULL);
+
+	spdk_json_write_object_end(w);
+	spdk_json_write_end(w);
+	jsonrpc_client_write_cb(request, "\n", 1);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("rpc_client", SPDK_LOG_RPC_CLIENT)
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c
new file mode 100644
index 000000000..512f6261c
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c
@@ -0,0 +1,431 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spdk/string.h"
+#include "jsonrpc_internal.h"
+#include "spdk/util.h"
+
+#define RPC_DEFAULT_PORT	"5260"
+
+static int
+jsonrpc_client_send_request(struct spdk_jsonrpc_client *client)
+{
+	ssize_t rc;
+	struct spdk_jsonrpc_client_request *request = client->request;
+
+	if (!request) {
+		return 0;
+	}
+
+	if (request->send_len > 0) {
+		rc = send(client->sockfd, request->send_buf + request->send_offset,
+			  request->send_len, 0);
+		if (rc < 0) {
+			/* For EINTR we pretend that nothing was send. */
+			if (errno == EINTR) {
+				rc = 0;
+			} else {
+				rc = -errno;
+				SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno));
+			}
+
+			return rc;
+		}
+
+		request->send_offset += rc;
+		request->send_len -= rc;
+	}
+
+	if (request->send_len == 0) {
+		client->request = NULL;
+		spdk_jsonrpc_client_free_request(request);
+	}
+
+	return 0;
+}
+
+static int
+recv_buf_expand(struct spdk_jsonrpc_client *client)
+{
+	uint8_t *new_buf;
+
+	if (client->recv_buf_size * 2 > SPDK_JSONRPC_SEND_BUF_SIZE_MAX) {
+		return -ENOSPC;
+	}
+
+	new_buf = realloc(client->recv_buf, client->recv_buf_size * 2);
+	if (new_buf == NULL) {
+		SPDK_ERRLOG("Resizing recv_buf failed (current size %zu, new size %zu)\n",
+			    client->recv_buf_size, client->recv_buf_size * 2);
+		return -ENOMEM;
+	}
+
+	client->recv_buf = new_buf;
+	client->recv_buf_size *= 2;
+
+	return 0;
+}
+
+static int
+jsonrpc_client_resp_ready_count(struct spdk_jsonrpc_client *client)
+{
+	return client->resp != NULL && client->resp->ready ? 1 : 0;
+}
+
+static int
+jsonrpc_client_recv(struct spdk_jsonrpc_client *client)
+{
+	ssize_t rc;
+
+	if (client->recv_buf == NULL) {
+		client->recv_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT);
+		if (!client->recv_buf) {
+			rc = errno;
+			SPDK_ERRLOG("malloc() failed (%d): %s\n", (int)rc, spdk_strerror(rc));
+			return -rc;
+		}
+		client->recv_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT;
+		client->recv_offset = 0;
+	} else if (client->recv_offset == client->recv_buf_size - 1) {
+		rc = recv_buf_expand(client);
+		if (rc) {
+			return rc;
+		}
+	}
+
+	rc = recv(client->sockfd, client->recv_buf + client->recv_offset,
+		  client->recv_buf_size - client->recv_offset - 1, 0);
+	if (rc < 0) {
+		/* For EINTR we pretend that nothing was reveived. */
+		if (errno == EINTR) {
+			return 0;
+		} else {
+			rc = -errno;
+			SPDK_ERRLOG("recv() failed (%d): %s\n", errno, spdk_strerror(errno));
+			return rc;
+		}
+	} else if (rc == 0) {
+		return -EIO;
+	}
+
+	client->recv_offset += rc;
+	client->recv_buf[client->recv_offset] = '\0';
+
+	/* Check to see if we have received a full JSON value. */
+	return jsonrpc_parse_response(client);
+}
+
+static int
+jsonrpc_client_poll(struct spdk_jsonrpc_client *client, int timeout)
+{
+	int rc;
+	struct pollfd pfd = { .fd = client->sockfd, .events = POLLIN | POLLOUT };
+
+	rc = poll(&pfd, 1, timeout);
+	if (rc == -1) {
+		if (errno == EINTR) {
+			/* For EINTR we pretend that nothing was received nor send. */
+			rc = 0;
+		} else {
+			rc = -errno;
+			SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno));
+		}
+	} else if (rc > 0) {
+		rc = 0;
+
+		if (pfd.revents & POLLOUT) {
+			rc = jsonrpc_client_send_request(client);
+		}
+
+		if (rc == 0 && (pfd.revents & POLLIN)) {
+			rc = jsonrpc_client_recv(client);
+			/* Incomplete message in buffer isn't an error. */
+			if (rc == -EAGAIN) {
+				rc = 0;
+			}
+		}
+	}
+
+	return rc ? rc : jsonrpc_client_resp_ready_count(client);
+}
+
+static int
+jsonrpc_client_poll_connecting(struct spdk_jsonrpc_client *client, int timeout)
+{
+	socklen_t rc_len;
+	int rc;
+
+	struct pollfd pfd = {
+		.fd = client->sockfd,
+		.events = POLLOUT
+	};
+
+	rc = poll(&pfd, 1, timeout);
+	if (rc == 0) {
+		return -ENOTCONN;
+	} else if (rc == -1) {
+		if (errno != EINTR) {
+			SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno));
+			goto err;
+		}
+
+		/* We are still not connected. Caller will have to call us again. */
+		return -ENOTCONN;
+	} else if (pfd.revents & ~POLLOUT) {
+		/* We only poll for POLLOUT */
+		goto err;
+	} else if ((pfd.revents & POLLOUT) == 0) {
+		/* Is this even possible to get here? */
+		return -ENOTCONN;
+	}
+
+	rc_len = sizeof(int);
+	/* connection might fail so need to check SO_ERROR. */
+	if (getsockopt(client->sockfd, SOL_SOCKET, SO_ERROR, &rc, &rc_len) == -1) {
+		goto err;
+	}
+
+	if (rc == 0) {
+		client->connected = true;
+		return 0;
+	}
+
+err:
+	return -EIO;
+}
+
+static int
+jsonrpc_client_connect(struct spdk_jsonrpc_client *client, int domain, int protocol,
+		       struct sockaddr *server_addr, socklen_t addrlen)
+{
+	int rc, flags;
+
+	client->sockfd = socket(domain, SOCK_STREAM, protocol);
+	if (client->sockfd < 0) {
+		rc = errno;
+		SPDK_ERRLOG("socket() failed\n");
+		return -rc;
+	}
+
+	flags = fcntl(client->sockfd, F_GETFL);
+	if (flags < 0 || fcntl(client->sockfd, F_SETFL, flags | O_NONBLOCK) < 0) {
+		rc = errno;
+		SPDK_ERRLOG("fcntl(): can't set nonblocking mode for socket (%d): %s\n",
+			    errno, spdk_strerror(errno));
+		goto err;
+	}
+
+	rc = connect(client->sockfd, server_addr, addrlen);
+	if (rc != 0) {
+		rc = errno;
+		if (rc != EINPROGRESS) {
+			SPDK_ERRLOG("could not connect to JSON-RPC server: %s\n", spdk_strerror(errno));
+			goto err;
+		}
+	} else {
+		client->connected = true;
+	}
+
+	return -rc;
+err:
+	close(client->sockfd);
+	client->sockfd = -1;
+	return -rc;
+}
+
+struct spdk_jsonrpc_client *
+spdk_jsonrpc_client_connect(const char *addr, int addr_family)
+{
+	struct spdk_jsonrpc_client *client = calloc(1, sizeof(struct spdk_jsonrpc_client));
+	/* Unix Domain Socket */
+	struct sockaddr_un addr_un = {};
+	char *add_in = NULL;
+	int rc;
+
+	if (client == NULL) {
+		SPDK_ERRLOG("%s\n", spdk_strerror(errno));
+		return NULL;
+	}
+
+	if (addr_family == AF_UNIX) {
+		addr_un.sun_family = AF_UNIX;
+		rc = snprintf(addr_un.sun_path, sizeof(addr_un.sun_path), "%s", addr);
+		if (rc < 0 || (size_t)rc >= sizeof(addr_un.sun_path)) {
+			rc = -EINVAL;
+			SPDK_ERRLOG("RPC Listen address Unix socket path too long\n");
+			goto err;
+		}
+
+		rc = jsonrpc_client_connect(client, AF_UNIX, 0, (struct sockaddr *)&addr_un, sizeof(addr_un));
+	} else {
+		/* TCP/IP socket */
+		struct addrinfo		hints;
+		struct addrinfo		*res;
+		char *host, *port;
+
+		add_in = strdup(addr);
+		if (!add_in) {
+			rc = -errno;
+			SPDK_ERRLOG("%s\n", spdk_strerror(errno));
+			goto err;
+		}
+
+		rc = spdk_parse_ip_addr(add_in, &host, &port);
+		if (rc) {
+			SPDK_ERRLOG("Invalid listen address '%s'\n", addr);
+			goto err;
+		}
+
+		if (port == NULL) {
+			port = RPC_DEFAULT_PORT;
+		}
+
+		memset(&hints, 0, sizeof(hints));
+		hints.ai_family = AF_UNSPEC;
+		hints.ai_socktype = SOCK_STREAM;
+		hints.ai_protocol = IPPROTO_TCP;
+
+		rc = getaddrinfo(host, port, &hints, &res);
+		if (rc != 0) {
+			SPDK_ERRLOG("Unable to look up RPC connnect address '%s' (%d): %s\n", addr, rc, gai_strerror(rc));
+			rc = -EINVAL;
+			goto err;
+		}
+
+		rc = jsonrpc_client_connect(client, res->ai_family, res->ai_protocol, res->ai_addr,
+					    res->ai_addrlen);
+		freeaddrinfo(res);
+	}
+
+err:
+	if (rc != 0 && rc != -EINPROGRESS) {
+		free(client);
+		client = NULL;
+		errno = -rc;
+	}
+
+	free(add_in);
+	return client;
+}
+
+void
+spdk_jsonrpc_client_close(struct spdk_jsonrpc_client *client)
+{
+	if (client->sockfd >= 0) {
+		close(client->sockfd);
+	}
+
+	free(client->recv_buf);
+	if (client->resp) {
+		spdk_jsonrpc_client_free_response(&client->resp->jsonrpc);
+	}
+
+	free(client);
+}
+
+struct spdk_jsonrpc_client_request *
+spdk_jsonrpc_client_create_request(void)
+{
+	struct spdk_jsonrpc_client_request *request;
+
+	request = calloc(1, sizeof(*request));
+	if (request == NULL) {
+		return NULL;
+	}
+
+	/* memory malloc for send-buf */
+	request->send_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT);
+	if (!request->send_buf) {
+		SPDK_ERRLOG("memory malloc for send-buf failed\n");
+		free(request);
+		return NULL;
+	}
+	request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT;
+
+	return request;
+}
+
+void
+spdk_jsonrpc_client_free_request(struct spdk_jsonrpc_client_request *req)
+{
+	free(req->send_buf);
+	free(req);
+}
+
+int
+spdk_jsonrpc_client_poll(struct spdk_jsonrpc_client *client, int timeout)
+{
+	if (client->connected) {
+		return jsonrpc_client_poll(client, timeout);
+	} else {
+		return jsonrpc_client_poll_connecting(client, timeout);
+	}
+}
+
+int spdk_jsonrpc_client_send_request(struct spdk_jsonrpc_client *client,
+				     struct spdk_jsonrpc_client_request *req)
+{
+	if (client->request != NULL) {
+		return -ENOSPC;
+	}
+
+	client->request = req;
+	return 0;
+}
+
+struct spdk_jsonrpc_client_response *
+spdk_jsonrpc_client_get_response(struct spdk_jsonrpc_client *client)
+{
+	struct spdk_jsonrpc_client_response_internal *r;
+
+	r = client->resp;
+	if (r == NULL || r->ready == false) {
+		return NULL;
+	}
+
+	client->resp = NULL;
+	return &r->jsonrpc;
+}
+
+void
+spdk_jsonrpc_client_free_response(struct spdk_jsonrpc_client_response *resp)
+{
+	struct spdk_jsonrpc_client_response_internal *r;
+
+	if (!resp) {
+		return;
+	}
+
+	r = SPDK_CONTAINEROF(resp, struct spdk_jsonrpc_client_response_internal, jsonrpc);
+	free(r->buf);
+	free(r);
+}
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_internal.h b/src/spdk/lib/jsonrpc/jsonrpc_internal.h
new file mode 100644
index 000000000..f51bedf62
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_internal.h
@@ -0,0 +1,166 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_JSONRPC_INTERNAL_H_
+#define SPDK_JSONRPC_INTERNAL_H_
+
+#include "spdk/stdinc.h"
+
+#include "spdk/jsonrpc.h"
+
+#include "spdk_internal/log.h"
+
+#define SPDK_JSONRPC_RECV_BUF_SIZE	(32 * 1024)
+#define SPDK_JSONRPC_SEND_BUF_SIZE_INIT	(32 * 1024)
+#define SPDK_JSONRPC_SEND_BUF_SIZE_MAX	(32 * 1024 * 1024)
+#define SPDK_JSONRPC_ID_MAX_LEN		128
+#define SPDK_JSONRPC_MAX_CONNS		64
+#define SPDK_JSONRPC_MAX_VALUES		1024
+#define SPDK_JSONRPC_CLIENT_MAX_VALUES		8192
+
+struct spdk_jsonrpc_request {
+	struct spdk_jsonrpc_server_conn *conn;
+
+	/* Copy of request id value */
+	const struct spdk_json_val *id;
+
+	/* Total space allocated for send_buf */
+	size_t send_buf_size;
+
+	/* Number of bytes used in send_buf (<= send_buf_size) */
+	size_t send_len;
+
+	size_t send_offset;
+
+	uint8_t *recv_buffer;
+	struct spdk_json_val *values;
+	size_t values_cnt;
+
+	uint8_t *send_buf;
+
+	struct spdk_json_write_ctx *response;
+
+	STAILQ_ENTRY(spdk_jsonrpc_request) link;
+};
+
+struct spdk_jsonrpc_server_conn {
+	struct spdk_jsonrpc_server *server;
+	int sockfd;
+	bool closed;
+	size_t recv_len;
+	uint8_t recv_buf[SPDK_JSONRPC_RECV_BUF_SIZE];
+	uint32_t outstanding_requests;
+
+	pthread_spinlock_t queue_lock;
+	STAILQ_HEAD(, spdk_jsonrpc_request) send_queue;
+
+	struct spdk_jsonrpc_request *send_request;
+
+	spdk_jsonrpc_conn_closed_fn close_cb;
+	void *close_cb_ctx;
+
+	TAILQ_ENTRY(spdk_jsonrpc_server_conn) link;
+};
+
+struct spdk_jsonrpc_server {
+	int sockfd;
+	spdk_jsonrpc_handle_request_fn handle_request;
+
+	TAILQ_HEAD(, spdk_jsonrpc_server_conn) free_conns;
+	TAILQ_HEAD(, spdk_jsonrpc_server_conn) conns;
+
+	struct spdk_jsonrpc_server_conn conns_array[SPDK_JSONRPC_MAX_CONNS];
+};
+
+struct spdk_jsonrpc_client_request {
+	/* Total space allocated for send_buf */
+	size_t send_buf_size;
+
+	/* Number of bytes used in send_buf (<= send_buf_size) */
+	size_t send_len;
+
+	size_t send_offset;
+
+	uint8_t *send_buf;
+};
+
+struct spdk_jsonrpc_client_response_internal {
+	struct spdk_jsonrpc_client_response jsonrpc;
+	bool ready;
+	uint8_t *buf;
+	size_t values_cnt;
+	struct spdk_json_val values[];
+};
+
+struct spdk_jsonrpc_client {
+	int sockfd;
+	bool connected;
+
+	size_t recv_buf_size;
+	size_t recv_offset;
+	char *recv_buf;
+
+	/* Parsed response */
+	struct spdk_jsonrpc_client_response_internal *resp;
+	struct spdk_jsonrpc_client_request *request;
+};
+
+/* jsonrpc_server_tcp */
+void jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request,
+				   const struct spdk_json_val *method,
+				   const struct spdk_json_val *params);
+void jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error);
+
+/* Might be called from any thread */
+void jsonrpc_server_send_response(struct spdk_jsonrpc_request *request);
+
+/* jsonrpc_server */
+int jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, const void *json,
+			  size_t size);
+
+/* Must be called only from server poll thread */
+void jsonrpc_free_request(struct spdk_jsonrpc_request *request);
+
+/*
+ * Parse JSON data as RPC command response.
+ *
+ * \param client structure pointer of jsonrpc client
+ *
+ * \return 0 On success. Negative error code in error
+ * -EAGAIN - If the provided data is not a complete JSON value (SPDK_JSON_PARSE_INCOMPLETE)
+ * -EINVAL - If the provided data has invalid JSON syntax and can't be parsed (SPDK_JSON_PARSE_INVALID).
+ * -ENOSPC - No space left to store parsed response.
+ */
+int jsonrpc_parse_response(struct spdk_jsonrpc_client *client);
+
+#endif
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server.c b/src/spdk/lib/jsonrpc/jsonrpc_server.c
new file mode 100644
index 000000000..774612b25
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_server.c
@@ -0,0 +1,361 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "jsonrpc_internal.h"
+
+#include "spdk/util.h"
+
+struct jsonrpc_request {
+	const struct spdk_json_val *version;
+	const struct spdk_json_val *method;
+	const struct spdk_json_val *params;
+	const struct spdk_json_val *id;
+};
+
+static int
+capture_val(const struct spdk_json_val *val, void *out)
+{
+	const struct spdk_json_val **vptr = out;
+
+	*vptr = val;
+	return 0;
+}
+
+static const struct spdk_json_object_decoder jsonrpc_request_decoders[] = {
+	{"jsonrpc", offsetof(struct jsonrpc_request, version), capture_val, true},
+	{"method", offsetof(struct jsonrpc_request, method), capture_val},
+	{"params", offsetof(struct jsonrpc_request, params), capture_val, true},
+	{"id", offsetof(struct jsonrpc_request, id), capture_val, true},
+};
+
+static void
+parse_single_request(struct spdk_jsonrpc_request *request, struct spdk_json_val *values)
+{
+	struct jsonrpc_request req = {};
+	const struct spdk_json_val *params = NULL;
+
+	if (spdk_json_decode_object(values, jsonrpc_request_decoders,
+				    SPDK_COUNTOF(jsonrpc_request_decoders),
+				    &req)) {
+		goto invalid;
+	}
+
+	if (req.version && (req.version->type != SPDK_JSON_VAL_STRING ||
+			    !spdk_json_strequal(req.version, "2.0"))) {
+		goto invalid;
+	}
+
+	if (!req.method || req.method->type != SPDK_JSON_VAL_STRING) {
+		goto invalid;
+	}
+
+	if (req.id) {
+		if (req.id->type == SPDK_JSON_VAL_STRING ||
+		    req.id->type == SPDK_JSON_VAL_NUMBER ||
+		    req.id->type == SPDK_JSON_VAL_NULL) {
+			request->id = req.id;
+		} else  {
+			goto invalid;
+		}
+	}
+
+	if (req.params) {
+		/* null json value is as if there were no parameters */
+		if (req.params->type != SPDK_JSON_VAL_NULL) {
+			if (req.params->type != SPDK_JSON_VAL_ARRAY_BEGIN &&
+			    req.params->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+				goto invalid;
+			}
+			params = req.params;
+		}
+	}
+
+	jsonrpc_server_handle_request(request, req.method, params);
+	return;
+
+invalid:
+	jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST);
+}
+
+static int
+jsonrpc_server_write_cb(void *cb_ctx, const void *data, size_t size)
+{
+	struct spdk_jsonrpc_request *request = cb_ctx;
+	size_t new_size = request->send_buf_size;
+
+	while (new_size - request->send_len < size) {
+		if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) {
+			SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n",
+				    (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX);
+			return -1;
+		}
+
+		new_size *= 2;
+	}
+
+	if (new_size != request->send_buf_size) {
+		uint8_t *new_buf;
+
+		new_buf = realloc(request->send_buf, new_size);
+		if (new_buf == NULL) {
+			SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n",
+				    request->send_buf_size, new_size);
+			return -1;
+		}
+
+		request->send_buf = new_buf;
+		request->send_buf_size = new_size;
+	}
+
+	memcpy(request->send_buf + request->send_len, data, size);
+	request->send_len += size;
+
+	return 0;
+}
+
+int
+jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, const void *json, size_t size)
+{
+	struct spdk_jsonrpc_request *request;
+	ssize_t rc;
+	size_t len;
+	void *end = NULL;
+
+	/* Check to see if we have received a full JSON value. It is safe to cast away const
+	 * as we don't decode in place. */
+	rc = spdk_json_parse((void *)json, size, NULL, 0, &end, 0);
+	if (rc == SPDK_JSON_PARSE_INCOMPLETE) {
+		return 0;
+	}
+
+	request = calloc(1, sizeof(*request));
+	if (request == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "Out of memory allocating request\n");
+		return -1;
+	}
+
+	conn->outstanding_requests++;
+
+	request->conn = conn;
+
+	len = end - json;
+	request->recv_buffer = malloc(len + 1);
+	if (request->recv_buffer == NULL) {
+		SPDK_ERRLOG("Failed to allocate buffer to copy request (%zu bytes)\n", len + 1);
+		jsonrpc_free_request(request);
+		return -1;
+	}
+
+	memcpy(request->recv_buffer, json, len);
+	request->recv_buffer[len] = '\0';
+
+	if (rc > 0 && rc <= SPDK_JSONRPC_MAX_VALUES) {
+		request->values_cnt = rc;
+		request->values = malloc(request->values_cnt * sizeof(request->values[0]));
+		if (request->values == NULL) {
+			SPDK_ERRLOG("Failed to allocate buffer for JSON values (%zu bytes)\n",
+				    request->values_cnt * sizeof(request->values[0]));
+			jsonrpc_free_request(request);
+			return -1;
+		}
+	}
+
+	request->send_offset = 0;
+	request->send_len = 0;
+	request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT;
+	request->send_buf = malloc(request->send_buf_size);
+	if (request->send_buf == NULL) {
+		SPDK_ERRLOG("Failed to allocate send_buf (%zu bytes)\n", request->send_buf_size);
+		jsonrpc_free_request(request);
+		return -1;
+	}
+
+	request->response = spdk_json_write_begin(jsonrpc_server_write_cb, request, 0);
+	if (request->response == NULL) {
+		SPDK_ERRLOG("Failed to allocate response JSON write context.\n");
+		jsonrpc_free_request(request);
+		return -1;
+	}
+
+	if (rc <= 0 || rc > SPDK_JSONRPC_MAX_VALUES) {
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error\n");
+		jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR);
+
+		/*
+		 * Can't recover from parse error (no guaranteed resync point in streaming JSON).
+		 * Return an error to indicate that the connection should be closed.
+		 */
+		return -1;
+	}
+
+	/* Decode a second time now that there is a full JSON value available. */
+	rc = spdk_json_parse(request->recv_buffer, size, request->values, request->values_cnt, &end,
+			     SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE);
+	if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) {
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error on second pass\n");
+		jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR);
+		return -1;
+	}
+
+	assert(end != NULL);
+
+	if (request->values[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+		parse_single_request(request, request->values);
+	} else if (request->values[0].type == SPDK_JSON_VAL_ARRAY_BEGIN) {
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "Got batch array (not currently supported)\n");
+		jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "top-level JSON value was not array or object\n");
+		jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST);
+	}
+
+	return len;
+}
+
+struct spdk_jsonrpc_server_conn *
+spdk_jsonrpc_get_conn(struct spdk_jsonrpc_request *request)
+{
+	return request->conn;
+}
+
+/* Never return NULL */
+static struct spdk_json_write_ctx *
+begin_response(struct spdk_jsonrpc_request *request)
+{
+	struct spdk_json_write_ctx *w = request->response;
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "jsonrpc", "2.0");
+
+	spdk_json_write_name(w, "id");
+	if (request->id) {
+		spdk_json_write_val(w, request->id);
+	} else {
+		spdk_json_write_null(w);
+	}
+
+	return w;
+}
+
+static void
+skip_response(struct spdk_jsonrpc_request *request)
+{
+	request->send_len = 0;
+	spdk_json_write_end(request->response);
+	request->response = NULL;
+	jsonrpc_server_send_response(request);
+}
+
+static void
+end_response(struct spdk_jsonrpc_request *request)
+{
+	spdk_json_write_object_end(request->response);
+	spdk_json_write_end(request->response);
+	request->response = NULL;
+
+	jsonrpc_server_write_cb(request, "\n", 1);
+	jsonrpc_server_send_response(request);
+}
+
+void
+jsonrpc_free_request(struct spdk_jsonrpc_request *request)
+{
+	if (!request) {
+		return;
+	}
+
+	/* We must send or skip response explicitly */
+	assert(request->response == NULL);
+
+	request->conn->outstanding_requests--;
+	free(request->recv_buffer);
+	free(request->values);
+	free(request->send_buf);
+	free(request);
+}
+
+struct spdk_json_write_ctx *
+spdk_jsonrpc_begin_result(struct spdk_jsonrpc_request *request)
+{
+	struct spdk_json_write_ctx *w = begin_response(request);
+
+	spdk_json_write_name(w, "result");
+	return w;
+}
+
+void
+spdk_jsonrpc_end_result(struct spdk_jsonrpc_request *request, struct spdk_json_write_ctx *w)
+{
+	assert(w != NULL);
+	assert(w == request->response);
+
+	/* If there was no ID in request we skip response. */
+	if (request->id && request->id->type != SPDK_JSON_VAL_NULL) {
+		end_response(request);
+	} else {
+		skip_response(request);
+	}
+}
+
+void
+spdk_jsonrpc_send_error_response(struct spdk_jsonrpc_request *request,
+				 int error_code, const char *msg)
+{
+	struct spdk_json_write_ctx *w = begin_response(request);
+
+	spdk_json_write_named_object_begin(w, "error");
+	spdk_json_write_named_int32(w, "code", error_code);
+	spdk_json_write_named_string(w, "message", msg);
+	spdk_json_write_object_end(w);
+
+	end_response(request);
+}
+
+void
+spdk_jsonrpc_send_error_response_fmt(struct spdk_jsonrpc_request *request,
+				     int error_code, const char *fmt, ...)
+{
+	struct spdk_json_write_ctx *w = begin_response(request);
+	va_list args;
+
+	spdk_json_write_named_object_begin(w, "error");
+	spdk_json_write_named_int32(w, "code", error_code);
+	va_start(args, fmt);
+	spdk_json_write_named_string_fmt_v(w, "message", fmt, args);
+	va_end(args);
+	spdk_json_write_object_end(w);
+
+	end_response(request);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("rpc", SPDK_LOG_RPC)
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c
new file mode 100644
index 000000000..1e38f713f
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c
@@ -0,0 +1,441 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "jsonrpc_internal.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+struct spdk_jsonrpc_server *
+spdk_jsonrpc_server_listen(int domain, int protocol,
+			   struct sockaddr *listen_addr, socklen_t addrlen,
+			   spdk_jsonrpc_handle_request_fn handle_request)
+{
+	struct spdk_jsonrpc_server *server;
+	int rc, val, flag, i;
+
+	server = calloc(1, sizeof(struct spdk_jsonrpc_server));
+	if (server == NULL) {
+		return NULL;
+	}
+
+	TAILQ_INIT(&server->free_conns);
+	TAILQ_INIT(&server->conns);
+
+	for (i = 0; i < SPDK_JSONRPC_MAX_CONNS; i++) {
+		TAILQ_INSERT_TAIL(&server->free_conns, &server->conns_array[i], link);
+	}
+
+	server->handle_request = handle_request;
+
+	server->sockfd = socket(domain, SOCK_STREAM, protocol);
+	if (server->sockfd < 0) {
+		SPDK_ERRLOG("socket() failed\n");
+		free(server);
+		return NULL;
+	}
+
+	val = 1;
+	setsockopt(server->sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+
+	flag = fcntl(server->sockfd, F_GETFL);
+	if (fcntl(server->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) {
+		SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+			    server->sockfd, spdk_strerror(errno));
+		close(server->sockfd);
+		free(server);
+		return NULL;
+	}
+
+	rc = bind(server->sockfd, listen_addr, addrlen);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not bind JSON-RPC server: %s\n", spdk_strerror(errno));
+		close(server->sockfd);
+		free(server);
+		return NULL;
+	}
+
+	rc = listen(server->sockfd, 512);
+	if (rc != 0) {
+		SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
+		close(server->sockfd);
+		free(server);
+		return NULL;
+	}
+
+	return server;
+}
+
+static struct spdk_jsonrpc_request *
+jsonrpc_server_dequeue_request(struct spdk_jsonrpc_server_conn *conn)
+{
+	struct spdk_jsonrpc_request *request = NULL;
+
+	pthread_spin_lock(&conn->queue_lock);
+	request = STAILQ_FIRST(&conn->send_queue);
+	if (request) {
+		STAILQ_REMOVE_HEAD(&conn->send_queue, link);
+	}
+	pthread_spin_unlock(&conn->queue_lock);
+	return request;
+}
+
+static void
+jsonrpc_server_free_conn_request(struct spdk_jsonrpc_server_conn *conn)
+{
+	struct spdk_jsonrpc_request *request;
+
+	jsonrpc_free_request(conn->send_request);
+	conn->send_request = NULL ;
+	while ((request = jsonrpc_server_dequeue_request(conn)) != NULL) {
+		jsonrpc_free_request(request);
+	}
+}
+
+static void
+jsonrpc_server_conn_close(struct spdk_jsonrpc_server_conn *conn)
+{
+	conn->closed = true;
+
+	if (conn->sockfd >= 0) {
+		jsonrpc_server_free_conn_request(conn);
+		close(conn->sockfd);
+		conn->sockfd = -1;
+
+		if (conn->close_cb) {
+			conn->close_cb(conn, conn->close_cb_ctx);
+		}
+	}
+}
+
+void
+spdk_jsonrpc_server_shutdown(struct spdk_jsonrpc_server *server)
+{
+	struct spdk_jsonrpc_server_conn *conn;
+
+	close(server->sockfd);
+
+	TAILQ_FOREACH(conn, &server->conns, link) {
+		jsonrpc_server_conn_close(conn);
+	}
+
+	free(server);
+}
+
+static void
+jsonrpc_server_conn_remove(struct spdk_jsonrpc_server_conn *conn)
+{
+	struct spdk_jsonrpc_server *server = conn->server;
+
+	jsonrpc_server_conn_close(conn);
+
+	pthread_spin_destroy(&conn->queue_lock);
+	assert(STAILQ_EMPTY(&conn->send_queue));
+
+	TAILQ_REMOVE(&server->conns, conn, link);
+	TAILQ_INSERT_HEAD(&server->free_conns, conn, link);
+}
+
+int
+spdk_jsonrpc_conn_add_close_cb(struct spdk_jsonrpc_server_conn *conn,
+			       spdk_jsonrpc_conn_closed_fn cb, void *ctx)
+{
+	int rc = 0;
+
+	pthread_spin_lock(&conn->queue_lock);
+	if (conn->close_cb == NULL) {
+		conn->close_cb = cb;
+		conn->close_cb_ctx = ctx;
+	} else {
+		rc = conn->close_cb == cb && conn->close_cb_ctx == ctx ? -EEXIST : -ENOSPC;
+	}
+	pthread_spin_unlock(&conn->queue_lock);
+
+	return rc;
+}
+
+int
+spdk_jsonrpc_conn_del_close_cb(struct spdk_jsonrpc_server_conn *conn,
+			       spdk_jsonrpc_conn_closed_fn cb, void *ctx)
+{
+	int rc = 0;
+
+	pthread_spin_lock(&conn->queue_lock);
+	if (conn->close_cb == NULL || conn->close_cb != cb || conn->close_cb_ctx != ctx) {
+		rc = -ENOENT;
+	} else {
+		conn->close_cb = NULL;
+	}
+	pthread_spin_unlock(&conn->queue_lock);
+
+	return rc;
+}
+
+static int
+jsonrpc_server_accept(struct spdk_jsonrpc_server *server)
+{
+	struct spdk_jsonrpc_server_conn *conn;
+	int rc, flag;
+
+	rc = accept(server->sockfd, NULL, NULL);
+	if (rc >= 0) {
+		conn = TAILQ_FIRST(&server->free_conns);
+		assert(conn != NULL);
+
+		conn->server = server;
+		conn->sockfd = rc;
+		conn->closed = false;
+		conn->recv_len = 0;
+		conn->outstanding_requests = 0;
+		STAILQ_INIT(&conn->send_queue);
+		conn->send_request = NULL;
+
+		if (pthread_spin_init(&conn->queue_lock, PTHREAD_PROCESS_PRIVATE)) {
+			SPDK_ERRLOG("Unable to create queue lock for socket: %d", conn->sockfd);
+			close(conn->sockfd);
+			return -1;
+		}
+
+		flag = fcntl(conn->sockfd, F_GETFL);
+		if (fcntl(conn->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) {
+			SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+				    conn->sockfd, spdk_strerror(errno));
+			close(conn->sockfd);
+			pthread_spin_destroy(&conn->queue_lock);
+			return -1;
+		}
+
+		TAILQ_REMOVE(&server->free_conns, conn, link);
+		TAILQ_INSERT_TAIL(&server->conns, conn, link);
+		return 0;
+	}
+
+	if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
+		return 0;
+	}
+
+	return -1;
+}
+
+void
+jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request,
+			      const struct spdk_json_val *method, const struct spdk_json_val *params)
+{
+	request->conn->server->handle_request(request, method, params);
+}
+
+void
+jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error)
+{
+	const char *msg;
+
+	switch (error) {
+	case SPDK_JSONRPC_ERROR_PARSE_ERROR:
+		msg = "Parse error";
+		break;
+
+	case SPDK_JSONRPC_ERROR_INVALID_REQUEST:
+		msg = "Invalid request";
+		break;
+
+	case SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND:
+		msg = "Method not found";
+		break;
+
+	case SPDK_JSONRPC_ERROR_INVALID_PARAMS:
+		msg = "Invalid parameters";
+		break;
+
+	case SPDK_JSONRPC_ERROR_INTERNAL_ERROR:
+		msg = "Internal error";
+		break;
+
+	default:
+		msg = "Error";
+		break;
+	}
+
+	spdk_jsonrpc_send_error_response(request, error, msg);
+}
+
+static int
+jsonrpc_server_conn_recv(struct spdk_jsonrpc_server_conn *conn)
+{
+	ssize_t rc, offset;
+	size_t recv_avail = SPDK_JSONRPC_RECV_BUF_SIZE - conn->recv_len;
+
+	rc = recv(conn->sockfd, conn->recv_buf + conn->recv_len, recv_avail, 0);
+	if (rc == -1) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
+			return 0;
+		}
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "recv() failed: %s\n", spdk_strerror(errno));
+		return -1;
+	}
+
+	if (rc == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_RPC, "remote closed connection\n");
+		conn->closed = true;
+		return 0;
+	}
+
+	conn->recv_len += rc;
+
+	offset = 0;
+	do {
+		rc = jsonrpc_parse_request(conn, conn->recv_buf + offset, conn->recv_len - offset);
+		if (rc < 0) {
+			SPDK_ERRLOG("jsonrpc parse request failed\n");
+			return -1;
+		}
+
+		offset += rc;
+	} while (rc > 0);
+
+	if (offset > 0) {
+		/*
+		 * Successfully parsed a requests - move any data past the end of the
+		 * parsed requests down to the beginning.
+		 */
+		assert((size_t)offset <= conn->recv_len);
+		memmove(conn->recv_buf, conn->recv_buf + offset, conn->recv_len - offset);
+		conn->recv_len -= offset;
+	}
+
+	return 0;
+}
+
+void
+jsonrpc_server_send_response(struct spdk_jsonrpc_request *request)
+{
+	struct spdk_jsonrpc_server_conn *conn = request->conn;
+
+	/* Queue the response to be sent */
+	pthread_spin_lock(&conn->queue_lock);
+	STAILQ_INSERT_TAIL(&conn->send_queue, request, link);
+	pthread_spin_unlock(&conn->queue_lock);
+}
+
+
+static int
+jsonrpc_server_conn_send(struct spdk_jsonrpc_server_conn *conn)
+{
+	struct spdk_jsonrpc_request *request;
+	ssize_t rc;
+
+more:
+	if (conn->outstanding_requests == 0) {
+		return 0;
+	}
+
+	if (conn->send_request == NULL) {
+		conn->send_request = jsonrpc_server_dequeue_request(conn);
+	}
+
+	request = conn->send_request;
+	if (request == NULL) {
+		/* Nothing to send right now */
+		return 0;
+	}
+
+	if (request->send_len > 0) {
+		rc = send(conn->sockfd, request->send_buf + request->send_offset,
+			  request->send_len, 0);
+		if (rc < 0) {
+			if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
+				return 0;
+			}
+
+			SPDK_DEBUGLOG(SPDK_LOG_RPC, "send() failed: %s\n", spdk_strerror(errno));
+			return -1;
+		}
+
+		request->send_offset += rc;
+		request->send_len -= rc;
+	}
+
+	if (request->send_len == 0) {
+		/*
+		 * Full response has been sent.
+		 * Free it and set send_request to NULL to move on to the next queued response.
+		 */
+		conn->send_request = NULL;
+		jsonrpc_free_request(request);
+		goto more;
+	}
+
+	return 0;
+}
+
+int
+spdk_jsonrpc_server_poll(struct spdk_jsonrpc_server *server)
+{
+	int rc;
+	struct spdk_jsonrpc_server_conn *conn, *conn_tmp;
+
+	TAILQ_FOREACH_SAFE(conn, &server->conns, link, conn_tmp) {
+		/* If we can't receive and there are no outstanding requests close the connection. */
+		if (conn->closed == true && conn->outstanding_requests == 0) {
+			jsonrpc_server_conn_close(conn);
+		}
+
+		if (conn->sockfd == -1 && conn->outstanding_requests == 0) {
+			jsonrpc_server_conn_remove(conn);
+		}
+	}
+
+	/* Check listen socket */
+	if (!TAILQ_EMPTY(&server->free_conns)) {
+		jsonrpc_server_accept(server);
+	}
+
+	TAILQ_FOREACH(conn, &server->conns, link) {
+		if (conn->sockfd == -1) {
+			continue;
+		}
+
+		rc = jsonrpc_server_conn_send(conn);
+		if (rc != 0) {
+			jsonrpc_server_conn_close(conn);
+			continue;
+		}
+
+		if (!conn->closed) {
+			rc = jsonrpc_server_conn_recv(conn);
+			if (rc != 0) {
+				jsonrpc_server_conn_close(conn);
+			}
+		}
+	}
+
+	return 0;
+}
diff --git a/src/spdk/lib/jsonrpc/spdk_jsonrpc.map b/src/spdk/lib/jsonrpc/spdk_jsonrpc.map
new file mode 100644
index 000000000..461fd0766
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/spdk_jsonrpc.map
@@ -0,0 +1,28 @@
+{
+	global:
+
+	# public functions
+	spdk_jsonrpc_server_listen;
+	spdk_jsonrpc_server_poll;
+	spdk_jsonrpc_server_shutdown;
+	spdk_jsonrpc_get_conn;
+	spdk_jsonrpc_conn_add_close_cb;
+	spdk_jsonrpc_conn_del_close_cb;
+	spdk_jsonrpc_begin_result;
+	spdk_jsonrpc_end_result;
+	spdk_jsonrpc_send_error_response;
+	spdk_jsonrpc_send_error_response_fmt;
+	spdk_jsonrpc_begin_request;
+	spdk_jsonrpc_end_request;
+	spdk_jsonrpc_client_connect;
+	spdk_jsonrpc_client_close;
+	spdk_jsonrpc_client_create_request;
+	spdk_jsonrpc_client_free_request;
+	spdk_jsonrpc_client_send_request;
+	spdk_jsonrpc_client_poll;
+	spdk_jsonrpc_client_get_response;
+	spdk_jsonrpc_client_free_response;
+
+
+	local: *;
+};
diff --git a/src/spdk/lib/log/Makefile b/src/spdk/lib/log/Makefile
new file mode 100644
index 000000000..4e7c25758
--- /dev/null
+++ b/src/spdk/lib/log/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+SO_SUFFIX := $(SO_VER).$(SO_MINOR)
+
+C_SRCS = log.c log_flags.c
+LIBNAME = log
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_log.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/log/log.c b/src/spdk/lib/log/log.c
new file mode 100644
index 000000000..0ab50d69c
--- /dev/null
+++ b/src/spdk/lib/log/log.c
@@ -0,0 +1,203 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+
+static const char *const spdk_level_names[] = {
+	[SPDK_LOG_ERROR]	= "ERROR",
+	[SPDK_LOG_WARN]		= "WARNING",
+	[SPDK_LOG_NOTICE]	= "NOTICE",
+	[SPDK_LOG_INFO]		= "INFO",
+	[SPDK_LOG_DEBUG]	= "DEBUG",
+};
+
+#define MAX_TMPBUF 1024
+
+static logfunc *g_log = NULL;
+
+void
+spdk_log_open(logfunc *logf)
+{
+	if (logf) {
+		g_log = logf;
+	} else {
+		openlog("spdk", LOG_PID, LOG_LOCAL7);
+	}
+}
+
+void
+spdk_log_close(void)
+{
+	if (!g_log) {
+		closelog();
+	}
+}
+
+static void
+get_timestamp_prefix(char *buf, int buf_size)
+{
+	struct tm *info;
+	char date[24];
+	struct timespec ts;
+	long usec;
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+	info = localtime(&ts.tv_sec);
+	usec = ts.tv_nsec / 1000;
+	if (info == NULL) {
+		snprintf(buf, buf_size, "[%s.%06ld] ", "unknown date", usec);
+		return;
+	}
+
+	strftime(date, sizeof(date), "%Y-%m-%d %H:%M:%S", info);
+	snprintf(buf, buf_size, "[%s.%06ld] ", date, usec);
+}
+
+void
+spdk_log(enum spdk_log_level level, const char *file, const int line, const char *func,
+	 const char *format, ...)
+{
+	va_list ap;
+
+	va_start(ap, format);
+	spdk_vlog(level, file, line, func, format, ap);
+	va_end(ap);
+}
+
+void
+spdk_vlog(enum spdk_log_level level, const char *file, const int line, const char *func,
+	  const char *format, va_list ap)
+{
+	int severity = LOG_INFO;
+	char buf[MAX_TMPBUF];
+	char timestamp[64];
+
+	if (g_log) {
+		g_log(level, file, line, func, format, ap);
+		return;
+	}
+
+	if (level > g_spdk_log_print_level && level > g_spdk_log_level) {
+		return;
+	}
+
+	switch (level) {
+	case SPDK_LOG_ERROR:
+		severity = LOG_ERR;
+		break;
+	case SPDK_LOG_WARN:
+		severity = LOG_WARNING;
+		break;
+	case SPDK_LOG_NOTICE:
+		severity = LOG_NOTICE;
+		break;
+	case SPDK_LOG_INFO:
+	case SPDK_LOG_DEBUG:
+		severity = LOG_INFO;
+		break;
+	case SPDK_LOG_DISABLED:
+		return;
+	}
+
+	vsnprintf(buf, sizeof(buf), format, ap);
+
+	if (level <= g_spdk_log_print_level) {
+		get_timestamp_prefix(timestamp, sizeof(timestamp));
+		if (file) {
+			fprintf(stderr, "%s%s:%4d:%s: *%s*: %s", timestamp, file, line, func, spdk_level_names[level], buf);
+		} else {
+			fprintf(stderr, "%s%s", timestamp, buf);
+		}
+	}
+
+	if (level <= g_spdk_log_level) {
+		if (file) {
+			syslog(severity, "%s:%4d:%s: *%s*: %s", file, line, func, spdk_level_names[level], buf);
+		} else {
+			syslog(severity, "%s", buf);
+		}
+	}
+}
+
+static void
+fdump(FILE *fp, const char *label, const uint8_t *buf, size_t len)
+{
+	char tmpbuf[MAX_TMPBUF];
+	char buf16[16 + 1];
+	size_t total;
+	unsigned int idx;
+
+	fprintf(fp, "%s\n", label);
+
+	memset(buf16, 0, sizeof buf16);
+	total = 0;
+	for (idx = 0; idx < len; idx++) {
+		if (idx != 0 && idx % 16 == 0) {
+			snprintf(tmpbuf + total, sizeof tmpbuf - total,
+				 " %s", buf16);
+			memset(buf16, 0, sizeof buf16);
+			fprintf(fp, "%s\n", tmpbuf);
+			total = 0;
+		}
+		if (idx % 16 == 0) {
+			total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+					  "%08x ", idx);
+		}
+		if (idx % 8 == 0) {
+			total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+					  "%s", " ");
+		}
+		total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+				  "%2.2x ", buf[idx] & 0xff);
+		buf16[idx % 16] = isprint(buf[idx]) ? buf[idx] : '.';
+	}
+	for (; idx % 16 != 0; idx++) {
+		if (idx == 8) {
+			total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+					  " ");
+		}
+
+		total += snprintf(tmpbuf + total, sizeof tmpbuf - total, "   ");
+	}
+	snprintf(tmpbuf + total, sizeof tmpbuf - total, "  %s", buf16);
+	fprintf(fp, "%s\n", tmpbuf);
+	fflush(fp);
+}
+
+void
+spdk_log_dump(FILE *fp, const char *label, const void *buf, size_t len)
+{
+	fdump(fp, label, buf, len);
+}
diff --git a/src/spdk/lib/log/log_flags.c b/src/spdk/lib/log/log_flags.c
new file mode 100644
index 000000000..c767a3786
--- /dev/null
+++ b/src/spdk/lib/log/log_flags.c
@@ -0,0 +1,188 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+
+static TAILQ_HEAD(, spdk_log_flag) g_log_flags = TAILQ_HEAD_INITIALIZER(g_log_flags);
+
+enum spdk_log_level g_spdk_log_level = SPDK_LOG_NOTICE;
+enum spdk_log_level g_spdk_log_print_level = SPDK_LOG_NOTICE;
+
+SPDK_LOG_REGISTER_COMPONENT("log", SPDK_LOG_LOG)
+
+#define MAX_TMPBUF 1024
+
+void
+spdk_log_set_level(enum spdk_log_level level)
+{
+	assert(level >= SPDK_LOG_DISABLED);
+	assert(level <= SPDK_LOG_DEBUG);
+	g_spdk_log_level = level;
+}
+
+enum spdk_log_level
+spdk_log_get_level(void) {
+	return g_spdk_log_level;
+}
+
+void
+spdk_log_set_print_level(enum spdk_log_level level)
+{
+	assert(level >= SPDK_LOG_DISABLED);
+	assert(level <= SPDK_LOG_DEBUG);
+	g_spdk_log_print_level = level;
+}
+
+enum spdk_log_level
+spdk_log_get_print_level(void) {
+	return g_spdk_log_print_level;
+}
+
+static struct spdk_log_flag *
+get_log_flag(const char *name)
+{
+	struct spdk_log_flag *flag;
+
+	TAILQ_FOREACH(flag, &g_log_flags, tailq) {
+		if (strcasecmp(name, flag->name) == 0) {
+			return flag;
+		}
+	}
+
+	return NULL;
+}
+
+void
+spdk_log_register_flag(const char *name, struct spdk_log_flag *flag)
+{
+	struct spdk_log_flag *iter;
+
+	if (name == NULL || flag == NULL) {
+		SPDK_ERRLOG("missing spdk_log_flag parameters\n");
+		assert(false);
+		return;
+	}
+
+	if (get_log_flag(name)) {
+		SPDK_ERRLOG("duplicate spdk_log_flag '%s'\n", name);
+		assert(false);
+		return;
+	}
+
+	TAILQ_FOREACH(iter, &g_log_flags, tailq) {
+		if (strcasecmp(iter->name, flag->name) > 0) {
+			TAILQ_INSERT_BEFORE(iter, flag, tailq);
+			return;
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&g_log_flags, flag, tailq);
+}
+
+bool
+spdk_log_get_flag(const char *name)
+{
+	struct spdk_log_flag *flag = get_log_flag(name);
+
+	if (flag && flag->enabled) {
+		return true;
+	}
+
+	return false;
+}
+
+static int
+log_set_flag(const char *name, bool value)
+{
+	struct spdk_log_flag *flag;
+
+	if (strcasecmp(name, "all") == 0) {
+		TAILQ_FOREACH(flag, &g_log_flags, tailq) {
+			flag->enabled = value;
+		}
+		return 0;
+	}
+
+	flag = get_log_flag(name);
+	if (flag == NULL) {
+		return -1;
+	}
+
+	flag->enabled = value;
+
+	return 0;
+}
+
+int
+spdk_log_set_flag(const char *name)
+{
+	return log_set_flag(name, true);
+}
+
+int
+spdk_log_clear_flag(const char *name)
+{
+	return log_set_flag(name, false);
+}
+
+struct spdk_log_flag *
+spdk_log_get_first_flag(void)
+{
+	return TAILQ_FIRST(&g_log_flags);
+}
+
+struct spdk_log_flag *
+spdk_log_get_next_flag(struct spdk_log_flag *flag)
+{
+	return TAILQ_NEXT(flag, tailq);
+}
+
+void
+spdk_log_usage(FILE *f, const char *log_arg)
+{
+#ifdef DEBUG
+	struct spdk_log_flag *flag;
+	fprintf(f, " %s, --logflag <flag>    enable debug log flag (all", log_arg);
+
+	TAILQ_FOREACH(flag, &g_log_flags, tailq) {
+		fprintf(f, ", %s", flag->name);
+	}
+
+	fprintf(f, ")\n");
+#else
+	fprintf(f, " %s, --logflag <flag>    enable debug log flag (not supported"
+		" - must reconfigure with --enable-debug)\n", log_arg);
+#endif
+}
diff --git a/src/spdk/lib/log/spdk_log.map b/src/spdk/lib/log/spdk_log.map
new file mode 100644
index 000000000..84629d555
--- /dev/null
+++ b/src/spdk/lib/log/spdk_log.map
@@ -0,0 +1,25 @@
+{
+	global:
+
+	# public functions
+	spdk_log_open;
+	spdk_log_close;
+	spdk_log_set_level;
+	spdk_log_get_level;
+	spdk_log_set_print_level;
+	spdk_log_get_print_level;
+	spdk_log;
+	spdk_vlog;
+	spdk_log_dump;
+	spdk_log_get_flag;
+	spdk_log_set_flag;
+	spdk_log_clear_flag;
+	spdk_log_usage;
+
+	# functions used by other SPDK libraries
+	spdk_log_register_flag;
+	spdk_log_get_first_flag;
+	spdk_log_get_next_flag;
+
+	local: *;
+};
diff --git a/src/spdk/lib/log_rpc/Makefile b/src/spdk/lib/log_rpc/Makefile
new file mode 100644
index 000000000..2c7a78deb
--- /dev/null
+++ b/src/spdk/lib/log_rpc/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = log_rpc.c
+LIBNAME = log_rpc
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_log_rpc.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/log_rpc/log_rpc.c b/src/spdk/lib/log_rpc/log_rpc.c
new file mode 100644
index 000000000..78b74c1f5
--- /dev/null
+++ b/src/spdk/lib/log_rpc/log_rpc.c
@@ -0,0 +1,340 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_log_flag {
+	char *flag;
+};
+
+struct rpc_log_level {
+	char *level;
+};
+
+static void
+free_rpc_log_flag(struct rpc_log_flag *p)
+{
+	free(p->flag);
+}
+
+static void
+free_rpc_log_level(struct rpc_log_level *p)
+{
+	free(p->level);
+}
+
+static const struct spdk_json_object_decoder rpc_log_flag_decoders[] = {
+	{"flag", offsetof(struct rpc_log_flag, flag), spdk_json_decode_string},
+};
+
+static const struct spdk_json_object_decoder rpc_log_level_decoders[] = {
+	{"level", offsetof(struct rpc_log_level, level), spdk_json_decode_string},
+};
+
+static int
+_parse_log_level(char *level)
+{
+	if (!strcasecmp(level, "ERROR")) {
+		return SPDK_LOG_ERROR;
+	} else if (!strcasecmp(level, "WARNING")) {
+		return SPDK_LOG_WARN;
+	} else if (!strcasecmp(level, "NOTICE")) {
+		return SPDK_LOG_NOTICE;
+	} else if (!strcasecmp(level, "INFO")) {
+		return SPDK_LOG_INFO;
+	} else if (!strcasecmp(level, "DEBUG")) {
+		return SPDK_LOG_DEBUG;
+	}
+	return -1;
+}
+
+static const char *
+_log_get_level_name(int level)
+{
+	if (level == SPDK_LOG_ERROR) {
+		return "ERROR";
+	} else if (level == SPDK_LOG_WARN) {
+		return "WARNING";
+	} else if (level == SPDK_LOG_NOTICE) {
+		return "NOTICE";
+	} else if (level == SPDK_LOG_INFO) {
+		return "INFO";
+	} else if (level == SPDK_LOG_DEBUG) {
+		return "DEBUG";
+	}
+	return NULL;
+}
+
+static void
+rpc_log_set_print_level(struct spdk_jsonrpc_request *request,
+			const struct spdk_json_val *params)
+{
+	struct rpc_log_level req = {};
+	int level;
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_log_level_decoders,
+				    SPDK_COUNTOF(rpc_log_level_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	level = _parse_log_level(req.level);
+	if (level == -1) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "tried to set invalid log level\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "invalid log level");
+		goto invalid;
+	}
+
+	spdk_log_set_print_level(level);
+	free_rpc_log_level(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_log_level(&req);
+}
+SPDK_RPC_REGISTER("log_set_print_level", rpc_log_set_print_level,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_print_level, set_log_print_level)
+
+static void
+rpc_log_get_print_level(struct spdk_jsonrpc_request *request,
+			const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+	int level;
+	const char *name;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "log_get_print_level requires no parameters");
+		return;
+	}
+
+	level = spdk_log_get_print_level();
+	name = _log_get_level_name(level);
+	if (name == NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "invalid log level");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_string(w, name);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("log_get_print_level", rpc_log_get_print_level,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_print_level, get_log_print_level)
+
+static void
+rpc_log_set_level(struct spdk_jsonrpc_request *request,
+		  const struct spdk_json_val *params)
+{
+	struct rpc_log_level req = {};
+	int level;
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_log_level_decoders,
+				    SPDK_COUNTOF(rpc_log_level_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	level = _parse_log_level(req.level);
+	if (level == -1) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "tried to set invalid log level\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "invalid log level");
+		goto invalid;
+	}
+
+
+	spdk_log_set_level(level);
+	free_rpc_log_level(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_log_level(&req);
+}
+SPDK_RPC_REGISTER("log_set_level", rpc_log_set_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_level, set_log_level)
+
+static void
+rpc_log_get_level(struct spdk_jsonrpc_request *request,
+		  const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+	int level;
+	const char *name;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "log_get_level requires no parameters");
+		return;
+	}
+
+	level = spdk_log_get_level();
+	name = _log_get_level_name(level);
+	if (name == NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "invalid log level");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_string(w, name);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("log_get_level", rpc_log_get_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_level, get_log_level)
+
+static void
+rpc_log_set_flag(struct spdk_jsonrpc_request *request,
+		 const struct spdk_json_val *params)
+{
+	struct rpc_log_flag req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_log_flag_decoders,
+				    SPDK_COUNTOF(rpc_log_flag_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	if (req.flag == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "invalid flag 0\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "invalid flag 0");
+		goto invalid;
+	}
+
+	spdk_log_set_flag(req.flag);
+	free_rpc_log_flag(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_log_flag(&req);
+}
+SPDK_RPC_REGISTER("log_set_flag", rpc_log_set_flag, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_flag, set_log_flag)
+
+static void
+rpc_log_clear_flag(struct spdk_jsonrpc_request *request,
+		   const struct spdk_json_val *params)
+{
+	struct rpc_log_flag req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_log_flag_decoders,
+				    SPDK_COUNTOF(rpc_log_flag_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	if (req.flag == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "Invalid flag 0\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "invalid flag 0");
+		goto invalid;
+	}
+
+	spdk_log_clear_flag(req.flag);
+	free_rpc_log_flag(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_log_flag(&req);
+}
+SPDK_RPC_REGISTER("log_clear_flag", rpc_log_clear_flag,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_clear_flag, clear_log_flag)
+
+static void
+rpc_log_get_flags(struct spdk_jsonrpc_request *request,
+		  const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+	struct spdk_log_flag *flag;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "log_get_flags requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_object_begin(w);
+	flag = spdk_log_get_first_flag();
+	while (flag) {
+		spdk_json_write_name(w, flag->name);
+		spdk_json_write_bool(w, flag->enabled);
+		flag = spdk_log_get_next_flag(flag);
+	}
+	spdk_json_write_object_end(w);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("log_get_flags", rpc_log_get_flags, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_flags, get_log_flags)
+
+SPDK_LOG_REGISTER_COMPONENT("log_rpc", SPDK_LOG_LOG_RPC)
diff --git a/src/spdk/lib/log_rpc/spdk_log_rpc.map b/src/spdk/lib/log_rpc/spdk_log_rpc.map
new file mode 100644
index 000000000..8bee6cdd3
--- /dev/null
+++ b/src/spdk/lib/log_rpc/spdk_log_rpc.map
@@ -0,0 +1,3 @@
+{
+	local: *;
+};
diff --git a/src/spdk/lib/lvol/Makefile b/src/spdk/lib/lvol/Makefile
new file mode 100644
index 000000000..c370a19a5
--- /dev/null
+++ b/src/spdk/lib/lvol/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = lvol.c
+LIBNAME = lvol
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_lvol.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/lvol/lvol.c b/src/spdk/lib/lvol/lvol.c
new file mode 100644
index 000000000..50b42d7b0
--- /dev/null
+++ b/src/spdk/lib/lvol/lvol.c
@@ -0,0 +1,1509 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/lvolstore.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/blob_bdev.h"
+#include "spdk/util.h"
+
+/* Default blob channel opts for lvol */
+#define SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS 512
+
+#define LVOL_NAME "name"
+
+SPDK_LOG_REGISTER_COMPONENT("lvol", SPDK_LOG_LVOL)
+
+static TAILQ_HEAD(, spdk_lvol_store) g_lvol_stores = TAILQ_HEAD_INITIALIZER(g_lvol_stores);
+static pthread_mutex_t g_lvol_stores_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static int
+add_lvs_to_list(struct spdk_lvol_store *lvs)
+{
+	struct spdk_lvol_store *tmp;
+	bool name_conflict = false;
+
+	pthread_mutex_lock(&g_lvol_stores_mutex);
+	TAILQ_FOREACH(tmp, &g_lvol_stores, link) {
+		if (!strncmp(lvs->name, tmp->name, SPDK_LVS_NAME_MAX)) {
+			name_conflict = true;
+			break;
+		}
+	}
+	if (!name_conflict) {
+		lvs->on_list = true;
+		TAILQ_INSERT_TAIL(&g_lvol_stores, lvs, link);
+	}
+	pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+	return name_conflict ? -1 : 0;
+}
+
+static void
+lvs_free(struct spdk_lvol_store *lvs)
+{
+	pthread_mutex_lock(&g_lvol_stores_mutex);
+	if (lvs->on_list) {
+		TAILQ_REMOVE(&g_lvol_stores, lvs, link);
+	}
+	pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+	free(lvs);
+}
+
+static void
+lvol_free(struct spdk_lvol *lvol)
+{
+	free(lvol);
+}
+
+static void
+lvol_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+	struct spdk_lvol_with_handle_req *req = cb_arg;
+	struct spdk_lvol *lvol = req->lvol;
+
+	if (lvolerrno != 0) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Failed to open lvol %s\n", lvol->unique_id);
+		goto end;
+	}
+
+	lvol->ref_count++;
+	lvol->blob = blob;
+end:
+	req->cb_fn(req->cb_arg, lvol, lvolerrno);
+	free(req);
+}
+
+void
+spdk_lvol_open(struct spdk_lvol *lvol, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_with_handle_req *req;
+	struct spdk_blob_open_opts opts;
+
+	assert(cb_fn != NULL);
+
+	if (lvol == NULL) {
+		SPDK_ERRLOG("lvol does not exist\n");
+		cb_fn(cb_arg, NULL, -ENODEV);
+		return;
+	}
+
+	if (lvol->action_in_progress == true) {
+		SPDK_ERRLOG("Cannot open lvol - operations on lvol pending\n");
+		cb_fn(cb_arg, lvol, -EBUSY);
+		return;
+	}
+
+	if (lvol->ref_count > 0) {
+		lvol->ref_count++;
+		cb_fn(cb_arg, lvol, 0);
+		return;
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot alloc memory for request structure\n");
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->lvol = lvol;
+
+	spdk_blob_open_opts_init(&opts);
+	opts.clear_method = lvol->clear_method;
+
+	spdk_bs_open_blob_ext(lvol->lvol_store->blobstore, lvol->blob_id, &opts, lvol_open_cb, req);
+}
+
+static void
+bs_unload_with_error_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+
+	req->cb_fn(req->cb_arg, NULL, req->lvserrno);
+	free(req);
+}
+
+static void
+load_next_lvol(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob_store *bs = lvs->blobstore;
+	struct spdk_lvol *lvol, *tmp;
+	spdk_blob_id blob_id;
+	const char *attr;
+	size_t value_len;
+	int rc;
+
+	if (lvolerrno == -ENOENT) {
+		/* Finished iterating */
+		req->cb_fn(req->cb_arg, lvs, 0);
+		free(req);
+		return;
+	} else if (lvolerrno < 0) {
+		SPDK_ERRLOG("Failed to fetch blobs list\n");
+		req->lvserrno = lvolerrno;
+		goto invalid;
+	}
+
+	blob_id = spdk_blob_get_id(blob);
+
+	if (blob_id == lvs->super_blob_id) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "found superblob %"PRIu64"\n", (uint64_t)blob_id);
+		spdk_bs_iter_next(bs, blob, load_next_lvol, req);
+		return;
+	}
+
+	lvol = calloc(1, sizeof(*lvol));
+	if (!lvol) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+		req->lvserrno = -ENOMEM;
+		goto invalid;
+	}
+
+	lvol->blob = blob;
+	lvol->blob_id = blob_id;
+	lvol->lvol_store = lvs;
+	lvol->thin_provision = spdk_blob_is_thin_provisioned(blob);
+
+	rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len);
+	if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0' ||
+	    spdk_uuid_parse(&lvol->uuid, attr) != 0) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Missing or corrupt lvol uuid\n");
+		memset(&lvol->uuid, 0, sizeof(lvol->uuid));
+	}
+	spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid);
+
+	if (!spdk_mem_all_zero(&lvol->uuid, sizeof(lvol->uuid))) {
+		snprintf(lvol->unique_id, sizeof(lvol->unique_id), "%s", lvol->uuid_str);
+	} else {
+		spdk_uuid_fmt_lower(lvol->unique_id, sizeof(lvol->unique_id), &lvol->lvol_store->uuid);
+		value_len = strlen(lvol->unique_id);
+		snprintf(lvol->unique_id + value_len, sizeof(lvol->unique_id) - value_len, "_%"PRIu64,
+			 (uint64_t)blob_id);
+	}
+
+	rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len);
+	if (rc != 0 || value_len > SPDK_LVOL_NAME_MAX) {
+		SPDK_ERRLOG("Cannot assign lvol name\n");
+		lvol_free(lvol);
+		req->lvserrno = -EINVAL;
+		goto invalid;
+	}
+
+	snprintf(lvol->name, sizeof(lvol->name), "%s", attr);
+
+	TAILQ_INSERT_TAIL(&lvs->lvols, lvol, link);
+
+	lvs->lvol_count++;
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "added lvol %s (%s)\n", lvol->unique_id, lvol->uuid_str);
+
+	spdk_bs_iter_next(bs, blob, load_next_lvol, req);
+
+	return;
+
+invalid:
+	TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+		TAILQ_REMOVE(&lvs->lvols, lvol, link);
+		free(lvol);
+	}
+
+	lvs_free(lvs);
+	spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+}
+
+static void
+close_super_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob_store *bs = lvs->blobstore;
+
+	if (lvolerrno != 0) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not close super blob\n");
+		lvs_free(lvs);
+		req->lvserrno = -ENODEV;
+		spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+		return;
+	}
+
+	/* Start loading lvols */
+	spdk_bs_iter_first(lvs->blobstore, load_next_lvol, req);
+}
+
+static void
+close_super_blob_with_error_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob_store *bs = lvs->blobstore;
+
+	lvs_free(lvs);
+
+	spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+}
+
+static void
+lvs_read_uuid(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob_store *bs = lvs->blobstore;
+	const char *attr;
+	size_t value_len;
+	int rc;
+
+	if (lvolerrno != 0) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not open super blob\n");
+		lvs_free(lvs);
+		req->lvserrno = -ENODEV;
+		spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+		return;
+	}
+
+	rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len);
+	if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0') {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or incorrect UUID\n");
+		req->lvserrno = -EINVAL;
+		spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+		return;
+	}
+
+	if (spdk_uuid_parse(&lvs->uuid, attr)) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "incorrect UUID '%s'\n", attr);
+		req->lvserrno = -EINVAL;
+		spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+		return;
+	}
+
+	rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len);
+	if (rc != 0 || value_len > SPDK_LVS_NAME_MAX) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or invalid name\n");
+		req->lvserrno = -EINVAL;
+		spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+		return;
+	}
+
+	snprintf(lvs->name, sizeof(lvs->name), "%s", attr);
+
+	rc = add_lvs_to_list(lvs);
+	if (rc) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "lvolstore with name %s already exists\n", lvs->name);
+		req->lvserrno = -EEXIST;
+		spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+		return;
+	}
+
+	lvs->super_blob_id = spdk_blob_get_id(blob);
+
+	spdk_blob_close(blob, close_super_cb, req);
+}
+
+static void
+lvs_open_super(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob_store *bs = lvs->blobstore;
+
+	if (lvolerrno != 0) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Super blob not found\n");
+		lvs_free(lvs);
+		req->lvserrno = -ENODEV;
+		spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+		return;
+	}
+
+	spdk_bs_open_blob(bs, blobid, lvs_read_uuid, req);
+}
+
+static void
+lvs_load_cb(void *cb_arg, struct spdk_blob_store *bs, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+	struct spdk_lvol_store *lvs;
+
+	if (lvolerrno != 0) {
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		free(req);
+		return;
+	}
+
+	lvs = calloc(1, sizeof(*lvs));
+	if (lvs == NULL) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol store\n");
+		spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+		return;
+	}
+
+	lvs->blobstore = bs;
+	lvs->bs_dev = req->bs_dev;
+	TAILQ_INIT(&lvs->lvols);
+	TAILQ_INIT(&lvs->pending_lvols);
+
+	req->lvol_store = lvs;
+
+	spdk_bs_get_super(bs, lvs_open_super, req);
+}
+
+static void
+lvs_bs_opts_init(struct spdk_bs_opts *opts)
+{
+	spdk_bs_opts_init(opts);
+	opts->max_channel_ops = SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS;
+}
+
+void
+spdk_lvs_load(struct spdk_bs_dev *bs_dev, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvs_with_handle_req *req;
+	struct spdk_bs_opts opts = {};
+
+	assert(cb_fn != NULL);
+
+	if (bs_dev == NULL) {
+		SPDK_ERRLOG("Blobstore device does not exist\n");
+		cb_fn(cb_arg, NULL, -ENODEV);
+		return;
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (req == NULL) {
+		SPDK_ERRLOG("Cannot alloc memory for request structure\n");
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->bs_dev = bs_dev;
+
+	lvs_bs_opts_init(&opts);
+	snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE");
+
+	spdk_bs_load(bs_dev, &opts, lvs_load_cb, req);
+}
+
+static void
+remove_bs_on_error_cb(void *cb_arg, int bserrno)
+{
+}
+
+static void
+super_create_close_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+
+	if (lvolerrno < 0) {
+		SPDK_ERRLOG("Lvol store init failed: could not close super blob\n");
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+		lvs_free(lvs);
+		free(req);
+		return;
+	}
+
+	req->cb_fn(req->cb_arg, lvs, lvolerrno);
+	free(req);
+}
+
+static void
+super_blob_set_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob *blob = lvs->super_blob;
+
+	if (lvolerrno < 0) {
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		SPDK_ERRLOG("Lvol store init failed: could not set uuid for super blob\n");
+		spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+		lvs_free(lvs);
+		free(req);
+		return;
+	}
+
+	spdk_blob_close(blob, super_create_close_cb, req);
+}
+
+static void
+super_blob_init_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob *blob = lvs->super_blob;
+	char uuid[SPDK_UUID_STRING_LEN];
+
+	if (lvolerrno < 0) {
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		SPDK_ERRLOG("Lvol store init failed: could not set super blob\n");
+		spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+		lvs_free(lvs);
+		free(req);
+		return;
+	}
+
+	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs->uuid);
+
+	spdk_blob_set_xattr(blob, "uuid", uuid, sizeof(uuid));
+	spdk_blob_set_xattr(blob, "name", lvs->name, strnlen(lvs->name, SPDK_LVS_NAME_MAX) + 1);
+	spdk_blob_sync_md(blob, super_blob_set_cb, req);
+}
+
+static void
+super_blob_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+
+	if (lvolerrno < 0) {
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		SPDK_ERRLOG("Lvol store init failed: could not open super blob\n");
+		spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+		lvs_free(lvs);
+		free(req);
+		return;
+	}
+
+	lvs->super_blob = blob;
+	lvs->super_blob_id = spdk_blob_get_id(blob);
+
+	spdk_bs_set_super(lvs->blobstore, lvs->super_blob_id, super_blob_init_cb, req);
+}
+
+static void
+super_blob_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
+{
+	struct spdk_lvs_with_handle_req *req = cb_arg;
+	struct spdk_lvol_store *lvs = req->lvol_store;
+	struct spdk_blob_store *bs;
+
+	if (lvolerrno < 0) {
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		SPDK_ERRLOG("Lvol store init failed: could not create super blob\n");
+		spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+		lvs_free(lvs);
+		free(req);
+		return;
+	}
+
+	bs = req->lvol_store->blobstore;
+
+	spdk_bs_open_blob(bs, blobid, super_blob_create_open_cb, req);
+}
+
+static void
+lvs_init_cb(void *cb_arg, struct spdk_blob_store *bs, int lvserrno)
+{
+	struct spdk_lvs_with_handle_req *lvs_req = cb_arg;
+	struct spdk_lvol_store *lvs = lvs_req->lvol_store;
+
+	if (lvserrno != 0) {
+		assert(bs == NULL);
+		lvs_req->cb_fn(lvs_req->cb_arg, NULL, lvserrno);
+		SPDK_ERRLOG("Lvol store init failed: could not initialize blobstore\n");
+		lvs_free(lvs);
+		free(lvs_req);
+		return;
+	}
+
+	assert(bs != NULL);
+	lvs->blobstore = bs;
+	TAILQ_INIT(&lvs->lvols);
+	TAILQ_INIT(&lvs->pending_lvols);
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store initialized\n");
+
+	/* create super blob */
+	spdk_bs_create_blob(lvs->blobstore, super_blob_create_cb, lvs_req);
+}
+
+void
+spdk_lvs_opts_init(struct spdk_lvs_opts *o)
+{
+	o->cluster_sz = SPDK_LVS_OPTS_CLUSTER_SZ;
+	o->clear_method = LVS_CLEAR_WITH_UNMAP;
+	memset(o->name, 0, sizeof(o->name));
+}
+
+static void
+setup_lvs_opts(struct spdk_bs_opts *bs_opts, struct spdk_lvs_opts *o)
+{
+	assert(o != NULL);
+	lvs_bs_opts_init(bs_opts);
+	bs_opts->cluster_sz = o->cluster_sz;
+	bs_opts->clear_method = (enum bs_clear_method)o->clear_method;
+}
+
+int
+spdk_lvs_init(struct spdk_bs_dev *bs_dev, struct spdk_lvs_opts *o,
+	      spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_store *lvs;
+	struct spdk_lvs_with_handle_req *lvs_req;
+	struct spdk_bs_opts opts = {};
+	int rc;
+
+	if (bs_dev == NULL) {
+		SPDK_ERRLOG("Blobstore device does not exist\n");
+		return -ENODEV;
+	}
+
+	if (o == NULL) {
+		SPDK_ERRLOG("spdk_lvs_opts not specified\n");
+		return -EINVAL;
+	}
+
+	setup_lvs_opts(&opts, o);
+
+	if (strnlen(o->name, SPDK_LVS_NAME_MAX) == SPDK_LVS_NAME_MAX) {
+		SPDK_ERRLOG("Name has no null terminator.\n");
+		return -EINVAL;
+	}
+
+	if (strnlen(o->name, SPDK_LVS_NAME_MAX) == 0) {
+		SPDK_ERRLOG("No name specified.\n");
+		return -EINVAL;
+	}
+
+	lvs = calloc(1, sizeof(*lvs));
+	if (!lvs) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol store base pointer\n");
+		return -ENOMEM;
+	}
+
+	spdk_uuid_generate(&lvs->uuid);
+	snprintf(lvs->name, sizeof(lvs->name), "%s", o->name);
+
+	rc = add_lvs_to_list(lvs);
+	if (rc) {
+		SPDK_ERRLOG("lvolstore with name %s already exists\n", lvs->name);
+		lvs_free(lvs);
+		return -EEXIST;
+	}
+
+	lvs_req = calloc(1, sizeof(*lvs_req));
+	if (!lvs_req) {
+		lvs_free(lvs);
+		SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n");
+		return -ENOMEM;
+	}
+
+	assert(cb_fn != NULL);
+	lvs_req->cb_fn = cb_fn;
+	lvs_req->cb_arg = cb_arg;
+	lvs_req->lvol_store = lvs;
+	lvs->bs_dev = bs_dev;
+	lvs->destruct = false;
+
+	snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE");
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Initializing lvol store\n");
+	spdk_bs_init(bs_dev, &opts, lvs_init_cb, lvs_req);
+
+	return 0;
+}
+
+static void
+lvs_rename_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_req *req = cb_arg;
+
+	if (lvolerrno != 0) {
+		req->lvserrno = lvolerrno;
+	}
+	if (req->lvserrno != 0) {
+		SPDK_ERRLOG("Lvol store rename operation failed\n");
+		/* Lvs renaming failed, so we should 'clear' new_name.
+		 * Otherwise it could cause a failure on the next attepmt to change the name to 'new_name'  */
+		snprintf(req->lvol_store->new_name,
+			 sizeof(req->lvol_store->new_name),
+			 "%s", req->lvol_store->name);
+	} else {
+		/* Update lvs name with new_name */
+		snprintf(req->lvol_store->name,
+			 sizeof(req->lvol_store->name),
+			 "%s", req->lvol_store->new_name);
+	}
+
+	req->cb_fn(req->cb_arg, req->lvserrno);
+	free(req);
+}
+
+static void
+lvs_rename_sync_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvs_req *req = cb_arg;
+	struct spdk_blob *blob = req->lvol_store->super_blob;
+
+	if (lvolerrno < 0) {
+		req->lvserrno = lvolerrno;
+	}
+
+	spdk_blob_close(blob, lvs_rename_cb, req);
+}
+
+static void
+lvs_rename_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+	struct spdk_lvs_req *req = cb_arg;
+	int rc;
+
+	if (lvolerrno < 0) {
+		lvs_rename_cb(cb_arg, lvolerrno);
+		return;
+	}
+
+	rc = spdk_blob_set_xattr(blob, "name", req->lvol_store->new_name,
+				 strlen(req->lvol_store->new_name) + 1);
+	if (rc < 0) {
+		req->lvserrno = rc;
+		lvs_rename_sync_cb(req, rc);
+		return;
+	}
+
+	req->lvol_store->super_blob = blob;
+
+	spdk_blob_sync_md(blob, lvs_rename_sync_cb, req);
+}
+
+void
+spdk_lvs_rename(struct spdk_lvol_store *lvs, const char *new_name,
+		spdk_lvs_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvs_req *req;
+	struct spdk_lvol_store *tmp;
+
+	/* Check if new name is current lvs name.
+	 * If so, return success immediately */
+	if (strncmp(lvs->name, new_name, SPDK_LVS_NAME_MAX) == 0) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	/* Check if new or new_name is already used in other lvs */
+	pthread_mutex_lock(&g_lvol_stores_mutex);
+	TAILQ_FOREACH(tmp, &g_lvol_stores, link) {
+		if (!strncmp(new_name, tmp->name, SPDK_LVS_NAME_MAX) ||
+		    !strncmp(new_name, tmp->new_name, SPDK_LVS_NAME_MAX)) {
+			pthread_mutex_unlock(&g_lvol_stores_mutex);
+			cb_fn(cb_arg, -EEXIST);
+			return;
+		}
+	}
+	pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	snprintf(lvs->new_name, sizeof(lvs->new_name), "%s", new_name);
+	req->lvol_store = lvs;
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	spdk_bs_open_blob(lvs->blobstore, lvs->super_blob_id, lvs_rename_open_cb, req);
+}
+
+static void
+_lvs_unload_cb(void *cb_arg, int lvserrno)
+{
+	struct spdk_lvs_req *lvs_req = cb_arg;
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store unloaded\n");
+	assert(lvs_req->cb_fn != NULL);
+	lvs_req->cb_fn(lvs_req->cb_arg, lvserrno);
+	free(lvs_req);
+}
+
+int
+spdk_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn,
+		void *cb_arg)
+{
+	struct spdk_lvs_req *lvs_req;
+	struct spdk_lvol *lvol, *tmp;
+
+	if (lvs == NULL) {
+		SPDK_ERRLOG("Lvol store is NULL\n");
+		return -ENODEV;
+	}
+
+	TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+		if (lvol->action_in_progress == true) {
+			SPDK_ERRLOG("Cannot unload lvol store - operations on lvols pending\n");
+			cb_fn(cb_arg, -EBUSY);
+			return -EBUSY;
+		} else if (lvol->ref_count != 0) {
+			SPDK_ERRLOG("Lvols still open on lvol store\n");
+			cb_fn(cb_arg, -EBUSY);
+			return -EBUSY;
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+		TAILQ_REMOVE(&lvs->lvols, lvol, link);
+		lvol_free(lvol);
+	}
+
+	lvs_req = calloc(1, sizeof(*lvs_req));
+	if (!lvs_req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n");
+		return -ENOMEM;
+	}
+
+	lvs_req->cb_fn = cb_fn;
+	lvs_req->cb_arg = cb_arg;
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Unloading lvol store\n");
+	spdk_bs_unload(lvs->blobstore, _lvs_unload_cb, lvs_req);
+	lvs_free(lvs);
+
+	return 0;
+}
+
+static void
+_lvs_destroy_cb(void *cb_arg, int lvserrno)
+{
+	struct spdk_lvs_destroy_req *lvs_req = cb_arg;
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store destroyed\n");
+	assert(lvs_req->cb_fn != NULL);
+	lvs_req->cb_fn(lvs_req->cb_arg, lvserrno);
+	free(lvs_req);
+}
+
+static void
+_lvs_destroy_super_cb(void *cb_arg, int bserrno)
+{
+	struct spdk_lvs_destroy_req *lvs_req = cb_arg;
+	struct spdk_lvol_store *lvs = lvs_req->lvs;
+
+	assert(lvs != NULL);
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Destroying lvol store\n");
+	spdk_bs_destroy(lvs->blobstore, _lvs_destroy_cb, lvs_req);
+	lvs_free(lvs);
+}
+
+int
+spdk_lvs_destroy(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn,
+		 void *cb_arg)
+{
+	struct spdk_lvs_destroy_req *lvs_req;
+	struct spdk_lvol *iter_lvol, *tmp;
+
+	if (lvs == NULL) {
+		SPDK_ERRLOG("Lvol store is NULL\n");
+		return -ENODEV;
+	}
+
+	TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) {
+		if (iter_lvol->action_in_progress == true) {
+			SPDK_ERRLOG("Cannot destroy lvol store - operations on lvols pending\n");
+			cb_fn(cb_arg, -EBUSY);
+			return -EBUSY;
+		} else if (iter_lvol->ref_count != 0) {
+			SPDK_ERRLOG("Lvols still open on lvol store\n");
+			cb_fn(cb_arg, -EBUSY);
+			return -EBUSY;
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) {
+		free(iter_lvol);
+	}
+
+	lvs_req = calloc(1, sizeof(*lvs_req));
+	if (!lvs_req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n");
+		return -ENOMEM;
+	}
+
+	lvs_req->cb_fn = cb_fn;
+	lvs_req->cb_arg = cb_arg;
+	lvs_req->lvs = lvs;
+
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Deleting super blob\n");
+	spdk_bs_delete_blob(lvs->blobstore, lvs->super_blob_id, _lvs_destroy_super_cb, lvs_req);
+
+	return 0;
+}
+
+static void
+lvol_close_blob_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+	struct spdk_lvol *lvol = req->lvol;
+
+	if (lvolerrno < 0) {
+		SPDK_ERRLOG("Could not close blob on lvol\n");
+		lvol_free(lvol);
+		goto end;
+	}
+
+	lvol->ref_count--;
+	lvol->action_in_progress = false;
+	SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s closed\n", lvol->unique_id);
+
+end:
+	req->cb_fn(req->cb_arg, lvolerrno);
+	free(req);
+}
+
+bool
+spdk_lvol_deletable(struct spdk_lvol *lvol)
+{
+	size_t count = 0;
+
+	spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count);
+	return (count == 0);
+}
+
+static void
+lvol_delete_blob_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+	struct spdk_lvol *lvol = req->lvol;
+
+	if (lvolerrno < 0) {
+		SPDK_ERRLOG("Could not remove blob on lvol gracefully - forced removal\n");
+	} else {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s deleted\n", lvol->unique_id);
+	}
+
+	TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link);
+	lvol_free(lvol);
+	req->cb_fn(req->cb_arg, lvolerrno);
+	free(req);
+}
+
+static void
+lvol_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+	struct spdk_lvol_with_handle_req *req = cb_arg;
+	struct spdk_lvol *lvol = req->lvol;
+
+	TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link);
+
+	if (lvolerrno < 0) {
+		free(lvol);
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		free(req);
+		return;
+	}
+
+	lvol->blob = blob;
+	lvol->blob_id = spdk_blob_get_id(blob);
+
+	TAILQ_INSERT_TAIL(&lvol->lvol_store->lvols, lvol, link);
+
+	snprintf(lvol->unique_id, sizeof(lvol->unique_id), "%s", lvol->uuid_str);
+	lvol->ref_count++;
+
+	assert(req->cb_fn != NULL);
+	req->cb_fn(req->cb_arg, req->lvol, lvolerrno);
+	free(req);
+}
+
+static void
+lvol_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
+{
+	struct spdk_lvol_with_handle_req *req = cb_arg;
+	struct spdk_blob_store *bs;
+	struct spdk_blob_open_opts opts;
+
+	if (lvolerrno < 0) {
+		TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link);
+		free(req->lvol);
+		assert(req->cb_fn != NULL);
+		req->cb_fn(req->cb_arg, NULL, lvolerrno);
+		free(req);
+		return;
+	}
+
+	spdk_blob_open_opts_init(&opts);
+	opts.clear_method = req->lvol->clear_method;
+	bs = req->lvol->lvol_store->blobstore;
+
+	spdk_bs_open_blob_ext(bs, blobid, &opts, lvol_create_open_cb, req);
+}
+
+static void
+lvol_get_xattr_value(void *xattr_ctx, const char *name,
+		     const void **value, size_t *value_len)
+{
+	struct spdk_lvol *lvol = xattr_ctx;
+
+	if (!strcmp(LVOL_NAME, name)) {
+		*value = lvol->name;
+		*value_len = SPDK_LVOL_NAME_MAX;
+	} else if (!strcmp("uuid", name)) {
+		*value = lvol->uuid_str;
+		*value_len = sizeof(lvol->uuid_str);
+	}
+}
+
+static int
+lvs_verify_lvol_name(struct spdk_lvol_store *lvs, const char *name)
+{
+	struct spdk_lvol *tmp;
+
+	if (name == NULL || strnlen(name, SPDK_LVOL_NAME_MAX) == 0) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "lvol name not provided.\n");
+		return -EINVAL;
+	}
+
+	if (strnlen(name, SPDK_LVOL_NAME_MAX) == SPDK_LVOL_NAME_MAX) {
+		SPDK_ERRLOG("Name has no null terminator.\n");
+		return -EINVAL;
+	}
+
+	TAILQ_FOREACH(tmp, &lvs->lvols, link) {
+		if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) {
+			SPDK_ERRLOG("lvol with name %s already exists\n", name);
+			return -EEXIST;
+		}
+	}
+
+	TAILQ_FOREACH(tmp, &lvs->pending_lvols, link) {
+		if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) {
+			SPDK_ERRLOG("lvol with name %s is being already created\n", name);
+			return -EEXIST;
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz,
+		 bool thin_provision, enum lvol_clear_method clear_method, spdk_lvol_op_with_handle_complete cb_fn,
+		 void *cb_arg)
+{
+	struct spdk_lvol_with_handle_req *req;
+	struct spdk_blob_store *bs;
+	struct spdk_lvol *lvol;
+	struct spdk_blob_opts opts;
+	uint64_t num_clusters;
+	char *xattr_names[] = {LVOL_NAME, "uuid"};
+	int rc;
+
+	if (lvs == NULL) {
+		SPDK_ERRLOG("lvol store does not exist\n");
+		return -EINVAL;
+	}
+
+	rc = lvs_verify_lvol_name(lvs, name);
+	if (rc < 0) {
+		return rc;
+	}
+
+	bs = lvs->blobstore;
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		return -ENOMEM;
+	}
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	lvol = calloc(1, sizeof(*lvol));
+	if (!lvol) {
+		free(req);
+		SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+		return -ENOMEM;
+	}
+	lvol->lvol_store = lvs;
+	num_clusters = spdk_divide_round_up(sz, spdk_bs_get_cluster_size(bs));
+	lvol->thin_provision = thin_provision;
+	lvol->clear_method = (enum blob_clear_method)clear_method;
+	snprintf(lvol->name, sizeof(lvol->name), "%s", name);
+	TAILQ_INSERT_TAIL(&lvol->lvol_store->pending_lvols, lvol, link);
+	spdk_uuid_generate(&lvol->uuid);
+	spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid);
+	req->lvol = lvol;
+
+	spdk_blob_opts_init(&opts);
+	opts.thin_provision = thin_provision;
+	opts.num_clusters = num_clusters;
+	opts.clear_method = lvol->clear_method;
+	opts.xattrs.count = SPDK_COUNTOF(xattr_names);
+	opts.xattrs.names = xattr_names;
+	opts.xattrs.ctx = lvol;
+	opts.xattrs.get_value = lvol_get_xattr_value;
+
+	spdk_bs_create_blob_ext(lvs->blobstore, &opts, lvol_create_cb, req);
+
+	return 0;
+}
+
+void
+spdk_lvol_create_snapshot(struct spdk_lvol *origlvol, const char *snapshot_name,
+			  spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_store *lvs;
+	struct spdk_lvol *newlvol;
+	struct spdk_blob *origblob;
+	struct spdk_lvol_with_handle_req *req;
+	struct spdk_blob_xattr_opts snapshot_xattrs;
+	char *xattr_names[] = {LVOL_NAME, "uuid"};
+	int rc;
+
+	if (origlvol == NULL) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	origblob = origlvol->blob;
+	lvs = origlvol->lvol_store;
+	if (lvs == NULL) {
+		SPDK_ERRLOG("lvol store does not exist\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	rc = lvs_verify_lvol_name(lvs, snapshot_name);
+	if (rc < 0) {
+		cb_fn(cb_arg, NULL, rc);
+		return;
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	newlvol = calloc(1, sizeof(*newlvol));
+	if (!newlvol) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+		free(req);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	newlvol->lvol_store = origlvol->lvol_store;
+	snprintf(newlvol->name, sizeof(newlvol->name), "%s", snapshot_name);
+	TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link);
+	spdk_uuid_generate(&newlvol->uuid);
+	spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid);
+	snapshot_xattrs.count = SPDK_COUNTOF(xattr_names);
+	snapshot_xattrs.ctx = newlvol;
+	snapshot_xattrs.names = xattr_names;
+	snapshot_xattrs.get_value = lvol_get_xattr_value;
+	req->lvol = newlvol;
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	spdk_bs_create_snapshot(lvs->blobstore, spdk_blob_get_id(origblob), &snapshot_xattrs,
+				lvol_create_cb, req);
+}
+
+void
+spdk_lvol_create_clone(struct spdk_lvol *origlvol, const char *clone_name,
+		       spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol *newlvol;
+	struct spdk_lvol_with_handle_req *req;
+	struct spdk_lvol_store *lvs;
+	struct spdk_blob *origblob;
+	struct spdk_blob_xattr_opts clone_xattrs;
+	char *xattr_names[] = {LVOL_NAME, "uuid"};
+	int rc;
+
+	if (origlvol == NULL) {
+		SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	origblob = origlvol->blob;
+	lvs = origlvol->lvol_store;
+	if (lvs == NULL) {
+		SPDK_ERRLOG("lvol store does not exist\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	rc = lvs_verify_lvol_name(lvs, clone_name);
+	if (rc < 0) {
+		cb_fn(cb_arg, NULL, rc);
+		return;
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	newlvol = calloc(1, sizeof(*newlvol));
+	if (!newlvol) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+		free(req);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	newlvol->lvol_store = lvs;
+	snprintf(newlvol->name, sizeof(newlvol->name), "%s", clone_name);
+	TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link);
+	spdk_uuid_generate(&newlvol->uuid);
+	spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid);
+	clone_xattrs.count = SPDK_COUNTOF(xattr_names);
+	clone_xattrs.ctx = newlvol;
+	clone_xattrs.names = xattr_names;
+	clone_xattrs.get_value = lvol_get_xattr_value;
+	req->lvol = newlvol;
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	spdk_bs_create_clone(lvs->blobstore, spdk_blob_get_id(origblob), &clone_xattrs,
+			     lvol_create_cb,
+			     req);
+}
+
+static void
+lvol_resize_done(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+
+	req->cb_fn(req->cb_arg,  lvolerrno);
+	free(req);
+}
+
+static void
+lvol_blob_resize_cb(void *cb_arg, int bserrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+	struct spdk_lvol *lvol = req->lvol;
+
+	if (bserrno != 0) {
+		req->cb_fn(req->cb_arg, bserrno);
+		free(req);
+		return;
+	}
+
+	spdk_blob_sync_md(lvol->blob, lvol_resize_done, req);
+}
+
+void
+spdk_lvol_resize(struct spdk_lvol *lvol, uint64_t sz,
+		 spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_blob *blob = lvol->blob;
+	struct spdk_lvol_store *lvs = lvol->lvol_store;
+	struct spdk_lvol_req *req;
+	uint64_t new_clusters = spdk_divide_round_up(sz, spdk_bs_get_cluster_size(lvs->blobstore));
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->lvol = lvol;
+
+	spdk_blob_resize(blob, new_clusters, lvol_blob_resize_cb, req);
+}
+
+static void
+lvol_set_read_only_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+
+	req->cb_fn(req->cb_arg, lvolerrno);
+	free(req);
+}
+
+void
+spdk_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_req *req;
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	spdk_blob_set_read_only(lvol->blob);
+	spdk_blob_sync_md(lvol->blob, lvol_set_read_only_cb, req);
+}
+
+static void
+lvol_rename_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+
+	if (lvolerrno != 0) {
+		SPDK_ERRLOG("Lvol rename operation failed\n");
+	} else {
+		snprintf(req->lvol->name, sizeof(req->lvol->name), "%s", req->name);
+	}
+
+	req->cb_fn(req->cb_arg, lvolerrno);
+	free(req);
+}
+
+void
+spdk_lvol_rename(struct spdk_lvol *lvol, const char *new_name,
+		 spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol *tmp;
+	struct spdk_blob *blob = lvol->blob;
+	struct spdk_lvol_req *req;
+	int rc;
+
+	/* Check if new name is current lvol name.
+	 * If so, return success immediately */
+	if (strncmp(lvol->name, new_name, SPDK_LVOL_NAME_MAX) == 0) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	/* Check if lvol with 'new_name' already exists in lvolstore */
+	TAILQ_FOREACH(tmp, &lvol->lvol_store->lvols, link) {
+		if (strncmp(tmp->name, new_name, SPDK_LVOL_NAME_MAX) == 0) {
+			SPDK_ERRLOG("Lvol %s already exists in lvol store %s\n", new_name, lvol->lvol_store->name);
+			cb_fn(cb_arg, -EEXIST);
+			return;
+		}
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->lvol = lvol;
+	snprintf(req->name, sizeof(req->name), "%s", new_name);
+
+	rc = spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1);
+	if (rc < 0) {
+		free(req);
+		cb_fn(cb_arg, rc);
+		return;
+	}
+
+	spdk_blob_sync_md(blob, lvol_rename_cb, req);
+}
+
+void
+spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_req *req;
+	struct spdk_blob_store *bs;
+
+	assert(cb_fn != NULL);
+
+	if (lvol == NULL) {
+		SPDK_ERRLOG("lvol does not exist\n");
+		cb_fn(cb_arg, -ENODEV);
+		return;
+	}
+
+	if (lvol->ref_count != 0) {
+		SPDK_ERRLOG("Cannot destroy lvol %s because it is still open\n", lvol->unique_id);
+		cb_fn(cb_arg, -EBUSY);
+		return;
+	}
+
+	lvol->action_in_progress = true;
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->lvol = lvol;
+	bs = lvol->lvol_store->blobstore;
+
+	spdk_bs_delete_blob(bs, lvol->blob_id, lvol_delete_blob_cb, req);
+}
+
+void
+spdk_lvol_close(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_req *req;
+
+	assert(cb_fn != NULL);
+
+	if (lvol == NULL) {
+		SPDK_ERRLOG("lvol does not exist\n");
+		cb_fn(cb_arg, -ENODEV);
+		return;
+	}
+
+	if (lvol->ref_count > 1) {
+		lvol->ref_count--;
+		cb_fn(cb_arg, 0);
+		return;
+	} else if (lvol->ref_count == 0) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	lvol->action_in_progress = true;
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->lvol = lvol;
+
+	spdk_blob_close(lvol->blob, lvol_close_blob_cb, req);
+}
+
+struct spdk_io_channel *
+spdk_lvol_get_io_channel(struct spdk_lvol *lvol)
+{
+	return spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore);
+}
+
+static void
+lvol_inflate_cb(void *cb_arg, int lvolerrno)
+{
+	struct spdk_lvol_req *req = cb_arg;
+
+	spdk_bs_free_io_channel(req->channel);
+
+	if (lvolerrno < 0) {
+		SPDK_ERRLOG("Could not inflate lvol\n");
+	}
+
+	req->cb_fn(req->cb_arg, lvolerrno);
+	free(req);
+}
+
+void
+spdk_lvol_inflate(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_req *req;
+	spdk_blob_id blob_id;
+
+	assert(cb_fn != NULL);
+
+	if (lvol == NULL) {
+		SPDK_ERRLOG("Lvol does not exist\n");
+		cb_fn(cb_arg, -ENODEV);
+		return;
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore);
+	if (req->channel == NULL) {
+		SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n");
+		free(req);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	blob_id = spdk_blob_get_id(lvol->blob);
+	spdk_bs_inflate_blob(lvol->lvol_store->blobstore, req->channel, blob_id, lvol_inflate_cb,
+			     req);
+}
+
+void
+spdk_lvol_decouple_parent(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_req *req;
+	spdk_blob_id blob_id;
+
+	assert(cb_fn != NULL);
+
+	if (lvol == NULL) {
+		SPDK_ERRLOG("Lvol does not exist\n");
+		cb_fn(cb_arg, -ENODEV);
+		return;
+	}
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore);
+	if (req->channel == NULL) {
+		SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n");
+		free(req);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	blob_id = spdk_blob_get_id(lvol->blob);
+	spdk_bs_blob_decouple_parent(lvol->lvol_store->blobstore, req->channel, blob_id,
+				     lvol_inflate_cb, req);
+}
diff --git a/src/spdk/lib/lvol/spdk_lvol.map b/src/spdk/lib/lvol/spdk_lvol.map
new file mode 100644
index 000000000..6ddeb3be6
--- /dev/null
+++ b/src/spdk/lib/lvol/spdk_lvol.map
@@ -0,0 +1,28 @@
+{
+	global:
+
+	# public functions
+	spdk_lvs_opts_init;
+	spdk_lvs_init;
+	spdk_lvs_rename;
+	spdk_lvs_unload;
+	spdk_lvs_destroy;
+	spdk_lvol_create;
+	spdk_lvol_create_snapshot;
+	spdk_lvol_create_clone;
+	spdk_lvol_rename;
+	spdk_lvol_deletable;
+	spdk_lvol_destroy;
+	spdk_lvol_close;
+	spdk_lvol_get_io_channel;
+	spdk_lvs_load;
+	spdk_lvol_open;
+	spdk_lvol_inflate;
+	spdk_lvol_decouple_parent;
+
+	# internal functions
+	spdk_lvol_resize;
+	spdk_lvol_set_read_only;
+
+	local: *;
+};
diff --git a/src/spdk/lib/nbd/Makefile b/src/spdk/lib/nbd/Makefile
new file mode 100644
index 000000000..69b13d133
--- /dev/null
+++ b/src/spdk/lib/nbd/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+LIBNAME = nbd
+C_SRCS = nbd.c nbd_rpc.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nbd.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nbd/nbd.c b/src/spdk/lib/nbd/nbd.c
new file mode 100644
index 000000000..7d96b9315
--- /dev/null
+++ b/src/spdk/lib/nbd/nbd.c
@@ -0,0 +1,1093 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include <linux/nbd.h>
+
+#include "spdk/nbd.h"
+#include "nbd_internal.h"
+#include "spdk/bdev.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/util.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/queue.h"
+
+#define GET_IO_LOOP_COUNT		16
+#define NBD_BUSY_WAITING_MS		1000
+#define NBD_BUSY_POLLING_INTERVAL_US	20000
+
+enum nbd_io_state_t {
+	/* Receiving or ready to receive nbd request header */
+	NBD_IO_RECV_REQ = 0,
+	/* Receiving write payload */
+	NBD_IO_RECV_PAYLOAD,
+	/* Transmitting or ready to transmit nbd response header */
+	NBD_IO_XMIT_RESP,
+	/* Transmitting read payload */
+	NBD_IO_XMIT_PAYLOAD,
+};
+
+struct nbd_io {
+	struct spdk_nbd_disk	*nbd;
+	enum nbd_io_state_t	state;
+
+	void			*payload;
+	uint32_t		payload_size;
+
+	struct nbd_request	req;
+	struct nbd_reply	resp;
+
+	/*
+	 * Tracks current progress on reading/writing a request,
+	 * response, or payload from the nbd socket.
+	 */
+	uint32_t		offset;
+
+	/* for bdev io_wait */
+	struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+	TAILQ_ENTRY(nbd_io)	tailq;
+};
+
+enum nbd_disk_state_t {
+	NBD_DISK_STATE_RUNNING = 0,
+	/* soft disconnection caused by receiving nbd_cmd_disc */
+	NBD_DISK_STATE_SOFTDISC,
+	/* hard disconnection caused by mandatory conditions */
+	NBD_DISK_STATE_HARDDISC,
+};
+
+struct spdk_nbd_disk {
+	struct spdk_bdev	*bdev;
+	struct spdk_bdev_desc	*bdev_desc;
+	struct spdk_io_channel	*ch;
+	int			dev_fd;
+	char			*nbd_path;
+	int			kernel_sp_fd;
+	int			spdk_sp_fd;
+	struct spdk_poller	*nbd_poller;
+	uint32_t		buf_align;
+
+	struct nbd_io		*io_in_recv;
+	TAILQ_HEAD(, nbd_io)	received_io_list;
+	TAILQ_HEAD(, nbd_io)	executed_io_list;
+
+	enum nbd_disk_state_t	state;
+	/* count of nbd_io in spdk_nbd_disk */
+	int			io_count;
+
+	TAILQ_ENTRY(spdk_nbd_disk)	tailq;
+};
+
+struct spdk_nbd_disk_globals {
+	TAILQ_HEAD(, spdk_nbd_disk)	disk_head;
+};
+
+static struct spdk_nbd_disk_globals g_spdk_nbd;
+
+static int
+nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io);
+
+int
+spdk_nbd_init(void)
+{
+	TAILQ_INIT(&g_spdk_nbd.disk_head);
+
+	return 0;
+}
+
+void
+spdk_nbd_fini(void)
+{
+	struct spdk_nbd_disk *nbd_idx, *nbd_tmp;
+
+	/*
+	 * Stop running spdk_nbd_disk.
+	 * Here, nbd removing are unnecessary, but _SAFE variant
+	 * is needed, since internal nbd_disk_unregister will
+	 * remove nbd from TAILQ.
+	 */
+	TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) {
+		spdk_nbd_stop(nbd_idx);
+	}
+}
+
+static int
+nbd_disk_register(struct spdk_nbd_disk *nbd)
+{
+	if (nbd_disk_find_by_nbd_path(nbd->nbd_path)) {
+		SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path);
+		return -EBUSY;
+	}
+
+	TAILQ_INSERT_TAIL(&g_spdk_nbd.disk_head, nbd, tailq);
+
+	return 0;
+}
+
+static void
+nbd_disk_unregister(struct spdk_nbd_disk *nbd)
+{
+	struct spdk_nbd_disk *nbd_idx, *nbd_tmp;
+
+	/*
+	 * nbd disk may be stopped before registered.
+	 * check whether it was registered.
+	 */
+	TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) {
+		if (nbd == nbd_idx) {
+			TAILQ_REMOVE(&g_spdk_nbd.disk_head, nbd_idx, tailq);
+			break;
+		}
+	}
+}
+
+struct spdk_nbd_disk *
+nbd_disk_find_by_nbd_path(const char *nbd_path)
+{
+	struct spdk_nbd_disk *nbd;
+
+	/*
+	 * check whether nbd has already been registered by nbd path.
+	 */
+	TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) {
+		if (!strcmp(nbd->nbd_path, nbd_path)) {
+			return nbd;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_nbd_disk *nbd_disk_first(void)
+{
+	return TAILQ_FIRST(&g_spdk_nbd.disk_head);
+}
+
+struct spdk_nbd_disk *nbd_disk_next(struct spdk_nbd_disk *prev)
+{
+	return TAILQ_NEXT(prev, tailq);
+}
+
+const char *
+nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd)
+{
+	return nbd->nbd_path;
+}
+
+const char *
+nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd)
+{
+	return spdk_bdev_get_name(nbd->bdev);
+}
+
+void
+spdk_nbd_write_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_nbd_disk *nbd;
+
+	spdk_json_write_array_begin(w);
+
+	TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) {
+		spdk_json_write_object_begin(w);
+
+		spdk_json_write_named_string(w, "method", "nbd_start_disk");
+
+		spdk_json_write_named_object_begin(w, "params");
+		spdk_json_write_named_string(w, "nbd_device",  nbd_disk_get_nbd_path(nbd));
+		spdk_json_write_named_string(w, "bdev_name", nbd_disk_get_bdev_name(nbd));
+		spdk_json_write_object_end(w);
+
+		spdk_json_write_object_end(w);
+	}
+
+	spdk_json_write_array_end(w);
+}
+
+void
+nbd_disconnect(struct spdk_nbd_disk *nbd)
+{
+	/*
+	 * nbd soft-disconnection to terminate transmission phase.
+	 * After receiving this ioctl command, nbd kernel module will send
+	 * a NBD_CMD_DISC type io to nbd server in order to inform server.
+	 */
+	ioctl(nbd->dev_fd, NBD_DISCONNECT);
+}
+
+static struct nbd_io *
+nbd_get_io(struct spdk_nbd_disk *nbd)
+{
+	struct nbd_io *io;
+
+	io = calloc(1, sizeof(*io));
+	if (!io) {
+		return NULL;
+	}
+
+	io->nbd = nbd;
+	to_be32(&io->resp.magic, NBD_REPLY_MAGIC);
+
+	nbd->io_count++;
+
+	return io;
+}
+
+static void
+nbd_put_io(struct spdk_nbd_disk *nbd, struct nbd_io *io)
+{
+	if (io->payload) {
+		spdk_free(io->payload);
+	}
+	free(io);
+
+	nbd->io_count--;
+}
+
+/*
+ * Check whether received nbd_io are all transmitted.
+ *
+ * \return 1 there is still some nbd_io not transmitted.
+ *         0 all nbd_io received are transmitted.
+ */
+static int
+nbd_io_xmit_check(struct spdk_nbd_disk *nbd)
+{
+	if (nbd->io_count == 0) {
+		return 0;
+	} else if (nbd->io_count == 1 && nbd->io_in_recv != NULL) {
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Check whether received nbd_io are all executed,
+ * and put back executed nbd_io instead of transmitting them
+ *
+ * \return 1 there is still some nbd_io under executing
+ *         0 all nbd_io gotten are freed.
+ */
+static int
+nbd_cleanup_io(struct spdk_nbd_disk *nbd)
+{
+	struct nbd_io *io, *io_tmp;
+
+	/* free io_in_recv */
+	if (nbd->io_in_recv != NULL) {
+		nbd_put_io(nbd, nbd->io_in_recv);
+		nbd->io_in_recv = NULL;
+	}
+
+	/* free io in received_io_list */
+	if (!TAILQ_EMPTY(&nbd->received_io_list)) {
+		TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) {
+			TAILQ_REMOVE(&nbd->received_io_list, io, tailq);
+			nbd_put_io(nbd, io);
+		}
+	}
+
+	/* free io in executed_io_list */
+	if (!TAILQ_EMPTY(&nbd->executed_io_list)) {
+		TAILQ_FOREACH_SAFE(io, &nbd->executed_io_list, tailq, io_tmp) {
+			TAILQ_REMOVE(&nbd->executed_io_list, io, tailq);
+			nbd_put_io(nbd, io);
+		}
+	}
+
+	/*
+	 * Some nbd_io may be under executing in bdev.
+	 * Wait for their done operation.
+	 */
+	if (nbd->io_count != 0) {
+		return 1;
+	}
+
+	return 0;
+}
+
+static void
+_nbd_stop(struct spdk_nbd_disk *nbd)
+{
+	if (nbd->ch) {
+		spdk_put_io_channel(nbd->ch);
+	}
+
+	if (nbd->bdev_desc) {
+		spdk_bdev_close(nbd->bdev_desc);
+	}
+
+	if (nbd->spdk_sp_fd >= 0) {
+		close(nbd->spdk_sp_fd);
+	}
+
+	if (nbd->kernel_sp_fd >= 0) {
+		close(nbd->kernel_sp_fd);
+	}
+
+	if (nbd->dev_fd >= 0) {
+		/* Clear nbd device only if it is occupied by SPDK app */
+		if (nbd->nbd_path && nbd_disk_find_by_nbd_path(nbd->nbd_path)) {
+			ioctl(nbd->dev_fd, NBD_CLEAR_QUE);
+			ioctl(nbd->dev_fd, NBD_CLEAR_SOCK);
+		}
+		close(nbd->dev_fd);
+	}
+
+	if (nbd->nbd_path) {
+		free(nbd->nbd_path);
+	}
+
+	if (nbd->nbd_poller) {
+		spdk_poller_unregister(&nbd->nbd_poller);
+	}
+
+	nbd_disk_unregister(nbd);
+
+	free(nbd);
+}
+
+void
+spdk_nbd_stop(struct spdk_nbd_disk *nbd)
+{
+	if (nbd == NULL) {
+		return;
+	}
+
+	nbd->state = NBD_DISK_STATE_HARDDISC;
+
+	/*
+	 * Stop action should be called only after all nbd_io are executed.
+	 */
+	if (!nbd_cleanup_io(nbd)) {
+		_nbd_stop(nbd);
+	}
+}
+
+static int64_t
+read_from_socket(int fd, void *buf, size_t length)
+{
+	ssize_t bytes_read;
+
+	bytes_read = read(fd, buf, length);
+	if (bytes_read == 0) {
+		return -EIO;
+	} else if (bytes_read == -1) {
+		if (errno != EAGAIN) {
+			return -errno;
+		}
+		return 0;
+	} else {
+		return bytes_read;
+	}
+}
+
+static int64_t
+write_to_socket(int fd, void *buf, size_t length)
+{
+	ssize_t bytes_written;
+
+	bytes_written = write(fd, buf, length);
+	if (bytes_written == 0) {
+		return -EIO;
+	} else if (bytes_written == -1) {
+		if (errno != EAGAIN) {
+			return -errno;
+		}
+		return 0;
+	} else {
+		return bytes_written;
+	}
+}
+
+static void
+nbd_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct nbd_io	*io = cb_arg;
+	struct spdk_nbd_disk *nbd = io->nbd;
+
+	if (success) {
+		io->resp.error = 0;
+	} else {
+		to_be32(&io->resp.error, EIO);
+	}
+
+	memcpy(&io->resp.handle, &io->req.handle, sizeof(io->resp.handle));
+	TAILQ_INSERT_TAIL(&nbd->executed_io_list, io, tailq);
+
+	if (bdev_io != NULL) {
+		spdk_bdev_free_io(bdev_io);
+	}
+
+	if (nbd->state == NBD_DISK_STATE_HARDDISC && !nbd_cleanup_io(nbd)) {
+		_nbd_stop(nbd);
+	}
+}
+
+static void
+nbd_resubmit_io(void *arg)
+{
+	struct nbd_io *io = (struct nbd_io *)arg;
+	struct spdk_nbd_disk *nbd = io->nbd;
+	int rc = 0;
+
+	rc = nbd_submit_bdev_io(nbd, io);
+	if (rc) {
+		SPDK_INFOLOG(SPDK_LOG_NBD, "nbd: io resubmit for dev %s , io_type %d, returned %d.\n",
+			     nbd_disk_get_bdev_name(nbd), from_be32(&io->req.type), rc);
+	}
+}
+
+static void
+nbd_queue_io(struct nbd_io *io)
+{
+	int rc;
+	struct spdk_bdev *bdev = io->nbd->bdev;
+
+	io->bdev_io_wait.bdev = bdev;
+	io->bdev_io_wait.cb_fn = nbd_resubmit_io;
+	io->bdev_io_wait.cb_arg = io;
+
+	rc = spdk_bdev_queue_io_wait(bdev, io->nbd->ch, &io->bdev_io_wait);
+	if (rc != 0) {
+		SPDK_ERRLOG("Queue io failed in nbd_queue_io, rc=%d.\n", rc);
+		nbd_io_done(NULL, false, io);
+	}
+}
+
+static int
+nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io)
+{
+	struct spdk_bdev_desc *desc = nbd->bdev_desc;
+	struct spdk_io_channel *ch = nbd->ch;
+	int rc = 0;
+
+	switch (from_be32(&io->req.type)) {
+	case NBD_CMD_READ:
+		rc = spdk_bdev_read(desc, ch, io->payload, from_be64(&io->req.from),
+				    io->payload_size, nbd_io_done, io);
+		break;
+	case NBD_CMD_WRITE:
+		rc = spdk_bdev_write(desc, ch, io->payload, from_be64(&io->req.from),
+				     io->payload_size, nbd_io_done, io);
+		break;
+#ifdef NBD_FLAG_SEND_FLUSH
+	case NBD_CMD_FLUSH:
+		rc = spdk_bdev_flush(desc, ch, 0,
+				     spdk_bdev_get_num_blocks(nbd->bdev) * spdk_bdev_get_block_size(nbd->bdev),
+				     nbd_io_done, io);
+		break;
+#endif
+#ifdef NBD_FLAG_SEND_TRIM
+	case NBD_CMD_TRIM:
+		rc = spdk_bdev_unmap(desc, ch, from_be64(&io->req.from),
+				     from_be32(&io->req.len), nbd_io_done, io);
+		break;
+#endif
+	case NBD_CMD_DISC:
+		nbd_put_io(nbd, io);
+		nbd->state = NBD_DISK_STATE_SOFTDISC;
+		break;
+	default:
+		rc = -1;
+	}
+
+	if (rc < 0) {
+		if (rc == -ENOMEM) {
+			SPDK_INFOLOG(SPDK_LOG_NBD, "No memory, start to queue io.\n");
+			nbd_queue_io(io);
+		} else {
+			SPDK_ERRLOG("nbd io failed in nbd_queue_io, rc=%d.\n", rc);
+			nbd_io_done(NULL, false, io);
+		}
+	}
+
+	return 0;
+}
+
+static int
+nbd_io_exec(struct spdk_nbd_disk *nbd)
+{
+	struct nbd_io *io, *io_tmp;
+	int io_count = 0;
+	int ret = 0;
+
+	/*
+	 * For soft disconnection, nbd server must handle all outstanding
+	 * request before closing connection.
+	 */
+	if (nbd->state == NBD_DISK_STATE_HARDDISC) {
+		return 0;
+	}
+
+	if (!TAILQ_EMPTY(&nbd->received_io_list)) {
+		TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) {
+			TAILQ_REMOVE(&nbd->received_io_list, io, tailq);
+			ret = nbd_submit_bdev_io(nbd, io);
+			if (ret < 0) {
+				return ret;
+			}
+
+			io_count++;
+		}
+	}
+
+	return io_count;
+}
+
+static int
+nbd_io_recv_internal(struct spdk_nbd_disk *nbd)
+{
+	struct nbd_io *io;
+	int ret = 0;
+	int received = 0;
+
+	if (nbd->io_in_recv == NULL) {
+		nbd->io_in_recv = nbd_get_io(nbd);
+		if (!nbd->io_in_recv) {
+			return -ENOMEM;
+		}
+	}
+
+	io = nbd->io_in_recv;
+
+	if (io->state == NBD_IO_RECV_REQ) {
+		ret = read_from_socket(nbd->spdk_sp_fd, (char *)&io->req + io->offset,
+				       sizeof(io->req) - io->offset);
+		if (ret < 0) {
+			nbd_put_io(nbd, io);
+			nbd->io_in_recv = NULL;
+			return ret;
+		}
+
+		io->offset += ret;
+		received = ret;
+
+		/* request is fully received */
+		if (io->offset == sizeof(io->req)) {
+			io->offset = 0;
+
+			/* req magic check */
+			if (from_be32(&io->req.magic) != NBD_REQUEST_MAGIC) {
+				SPDK_ERRLOG("invalid request magic\n");
+				nbd_put_io(nbd, io);
+				nbd->io_in_recv = NULL;
+				return -EINVAL;
+			}
+
+			/* io except read/write should ignore payload */
+			if (from_be32(&io->req.type) == NBD_CMD_WRITE ||
+			    from_be32(&io->req.type) == NBD_CMD_READ) {
+				io->payload_size = from_be32(&io->req.len);
+			} else {
+				io->payload_size = 0;
+			}
+
+			/* io payload allocate */
+			if (io->payload_size) {
+				io->payload = spdk_malloc(io->payload_size, nbd->buf_align, NULL,
+							  SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+				if (io->payload == NULL) {
+					SPDK_ERRLOG("could not allocate io->payload of size %d\n", io->payload_size);
+					nbd_put_io(nbd, io);
+					nbd->io_in_recv = NULL;
+					return -ENOMEM;
+				}
+			} else {
+				io->payload = NULL;
+			}
+
+			/* next io step */
+			if (from_be32(&io->req.type) == NBD_CMD_WRITE) {
+				io->state = NBD_IO_RECV_PAYLOAD;
+			} else {
+				io->state = NBD_IO_XMIT_RESP;
+				nbd->io_in_recv = NULL;
+				TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq);
+			}
+		}
+	}
+
+	if (io->state == NBD_IO_RECV_PAYLOAD) {
+		ret = read_from_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset);
+		if (ret < 0) {
+			nbd_put_io(nbd, io);
+			nbd->io_in_recv = NULL;
+			return ret;
+		}
+
+		io->offset += ret;
+		received += ret;
+
+		/* request payload is fully received */
+		if (io->offset == io->payload_size) {
+			io->offset = 0;
+			io->state = NBD_IO_XMIT_RESP;
+			nbd->io_in_recv = NULL;
+			TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq);
+		}
+
+	}
+
+	return received;
+}
+
+static int
+nbd_io_recv(struct spdk_nbd_disk *nbd)
+{
+	int i, rc, ret = 0;
+
+	/*
+	 * nbd server should not accept request in both soft and hard
+	 * disconnect states.
+	 */
+	if (nbd->state != NBD_DISK_STATE_RUNNING) {
+		return 0;
+	}
+
+	for (i = 0; i < GET_IO_LOOP_COUNT; i++) {
+		rc = nbd_io_recv_internal(nbd);
+		if (rc < 0) {
+			return rc;
+		}
+		ret += rc;
+	}
+
+	return ret;
+}
+
+static int
+nbd_io_xmit_internal(struct spdk_nbd_disk *nbd)
+{
+	struct nbd_io *io;
+	int ret = 0;
+	int sent = 0;
+
+	io = TAILQ_FIRST(&nbd->executed_io_list);
+	if (io == NULL) {
+		return 0;
+	}
+
+	/* Remove IO from list now assuming it will be completed.  It will be inserted
+	 *  back to the head if it cannot be completed.  This approach is specifically
+	 *  taken to work around a scan-build use-after-free mischaracterization.
+	 */
+	TAILQ_REMOVE(&nbd->executed_io_list, io, tailq);
+
+	/* resp error and handler are already set in io_done */
+
+	if (io->state == NBD_IO_XMIT_RESP) {
+		ret = write_to_socket(nbd->spdk_sp_fd, (char *)&io->resp + io->offset,
+				      sizeof(io->resp) - io->offset);
+		if (ret <= 0) {
+			goto reinsert;
+		}
+
+		io->offset += ret;
+		sent = ret;
+
+		/* response is fully transmitted */
+		if (io->offset == sizeof(io->resp)) {
+			io->offset = 0;
+
+			/* transmit payload only when NBD_CMD_READ with no resp error */
+			if (from_be32(&io->req.type) != NBD_CMD_READ || io->resp.error != 0) {
+				nbd_put_io(nbd, io);
+				return 0;
+			} else {
+				io->state = NBD_IO_XMIT_PAYLOAD;
+			}
+		}
+	}
+
+	if (io->state == NBD_IO_XMIT_PAYLOAD) {
+		ret = write_to_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset);
+		if (ret <= 0) {
+			goto reinsert;
+		}
+
+		io->offset += ret;
+		sent += ret;
+
+		/* read payload is fully transmitted */
+		if (io->offset == io->payload_size) {
+			nbd_put_io(nbd, io);
+			return sent;
+		}
+	}
+
+reinsert:
+	TAILQ_INSERT_HEAD(&nbd->executed_io_list, io, tailq);
+	return ret < 0 ? ret : sent;
+}
+
+static int
+nbd_io_xmit(struct spdk_nbd_disk *nbd)
+{
+	int ret = 0;
+	int rc;
+
+	/*
+	 * For soft disconnection, nbd server must handle all outstanding
+	 * request before closing connection.
+	 */
+	if (nbd->state == NBD_DISK_STATE_HARDDISC) {
+		return 0;
+	}
+
+	while (!TAILQ_EMPTY(&nbd->executed_io_list)) {
+		rc = nbd_io_xmit_internal(nbd);
+		if (rc < 0) {
+			return rc;
+		}
+
+		ret += rc;
+	}
+
+	/*
+	 * For soft disconnection, nbd server can close connection after all
+	 * outstanding request are transmitted.
+	 */
+	if (nbd->state == NBD_DISK_STATE_SOFTDISC && !nbd_io_xmit_check(nbd)) {
+		return -1;
+	}
+
+	return ret;
+}
+
+/**
+ * Poll an NBD instance.
+ *
+ * \return 0 on success or negated errno values on error (e.g. connection closed).
+ */
+static int
+_nbd_poll(struct spdk_nbd_disk *nbd)
+{
+	int received, sent, executed;
+
+	/* transmit executed io first */
+	sent = nbd_io_xmit(nbd);
+	if (sent < 0) {
+		return sent;
+	}
+
+	received = nbd_io_recv(nbd);
+	if (received < 0) {
+		return received;
+	}
+
+	executed = nbd_io_exec(nbd);
+	if (executed < 0) {
+		return executed;
+	}
+
+	return sent + received + executed;
+}
+
+static int
+nbd_poll(void *arg)
+{
+	struct spdk_nbd_disk *nbd = arg;
+	int rc;
+
+	rc = _nbd_poll(nbd);
+	if (rc < 0) {
+		SPDK_INFOLOG(SPDK_LOG_NBD, "nbd_poll() returned %s (%d); closing connection\n",
+			     spdk_strerror(-rc), rc);
+		spdk_nbd_stop(nbd);
+	}
+
+	return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static void *
+nbd_start_kernel(void *arg)
+{
+	int dev_fd = (int)(intptr_t)arg;
+
+	spdk_unaffinitize_thread();
+
+	/* This will block in the kernel until we close the spdk_sp_fd. */
+	ioctl(dev_fd, NBD_DO_IT);
+
+	pthread_exit(NULL);
+}
+
+static void
+nbd_bdev_hot_remove(void *remove_ctx)
+{
+	struct spdk_nbd_disk *nbd = remove_ctx;
+
+	spdk_nbd_stop(nbd);
+}
+
+struct spdk_nbd_start_ctx {
+	struct spdk_nbd_disk	*nbd;
+	spdk_nbd_start_cb	cb_fn;
+	void			*cb_arg;
+	struct spdk_poller	*poller;
+	int			polling_count;
+};
+
+static void
+nbd_start_complete(struct spdk_nbd_start_ctx *ctx)
+{
+	int		rc;
+	pthread_t	tid;
+	int		flag;
+
+	/* Add nbd_disk to the end of disk list */
+	rc = nbd_disk_register(ctx->nbd);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to register %s, it should not happen.\n", ctx->nbd->nbd_path);
+		assert(false);
+		goto err;
+	}
+
+	rc = ioctl(ctx->nbd->dev_fd, NBD_SET_BLKSIZE, spdk_bdev_get_block_size(ctx->nbd->bdev));
+	if (rc == -1) {
+		SPDK_ERRLOG("ioctl(NBD_SET_BLKSIZE) failed: %s\n", spdk_strerror(errno));
+		rc = -errno;
+		goto err;
+	}
+
+	rc = ioctl(ctx->nbd->dev_fd, NBD_SET_SIZE_BLOCKS, spdk_bdev_get_num_blocks(ctx->nbd->bdev));
+	if (rc == -1) {
+		SPDK_ERRLOG("ioctl(NBD_SET_SIZE_BLOCKS) failed: %s\n", spdk_strerror(errno));
+		rc = -errno;
+		goto err;
+	}
+
+#ifdef NBD_FLAG_SEND_TRIM
+	rc = ioctl(ctx->nbd->dev_fd, NBD_SET_FLAGS, NBD_FLAG_SEND_TRIM);
+	if (rc == -1) {
+		SPDK_ERRLOG("ioctl(NBD_SET_FLAGS) failed: %s\n", spdk_strerror(errno));
+		rc = -errno;
+		goto err;
+	}
+#endif
+
+	rc = pthread_create(&tid, NULL, nbd_start_kernel, (void *)(intptr_t)ctx->nbd->dev_fd);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc));
+		rc = -rc;
+		goto err;
+	}
+
+	rc = pthread_detach(tid);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not detach thread for nbd kernel: %s\n", spdk_strerror(rc));
+		rc = -rc;
+		goto err;
+	}
+
+	flag = fcntl(ctx->nbd->spdk_sp_fd, F_GETFL);
+	if (fcntl(ctx->nbd->spdk_sp_fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+		SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+			    ctx->nbd->spdk_sp_fd, spdk_strerror(errno));
+		rc = -errno;
+		goto err;
+	}
+
+	ctx->nbd->nbd_poller = SPDK_POLLER_REGISTER(nbd_poll, ctx->nbd, 0);
+
+	if (ctx->cb_fn) {
+		ctx->cb_fn(ctx->cb_arg, ctx->nbd, 0);
+	}
+
+	free(ctx);
+	return;
+
+err:
+	spdk_nbd_stop(ctx->nbd);
+	if (ctx->cb_fn) {
+		ctx->cb_fn(ctx->cb_arg, NULL, rc);
+	}
+	free(ctx);
+}
+
+static int
+nbd_enable_kernel(void *arg)
+{
+	struct spdk_nbd_start_ctx *ctx = arg;
+	int rc;
+
+	/* Declare device setup by this process */
+	rc = ioctl(ctx->nbd->dev_fd, NBD_SET_SOCK, ctx->nbd->kernel_sp_fd);
+	if (rc == -1) {
+		if (errno == EBUSY && ctx->polling_count-- > 0) {
+			if (ctx->poller == NULL) {
+				ctx->poller = SPDK_POLLER_REGISTER(nbd_enable_kernel, ctx,
+								   NBD_BUSY_POLLING_INTERVAL_US);
+			}
+			/* If the kernel is busy, check back later */
+			return SPDK_POLLER_BUSY;
+		}
+
+		SPDK_ERRLOG("ioctl(NBD_SET_SOCK) failed: %s\n", spdk_strerror(errno));
+		if (ctx->poller) {
+			spdk_poller_unregister(&ctx->poller);
+		}
+
+		spdk_nbd_stop(ctx->nbd);
+
+		if (ctx->cb_fn) {
+			ctx->cb_fn(ctx->cb_arg, NULL, -errno);
+		}
+
+		free(ctx);
+		return SPDK_POLLER_BUSY;
+	}
+
+	if (ctx->poller) {
+		spdk_poller_unregister(&ctx->poller);
+	}
+
+	nbd_start_complete(ctx);
+
+	return SPDK_POLLER_BUSY;
+}
+
+void
+spdk_nbd_start(const char *bdev_name, const char *nbd_path,
+	       spdk_nbd_start_cb cb_fn, void *cb_arg)
+{
+	struct spdk_nbd_start_ctx	*ctx = NULL;
+	struct spdk_nbd_disk		*nbd = NULL;
+	struct spdk_bdev		*bdev;
+	int				rc;
+	int				sp[2];
+
+	bdev = spdk_bdev_get_by_name(bdev_name);
+	if (bdev == NULL) {
+		SPDK_ERRLOG("no bdev %s exists\n", bdev_name);
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nbd = calloc(1, sizeof(*nbd));
+	if (nbd == NULL) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	nbd->dev_fd = -1;
+	nbd->spdk_sp_fd = -1;
+	nbd->kernel_sp_fd = -1;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ctx->nbd = nbd;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+	ctx->polling_count = NBD_BUSY_WAITING_MS * 1000ULL / NBD_BUSY_POLLING_INTERVAL_US;
+
+	rc = spdk_bdev_open(bdev, true, nbd_bdev_hot_remove, nbd, &nbd->bdev_desc);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not open bdev %s, error=%d\n", spdk_bdev_get_name(bdev), rc);
+		goto err;
+	}
+
+	nbd->bdev = bdev;
+
+	nbd->ch = spdk_bdev_get_io_channel(nbd->bdev_desc);
+	nbd->buf_align = spdk_max(spdk_bdev_get_buf_align(bdev), 64);
+
+	rc = socketpair(AF_UNIX, SOCK_STREAM, 0, sp);
+	if (rc != 0) {
+		SPDK_ERRLOG("socketpair failed\n");
+		rc = -errno;
+		goto err;
+	}
+
+	nbd->spdk_sp_fd = sp[0];
+	nbd->kernel_sp_fd = sp[1];
+	nbd->nbd_path = strdup(nbd_path);
+	if (!nbd->nbd_path) {
+		SPDK_ERRLOG("strdup allocation failure\n");
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	TAILQ_INIT(&nbd->received_io_list);
+	TAILQ_INIT(&nbd->executed_io_list);
+
+	/* Make sure nbd_path is not used in this SPDK app */
+	if (nbd_disk_find_by_nbd_path(nbd->nbd_path)) {
+		SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path);
+		rc = -EBUSY;
+		goto err;
+	}
+
+	nbd->dev_fd = open(nbd_path, O_RDWR);
+	if (nbd->dev_fd == -1) {
+		SPDK_ERRLOG("open(\"%s\") failed: %s\n", nbd_path, spdk_strerror(errno));
+		rc = -errno;
+		goto err;
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_NBD, "Enabling kernel access to bdev %s via %s\n",
+		     spdk_bdev_get_name(bdev), nbd_path);
+
+	nbd_enable_kernel(ctx);
+	return;
+
+err:
+	free(ctx);
+	if (nbd) {
+		spdk_nbd_stop(nbd);
+	}
+
+	if (cb_fn) {
+		cb_fn(cb_arg, NULL, rc);
+	}
+}
+
+const char *
+spdk_nbd_get_path(struct spdk_nbd_disk *nbd)
+{
+	return nbd->nbd_path;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nbd", SPDK_LOG_NBD)
diff --git a/src/spdk/lib/nbd/nbd_internal.h b/src/spdk/lib/nbd/nbd_internal.h
new file mode 100644
index 000000000..c0d7ee220
--- /dev/null
+++ b/src/spdk/lib/nbd/nbd_internal.h
@@ -0,0 +1,52 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_NBD_INTERNAL_H
+#define SPDK_NBD_INTERNAL_H
+
+#include "spdk/stdinc.h"
+#include "spdk/nbd.h"
+
+struct spdk_nbd_disk *nbd_disk_find_by_nbd_path(const char *nbd_path);
+
+struct spdk_nbd_disk *nbd_disk_first(void);
+
+struct spdk_nbd_disk *nbd_disk_next(struct spdk_nbd_disk *prev);
+
+const char *nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd);
+
+const char *nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd);
+
+void nbd_disconnect(struct spdk_nbd_disk *nbd);
+
+#endif /* SPDK_NBD_INTERNAL_H */
diff --git a/src/spdk/lib/nbd/nbd_rpc.c b/src/spdk/lib/nbd/nbd_rpc.c
new file mode 100644
index 000000000..a00c0a7e6
--- /dev/null
+++ b/src/spdk/lib/nbd/nbd_rpc.c
@@ -0,0 +1,422 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/string.h"
+#include "spdk/env.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include <linux/nbd.h>
+
+#include "nbd_internal.h"
+#include "spdk_internal/log.h"
+
+struct rpc_nbd_start_disk {
+	char *bdev_name;
+	char *nbd_device;
+	/* Used to search one available nbd device */
+	int nbd_idx;
+	bool nbd_idx_specified;
+	struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_nbd_start_disk(struct rpc_nbd_start_disk *req)
+{
+	free(req->bdev_name);
+	free(req->nbd_device);
+	free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_nbd_start_disk_decoders[] = {
+	{"bdev_name", offsetof(struct rpc_nbd_start_disk, bdev_name), spdk_json_decode_string},
+	{"nbd_device", offsetof(struct rpc_nbd_start_disk, nbd_device), spdk_json_decode_string, true},
+};
+
+/* Return 0 to indicate the nbd_device might be available,
+ * or non-zero to indicate the nbd_device is invalid or in using.
+ */
+static int
+check_available_nbd_disk(char *nbd_device)
+{
+	char nbd_block_path[256];
+	char tail[2];
+	int rc;
+	unsigned int nbd_idx;
+	struct spdk_nbd_disk *nbd;
+
+	/* nbd device path must be in format of /dev/nbd<num>, with no tail. */
+	rc = sscanf(nbd_device, "/dev/nbd%u%1s", &nbd_idx, tail);
+	if (rc != 1) {
+		return -errno;
+	}
+
+	/* make sure nbd_device is not registered inside SPDK */
+	nbd = nbd_disk_find_by_nbd_path(nbd_device);
+	if (nbd) {
+		/* nbd_device is in using */
+		return -EBUSY;
+	}
+
+	/* A valid pid file in /sys/block indicates the device is in using */
+	snprintf(nbd_block_path, 256, "/sys/block/nbd%u/pid", nbd_idx);
+
+	rc = open(nbd_block_path, O_RDONLY);
+	if (rc < 0) {
+		if (errno == ENOENT) {
+			/* nbd_device might be available */
+			return 0;
+		} else {
+			SPDK_ERRLOG("Failed to check PID file %s: %s\n", nbd_block_path, spdk_strerror(errno));
+			return -errno;
+		}
+	}
+
+	close(rc);
+
+	/* nbd_device is in using */
+	return -EBUSY;
+}
+
+static char *
+find_available_nbd_disk(int nbd_idx, int *next_nbd_idx)
+{
+	int i, rc;
+	char nbd_device[20];
+
+	for (i = nbd_idx; ; i++) {
+		snprintf(nbd_device, 20, "/dev/nbd%d", i);
+		/* Check whether an nbd device exists in order to reach the last one nbd device */
+		rc = access(nbd_device, F_OK);
+		if (rc != 0) {
+			break;
+		}
+
+		rc = check_available_nbd_disk(nbd_device);
+		if (rc == 0) {
+			if (next_nbd_idx != NULL) {
+				*next_nbd_idx = i + 1;
+			}
+
+			return strdup(nbd_device);
+		}
+	}
+
+	return NULL;
+}
+
+static void
+rpc_start_nbd_done(void *cb_arg, struct spdk_nbd_disk *nbd, int rc)
+{
+	struct rpc_nbd_start_disk *req = cb_arg;
+	struct spdk_jsonrpc_request *request = req->request;
+	struct spdk_json_write_ctx *w;
+
+	/* Check whether it's automatic nbd-device assignment */
+	if (rc == -EBUSY && req->nbd_idx_specified == false) {
+		free(req->nbd_device);
+
+		req->nbd_device = find_available_nbd_disk(req->nbd_idx, &req->nbd_idx);
+		if (req->nbd_device != NULL) {
+			spdk_nbd_start(req->bdev_name, req->nbd_device,
+				       rpc_start_nbd_done, req);
+			return;
+		}
+
+		SPDK_INFOLOG(SPDK_LOG_NBD, "There is no available nbd device.\n");
+	}
+
+	if (rc) {
+		spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_string(w, spdk_nbd_get_path(nbd));
+	spdk_jsonrpc_end_result(request, w);
+
+	free_rpc_nbd_start_disk(req);
+}
+
+static void
+rpc_nbd_start_disk(struct spdk_jsonrpc_request *request,
+		   const struct spdk_json_val *params)
+{
+	struct rpc_nbd_start_disk *req;
+	int rc;
+
+	req = calloc(1, sizeof(*req));
+	if (req == NULL) {
+		SPDK_ERRLOG("could not allocate nbd_start_disk request.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, rpc_nbd_start_disk_decoders,
+				    SPDK_COUNTOF(rpc_nbd_start_disk_decoders),
+				    req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	if (req->bdev_name == NULL) {
+		goto invalid;
+	}
+
+	if (req->nbd_device != NULL) {
+		req->nbd_idx_specified = true;
+		rc = check_available_nbd_disk(req->nbd_device);
+		if (rc == -EBUSY) {
+			SPDK_DEBUGLOG(SPDK_LOG_NBD, "NBD device %s is in using.\n", req->nbd_device);
+			spdk_jsonrpc_send_error_response(request, -EBUSY, spdk_strerror(-rc));
+			goto invalid;
+		}
+
+		if (rc != 0) {
+			SPDK_DEBUGLOG(SPDK_LOG_NBD, "Illegal nbd_device %s.\n", req->nbd_device);
+			spdk_jsonrpc_send_error_response_fmt(request, -ENODEV,
+							     "illegal nbd device %s", req->nbd_device);
+			goto invalid;
+		}
+	} else {
+		req->nbd_idx = 0;
+		req->nbd_device = find_available_nbd_disk(req->nbd_idx, &req->nbd_idx);
+		if (req->nbd_device == NULL) {
+			SPDK_INFOLOG(SPDK_LOG_NBD, "There is no available nbd device.\n");
+			spdk_jsonrpc_send_error_response(request, -ENODEV,
+							 "nbd device not found");
+			goto invalid;
+		}
+	}
+
+	req->request = request;
+	spdk_nbd_start(req->bdev_name, req->nbd_device,
+		       rpc_start_nbd_done, req);
+
+	return;
+
+invalid:
+	free_rpc_nbd_start_disk(req);
+}
+
+SPDK_RPC_REGISTER("nbd_start_disk", rpc_nbd_start_disk, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_start_disk, start_nbd_disk)
+
+struct rpc_nbd_stop_disk {
+	char *nbd_device;
+};
+
+static void
+free_rpc_nbd_stop_disk(struct rpc_nbd_stop_disk *req)
+{
+	free(req->nbd_device);
+}
+
+static const struct spdk_json_object_decoder rpc_nbd_stop_disk_decoders[] = {
+	{"nbd_device", offsetof(struct rpc_nbd_stop_disk, nbd_device), spdk_json_decode_string},
+};
+
+struct nbd_disconnect_arg {
+	struct spdk_jsonrpc_request *request;
+	struct spdk_nbd_disk *nbd;
+};
+
+static void *
+nbd_disconnect_thread(void *arg)
+{
+	struct nbd_disconnect_arg *thd_arg = arg;
+	struct spdk_json_write_ctx *w;
+
+	spdk_unaffinitize_thread();
+
+	nbd_disconnect(thd_arg->nbd);
+
+	w = spdk_jsonrpc_begin_result(thd_arg->request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(thd_arg->request, w);
+
+	free(thd_arg);
+	pthread_exit(NULL);
+}
+
+static void
+rpc_nbd_stop_disk(struct spdk_jsonrpc_request *request,
+		  const struct spdk_json_val *params)
+{
+	struct rpc_nbd_stop_disk req = {};
+	struct spdk_nbd_disk *nbd;
+	pthread_t tid;
+	struct nbd_disconnect_arg *thd_arg = NULL;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_nbd_stop_disk_decoders,
+				    SPDK_COUNTOF(rpc_nbd_stop_disk_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto out;
+	}
+
+	if (req.nbd_device == NULL) {
+		spdk_jsonrpc_send_error_response(request, -ENODEV, "invalid nbd device");
+		goto out;
+	}
+
+	/* make sure nbd_device is registered */
+	nbd = nbd_disk_find_by_nbd_path(req.nbd_device);
+	if (!nbd) {
+		spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+		goto out;
+	}
+
+	/*
+	 * thd_arg should be freed by created thread
+	 * if thread is created successfully.
+	 */
+	thd_arg = malloc(sizeof(*thd_arg));
+	if (!thd_arg) {
+		SPDK_ERRLOG("could not allocate nbd disconnect thread arg\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		goto out;
+	}
+
+	thd_arg->request = request;
+	thd_arg->nbd = nbd;
+
+	/*
+	 * NBD ioctl of disconnect will block until data are flushed.
+	 * Create separate thread to execute it.
+	 */
+	rc = pthread_create(&tid, NULL, nbd_disconnect_thread, (void *)thd_arg);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not create nbd disconnect thread: %s\n", spdk_strerror(rc));
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(rc));
+		free(thd_arg);
+		goto out;
+	}
+
+	rc = pthread_detach(tid);
+	if (rc != 0) {
+		SPDK_ERRLOG("could not detach nbd disconnect thread: %s\n", spdk_strerror(rc));
+		goto out;
+	}
+
+out:
+	free_rpc_nbd_stop_disk(&req);
+}
+
+SPDK_RPC_REGISTER("nbd_stop_disk", rpc_nbd_stop_disk, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_stop_disk, stop_nbd_disk)
+
+static void
+rpc_dump_nbd_info(struct spdk_json_write_ctx *w,
+		  struct spdk_nbd_disk *nbd)
+{
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "nbd_device", nbd_disk_get_nbd_path(nbd));
+
+	spdk_json_write_named_string(w, "bdev_name", nbd_disk_get_bdev_name(nbd));
+
+	spdk_json_write_object_end(w);
+}
+
+struct rpc_nbd_get_disks {
+	char *nbd_device;
+};
+
+static void
+free_rpc_nbd_get_disks(struct rpc_nbd_get_disks *r)
+{
+	free(r->nbd_device);
+}
+
+static const struct spdk_json_object_decoder rpc_nbd_get_disks_decoders[] = {
+	{"nbd_device", offsetof(struct rpc_nbd_get_disks, nbd_device), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nbd_get_disks(struct spdk_jsonrpc_request *request,
+		  const struct spdk_json_val *params)
+{
+	struct rpc_nbd_get_disks req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_nbd_disk *nbd = NULL;
+
+	if (params != NULL) {
+		if (spdk_json_decode_object(params, rpc_nbd_get_disks_decoders,
+					    SPDK_COUNTOF(rpc_nbd_get_disks_decoders),
+					    &req)) {
+			SPDK_ERRLOG("spdk_json_decode_object failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+							 "spdk_json_decode_object failed");
+			goto invalid;
+		}
+
+		if (req.nbd_device) {
+			nbd = nbd_disk_find_by_nbd_path(req.nbd_device);
+			if (nbd == NULL) {
+				SPDK_ERRLOG("nbd device '%s' does not exist\n", req.nbd_device);
+				spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+				goto invalid;
+			}
+
+			free_rpc_nbd_get_disks(&req);
+		}
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+
+	if (nbd != NULL) {
+		rpc_dump_nbd_info(w, nbd);
+	} else {
+		for (nbd = nbd_disk_first(); nbd != NULL; nbd = nbd_disk_next(nbd)) {
+			rpc_dump_nbd_info(w, nbd);
+		}
+	}
+
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+
+	return;
+
+invalid:
+	free_rpc_nbd_get_disks(&req);
+}
+SPDK_RPC_REGISTER("nbd_get_disks", rpc_nbd_get_disks, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_get_disks, get_nbd_disks)
diff --git a/src/spdk/lib/nbd/spdk_nbd.map b/src/spdk/lib/nbd/spdk_nbd.map
new file mode 100644
index 000000000..0b7d8de81
--- /dev/null
+++ b/src/spdk/lib/nbd/spdk_nbd.map
@@ -0,0 +1,13 @@
+{
+	global:
+
+	# public functions
+	spdk_nbd_init;
+	spdk_nbd_fini;
+	spdk_nbd_start;
+	spdk_nbd_stop;
+	spdk_nbd_get_path;
+	spdk_nbd_write_config_json;
+
+	local: *;
+};
diff --git a/src/spdk/lib/net/Makefile b/src/spdk/lib/net/Makefile
new file mode 100644
index 000000000..918df6cfb
--- /dev/null
+++ b/src/spdk/lib/net/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = interface.c net_rpc.c
+
+LIBNAME = net
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_net.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/net/interface.c b/src/spdk/lib/net/interface.c
new file mode 100644
index 000000000..358cbc308
--- /dev/null
+++ b/src/spdk/lib/net/interface.c
@@ -0,0 +1,551 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "net_internal.h"
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include "spdk/log.h"
+#include "spdk/net.h"
+
+#ifdef __linux__ /* Interface management is Linux-specific */
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+static TAILQ_HEAD(, spdk_interface) g_interface_head;
+
+static pthread_mutex_t interface_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static int get_ifc_ipv4(void)
+{
+	int ret;
+	int rtattrlen;
+	int netlink_fd;
+	uint32_t ipv4_addr;
+
+	struct {
+		struct nlmsghdr n;
+		struct ifaddrmsg r;
+		struct rtattr rta;
+	} req;
+	char buf[16384];
+	struct nlmsghdr *nlmp;
+	struct ifaddrmsg *rtmp;
+	struct rtattr *rtatp;
+	struct spdk_interface *ifc;
+
+	netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE);
+	if (netlink_fd < 0) {
+		SPDK_ERRLOG("socket failed!\n");
+		return 1;
+	}
+
+	/*
+	 * Prepare a message structure
+	 */
+	memset(&req, 0, sizeof(req));
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+	req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
+	req.n.nlmsg_type = RTM_GETADDR;
+
+	/* IPv4 only */
+	req.r.ifa_family = AF_INET;
+
+	/*
+	 * Fill up all the attributes for the rtnetlink header.
+	 */
+	assert(&req.rta == (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.n.nlmsg_len)));
+	req.rta.rta_len = RTA_LENGTH(16);
+
+	/* Send and recv the message from kernel */
+	ret = send(netlink_fd, &req, req.n.nlmsg_len, 0);
+	if (ret < 0) {
+		SPDK_ERRLOG("netlink send failed: %s\n", spdk_strerror(errno));
+		ret = 1;
+		goto exit;
+	}
+
+	ret = recv(netlink_fd, buf, sizeof(buf), 0);
+	if (ret <= 0) {
+		SPDK_ERRLOG("netlink recv failed: %s\n", spdk_strerror(errno));
+		ret = 1;
+		goto exit;
+	}
+
+	for (nlmp = (struct nlmsghdr *)buf; ret > (int)sizeof(*nlmp);) {
+		int len = nlmp->nlmsg_len;
+		int req_len = len - sizeof(*nlmp);
+
+		if (req_len < 0 || len > ret) {
+			SPDK_ERRLOG("error\n");
+			ret = 1;
+			goto exit;
+		}
+
+		if (!NLMSG_OK(nlmp, (uint32_t)ret)) {
+			SPDK_ERRLOG("NLMSG not OK\n");
+			ret = 1;
+			goto exit;
+		}
+
+		rtmp = (struct ifaddrmsg *)NLMSG_DATA(nlmp);
+		rtatp = (struct rtattr *)IFA_RTA(rtmp);
+
+		rtattrlen = IFA_PAYLOAD(nlmp);
+
+		for (; RTA_OK(rtatp, rtattrlen); rtatp = RTA_NEXT(rtatp, rtattrlen)) {
+			if (rtatp->rta_type == IFA_LOCAL) {
+				memcpy(&ipv4_addr, (struct in_addr *)RTA_DATA(rtatp),
+				       sizeof(struct in_addr));
+				TAILQ_FOREACH(ifc, &g_interface_head, tailq) {
+					if (ifc->index == rtmp->ifa_index) {
+						/* add a new IP address to interface */
+						if (ifc->num_ip_addresses >= SPDK_MAX_IP_PER_IFC) {
+							SPDK_ERRLOG("SPDK: number of IP addresses supported for %s excceded. limit=%d\n",
+								    ifc->name,
+								    SPDK_MAX_IP_PER_IFC);
+							break;
+						}
+						ifc->ip_address[ifc->num_ip_addresses] = ipv4_addr;
+						ifc->num_ip_addresses++;
+						break;
+					}
+				}
+			}
+		}
+		ret -= NLMSG_ALIGN(len);
+		nlmp = (struct nlmsghdr *)((char *)nlmp + NLMSG_ALIGN(len));
+	}
+	ret = 0;
+
+exit:
+	close(netlink_fd);
+	return ret;
+}
+
+
+static int process_new_interface_msg(struct nlmsghdr *h)
+{
+	int len;
+	struct spdk_interface *ifc;
+	struct ifinfomsg *iface;
+	struct rtattr *attribute;
+
+	iface = (struct ifinfomsg *)NLMSG_DATA(h);
+
+	ifc = (struct spdk_interface *) malloc(sizeof(*ifc));
+	if (ifc == NULL) {
+		SPDK_ERRLOG("Malloc failed\n");
+		return 1;
+	}
+
+	memset(ifc, 0, sizeof(*ifc));
+
+	/* Set interface index */
+	ifc->index = iface->ifi_index;
+
+	len = h->nlmsg_len - NLMSG_LENGTH(sizeof(*iface));
+
+	/* Loop over all attributes for the NEWLINK message */
+	for (attribute = IFLA_RTA(iface); RTA_OK(attribute, len); attribute = RTA_NEXT(attribute, len)) {
+		switch (attribute->rta_type) {
+		case IFLA_IFNAME:
+			if (if_indextoname(iface->ifi_index, ifc->name) == NULL) {
+				SPDK_ERRLOG("Indextoname failed!\n");
+				free(ifc);
+				return 2;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	TAILQ_INSERT_TAIL(&g_interface_head, ifc, tailq);
+	return 0;
+}
+
+static int prepare_ifc_list(void)
+{
+	int ret = 0;
+	struct nl_req_s {
+		struct nlmsghdr hdr;
+		struct rtgenmsg gen;
+		struct ifinfomsg ifi;
+	};
+	int netlink_fd;
+	struct sockaddr_nl local;	/* Our local (user space) side of the communication */
+	struct sockaddr_nl kernel;	/* The remote (kernel space) side of the communication */
+
+	struct msghdr rtnl_msg;		/* Generic msghdr struct for use with sendmsg */
+	struct iovec io;		/* IO vector for sendmsg */
+
+	struct nl_req_s req;		/* Structure that describes the rtnetlink packet itself */
+	char reply[16384];		/* a large buffer to receive lots of link information */
+
+	pid_t pid = getpid();		/* Our process ID to build the correct netlink address */
+	int end = 0;			/* some flag to end loop parsing */
+
+	/*
+	 * Prepare netlink socket for kernel/user space communication
+	 */
+	netlink_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (netlink_fd < 0) {
+		SPDK_ERRLOG("socket failed!\n");
+		return 1;
+	}
+
+	memset(&local, 0, sizeof(local)); /* Fill-in local address information */
+	local.nl_family = AF_NETLINK;
+	local.nl_pid = pid;
+	local.nl_groups = 0;
+
+	/* RTNL socket is ready to use, prepare and send L2 request. */
+	memset(&rtnl_msg, 0, sizeof(rtnl_msg));
+	memset(&kernel, 0, sizeof(kernel));
+	memset(&req, 0, sizeof(req));
+
+	kernel.nl_family = AF_NETLINK; /* Fill-in kernel address (destination of our message) */
+
+	req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg));
+	req.hdr.nlmsg_type = RTM_GETLINK;
+	req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	req.hdr.nlmsg_seq = 1;
+	req.hdr.nlmsg_pid = pid;
+
+	req.ifi.ifi_family = AF_UNSPEC;
+	req.ifi.ifi_type = 1;
+
+	io.iov_base = &req;
+	io.iov_len = req.hdr.nlmsg_len;
+	rtnl_msg.msg_iov = &io;
+	rtnl_msg.msg_iovlen = 1;
+	rtnl_msg.msg_name = &kernel;
+	rtnl_msg.msg_namelen = sizeof(kernel);
+
+	if (sendmsg(netlink_fd, &rtnl_msg, 0) == -1) {
+		SPDK_ERRLOG("Sendmsg failed!\n");
+		ret = 1;
+		goto exit;
+	}
+
+	/* Parse reply */
+	while (!end) {
+		int len;
+		struct nlmsghdr *msg_ptr;	/* Pointer to current message part */
+
+		struct msghdr rtnl_reply;	/* Generic msghdr structure for use with recvmsg */
+		struct iovec io_reply;
+
+		memset(&io_reply, 0, sizeof(io_reply));
+		memset(&rtnl_reply, 0, sizeof(rtnl_reply));
+
+		io.iov_base = reply;
+		io.iov_len = 8192;
+		rtnl_reply.msg_iov = &io;
+		rtnl_reply.msg_iovlen = 1;
+		rtnl_reply.msg_name = &kernel;
+		rtnl_reply.msg_namelen = sizeof(kernel);
+
+		/* Read as much data as fits in the receive buffer */
+		len = recvmsg(netlink_fd, &rtnl_reply, 0);
+		if (len) {
+			for (msg_ptr = (struct nlmsghdr *) reply; NLMSG_OK(msg_ptr, (uint32_t)len);
+			     msg_ptr = NLMSG_NEXT(msg_ptr, len)) {
+				switch (msg_ptr->nlmsg_type) {
+				case NLMSG_DONE:		/* This is the special meaning NLMSG_DONE message we asked for by using NLM_F_DUMP flag */
+					end++;
+					break;
+				case RTM_NEWLINK:	/* This is a RTM_NEWLINK message, which contains lots of information about a link */
+					ret = process_new_interface_msg(msg_ptr);
+					if (ret != 0) {
+						goto exit;
+					}
+					break;
+				default:
+					break;
+				}
+			}
+		}
+	}
+exit:
+	close(netlink_fd);
+	return ret;
+}
+
+static struct spdk_interface *
+interface_find_by_index(uint32_t ifc_index)
+{
+	struct spdk_interface *ifc_entry;
+
+	/* Mutex must has benn held by the caller */
+	TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) {
+		if (ifc_entry->index == ifc_index) {
+			return ifc_entry;
+		}
+	}
+
+	return NULL;
+}
+
+static int netlink_addr_msg(uint32_t ifc_idx, uint32_t ip_address, uint32_t create)
+{
+	int fd;
+	struct sockaddr_nl la;
+	struct sockaddr_nl pa;
+	struct msghdr msg;
+	struct iovec iov;
+	int ifal;
+	struct {
+		struct nlmsghdr n;
+		struct ifaddrmsg r;
+		char buf[16384];
+	} req;
+	struct rtattr *rta;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (fd < 0) {
+		SPDK_ERRLOG("socket failed!\n");
+		return errno;
+	}
+
+	/* setup local address & bind using this address. */
+	bzero(&la, sizeof(la));
+	la.nl_family = AF_NETLINK;
+	la.nl_pid = getpid();
+	bind(fd, (struct sockaddr *) &la, sizeof(la));
+
+	/* initialize RTNETLINK request buffer. */
+	bzero(&req, sizeof(req));
+
+	/* compute the initial length of the service request. */
+	ifal = sizeof(struct ifaddrmsg);
+
+	/* add first attrib: set IP addr and RTNETLINK buffer size. */
+	rta = (struct rtattr *) req.buf;
+	rta->rta_type = IFA_ADDRESS;
+	rta->rta_len = sizeof(struct rtattr) + 4;
+	memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address));
+	ifal += rta->rta_len;
+
+	/* add second attrib. */
+	rta = (struct rtattr *)(((char *)rta) + rta->rta_len);
+	rta->rta_type = IFA_LOCAL;
+	rta->rta_len = sizeof(struct rtattr) + 4;
+	memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address));
+	ifal += rta->rta_len;
+
+	/* setup the NETLINK header. */
+	req.n.nlmsg_len = NLMSG_LENGTH(ifal);
+	if (create) {
+		req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_APPEND;
+		req.n.nlmsg_type = RTM_NEWADDR;
+	} else {
+		req.n.nlmsg_flags = NLM_F_REQUEST;
+		req.n.nlmsg_type = RTM_DELADDR;
+	}
+
+	/* setup the service header (struct rtmsg). */
+	req.r.ifa_family = AF_INET;
+	req.r.ifa_prefixlen = 32; /* hardcoded */
+	req.r.ifa_flags = IFA_F_PERMANENT | IFA_F_SECONDARY;
+	req.r.ifa_index = ifc_idx;
+	req.r.ifa_scope = 0;
+
+	/* create the remote address to communicate. */
+	bzero(&pa, sizeof(pa));
+	pa.nl_family = AF_NETLINK;
+
+	/* initialize & create the struct msghdr supplied to the sendmsg() function. */
+	bzero(&msg, sizeof(msg));
+	msg.msg_name = (void *) &pa;
+	msg.msg_namelen = sizeof(pa);
+
+	/* place the pointer & size of the RTNETLINK message in the struct msghdr. */
+	iov.iov_base = (void *) &req.n;
+	iov.iov_len = req.n.nlmsg_len;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	/* send the RTNETLINK message to kernel. */
+	sendmsg(fd, &msg, 0);
+	close(fd);
+	return 0;
+}
+
+static void interface_ip_update(void)
+{
+	struct spdk_interface *ifc_entry;
+
+	pthread_mutex_lock(&interface_lock);
+	TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) {
+		ifc_entry->num_ip_addresses = 0;
+		memset(ifc_entry->ip_address, 0, sizeof(ifc_entry->ip_address));
+	}
+	get_ifc_ipv4();
+	pthread_mutex_unlock(&interface_lock);
+}
+
+static int
+interface_is_ip_address_in_use(int ifc_index, uint32_t addr, bool add)
+{
+	struct spdk_interface *ifc_entry;
+	bool in_use = false;
+	uint32_t idx = 0;
+
+	interface_ip_update();
+
+	pthread_mutex_lock(&interface_lock);
+	ifc_entry = interface_find_by_index(ifc_index);
+	if (ifc_entry == NULL) {
+		pthread_mutex_unlock(&interface_lock);
+		return -ENODEV;
+	}
+
+	for (idx = 0; idx < ifc_entry->num_ip_addresses; idx++) {
+		if (ifc_entry->ip_address[idx] == addr) {
+			in_use = true;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&interface_lock);
+
+	/* The IP address to add is alerady in use */
+	if (add == true && in_use == true) {
+		return -EADDRINUSE;
+	}
+
+	/* The IP address to delete is not in use */
+	if (add == false && in_use == false) {
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+int
+spdk_interface_init(void)
+{
+	int rc = 0;
+
+	TAILQ_INIT(&g_interface_head);
+	rc = prepare_ifc_list();
+	if (!rc) {
+		rc = get_ifc_ipv4();
+	}
+
+	return rc;
+}
+
+void
+spdk_interface_destroy(void)
+{
+	struct spdk_interface *ifc_entry;
+
+	while (!TAILQ_EMPTY(&g_interface_head)) {
+		ifc_entry = TAILQ_FIRST(&g_interface_head);
+		TAILQ_REMOVE(&g_interface_head, ifc_entry, tailq);
+		free(ifc_entry);
+	}
+}
+
+int
+interface_net_interface_add_ip_address(int ifc_index, char *ip_addr)
+{
+	uint32_t addr;
+	int ret;
+
+	addr = inet_addr(ip_addr);
+
+	ret = interface_is_ip_address_in_use(ifc_index, addr, true);
+	if (ret < 0) {
+		return ret;
+	}
+
+	return netlink_addr_msg(ifc_index, addr, 1);
+}
+
+int
+interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr)
+{
+	uint32_t addr;
+	int ret;
+
+	addr = inet_addr(ip_addr);
+
+	ret = interface_is_ip_address_in_use(ifc_index, addr, false);
+	if (ret < 0) {
+		return ret;
+	}
+
+	return netlink_addr_msg(ifc_index, addr, 0);
+}
+
+void *interface_get_list(void)
+{
+	interface_ip_update();
+	return &g_interface_head;
+}
+
+#else /* Not Linux */
+
+int
+spdk_interface_init(void)
+{
+	return 0;
+}
+
+void
+spdk_interface_destroy(void)
+{
+}
+
+int
+interface_net_interface_add_ip_address(int ifc_index, char *ip_addr)
+{
+	return -1;
+}
+
+int
+interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr)
+{
+	return -1;
+}
+
+void *
+interface_get_list(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/src/spdk/lib/net/net_internal.h b/src/spdk/lib/net/net_internal.h
new file mode 100644
index 000000000..4a1422939
--- /dev/null
+++ b/src/spdk/lib/net/net_internal.h
@@ -0,0 +1,79 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_NET_INTERNAL_H
+#define SPDK_NET_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/queue.h"
+
+#define SPDK_IFNAMSIZE		32
+#define SPDK_MAX_IP_PER_IFC	32
+
+struct spdk_interface {
+	char name[SPDK_IFNAMSIZE];
+	uint32_t index;
+	uint32_t num_ip_addresses; /* number of IP addresses defined */
+	uint32_t ip_address[SPDK_MAX_IP_PER_IFC];
+	TAILQ_ENTRY(spdk_interface)	tailq;
+};
+
+/**
+ * Add an ip address to the network interface.
+ *
+ * \param ifc_index Index of the network interface.
+ * \param ip_addr Ip address to add.
+ *
+ * \return 0 on success, -1 on failure.
+ */
+int interface_net_interface_add_ip_address(int ifc_index, char *ip_addr);
+
+/**
+ * Delete an ip address from the network interface.
+ *
+ * \param ifc_index Index of the network interface.
+ * \param ip_addr Ip address to delete.
+ *
+ * \return 0 on success, -1 on failure.
+ */
+int interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr);
+
+/**
+ * Get the list of all the network interfaces.
+ *
+ * \return a pointer to the head of the linked list of all the network interfaces.
+ */
+void *interface_get_list(void);
+
+#endif /* SPDK_NET_INTERNAL_H */
diff --git a/src/spdk/lib/net/net_rpc.c b/src/spdk/lib/net/net_rpc.c
new file mode 100644
index 000000000..47a302a6b
--- /dev/null
+++ b/src/spdk/lib/net/net_rpc.c
@@ -0,0 +1,198 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "net_internal.h"
+
+#include "spdk/stdinc.h"
+
+#include "spdk/rpc.h"
+#include "spdk/net.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_ip_address {
+	int32_t ifc_index;
+	char *ip_address;
+};
+
+static void
+free_rpc_ip_address(struct rpc_ip_address *req)
+{
+	free(req->ip_address);
+}
+
+static const struct spdk_json_object_decoder rpc_ip_address_decoders[] = {
+	{"ifc_index", offsetof(struct rpc_ip_address, ifc_index), spdk_json_decode_int32},
+	{"ip_address", offsetof(struct rpc_ip_address, ip_address), spdk_json_decode_string},
+};
+
+static void
+rpc_net_interface_add_ip_address(struct spdk_jsonrpc_request *request,
+				 const struct spdk_json_val *params)
+{
+	struct rpc_ip_address req = {};
+	struct spdk_json_write_ctx *w;
+	int ret_val = 0;
+
+	if (spdk_json_decode_object(params, rpc_ip_address_decoders,
+				    SPDK_COUNTOF(rpc_ip_address_decoders),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	ret_val = interface_net_interface_add_ip_address(req.ifc_index, req.ip_address);
+	if (ret_val) {
+		if (ret_val == -ENODEV) {
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE,
+							     "Interface %d not available", req.ifc_index);
+		} else if (ret_val == -EADDRINUSE) {
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							     "IP address %s is already added to interface %d",
+							     req.ip_address, req.ifc_index);
+		} else {
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+							 strerror(ret_val));
+		}
+		goto invalid;
+	}
+
+	free_rpc_ip_address(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_ip_address(&req);
+}
+SPDK_RPC_REGISTER("net_interface_add_ip_address", rpc_net_interface_add_ip_address,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_interface_add_ip_address, add_ip_address)
+
+static void
+rpc_net_interface_delete_ip_address(struct spdk_jsonrpc_request *request,
+				    const struct spdk_json_val *params)
+{
+	struct rpc_ip_address req = {};
+	struct spdk_json_write_ctx *w;
+	int ret_val = 0;
+
+	if (spdk_json_decode_object(params, rpc_ip_address_decoders,
+				    SPDK_COUNTOF(rpc_ip_address_decoders),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "spdk_json_decode_object failed");
+		goto invalid;
+	}
+
+	ret_val = interface_net_interface_delete_ip_address(req.ifc_index, req.ip_address);
+	if (ret_val) {
+		if (ret_val == -ENODEV) {
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE,
+							     "Interface %d not available", req.ifc_index);
+		} else if (ret_val == -ENXIO) {
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							     "IP address %s is not found in interface %d",
+							     req.ip_address, req.ifc_index);
+		} else {
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+							 strerror(ret_val));
+		}
+		goto invalid;
+	}
+
+	free_rpc_ip_address(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_ip_address(&req);
+}
+SPDK_RPC_REGISTER("net_interface_delete_ip_address", rpc_net_interface_delete_ip_address,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_interface_delete_ip_address, delete_ip_address)
+
+static void
+rpc_net_get_interfaces(struct spdk_jsonrpc_request *request,
+		       const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+	TAILQ_HEAD(, spdk_interface) *interface_head = interface_get_list();
+	struct spdk_interface *ifc;
+	char *ip_address;
+	struct in_addr inaddr;
+	uint32_t i;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "net_get_interfaces requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+
+	TAILQ_FOREACH(ifc, interface_head, tailq) {
+		spdk_json_write_object_begin(w);
+
+		spdk_json_write_named_string(w, "name", ifc->name);
+
+		spdk_json_write_named_int32(w, "ifc_index", ifc->index);
+
+		spdk_json_write_named_array_begin(w, "ip_addr");
+		for (i = 0; i < ifc->num_ip_addresses; i++) {
+			memcpy(&inaddr, &ifc->ip_address[i], sizeof(uint32_t));
+			ip_address = inet_ntoa(inaddr);
+			spdk_json_write_string(w, ip_address);
+		}
+		spdk_json_write_array_end(w);
+
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("net_get_interfaces", rpc_net_get_interfaces, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_get_interfaces, get_interfaces)
+
+SPDK_LOG_REGISTER_COMPONENT("net", SPDK_LOG_NET)
diff --git a/src/spdk/lib/net/spdk_net.map b/src/spdk/lib/net/spdk_net.map
new file mode 100644
index 000000000..944bc4c6e
--- /dev/null
+++ b/src/spdk/lib/net/spdk_net.map
@@ -0,0 +1,9 @@
+{
+	global:
+
+	# public functions
+	spdk_interface_init;
+	spdk_interface_destroy;
+
+	local: *;
+};
diff --git a/src/spdk/lib/notify/Makefile b/src/spdk/lib/notify/Makefile
new file mode 100644
index 000000000..82249a5b2
--- /dev/null
+++ b/src/spdk/lib/notify/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = notify.c notify_rpc.c
+LIBNAME = notify
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_notify.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/notify/notify.c b/src/spdk/lib/notify/notify.c
new file mode 100644
index 000000000..88c5d633b
--- /dev/null
+++ b/src/spdk/lib/notify/notify.c
@@ -0,0 +1,150 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+#include "spdk/queue.h"
+#include "spdk/string.h"
+#include "spdk/log.h"
+
+#include "spdk/notify.h"
+
+#define SPDK_NOTIFY_MAX_EVENTS	1024
+
+struct spdk_notify_type {
+	char name[SPDK_NOTIFY_MAX_NAME_SIZE];
+	TAILQ_ENTRY(spdk_notify_type) tailq;
+};
+
+pthread_mutex_t g_events_lock = PTHREAD_MUTEX_INITIALIZER;
+static struct spdk_notify_event g_events[SPDK_NOTIFY_MAX_EVENTS];
+static uint64_t g_events_head;
+
+static TAILQ_HEAD(, spdk_notify_type) g_notify_types = TAILQ_HEAD_INITIALIZER(g_notify_types);
+
+struct spdk_notify_type *
+spdk_notify_type_register(const char *type)
+{
+	struct spdk_notify_type *it = NULL;
+
+	if (!type) {
+		SPDK_ERRLOG("Invalid notification type %p\n", type);
+		return NULL;
+	} else if (!type[0] || strlen(type) >= SPDK_NOTIFY_MAX_NAME_SIZE) {
+		SPDK_ERRLOG("Notification type '%s' too short or too long\n", type);
+		return NULL;
+	}
+
+	pthread_mutex_lock(&g_events_lock);
+	TAILQ_FOREACH(it, &g_notify_types, tailq) {
+		if (strcmp(type, it->name) == 0) {
+			SPDK_NOTICELOG("Notification type '%s' already registered.\n", type);
+			goto out;
+		}
+	}
+
+	it = calloc(1, sizeof(*it));
+	if (it == NULL) {
+		goto out;
+	}
+
+	snprintf(it->name, sizeof(it->name), "%s", type);
+	TAILQ_INSERT_TAIL(&g_notify_types, it, tailq);
+
+out:
+	pthread_mutex_unlock(&g_events_lock);
+	return it;
+}
+
+const char *
+spdk_notify_type_get_name(const struct spdk_notify_type *type)
+{
+	return type->name;
+}
+
+
+void
+spdk_notify_foreach_type(spdk_notify_foreach_type_cb cb, void *ctx)
+{
+	struct spdk_notify_type *it;
+
+	pthread_mutex_lock(&g_events_lock);
+	TAILQ_FOREACH(it, &g_notify_types, tailq) {
+		if (cb(it, ctx)) {
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_events_lock);
+}
+
+uint64_t
+spdk_notify_send(const char *type, const char *ctx)
+{
+	uint64_t head;
+	struct spdk_notify_event *ev;
+
+	pthread_mutex_lock(&g_events_lock);
+	head = g_events_head;
+	g_events_head++;
+
+	ev = &g_events[head % SPDK_NOTIFY_MAX_EVENTS];
+	spdk_strcpy_pad(ev->type, type, sizeof(ev->type), '\0');
+	spdk_strcpy_pad(ev->ctx, ctx, sizeof(ev->ctx), '\0');
+	pthread_mutex_unlock(&g_events_lock);
+
+	return head;
+}
+
+uint64_t
+spdk_notify_foreach_event(uint64_t start_idx, uint64_t max,
+			  spdk_notify_foreach_event_cb cb_fn, void *ctx)
+{
+	uint64_t i;
+
+	pthread_mutex_lock(&g_events_lock);
+
+	if (g_events_head > SPDK_NOTIFY_MAX_EVENTS && start_idx < g_events_head - SPDK_NOTIFY_MAX_EVENTS) {
+		start_idx = g_events_head - SPDK_NOTIFY_MAX_EVENTS;
+	}
+
+	for (i = 0; start_idx < g_events_head && i < max; start_idx++, i++) {
+		if (cb_fn(start_idx, &g_events[start_idx % SPDK_NOTIFY_MAX_EVENTS], ctx)) {
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_events_lock);
+
+	return i;
+}
diff --git a/src/spdk/lib/notify/notify_rpc.c b/src/spdk/lib/notify/notify_rpc.c
new file mode 100644
index 000000000..fc40502c2
--- /dev/null
+++ b/src/spdk/lib/notify/notify_rpc.c
@@ -0,0 +1,126 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/notify.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+static int
+notify_get_types_cb(const struct spdk_notify_type *type, void *ctx)
+{
+	spdk_json_write_string((struct spdk_json_write_ctx *)ctx, spdk_notify_type_get_name(type));
+	return 0;
+}
+
+static void
+rpc_notify_get_types(struct spdk_jsonrpc_request *request,
+		     const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "No parameters required");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	spdk_notify_foreach_type(notify_get_types_cb, w);
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("notify_get_types", rpc_notify_get_types, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(notify_get_types, get_notification_types)
+
+struct rpc_notify_get_notifications {
+	uint64_t id;
+	uint64_t max;
+
+	struct spdk_json_write_ctx *w;
+};
+
+static const struct spdk_json_object_decoder rpc_notify_get_notifications_decoders[] = {
+	{"id", offsetof(struct rpc_notify_get_notifications, id), spdk_json_decode_uint64, true},
+	{"max", offsetof(struct rpc_notify_get_notifications, max), spdk_json_decode_uint64, true},
+};
+
+
+static int
+notify_get_notifications_cb(uint64_t id, const struct spdk_notify_event *ev, void *ctx)
+{
+	struct rpc_notify_get_notifications *req = ctx;
+
+	spdk_json_write_object_begin(req->w);
+	spdk_json_write_named_string(req->w, "type", ev->type);
+	spdk_json_write_named_string(req->w, "ctx", ev->ctx);
+	spdk_json_write_named_uint64(req->w, "id", id);
+	spdk_json_write_object_end(req->w);
+	return 0;
+}
+
+static void
+rpc_notify_get_notifications(struct spdk_jsonrpc_request *request,
+			     const struct spdk_json_val *params)
+{
+	struct rpc_notify_get_notifications req = {0, UINT64_MAX};
+
+	if (params &&
+	    spdk_json_decode_object(params, rpc_notify_get_notifications_decoders,
+				    SPDK_COUNTOF(rpc_notify_get_notifications_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_NOTIFY_RPC, "spdk_json_decode_object failed\n");
+
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 spdk_strerror(EINVAL));
+		return;
+	}
+
+
+	req.w = spdk_jsonrpc_begin_result(request);
+
+	spdk_json_write_array_begin(req.w);
+	spdk_notify_foreach_event(req.id, req.max, notify_get_notifications_cb, &req);
+	spdk_json_write_array_end(req.w);
+
+	spdk_jsonrpc_end_result(request, req.w);
+}
+SPDK_RPC_REGISTER("notify_get_notifications", rpc_notify_get_notifications, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(notify_get_notifications, get_notifications)
+
+SPDK_LOG_REGISTER_COMPONENT("notify_rpc", SPDK_NOTIFY_RPC)
diff --git a/src/spdk/lib/notify/spdk_notify.map b/src/spdk/lib/notify/spdk_notify.map
new file mode 100644
index 000000000..4023a8e66
--- /dev/null
+++ b/src/spdk/lib/notify/spdk_notify.map
@@ -0,0 +1,10 @@
+{
+	global:
+	spdk_notify_type_register;
+	spdk_notify_type_get_name;
+	spdk_notify_foreach_type;
+	spdk_notify_send;
+	spdk_notify_foreach_event;
+
+	local: *;
+};
diff --git a/src/spdk/lib/nvme/Makefile b/src/spdk/lib/nvme/Makefile
new file mode 100644
index 000000000..1c02965f5
--- /dev/null
+++ b/src/spdk/lib/nvme/Makefile
@@ -0,0 +1,73 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 4
+SO_MINOR := 0
+
+C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \
+	nvme_ns_ocssd_cmd.c nvme_tcp.c nvme_opal.c nvme_io_msg.c nvme_poll_group.c
+C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c
+C_SRCS-$(CONFIG_NVME_CUSE) += nvme_cuse.c
+
+LIBNAME = nvme
+LOCAL_SYS_LIBS = -luuid
+ifeq ($(CONFIG_RDMA),y)
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+endif
+
+ifeq ($(CONFIG_NVME_CUSE),y)
+# fuse requires to set _FILE_OFFSET_BITS to 64 bits even for 64 bit machines
+CFLAGS += -D_FILE_OFFSET_BITS=64
+endif
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvme.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nvme/nvme.c b/src/spdk/lib/nvme/nvme.c
new file mode 100644
index 000000000..9393810a6
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme.c
@@ -0,0 +1,1423 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+#include "nvme_uevent.h"
+
+#define SPDK_NVME_DRIVER_NAME "spdk_nvme_driver"
+
+struct nvme_driver	*g_spdk_nvme_driver;
+pid_t			g_spdk_nvme_pid;
+
+/* gross timeout of 180 seconds in milliseconds */
+static int g_nvme_driver_timeout_ms = 3 * 60 * 1000;
+
+/* Per-process attached controller list */
+static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_attached_ctrlrs =
+	TAILQ_HEAD_INITIALIZER(g_nvme_attached_ctrlrs);
+
+/* Returns true if ctrlr should be stored on the multi-process shared_attached_ctrlrs list */
+static bool
+nvme_ctrlr_shared(const struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE;
+}
+
+void
+nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx,
+		     struct spdk_nvme_ctrlr *ctrlr)
+{
+	TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+}
+
+int
+spdk_nvme_detach(struct spdk_nvme_ctrlr *ctrlr)
+{
+	nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+	nvme_ctrlr_proc_put_ref(ctrlr);
+
+	if (nvme_ctrlr_get_ref_count(ctrlr) == 0) {
+		nvme_io_msg_ctrlr_detach(ctrlr);
+		if (nvme_ctrlr_shared(ctrlr)) {
+			TAILQ_REMOVE(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq);
+		} else {
+			TAILQ_REMOVE(&g_nvme_attached_ctrlrs, ctrlr, tailq);
+		}
+		nvme_ctrlr_destruct(ctrlr);
+	}
+
+	nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+	return 0;
+}
+
+void
+nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_completion_poll_status	*status = arg;
+
+	if (status->timed_out) {
+		/* There is no routine waiting for the completion of this request, free allocated memory */
+		free(status);
+		return;
+	}
+
+	/*
+	 * Copy status into the argument passed by the caller, so that
+	 *  the caller can check the status to determine if the
+	 *  the request passed or failed.
+	 */
+	memcpy(&status->cpl, cpl, sizeof(*cpl));
+	status->done = true;
+}
+
+/**
+ * Poll qpair for completions until a command completes.
+ *
+ * \param qpair queue to poll
+ * \param status completion status. The user must fill this structure with zeroes before calling
+ * this function
+ * \param robust_mutex optional robust mutex to lock while polling qpair
+ *
+ * \return 0 if command completed without error,
+ * -EIO if command completed with error,
+ * -ECANCELED if command is not completed due to transport/device error
+ *
+ * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback
+ * and status as the callback argument.
+ */
+int
+nvme_wait_for_completion_robust_lock(
+	struct spdk_nvme_qpair *qpair,
+	struct nvme_completion_poll_status *status,
+	pthread_mutex_t *robust_mutex)
+{
+	int rc;
+
+	while (status->done == false) {
+		if (robust_mutex) {
+			nvme_robust_mutex_lock(robust_mutex);
+		}
+
+		rc = spdk_nvme_qpair_process_completions(qpair, 0);
+
+		if (robust_mutex) {
+			nvme_robust_mutex_unlock(robust_mutex);
+		}
+
+		if (rc < 0) {
+			status->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+			status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+			if (status->done == false) {
+				status->timed_out = true;
+			}
+			return -ECANCELED;
+		}
+	}
+
+	return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0;
+}
+
+int
+nvme_wait_for_completion(struct spdk_nvme_qpair *qpair,
+			 struct nvme_completion_poll_status *status)
+{
+	return nvme_wait_for_completion_robust_lock(qpair, status, NULL);
+}
+
+/**
+ * Poll qpair for completions until a command completes.
+ *
+ * \param qpair queue to poll
+ * \param status completion status. The user must fill this structure with zeroes before calling
+ * this function
+ * \param timeout_in_secs optional timeout
+ *
+ * \return 0 if command completed without error,
+ * -EIO if command completed with error,
+ * -ECANCELED if command is not completed due to transport/device error or time expired
+ *
+ * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback
+ * and status as the callback argument.
+ */
+int
+nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair,
+				 struct nvme_completion_poll_status *status,
+				 uint64_t timeout_in_secs)
+{
+	uint64_t timeout_tsc = 0;
+	int rc = 0;
+
+	if (timeout_in_secs) {
+		timeout_tsc = spdk_get_ticks() + timeout_in_secs * spdk_get_ticks_hz();
+	}
+
+	while (status->done == false) {
+		rc = spdk_nvme_qpair_process_completions(qpair, 0);
+
+		if (rc < 0) {
+			status->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+			status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+			break;
+		}
+		if (timeout_tsc && spdk_get_ticks() > timeout_tsc) {
+			break;
+		}
+	}
+
+	if (status->done == false || rc < 0) {
+		if (status->done == false) {
+			status->timed_out = true;
+		}
+		return -ECANCELED;
+	}
+
+	return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0;
+}
+
+static void
+nvme_user_copy_cmd_complete(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_request *req = arg;
+	enum spdk_nvme_data_transfer xfer;
+
+	if (req->user_buffer && req->payload_size) {
+		/* Copy back to the user buffer and free the contig buffer */
+		assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+		xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc);
+		if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST ||
+		    xfer == SPDK_NVME_DATA_BIDIRECTIONAL) {
+			assert(req->pid == getpid());
+			memcpy(req->user_buffer, req->payload.contig_or_cb_arg, req->payload_size);
+		}
+
+		spdk_free(req->payload.contig_or_cb_arg);
+	}
+
+	/* Call the user's original callback now that the buffer has been copied */
+	req->user_cb_fn(req->user_cb_arg, cpl);
+}
+
+/**
+ * Allocate a request as well as a DMA-capable buffer to copy to/from the user's buffer.
+ *
+ * This is intended for use in non-fast-path functions (admin commands, reservations, etc.)
+ * where the overhead of a copy is not a problem.
+ */
+struct nvme_request *
+nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair,
+				void *buffer, uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+				void *cb_arg, bool host_to_controller)
+{
+	struct nvme_request *req;
+	void *dma_buffer = NULL;
+
+	if (buffer && payload_size) {
+		dma_buffer = spdk_zmalloc(payload_size, 4096, NULL,
+					  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+		if (!dma_buffer) {
+			return NULL;
+		}
+
+		if (host_to_controller) {
+			memcpy(dma_buffer, buffer, payload_size);
+		}
+	}
+
+	req = nvme_allocate_request_contig(qpair, dma_buffer, payload_size, nvme_user_copy_cmd_complete,
+					   NULL);
+	if (!req) {
+		spdk_free(dma_buffer);
+		return NULL;
+	}
+
+	req->user_cb_fn = cb_fn;
+	req->user_cb_arg = cb_arg;
+	req->user_buffer = buffer;
+	req->cb_arg = req;
+
+	return req;
+}
+
+/**
+ * Check if a request has exceeded the controller timeout.
+ *
+ * \param req request to check for timeout.
+ * \param cid command ID for command submitted by req (will be passed to timeout_cb_fn)
+ * \param active_proc per-process data for the controller associated with req
+ * \param now_tick current time from spdk_get_ticks()
+ * \return 0 if requests submitted more recently than req should still be checked for timeouts, or
+ * 1 if requests newer than req need not be checked.
+ *
+ * The request's timeout callback will be called if needed; the caller is only responsible for
+ * calling this function on each outstanding request.
+ */
+int
+nvme_request_check_timeout(struct nvme_request *req, uint16_t cid,
+			   struct spdk_nvme_ctrlr_process *active_proc,
+			   uint64_t now_tick)
+{
+	struct spdk_nvme_qpair *qpair = req->qpair;
+	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+	assert(active_proc->timeout_cb_fn != NULL);
+
+	if (req->timed_out || req->submit_tick == 0) {
+		return 0;
+	}
+
+	if (req->pid != g_spdk_nvme_pid) {
+		return 0;
+	}
+
+	if (nvme_qpair_is_admin_queue(qpair) &&
+	    req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+		return 0;
+	}
+
+	if (req->submit_tick + active_proc->timeout_ticks > now_tick) {
+		return 1;
+	}
+
+	req->timed_out = true;
+
+	/*
+	 * We don't want to expose the admin queue to the user,
+	 * so when we're timing out admin commands set the
+	 * qpair to NULL.
+	 */
+	active_proc->timeout_cb_fn(active_proc->timeout_cb_arg, ctrlr,
+				   nvme_qpair_is_admin_queue(qpair) ? NULL : qpair,
+				   cid);
+	return 0;
+}
+
+int
+nvme_robust_mutex_init_shared(pthread_mutex_t *mtx)
+{
+	int rc = 0;
+
+#ifdef __FreeBSD__
+	pthread_mutex_init(mtx, NULL);
+#else
+	pthread_mutexattr_t attr;
+
+	if (pthread_mutexattr_init(&attr)) {
+		return -1;
+	}
+	if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) ||
+	    pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) ||
+	    pthread_mutex_init(mtx, &attr)) {
+		rc = -1;
+	}
+	pthread_mutexattr_destroy(&attr);
+#endif
+
+	return rc;
+}
+
+int
+nvme_driver_init(void)
+{
+	static pthread_mutex_t g_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+	int ret = 0;
+	/* Any socket ID */
+	int socket_id = -1;
+
+	/* Use a special process-private mutex to ensure the global
+	 * nvme driver object (g_spdk_nvme_driver) gets initialized by
+	 * only one thread.  Once that object is established and its
+	 * mutex is initialized, we can unlock this mutex and use that
+	 * one instead.
+	 */
+	pthread_mutex_lock(&g_init_mutex);
+
+	/* Each process needs its own pid. */
+	g_spdk_nvme_pid = getpid();
+
+	/*
+	 * Only one thread from one process will do this driver init work.
+	 * The primary process will reserve the shared memory and do the
+	 *  initialization.
+	 * The secondary process will lookup the existing reserved memory.
+	 */
+	if (spdk_process_is_primary()) {
+		/* The unique named memzone already reserved. */
+		if (g_spdk_nvme_driver != NULL) {
+			pthread_mutex_unlock(&g_init_mutex);
+			return 0;
+		} else {
+			g_spdk_nvme_driver = spdk_memzone_reserve(SPDK_NVME_DRIVER_NAME,
+					     sizeof(struct nvme_driver), socket_id,
+					     SPDK_MEMZONE_NO_IOVA_CONTIG);
+		}
+
+		if (g_spdk_nvme_driver == NULL) {
+			SPDK_ERRLOG("primary process failed to reserve memory\n");
+			pthread_mutex_unlock(&g_init_mutex);
+			return -1;
+		}
+	} else {
+		g_spdk_nvme_driver = spdk_memzone_lookup(SPDK_NVME_DRIVER_NAME);
+
+		/* The unique named memzone already reserved by the primary process. */
+		if (g_spdk_nvme_driver != NULL) {
+			int ms_waited = 0;
+
+			/* Wait the nvme driver to get initialized. */
+			while ((g_spdk_nvme_driver->initialized == false) &&
+			       (ms_waited < g_nvme_driver_timeout_ms)) {
+				ms_waited++;
+				nvme_delay(1000); /* delay 1ms */
+			}
+			if (g_spdk_nvme_driver->initialized == false) {
+				SPDK_ERRLOG("timeout waiting for primary process to init\n");
+				pthread_mutex_unlock(&g_init_mutex);
+				return -1;
+			}
+		} else {
+			SPDK_ERRLOG("primary process is not started yet\n");
+			pthread_mutex_unlock(&g_init_mutex);
+			return -1;
+		}
+
+		pthread_mutex_unlock(&g_init_mutex);
+		return 0;
+	}
+
+	/*
+	 * At this moment, only one thread from the primary process will do
+	 * the g_spdk_nvme_driver initialization
+	 */
+	assert(spdk_process_is_primary());
+
+	ret = nvme_robust_mutex_init_shared(&g_spdk_nvme_driver->lock);
+	if (ret != 0) {
+		SPDK_ERRLOG("failed to initialize mutex\n");
+		spdk_memzone_free(SPDK_NVME_DRIVER_NAME);
+		pthread_mutex_unlock(&g_init_mutex);
+		return ret;
+	}
+
+	/* The lock in the shared g_spdk_nvme_driver object is now ready to
+	 * be used - so we can unlock the g_init_mutex here.
+	 */
+	pthread_mutex_unlock(&g_init_mutex);
+	nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+	g_spdk_nvme_driver->initialized = false;
+	g_spdk_nvme_driver->hotplug_fd = nvme_uevent_connect();
+	if (g_spdk_nvme_driver->hotplug_fd < 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
+	}
+
+	TAILQ_INIT(&g_spdk_nvme_driver->shared_attached_ctrlrs);
+
+	spdk_uuid_generate(&g_spdk_nvme_driver->default_extended_host_id);
+
+	nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+	return ret;
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+int
+nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid,
+		 struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle)
+{
+	struct spdk_nvme_ctrlr *ctrlr;
+	struct spdk_nvme_ctrlr_opts opts;
+
+	assert(trid != NULL);
+
+	spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
+
+	if (!probe_ctx->probe_cb || probe_ctx->probe_cb(probe_ctx->cb_ctx, trid, &opts)) {
+		ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid);
+		if (ctrlr) {
+			/* This ctrlr already exists.
+			* Increase the ref count before calling attach_cb() as the user may
+			* call nvme_detach() immediately. */
+			nvme_ctrlr_proc_get_ref(ctrlr);
+
+			if (probe_ctx->attach_cb) {
+				nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+				probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+				nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+			}
+			return 0;
+		}
+
+		ctrlr = nvme_transport_ctrlr_construct(trid, &opts, devhandle);
+		if (ctrlr == NULL) {
+			SPDK_ERRLOG("Failed to construct NVMe controller for SSD: %s\n", trid->traddr);
+			return -1;
+		}
+		ctrlr->remove_cb = probe_ctx->remove_cb;
+		ctrlr->cb_ctx = probe_ctx->cb_ctx;
+
+		if (ctrlr->quirks & NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE &&
+		    ctrlr->opts.io_queue_size == DEFAULT_IO_QUEUE_SIZE) {
+			/* If the user specifically set an IO queue size different than the
+			 * default, use that value.  Otherwise overwrite with the quirked value.
+			 * This allows this quirk to be overridden when necessary.
+			 * However, cap.mqes still needs to be respected.
+			 */
+			ctrlr->opts.io_queue_size = spdk_min(DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK, ctrlr->cap.bits.mqes + 1u);
+		}
+
+		nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED);
+		TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+nvme_ctrlr_poll_internal(struct spdk_nvme_ctrlr *ctrlr,
+			 struct spdk_nvme_probe_ctx *probe_ctx)
+{
+	int	rc = 0;
+
+	rc = nvme_ctrlr_process_init(ctrlr);
+
+	if (rc) {
+		/* Controller failed to initialize. */
+		TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+		SPDK_ERRLOG("Failed to initialize SSD: %s\n", ctrlr->trid.traddr);
+		nvme_ctrlr_fail(ctrlr, false);
+		nvme_ctrlr_destruct(ctrlr);
+		return rc;
+	}
+
+	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+		return 0;
+	}
+
+	STAILQ_INIT(&ctrlr->io_producers);
+
+	/*
+	 * Controller has been initialized.
+	 *  Move it to the attached_ctrlrs list.
+	 */
+	TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+
+	nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+	if (nvme_ctrlr_shared(ctrlr)) {
+		TAILQ_INSERT_TAIL(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq);
+	} else {
+		TAILQ_INSERT_TAIL(&g_nvme_attached_ctrlrs, ctrlr, tailq);
+	}
+
+	/*
+	 * Increase the ref count before calling attach_cb() as the user may
+	 * call nvme_detach() immediately.
+	 */
+	nvme_ctrlr_proc_get_ref(ctrlr);
+	nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+	if (probe_ctx->attach_cb) {
+		probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+		return 0;
+	}
+
+	return 0;
+}
+
+static int
+nvme_init_controllers(struct spdk_nvme_probe_ctx *probe_ctx)
+{
+	int rc = 0;
+
+	while (true) {
+		rc = spdk_nvme_probe_poll_async(probe_ctx);
+		if (rc != -EAGAIN) {
+			return rc;
+		}
+	}
+
+	return rc;
+}
+
+/* This function must not be called while holding g_spdk_nvme_driver->lock */
+static struct spdk_nvme_ctrlr *
+nvme_get_ctrlr_by_trid(const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvme_ctrlr *ctrlr;
+
+	nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+	ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid);
+	nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+	return ctrlr;
+}
+
+/* This function must be called while holding g_spdk_nvme_driver->lock */
+struct spdk_nvme_ctrlr *
+nvme_get_ctrlr_by_trid_unsafe(const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvme_ctrlr *ctrlr;
+
+	/* Search per-process list */
+	TAILQ_FOREACH(ctrlr, &g_nvme_attached_ctrlrs, tailq) {
+		if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) {
+			return ctrlr;
+		}
+	}
+
+	/* Search multi-process shared list */
+	TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) {
+		if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) {
+			return ctrlr;
+		}
+	}
+
+	return NULL;
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+static int
+nvme_probe_internal(struct spdk_nvme_probe_ctx *probe_ctx,
+		    bool direct_connect)
+{
+	int rc;
+	struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp;
+
+	spdk_nvme_trid_populate_transport(&probe_ctx->trid, probe_ctx->trid.trtype);
+	if (!spdk_nvme_transport_available_by_name(probe_ctx->trid.trstring)) {
+		SPDK_ERRLOG("NVMe trtype %u not available\n", probe_ctx->trid.trtype);
+		return -1;
+	}
+
+	nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+	rc = nvme_transport_ctrlr_scan(probe_ctx, direct_connect);
+	if (rc != 0) {
+		SPDK_ERRLOG("NVMe ctrlr scan failed\n");
+		TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) {
+			TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+			nvme_transport_ctrlr_destruct(ctrlr);
+		}
+		nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+		return -1;
+	}
+
+	/*
+	 * Probe controllers on the shared_attached_ctrlrs list
+	 */
+	if (!spdk_process_is_primary() && (probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) {
+		TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) {
+			/* Do not attach other ctrlrs if user specify a valid trid */
+			if ((strlen(probe_ctx->trid.traddr) != 0) &&
+			    (spdk_nvme_transport_id_compare(&probe_ctx->trid, &ctrlr->trid))) {
+				continue;
+			}
+
+			/* Do not attach if we failed to initialize it in this process */
+			if (nvme_ctrlr_get_current_process(ctrlr) == NULL) {
+				continue;
+			}
+
+			nvme_ctrlr_proc_get_ref(ctrlr);
+
+			/*
+			 * Unlock while calling attach_cb() so the user can call other functions
+			 *  that may take the driver lock, like nvme_detach().
+			 */
+			if (probe_ctx->attach_cb) {
+				nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+				probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+				nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+			}
+		}
+	}
+
+	nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+	return 0;
+}
+
+static void
+nvme_probe_ctx_init(struct spdk_nvme_probe_ctx *probe_ctx,
+		    const struct spdk_nvme_transport_id *trid,
+		    void *cb_ctx,
+		    spdk_nvme_probe_cb probe_cb,
+		    spdk_nvme_attach_cb attach_cb,
+		    spdk_nvme_remove_cb remove_cb)
+{
+	probe_ctx->trid = *trid;
+	probe_ctx->cb_ctx = cb_ctx;
+	probe_ctx->probe_cb = probe_cb;
+	probe_ctx->attach_cb = attach_cb;
+	probe_ctx->remove_cb = remove_cb;
+	TAILQ_INIT(&probe_ctx->init_ctrlrs);
+}
+
+int
+spdk_nvme_probe(const struct spdk_nvme_transport_id *trid, void *cb_ctx,
+		spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb,
+		spdk_nvme_remove_cb remove_cb)
+{
+	struct spdk_nvme_transport_id trid_pcie;
+	struct spdk_nvme_probe_ctx *probe_ctx;
+
+	if (trid == NULL) {
+		memset(&trid_pcie, 0, sizeof(trid_pcie));
+		spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
+		trid = &trid_pcie;
+	}
+
+	probe_ctx = spdk_nvme_probe_async(trid, cb_ctx, probe_cb,
+					  attach_cb, remove_cb);
+	if (!probe_ctx) {
+		SPDK_ERRLOG("Create probe context failed\n");
+		return -1;
+	}
+
+	/*
+	 * Keep going even if one or more nvme_attach() calls failed,
+	 *  but maintain the value of rc to signal errors when we return.
+	 */
+	return nvme_init_controllers(probe_ctx);
+}
+
+static bool
+nvme_connect_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+		      struct spdk_nvme_ctrlr_opts *opts)
+{
+	struct spdk_nvme_ctrlr_opts *requested_opts = cb_ctx;
+
+	assert(requested_opts);
+	memcpy(opts, requested_opts, sizeof(*opts));
+
+	return true;
+}
+
+static void
+nvme_ctrlr_opts_init(struct spdk_nvme_ctrlr_opts *opts,
+		     const struct spdk_nvme_ctrlr_opts *opts_user,
+		     size_t opts_size_user)
+{
+	assert(opts);
+	assert(opts_user);
+
+	spdk_nvme_ctrlr_get_default_ctrlr_opts(opts, opts_size_user);
+
+#define FIELD_OK(field) \
+        offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= (opts->opts_size)
+
+	if (FIELD_OK(num_io_queues)) {
+		opts->num_io_queues = opts_user->num_io_queues;
+	}
+
+	if (FIELD_OK(use_cmb_sqs)) {
+		opts->use_cmb_sqs = opts_user->use_cmb_sqs;
+	}
+
+	if (FIELD_OK(no_shn_notification)) {
+		opts->no_shn_notification = opts_user->no_shn_notification;
+	}
+
+	if (FIELD_OK(arb_mechanism)) {
+		opts->arb_mechanism = opts_user->arb_mechanism;
+	}
+
+	if (FIELD_OK(arbitration_burst)) {
+		opts->arbitration_burst = opts_user->arbitration_burst;
+	}
+
+	if (FIELD_OK(low_priority_weight)) {
+		opts->low_priority_weight = opts_user->low_priority_weight;
+	}
+
+	if (FIELD_OK(medium_priority_weight)) {
+		opts->medium_priority_weight = opts_user->medium_priority_weight;
+	}
+
+	if (FIELD_OK(high_priority_weight)) {
+		opts->high_priority_weight = opts_user->high_priority_weight;
+	}
+
+	if (FIELD_OK(keep_alive_timeout_ms)) {
+		opts->keep_alive_timeout_ms =  opts_user->keep_alive_timeout_ms;
+	}
+
+	if (FIELD_OK(transport_retry_count)) {
+		opts->transport_retry_count = opts_user->transport_retry_count;
+	}
+
+	if (FIELD_OK(io_queue_size)) {
+		opts->io_queue_size =  opts_user->io_queue_size;
+	}
+
+	if (FIELD_OK(hostnqn)) {
+		memcpy(opts->hostnqn, opts_user->hostnqn, sizeof(opts_user->hostnqn));
+	}
+
+	if (FIELD_OK(io_queue_requests)) {
+		opts->io_queue_requests =  opts_user->io_queue_requests;
+	}
+
+	if (FIELD_OK(src_addr)) {
+		memcpy(opts->src_addr, opts_user->src_addr, sizeof(opts_user->src_addr));
+	}
+
+	if (FIELD_OK(src_svcid)) {
+		memcpy(opts->src_svcid, opts_user->src_svcid, sizeof(opts_user->src_svcid));
+	}
+
+	if (FIELD_OK(host_id)) {
+		memcpy(opts->host_id, opts_user->host_id, sizeof(opts_user->host_id));
+	}
+	if (FIELD_OK(extended_host_id)) {
+		memcpy(opts->extended_host_id, opts_user->extended_host_id,
+		       sizeof(opts_user->extended_host_id));
+	}
+
+	if (FIELD_OK(command_set)) {
+		opts->command_set = opts_user->command_set;
+	}
+
+	if (FIELD_OK(admin_timeout_ms)) {
+		opts->admin_timeout_ms = opts_user->admin_timeout_ms;
+	}
+
+	if (FIELD_OK(header_digest)) {
+		opts->header_digest = opts_user->header_digest;
+	}
+
+	if (FIELD_OK(data_digest)) {
+		opts->data_digest = opts_user->data_digest;
+	}
+
+	if (FIELD_OK(disable_error_logging)) {
+		opts->disable_error_logging = opts_user->disable_error_logging;
+	}
+
+	if (FIELD_OK(transport_ack_timeout)) {
+		opts->transport_ack_timeout = opts_user->transport_ack_timeout;
+	}
+
+	if (FIELD_OK(admin_queue_size)) {
+		opts->admin_queue_size = opts_user->admin_queue_size;
+	}
+#undef FIELD_OK
+}
+
+struct spdk_nvme_ctrlr *
+spdk_nvme_connect(const struct spdk_nvme_transport_id *trid,
+		  const struct spdk_nvme_ctrlr_opts *opts, size_t opts_size)
+{
+	int rc;
+	struct spdk_nvme_ctrlr *ctrlr = NULL;
+	struct spdk_nvme_probe_ctx *probe_ctx;
+	struct spdk_nvme_ctrlr_opts *opts_local_p = NULL;
+	struct spdk_nvme_ctrlr_opts opts_local;
+
+	if (trid == NULL) {
+		SPDK_ERRLOG("No transport ID specified\n");
+		return NULL;
+	}
+
+	if (opts) {
+		opts_local_p = &opts_local;
+		nvme_ctrlr_opts_init(opts_local_p, opts, opts_size);
+	}
+
+	probe_ctx = spdk_nvme_connect_async(trid, opts_local_p, NULL);
+	if (!probe_ctx) {
+		SPDK_ERRLOG("Create probe context failed\n");
+		return NULL;
+	}
+
+	rc = nvme_init_controllers(probe_ctx);
+	if (rc != 0) {
+		return NULL;
+	}
+
+	ctrlr = nvme_get_ctrlr_by_trid(trid);
+
+	return ctrlr;
+}
+
+void
+spdk_nvme_trid_populate_transport(struct spdk_nvme_transport_id *trid,
+				  enum spdk_nvme_transport_type trtype)
+{
+	const char *trstring = "";
+
+	trid->trtype = trtype;
+	switch (trtype) {
+	case SPDK_NVME_TRANSPORT_FC:
+		trstring = SPDK_NVME_TRANSPORT_NAME_FC;
+		break;
+	case SPDK_NVME_TRANSPORT_PCIE:
+		trstring = SPDK_NVME_TRANSPORT_NAME_PCIE;
+		break;
+	case SPDK_NVME_TRANSPORT_RDMA:
+		trstring = SPDK_NVME_TRANSPORT_NAME_RDMA;
+		break;
+	case SPDK_NVME_TRANSPORT_TCP:
+		trstring = SPDK_NVME_TRANSPORT_NAME_TCP;
+		break;
+	case SPDK_NVME_TRANSPORT_CUSTOM:
+	default:
+		SPDK_ERRLOG("don't use this for custom transports\n");
+		assert(0);
+		return;
+	}
+	snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring);
+}
+
+int
+spdk_nvme_transport_id_populate_trstring(struct spdk_nvme_transport_id *trid, const char *trstring)
+{
+	int len, i, rc;
+
+	if (trstring == NULL) {
+		return -EINVAL;
+	}
+
+	len = strnlen(trstring, SPDK_NVMF_TRSTRING_MAX_LEN);
+	if (len == SPDK_NVMF_TRSTRING_MAX_LEN) {
+		return -EINVAL;
+	}
+
+	rc = snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring);
+	if (rc < 0) {
+		return rc;
+	}
+
+	/* cast official trstring to uppercase version of input. */
+	for (i = 0; i < len; i++) {
+		trid->trstring[i] = toupper(trid->trstring[i]);
+	}
+	return 0;
+}
+
+int
+spdk_nvme_transport_id_parse_trtype(enum spdk_nvme_transport_type *trtype, const char *str)
+{
+	if (trtype == NULL || str == NULL) {
+		return -EINVAL;
+	}
+
+	if (strcasecmp(str, "PCIe") == 0) {
+		*trtype = SPDK_NVME_TRANSPORT_PCIE;
+	} else if (strcasecmp(str, "RDMA") == 0) {
+		*trtype = SPDK_NVME_TRANSPORT_RDMA;
+	} else if (strcasecmp(str, "FC") == 0) {
+		*trtype = SPDK_NVME_TRANSPORT_FC;
+	} else if (strcasecmp(str, "TCP") == 0) {
+		*trtype = SPDK_NVME_TRANSPORT_TCP;
+	} else {
+		*trtype = SPDK_NVME_TRANSPORT_CUSTOM;
+	}
+	return 0;
+}
+
+const char *
+spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype)
+{
+	switch (trtype) {
+	case SPDK_NVME_TRANSPORT_PCIE:
+		return "PCIe";
+	case SPDK_NVME_TRANSPORT_RDMA:
+		return "RDMA";
+	case SPDK_NVME_TRANSPORT_FC:
+		return "FC";
+	case SPDK_NVME_TRANSPORT_TCP:
+		return "TCP";
+	case SPDK_NVME_TRANSPORT_CUSTOM:
+		return "CUSTOM";
+	default:
+		return NULL;
+	}
+}
+
+int
+spdk_nvme_transport_id_parse_adrfam(enum spdk_nvmf_adrfam *adrfam, const char *str)
+{
+	if (adrfam == NULL || str == NULL) {
+		return -EINVAL;
+	}
+
+	if (strcasecmp(str, "IPv4") == 0) {
+		*adrfam = SPDK_NVMF_ADRFAM_IPV4;
+	} else if (strcasecmp(str, "IPv6") == 0) {
+		*adrfam = SPDK_NVMF_ADRFAM_IPV6;
+	} else if (strcasecmp(str, "IB") == 0) {
+		*adrfam = SPDK_NVMF_ADRFAM_IB;
+	} else if (strcasecmp(str, "FC") == 0) {
+		*adrfam = SPDK_NVMF_ADRFAM_FC;
+	} else {
+		return -ENOENT;
+	}
+	return 0;
+}
+
+const char *
+spdk_nvme_transport_id_adrfam_str(enum spdk_nvmf_adrfam adrfam)
+{
+	switch (adrfam) {
+	case SPDK_NVMF_ADRFAM_IPV4:
+		return "IPv4";
+	case SPDK_NVMF_ADRFAM_IPV6:
+		return "IPv6";
+	case SPDK_NVMF_ADRFAM_IB:
+		return "IB";
+	case SPDK_NVMF_ADRFAM_FC:
+		return "FC";
+	default:
+		return NULL;
+	}
+}
+
+static size_t
+parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, size_t val_buf_size)
+{
+
+	const char *sep, *sep1;
+	const char *whitespace = " \t\n";
+	size_t key_len, val_len;
+
+	*str += strspn(*str, whitespace);
+
+	sep = strchr(*str, ':');
+	if (!sep) {
+		sep = strchr(*str, '=');
+		if (!sep) {
+			SPDK_ERRLOG("Key without ':' or '=' separator\n");
+			return 0;
+		}
+	} else {
+		sep1 = strchr(*str, '=');
+		if ((sep1 != NULL) && (sep1 < sep)) {
+			sep = sep1;
+		}
+	}
+
+	key_len = sep - *str;
+	if (key_len >= key_buf_size) {
+		SPDK_ERRLOG("Key length %zu greater than maximum allowed %zu\n",
+			    key_len, key_buf_size - 1);
+		return 0;
+	}
+
+	memcpy(key, *str, key_len);
+	key[key_len] = '\0';
+
+	*str += key_len + 1; /* Skip key: */
+	val_len = strcspn(*str, whitespace);
+	if (val_len == 0) {
+		SPDK_ERRLOG("Key without value\n");
+		return 0;
+	}
+
+	if (val_len >= val_buf_size) {
+		SPDK_ERRLOG("Value length %zu greater than maximum allowed %zu\n",
+			    val_len, val_buf_size - 1);
+		return 0;
+	}
+
+	memcpy(val, *str, val_len);
+	val[val_len] = '\0';
+
+	*str += val_len;
+
+	return val_len;
+}
+
+int
+spdk_nvme_transport_id_parse(struct spdk_nvme_transport_id *trid, const char *str)
+{
+	size_t val_len;
+	char key[32];
+	char val[1024];
+
+	if (trid == NULL || str == NULL) {
+		return -EINVAL;
+	}
+
+	while (*str != '\0') {
+
+		val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
+
+		if (val_len == 0) {
+			SPDK_ERRLOG("Failed to parse transport ID\n");
+			return -EINVAL;
+		}
+
+		if (strcasecmp(key, "trtype") == 0) {
+			if (spdk_nvme_transport_id_populate_trstring(trid, val) != 0) {
+				SPDK_ERRLOG("invalid transport '%s'\n", val);
+				return -EINVAL;
+			}
+			if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, val) != 0) {
+				SPDK_ERRLOG("Unknown trtype '%s'\n", val);
+				return -EINVAL;
+			}
+		} else if (strcasecmp(key, "adrfam") == 0) {
+			if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, val) != 0) {
+				SPDK_ERRLOG("Unknown adrfam '%s'\n", val);
+				return -EINVAL;
+			}
+		} else if (strcasecmp(key, "traddr") == 0) {
+			if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) {
+				SPDK_ERRLOG("traddr length %zu greater than maximum allowed %u\n",
+					    val_len, SPDK_NVMF_TRADDR_MAX_LEN);
+				return -EINVAL;
+			}
+			memcpy(trid->traddr, val, val_len + 1);
+		} else if (strcasecmp(key, "trsvcid") == 0) {
+			if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) {
+				SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n",
+					    val_len, SPDK_NVMF_TRSVCID_MAX_LEN);
+				return -EINVAL;
+			}
+			memcpy(trid->trsvcid, val, val_len + 1);
+		} else if (strcasecmp(key, "priority") == 0) {
+			if (val_len > SPDK_NVMF_PRIORITY_MAX_LEN) {
+				SPDK_ERRLOG("priority length %zu greater than maximum allowed %u\n",
+					    val_len, SPDK_NVMF_PRIORITY_MAX_LEN);
+				return -EINVAL;
+			}
+			trid->priority = spdk_strtol(val, 10);
+		} else if (strcasecmp(key, "subnqn") == 0) {
+			if (val_len > SPDK_NVMF_NQN_MAX_LEN) {
+				SPDK_ERRLOG("subnqn length %zu greater than maximum allowed %u\n",
+					    val_len, SPDK_NVMF_NQN_MAX_LEN);
+				return -EINVAL;
+			}
+			memcpy(trid->subnqn, val, val_len + 1);
+		} else if (strcasecmp(key, "hostaddr") == 0) {
+			continue;
+		} else if (strcasecmp(key, "hostsvcid") == 0) {
+			continue;
+		} else if (strcasecmp(key, "ns") == 0) {
+			/*
+			 * Special case.  The namespace id parameter may
+			 * optionally be passed in the transport id string
+			 * for an SPDK application (e.g. nvme/perf)
+			 * and additionally parsed therein to limit
+			 * targeting a specific namespace.  For this
+			 * scenario, just silently ignore this key
+			 * rather than letting it default to logging
+			 * it as an invalid key.
+			 */
+			continue;
+		} else if (strcasecmp(key, "alt_traddr") == 0) {
+			/*
+			 * Used by applications for enabling transport ID failover.
+			 * Please see the case above for more information on custom parameters.
+			 */
+			continue;
+		} else {
+			SPDK_ERRLOG("Unknown transport ID key '%s'\n", key);
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_nvme_host_id_parse(struct spdk_nvme_host_id *hostid, const char *str)
+{
+
+	size_t key_size = 32;
+	size_t val_size = 1024;
+	size_t val_len;
+	char key[key_size];
+	char val[val_size];
+
+	if (hostid == NULL || str == NULL) {
+		return -EINVAL;
+	}
+
+	while (*str != '\0') {
+
+		val_len = parse_next_key(&str, key, val, key_size, val_size);
+
+		if (val_len == 0) {
+			SPDK_ERRLOG("Failed to parse host ID\n");
+			return val_len;
+		}
+
+		/* Ignore the rest of the options from the transport ID. */
+		if (strcasecmp(key, "trtype") == 0) {
+			continue;
+		} else if (strcasecmp(key, "adrfam") == 0) {
+			continue;
+		} else if (strcasecmp(key, "traddr") == 0) {
+			continue;
+		} else if (strcasecmp(key, "trsvcid") == 0) {
+			continue;
+		} else if (strcasecmp(key, "subnqn") == 0) {
+			continue;
+		} else if (strcasecmp(key, "priority") == 0) {
+			continue;
+		} else if (strcasecmp(key, "ns") == 0) {
+			continue;
+		} else if (strcasecmp(key, "hostaddr") == 0) {
+			if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) {
+				SPDK_ERRLOG("hostaddr length %zu greater than maximum allowed %u\n",
+					    val_len, SPDK_NVMF_TRADDR_MAX_LEN);
+				return -EINVAL;
+			}
+			memcpy(hostid->hostaddr, val, val_len + 1);
+
+		} else if (strcasecmp(key, "hostsvcid") == 0) {
+			if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) {
+				SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n",
+					    val_len, SPDK_NVMF_TRSVCID_MAX_LEN);
+				return -EINVAL;
+			}
+			memcpy(hostid->hostsvcid, val, val_len + 1);
+		} else {
+			SPDK_ERRLOG("Unknown transport ID key '%s'\n", key);
+		}
+	}
+
+	return 0;
+}
+
+static int
+cmp_int(int a, int b)
+{
+	return a - b;
+}
+
+int
+spdk_nvme_transport_id_compare(const struct spdk_nvme_transport_id *trid1,
+			       const struct spdk_nvme_transport_id *trid2)
+{
+	int cmp;
+
+	if (trid1->trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
+		cmp = strcasecmp(trid1->trstring, trid2->trstring);
+	} else {
+		cmp = cmp_int(trid1->trtype, trid2->trtype);
+	}
+
+	if (cmp) {
+		return cmp;
+	}
+
+	if (trid1->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+		struct spdk_pci_addr pci_addr1 = {};
+		struct spdk_pci_addr pci_addr2 = {};
+
+		/* Normalize PCI addresses before comparing */
+		if (spdk_pci_addr_parse(&pci_addr1, trid1->traddr) < 0 ||
+		    spdk_pci_addr_parse(&pci_addr2, trid2->traddr) < 0) {
+			return -1;
+		}
+
+		/* PCIe transport ID only uses trtype and traddr */
+		return spdk_pci_addr_compare(&pci_addr1, &pci_addr2);
+	}
+
+	cmp = strcasecmp(trid1->traddr, trid2->traddr);
+	if (cmp) {
+		return cmp;
+	}
+
+	cmp = cmp_int(trid1->adrfam, trid2->adrfam);
+	if (cmp) {
+		return cmp;
+	}
+
+	cmp = strcasecmp(trid1->trsvcid, trid2->trsvcid);
+	if (cmp) {
+		return cmp;
+	}
+
+	cmp = strcmp(trid1->subnqn, trid2->subnqn);
+	if (cmp) {
+		return cmp;
+	}
+
+	return 0;
+}
+
+int
+spdk_nvme_prchk_flags_parse(uint32_t *prchk_flags, const char *str)
+{
+	size_t val_len;
+	char key[32];
+	char val[1024];
+
+	if (prchk_flags == NULL || str == NULL) {
+		return -EINVAL;
+	}
+
+	while (*str != '\0') {
+		val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
+
+		if (val_len == 0) {
+			SPDK_ERRLOG("Failed to parse prchk\n");
+			return -EINVAL;
+		}
+
+		if (strcasecmp(key, "prchk") == 0) {
+			if (strcasestr(val, "reftag") != NULL) {
+				*prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
+			}
+			if (strcasestr(val, "guard") != NULL) {
+				*prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
+			}
+		} else {
+			SPDK_ERRLOG("Unknown key '%s'\n", key);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+const char *
+spdk_nvme_prchk_flags_str(uint32_t prchk_flags)
+{
+	if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) {
+		if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) {
+			return "prchk:reftag|guard";
+		} else {
+			return "prchk:reftag";
+		}
+	} else {
+		if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) {
+			return "prchk:guard";
+		} else {
+			return NULL;
+		}
+	}
+}
+
+struct spdk_nvme_probe_ctx *
+spdk_nvme_probe_async(const struct spdk_nvme_transport_id *trid,
+		      void *cb_ctx,
+		      spdk_nvme_probe_cb probe_cb,
+		      spdk_nvme_attach_cb attach_cb,
+		      spdk_nvme_remove_cb remove_cb)
+{
+	int rc;
+	struct spdk_nvme_probe_ctx *probe_ctx;
+
+	rc = nvme_driver_init();
+	if (rc != 0) {
+		return NULL;
+	}
+
+	probe_ctx = calloc(1, sizeof(*probe_ctx));
+	if (!probe_ctx) {
+		return NULL;
+	}
+
+	nvme_probe_ctx_init(probe_ctx, trid, cb_ctx, probe_cb, attach_cb, remove_cb);
+	rc = nvme_probe_internal(probe_ctx, false);
+	if (rc != 0) {
+		free(probe_ctx);
+		return NULL;
+	}
+
+	return probe_ctx;
+}
+
+int
+spdk_nvme_probe_poll_async(struct spdk_nvme_probe_ctx *probe_ctx)
+{
+	int rc = 0;
+	struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp;
+
+	if (!spdk_process_is_primary() && probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+		free(probe_ctx);
+		return 0;
+	}
+
+	TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) {
+		rc = nvme_ctrlr_poll_internal(ctrlr, probe_ctx);
+		if (rc != 0) {
+			rc = -EIO;
+			break;
+		}
+	}
+
+	if (rc != 0 || TAILQ_EMPTY(&probe_ctx->init_ctrlrs)) {
+		nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+		g_spdk_nvme_driver->initialized = true;
+		nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+		free(probe_ctx);
+		return rc;
+	}
+
+	return -EAGAIN;
+}
+
+struct spdk_nvme_probe_ctx *
+spdk_nvme_connect_async(const struct spdk_nvme_transport_id *trid,
+			const struct spdk_nvme_ctrlr_opts *opts,
+			spdk_nvme_attach_cb attach_cb)
+{
+	int rc;
+	spdk_nvme_probe_cb probe_cb = NULL;
+	struct spdk_nvme_probe_ctx *probe_ctx;
+
+	rc = nvme_driver_init();
+	if (rc != 0) {
+		return NULL;
+	}
+
+	probe_ctx = calloc(1, sizeof(*probe_ctx));
+	if (!probe_ctx) {
+		return NULL;
+	}
+
+	if (opts) {
+		probe_cb = nvme_connect_probe_cb;
+	}
+
+	nvme_probe_ctx_init(probe_ctx, trid, (void *)opts, probe_cb, attach_cb, NULL);
+	rc = nvme_probe_internal(probe_ctx, true);
+	if (rc != 0) {
+		free(probe_ctx);
+		return NULL;
+	}
+
+	return probe_ctx;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nvme", SPDK_LOG_NVME)
diff --git a/src/spdk/lib/nvme/nvme_ctrlr.c b/src/spdk/lib/nvme/nvme_ctrlr.c
new file mode 100644
index 000000000..ced02e9bb
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr.c
@@ -0,0 +1,3639 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+
+#include "spdk/env.h"
+#include "spdk/string.h"
+
+struct nvme_active_ns_ctx;
+
+static void nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr);
+static int nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr,
+		struct nvme_async_event_request *aer);
+static void nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx);
+static int nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns);
+static int nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns);
+
+static int
+nvme_ctrlr_get_cc(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cc_register *cc)
+{
+	return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+					      &cc->raw);
+}
+
+static int
+nvme_ctrlr_get_csts(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_csts_register *csts)
+{
+	return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, csts.raw),
+					      &csts->raw);
+}
+
+int
+nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap)
+{
+	return nvme_transport_ctrlr_get_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, cap.raw),
+					      &cap->raw);
+}
+
+int
+nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs)
+{
+	return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, vs.raw),
+					      &vs->raw);
+}
+
+static int
+nvme_ctrlr_set_cc(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cc_register *cc)
+{
+	return nvme_transport_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+					      cc->raw);
+}
+
+int
+nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz)
+{
+	return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
+					      &cmbsz->raw);
+}
+
+/* When the field in spdk_nvme_ctrlr_opts are changed and you change this function, please
+ * also update the nvme_ctrl_opts_init function in nvme_ctrlr.c
+ */
+void
+spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size)
+{
+	char host_id_str[SPDK_UUID_STRING_LEN];
+
+	assert(opts);
+
+	opts->opts_size = opts_size;
+
+#define FIELD_OK(field) \
+	offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= opts_size
+
+	if (FIELD_OK(num_io_queues)) {
+		opts->num_io_queues = DEFAULT_MAX_IO_QUEUES;
+	}
+
+	if (FIELD_OK(use_cmb_sqs)) {
+		opts->use_cmb_sqs = true;
+	}
+
+	if (FIELD_OK(no_shn_notification)) {
+		opts->no_shn_notification = false;
+	}
+
+	if (FIELD_OK(arb_mechanism)) {
+		opts->arb_mechanism = SPDK_NVME_CC_AMS_RR;
+	}
+
+	if (FIELD_OK(arbitration_burst)) {
+		opts->arbitration_burst = 0;
+	}
+
+	if (FIELD_OK(low_priority_weight)) {
+		opts->low_priority_weight = 0;
+	}
+
+	if (FIELD_OK(medium_priority_weight)) {
+		opts->medium_priority_weight = 0;
+	}
+
+	if (FIELD_OK(high_priority_weight)) {
+		opts->high_priority_weight = 0;
+	}
+
+	if (FIELD_OK(keep_alive_timeout_ms)) {
+		opts->keep_alive_timeout_ms = MIN_KEEP_ALIVE_TIMEOUT_IN_MS;
+	}
+
+	if (FIELD_OK(transport_retry_count)) {
+		opts->transport_retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT;
+	}
+
+	if (FIELD_OK(io_queue_size)) {
+		opts->io_queue_size = DEFAULT_IO_QUEUE_SIZE;
+	}
+
+	if (nvme_driver_init() == 0) {
+		if (FIELD_OK(hostnqn)) {
+			spdk_uuid_fmt_lower(host_id_str, sizeof(host_id_str),
+					    &g_spdk_nvme_driver->default_extended_host_id);
+			snprintf(opts->hostnqn, sizeof(opts->hostnqn), "2014-08.org.nvmexpress:uuid:%s", host_id_str);
+		}
+
+		if (FIELD_OK(extended_host_id)) {
+			memcpy(opts->extended_host_id, &g_spdk_nvme_driver->default_extended_host_id,
+			       sizeof(opts->extended_host_id));
+		}
+
+	}
+
+	if (FIELD_OK(io_queue_requests)) {
+		opts->io_queue_requests = DEFAULT_IO_QUEUE_REQUESTS;
+	}
+
+	if (FIELD_OK(src_addr)) {
+		memset(opts->src_addr, 0, sizeof(opts->src_addr));
+	}
+
+	if (FIELD_OK(src_svcid)) {
+		memset(opts->src_svcid, 0, sizeof(opts->src_svcid));
+	}
+
+	if (FIELD_OK(host_id)) {
+		memset(opts->host_id, 0, sizeof(opts->host_id));
+	}
+
+	if (FIELD_OK(command_set)) {
+		opts->command_set = SPDK_NVME_CC_CSS_NVM;
+	}
+
+	if (FIELD_OK(admin_timeout_ms)) {
+		opts->admin_timeout_ms = NVME_MAX_ADMIN_TIMEOUT_IN_SECS * 1000;
+	}
+
+	if (FIELD_OK(header_digest)) {
+		opts->header_digest = false;
+	}
+
+	if (FIELD_OK(data_digest)) {
+		opts->data_digest = false;
+	}
+
+	if (FIELD_OK(disable_error_logging)) {
+		opts->disable_error_logging = false;
+	}
+
+	if (FIELD_OK(transport_ack_timeout)) {
+		opts->transport_ack_timeout = SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT;
+	}
+
+	if (FIELD_OK(admin_queue_size)) {
+		opts->admin_queue_size = DEFAULT_ADMIN_QUEUE_SIZE;
+	}
+#undef FIELD_OK
+}
+
+/**
+ * This function will be called when the process allocates the IO qpair.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_proc_add_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc) {
+		TAILQ_INSERT_TAIL(&active_proc->allocated_io_qpairs, qpair, per_process_tailq);
+		qpair->active_proc = active_proc;
+	}
+}
+
+/**
+ * This function will be called when the process frees the IO qpair.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_proc_remove_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
+	struct spdk_nvme_qpair          *active_qpair, *tmp_qpair;
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (!active_proc) {
+		return;
+	}
+
+	TAILQ_FOREACH_SAFE(active_qpair, &active_proc->allocated_io_qpairs,
+			   per_process_tailq, tmp_qpair) {
+		if (active_qpair == qpair) {
+			TAILQ_REMOVE(&active_proc->allocated_io_qpairs,
+				     active_qpair, per_process_tailq);
+
+			break;
+		}
+	}
+}
+
+void
+spdk_nvme_ctrlr_get_default_io_qpair_opts(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_io_qpair_opts *opts,
+		size_t opts_size)
+{
+	assert(ctrlr);
+
+	assert(opts);
+
+	memset(opts, 0, opts_size);
+
+#define FIELD_OK(field) \
+	offsetof(struct spdk_nvme_io_qpair_opts, field) + sizeof(opts->field) <= opts_size
+
+	if (FIELD_OK(qprio)) {
+		opts->qprio = SPDK_NVME_QPRIO_URGENT;
+	}
+
+	if (FIELD_OK(io_queue_size)) {
+		opts->io_queue_size = ctrlr->opts.io_queue_size;
+	}
+
+	if (FIELD_OK(io_queue_requests)) {
+		opts->io_queue_requests = ctrlr->opts.io_queue_requests;
+	}
+
+	if (FIELD_OK(delay_cmd_submit)) {
+		opts->delay_cmd_submit = false;
+	}
+
+	if (FIELD_OK(sq.vaddr)) {
+		opts->sq.vaddr = NULL;
+	}
+
+	if (FIELD_OK(sq.paddr)) {
+		opts->sq.paddr = 0;
+	}
+
+	if (FIELD_OK(sq.buffer_size)) {
+		opts->sq.buffer_size = 0;
+	}
+
+	if (FIELD_OK(cq.vaddr)) {
+		opts->cq.vaddr = NULL;
+	}
+
+	if (FIELD_OK(cq.paddr)) {
+		opts->cq.paddr = 0;
+	}
+
+	if (FIELD_OK(cq.buffer_size)) {
+		opts->cq.buffer_size = 0;
+	}
+
+	if (FIELD_OK(create_only)) {
+		opts->create_only = false;
+	}
+
+#undef FIELD_OK
+}
+
+static struct spdk_nvme_qpair *
+nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+			   const struct spdk_nvme_io_qpair_opts *opts)
+{
+	uint32_t				qid;
+	struct spdk_nvme_qpair			*qpair;
+	union spdk_nvme_cc_register		cc;
+
+	if (!ctrlr) {
+		return NULL;
+	}
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("get_cc failed\n");
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return NULL;
+	}
+
+	if (opts->qprio & ~SPDK_NVME_CREATE_IO_SQ_QPRIO_MASK) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return NULL;
+	}
+
+	/*
+	 * Only value SPDK_NVME_QPRIO_URGENT(0) is valid for the
+	 * default round robin arbitration method.
+	 */
+	if ((cc.bits.ams == SPDK_NVME_CC_AMS_RR) && (opts->qprio != SPDK_NVME_QPRIO_URGENT)) {
+		SPDK_ERRLOG("invalid queue priority for default round robin arbitration method\n");
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return NULL;
+	}
+
+	/*
+	 * Get the first available I/O queue ID.
+	 */
+	qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1);
+	if (qid > ctrlr->opts.num_io_queues) {
+		SPDK_ERRLOG("No free I/O queue IDs\n");
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return NULL;
+	}
+
+	qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, opts);
+	if (qpair == NULL) {
+		SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n");
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return NULL;
+	}
+
+	spdk_bit_array_clear(ctrlr->free_io_qids, qid);
+	TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq);
+
+	nvme_ctrlr_proc_add_io_qpair(qpair);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return qpair;
+}
+
+int
+spdk_nvme_ctrlr_connect_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	int rc;
+
+	if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) {
+		return -EISCONN;
+	}
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC) {
+		spdk_delay_us(100);
+	}
+
+	return rc;
+}
+
+void
+spdk_nvme_ctrlr_disconnect_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+struct spdk_nvme_qpair *
+spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+			       const struct spdk_nvme_io_qpair_opts *user_opts,
+			       size_t opts_size)
+{
+
+	struct spdk_nvme_qpair		*qpair;
+	struct spdk_nvme_io_qpair_opts	opts;
+	int				rc;
+
+	/*
+	 * Get the default options, then overwrite them with the user-provided options
+	 * up to opts_size.
+	 *
+	 * This allows for extensions of the opts structure without breaking
+	 * ABI compatibility.
+	 */
+	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
+	if (user_opts) {
+		memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size));
+
+		/* If user passes buffers, make sure they're big enough for the requested queue size */
+		if (opts.sq.vaddr) {
+			if (opts.sq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cmd))) {
+				SPDK_ERRLOG("sq buffer size %lx is too small for sq size %lx\n",
+					    opts.sq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cmd)));
+				return NULL;
+			}
+		}
+		if (opts.cq.vaddr) {
+			if (opts.cq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cpl))) {
+				SPDK_ERRLOG("cq buffer size %lx is too small for cq size %lx\n",
+					    opts.cq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cpl)));
+				return NULL;
+			}
+		}
+	}
+
+	qpair = nvme_ctrlr_create_io_qpair(ctrlr, &opts);
+
+	if (qpair == NULL || opts.create_only == true) {
+		return qpair;
+	}
+
+	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
+	if (rc != 0) {
+		SPDK_ERRLOG("nvme_transport_ctrlr_connect_io_qpair() failed\n");
+		nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair);
+		return NULL;
+	}
+
+	return qpair;
+}
+
+int
+spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr *ctrlr;
+	enum nvme_qpair_state qpair_state;
+	int rc;
+
+	assert(qpair != NULL);
+	assert(nvme_qpair_is_admin_queue(qpair) == false);
+	assert(qpair->ctrlr != NULL);
+
+	ctrlr = qpair->ctrlr;
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	qpair_state = nvme_qpair_get_state(qpair);
+
+	if (ctrlr->is_removed) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	if (ctrlr->is_resetting || qpair_state == NVME_QPAIR_DISCONNECTING) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	if (ctrlr->is_failed || qpair_state == NVME_QPAIR_DESTROYING) {
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (qpair_state != NVME_QPAIR_DISCONNECTED) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);
+	if (rc) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+out:
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+spdk_nvme_qp_failure_reason
+spdk_nvme_ctrlr_get_admin_qp_failure_reason(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->adminq->transport_failure_reason;
+}
+
+/*
+ * This internal function will attempt to take the controller
+ * lock before calling disconnect on a controller qpair.
+ * Functions already holding the controller lock should
+ * call nvme_transport_ctrlr_disconnect_qpair directly.
+ */
+void
+nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+	assert(ctrlr != NULL);
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+int
+spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr *ctrlr;
+
+	if (qpair == NULL) {
+		return 0;
+	}
+
+	ctrlr = qpair->ctrlr;
+
+	if (qpair->in_completion_context) {
+		/*
+		 * There are many cases where it is convenient to delete an io qpair in the context
+		 *  of that qpair's completion routine.  To handle this properly, set a flag here
+		 *  so that the completion routine will perform an actual delete after the context
+		 *  unwinds.
+		 */
+		qpair->delete_after_completion_context = 1;
+		return 0;
+	}
+
+	if (qpair->poll_group && qpair->poll_group->in_completion_context) {
+		/* Same as above, but in a poll group. */
+		qpair->poll_group->num_qpairs_to_delete++;
+		qpair->delete_after_completion_context = 1;
+		return 0;
+	}
+
+	if (qpair->poll_group) {
+		spdk_nvme_poll_group_remove(qpair->poll_group->group, qpair);
+	}
+
+	/* Do not retry. */
+	nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+
+	/* In the multi-process case, a process may call this function on a foreign
+	 * I/O qpair (i.e. one that this process did not create) when that qpairs process
+	 * exits unexpectedly.  In that case, we must not try to abort any reqs associated
+	 * with that qpair, since the callbacks will also be foreign to this process.
+	 */
+	if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) {
+		nvme_qpair_abort_reqs(qpair, 1);
+	}
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	nvme_ctrlr_proc_remove_io_qpair(qpair);
+
+	TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq);
+	spdk_bit_array_set(ctrlr->free_io_qids, qpair->id);
+
+	if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -1;
+	}
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return 0;
+}
+
+static void
+nvme_ctrlr_construct_intel_support_log_page_list(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_intel_log_page_directory *log_page_directory)
+{
+	if (log_page_directory == NULL) {
+		return;
+	}
+
+	if (ctrlr->cdata.vid != SPDK_PCI_VID_INTEL) {
+		return;
+	}
+
+	ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY] = true;
+
+	if (log_page_directory->read_latency_log_len ||
+	    (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY)) {
+		ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY] = true;
+	}
+	if (log_page_directory->write_latency_log_len ||
+	    (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY)) {
+		ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true;
+	}
+	if (log_page_directory->temperature_statistics_log_len) {
+		ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_TEMPERATURE] = true;
+	}
+	if (log_page_directory->smart_log_len) {
+		ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_SMART] = true;
+	}
+	if (log_page_directory->marketing_description_log_len) {
+		ctrlr->log_page_supported[SPDK_NVME_INTEL_MARKETING_DESCRIPTION] = true;
+	}
+}
+
+static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc = 0;
+	struct nvme_completion_poll_status	*status;
+	struct spdk_nvme_intel_log_page_directory *log_page_directory;
+
+	log_page_directory = spdk_zmalloc(sizeof(struct spdk_nvme_intel_log_page_directory),
+					  64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	if (log_page_directory == NULL) {
+		SPDK_ERRLOG("could not allocate log_page_directory\n");
+		return -ENXIO;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		spdk_free(log_page_directory);
+		return -ENOMEM;
+	}
+
+	rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY,
+					      SPDK_NVME_GLOBAL_NS_TAG, log_page_directory,
+					      sizeof(struct spdk_nvme_intel_log_page_directory),
+					      0, nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		spdk_free(log_page_directory);
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion_timeout(ctrlr->adminq, status,
+					     ctrlr->opts.admin_timeout_ms / 1000)) {
+		spdk_free(log_page_directory);
+		SPDK_WARNLOG("Intel log pages not supported on Intel drive!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return 0;
+	}
+
+	nvme_ctrlr_construct_intel_support_log_page_list(ctrlr, log_page_directory);
+	spdk_free(log_page_directory);
+	free(status);
+	return 0;
+}
+
+static int
+nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int	rc = 0;
+
+	memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported));
+	/* Mandatory pages */
+	ctrlr->log_page_supported[SPDK_NVME_LOG_ERROR] = true;
+	ctrlr->log_page_supported[SPDK_NVME_LOG_HEALTH_INFORMATION] = true;
+	ctrlr->log_page_supported[SPDK_NVME_LOG_FIRMWARE_SLOT] = true;
+	if (ctrlr->cdata.lpa.celp) {
+		ctrlr->log_page_supported[SPDK_NVME_LOG_COMMAND_EFFECTS_LOG] = true;
+	}
+	if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) {
+		rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr);
+	}
+
+	return rc;
+}
+
+static void
+nvme_ctrlr_set_intel_supported_features(struct spdk_nvme_ctrlr *ctrlr)
+{
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_MAX_LBA] = true;
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true;
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true;
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_SMBUS_ADDRESS] = true;
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LED_PATTERN] = true;
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true;
+	ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING] = true;
+}
+
+static void
+nvme_ctrlr_set_arbitration_feature(struct spdk_nvme_ctrlr *ctrlr)
+{
+	uint32_t cdw11;
+	struct nvme_completion_poll_status *status;
+
+	if (ctrlr->opts.arbitration_burst == 0) {
+		return;
+	}
+
+	if (ctrlr->opts.arbitration_burst > 7) {
+		SPDK_WARNLOG("Valid arbitration burst values is from 0-7\n");
+		return;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return;
+	}
+
+	cdw11 = ctrlr->opts.arbitration_burst;
+
+	if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_WRR_SUPPORTED) {
+		cdw11 |= (uint32_t)ctrlr->opts.low_priority_weight << 8;
+		cdw11 |= (uint32_t)ctrlr->opts.medium_priority_weight << 16;
+		cdw11 |= (uint32_t)ctrlr->opts.high_priority_weight << 24;
+	}
+
+	if (spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION,
+					    cdw11, 0, NULL, 0,
+					    nvme_completion_poll_cb, status) < 0) {
+		SPDK_ERRLOG("Set arbitration feature failed\n");
+		free(status);
+		return;
+	}
+
+	if (nvme_wait_for_completion_timeout(ctrlr->adminq, status,
+					     ctrlr->opts.admin_timeout_ms / 1000)) {
+		SPDK_ERRLOG("Timeout to set arbitration feature\n");
+	}
+
+	if (!status->timed_out) {
+		free(status);
+	}
+}
+
+static void
+nvme_ctrlr_set_supported_features(struct spdk_nvme_ctrlr *ctrlr)
+{
+	memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported));
+	/* Mandatory features */
+	ctrlr->feature_supported[SPDK_NVME_FEAT_ARBITRATION] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_POWER_MANAGEMENT] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_ERROR_RECOVERY] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_NUMBER_OF_QUEUES] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_COALESCING] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_WRITE_ATOMICITY] = true;
+	ctrlr->feature_supported[SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true;
+	/* Optional features */
+	if (ctrlr->cdata.vwc.present) {
+		ctrlr->feature_supported[SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE] = true;
+	}
+	if (ctrlr->cdata.apsta.supported) {
+		ctrlr->feature_supported[SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION] = true;
+	}
+	if (ctrlr->cdata.hmpre) {
+		ctrlr->feature_supported[SPDK_NVME_FEAT_HOST_MEM_BUFFER] = true;
+	}
+	if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL) {
+		nvme_ctrlr_set_intel_supported_features(ctrlr);
+	}
+
+	nvme_ctrlr_set_arbitration_feature(ctrlr);
+}
+
+bool
+spdk_nvme_ctrlr_is_failed(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->is_failed;
+}
+
+void
+nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove)
+{
+	/*
+	 * Set the flag here and leave the work failure of qpairs to
+	 * spdk_nvme_qpair_process_completions().
+	 */
+	if (hot_remove) {
+		ctrlr->is_removed = true;
+	}
+	ctrlr->is_failed = true;
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq);
+	SPDK_ERRLOG("ctrlr %s in failed state.\n", ctrlr->trid.traddr);
+}
+
+/**
+ * This public API function will try to take the controller lock.
+ * Any private functions being called from a thread already holding
+ * the ctrlr lock should call nvme_ctrlr_fail directly.
+ */
+void
+spdk_nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr)
+{
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	nvme_ctrlr_fail(ctrlr, false);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+static void
+nvme_ctrlr_shutdown(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_cc_register	cc;
+	union spdk_nvme_csts_register	csts;
+	uint32_t			ms_waited = 0;
+	uint32_t			shutdown_timeout_ms;
+
+	if (ctrlr->is_removed) {
+		return;
+	}
+
+	if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("ctrlr %s get_cc() failed\n", ctrlr->trid.traddr);
+		return;
+	}
+
+	cc.bits.shn = SPDK_NVME_SHN_NORMAL;
+
+	if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("ctrlr %s set_cc() failed\n", ctrlr->trid.traddr);
+		return;
+	}
+
+	/*
+	 * The NVMe specification defines RTD3E to be the time between
+	 *  setting SHN = 1 until the controller will set SHST = 10b.
+	 * If the device doesn't report RTD3 entry latency, or if it
+	 *  reports RTD3 entry latency less than 10 seconds, pick
+	 *  10 seconds as a reasonable amount of time to
+	 *  wait before proceeding.
+	 */
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "RTD3E = %" PRIu32 " us\n", ctrlr->cdata.rtd3e);
+	shutdown_timeout_ms = (ctrlr->cdata.rtd3e + 999) / 1000;
+	shutdown_timeout_ms = spdk_max(shutdown_timeout_ms, 10000);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown timeout = %" PRIu32 " ms\n", shutdown_timeout_ms);
+
+	do {
+		if (nvme_ctrlr_get_csts(ctrlr, &csts)) {
+			SPDK_ERRLOG("ctrlr %s get_csts() failed\n", ctrlr->trid.traddr);
+			return;
+		}
+
+		if (csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "ctrlr %s shutdown complete in %u milliseconds\n",
+				      ctrlr->trid.traddr, ms_waited);
+			return;
+		}
+
+		nvme_delay(1000);
+		ms_waited++;
+	} while (ms_waited < shutdown_timeout_ms);
+
+	SPDK_ERRLOG("ctrlr %s did not shutdown within %u milliseconds\n",
+		    ctrlr->trid.traddr, shutdown_timeout_ms);
+	if (ctrlr->quirks & NVME_QUIRK_SHST_COMPLETE) {
+		SPDK_ERRLOG("likely due to shutdown handling in the VMWare emulated NVMe SSD\n");
+	}
+}
+
+static int
+nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_cc_register	cc;
+	int				rc;
+
+	rc = nvme_transport_ctrlr_enable(ctrlr);
+	if (rc != 0) {
+		SPDK_ERRLOG("transport ctrlr_enable failed\n");
+		return rc;
+	}
+
+	if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("get_cc() failed\n");
+		return -EIO;
+	}
+
+	if (cc.bits.en != 0) {
+		SPDK_ERRLOG("called with CC.EN = 1\n");
+		return -EINVAL;
+	}
+
+	cc.bits.en = 1;
+	cc.bits.css = 0;
+	cc.bits.shn = 0;
+	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+
+	/* Page size is 2 ^ (12 + mps). */
+	cc.bits.mps = spdk_u32log2(ctrlr->page_size) - 12;
+
+	if (ctrlr->cap.bits.css == 0) {
+		SPDK_INFOLOG(SPDK_LOG_NVME,
+			     "Drive reports no command sets supported. Assuming NVM is supported.\n");
+		ctrlr->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+	}
+
+	if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n",
+			      ctrlr->opts.command_set, ctrlr->cap.bits.css);
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Falling back to NVM. Assuming NVM is supported.\n");
+		ctrlr->opts.command_set = SPDK_NVME_CC_CSS_NVM;
+	}
+
+	cc.bits.css = ctrlr->opts.command_set;
+
+	switch (ctrlr->opts.arb_mechanism) {
+	case SPDK_NVME_CC_AMS_RR:
+		break;
+	case SPDK_NVME_CC_AMS_WRR:
+		if (SPDK_NVME_CAP_AMS_WRR & ctrlr->cap.bits.ams) {
+			break;
+		}
+		return -EINVAL;
+	case SPDK_NVME_CC_AMS_VS:
+		if (SPDK_NVME_CAP_AMS_VS & ctrlr->cap.bits.ams) {
+			break;
+		}
+		return -EINVAL;
+	default:
+		return -EINVAL;
+	}
+
+	cc.bits.ams = ctrlr->opts.arb_mechanism;
+
+	if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("set_cc() failed\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int
+nvme_ctrlr_disable(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_cc_register	cc;
+
+	if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("get_cc() failed\n");
+		return -EIO;
+	}
+
+	if (cc.bits.en == 0) {
+		return 0;
+	}
+
+	cc.bits.en = 0;
+
+	if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+		SPDK_ERRLOG("set_cc() failed\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#ifdef DEBUG
+static const char *
+nvme_ctrlr_state_string(enum nvme_ctrlr_state state)
+{
+	switch (state) {
+	case NVME_CTRLR_STATE_INIT_DELAY:
+		return "delay init";
+	case NVME_CTRLR_STATE_INIT:
+		return "init";
+	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
+		return "disable and wait for CSTS.RDY = 1";
+	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
+		return "disable and wait for CSTS.RDY = 0";
+	case NVME_CTRLR_STATE_ENABLE:
+		return "enable controller by writing CC.EN = 1";
+	case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
+		return "wait for CSTS.RDY = 1";
+	case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE:
+		return "reset admin queue";
+	case NVME_CTRLR_STATE_IDENTIFY:
+		return "identify controller";
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY:
+		return "wait for identify controller";
+	case NVME_CTRLR_STATE_SET_NUM_QUEUES:
+		return "set number of queues";
+	case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES:
+		return "wait for set number of queues";
+	case NVME_CTRLR_STATE_CONSTRUCT_NS:
+		return "construct namespaces";
+	case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS:
+		return "identify active ns";
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS:
+		return "wait for identify active ns";
+	case NVME_CTRLR_STATE_IDENTIFY_NS:
+		return "identify ns";
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS:
+		return "wait for identify ns";
+	case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS:
+		return "identify namespace id descriptors";
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS:
+		return "wait for identify namespace id descriptors";
+	case NVME_CTRLR_STATE_CONFIGURE_AER:
+		return "configure AER";
+	case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER:
+		return "wait for configure aer";
+	case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES:
+		return "set supported log pages";
+	case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES:
+		return "set supported features";
+	case NVME_CTRLR_STATE_SET_DB_BUF_CFG:
+		return "set doorbell buffer config";
+	case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG:
+		return "wait for doorbell buffer config";
+	case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT:
+		return "set keep alive timeout";
+	case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT:
+		return "wait for set keep alive timeout";
+	case NVME_CTRLR_STATE_SET_HOST_ID:
+		return "set host ID";
+	case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID:
+		return "wait for set host ID";
+	case NVME_CTRLR_STATE_READY:
+		return "ready";
+	case NVME_CTRLR_STATE_ERROR:
+		return "error";
+	}
+	return "unknown";
+};
+#endif /* DEBUG */
+
+static void
+nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state,
+		     uint64_t timeout_in_ms)
+{
+	uint64_t ticks_per_ms, timeout_in_ticks, now_ticks;
+
+	ctrlr->state = state;
+	if (timeout_in_ms == NVME_TIMEOUT_INFINITE) {
+		goto inf;
+	}
+
+	ticks_per_ms = spdk_get_ticks_hz() / 1000;
+	if (timeout_in_ms > UINT64_MAX / ticks_per_ms) {
+		SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n");
+		goto inf;
+	}
+
+	now_ticks = spdk_get_ticks();
+	timeout_in_ticks = timeout_in_ms * ticks_per_ms;
+	if (timeout_in_ticks > UINT64_MAX - now_ticks) {
+		SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n");
+		goto inf;
+	}
+
+	ctrlr->state_timeout_tsc = timeout_in_ticks + now_ticks;
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n",
+		      nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms);
+	return;
+inf:
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n",
+		      nvme_ctrlr_state_string(ctrlr->state));
+	ctrlr->state_timeout_tsc = NVME_TIMEOUT_INFINITE;
+}
+
+static void
+nvme_ctrlr_free_doorbell_buffer(struct spdk_nvme_ctrlr *ctrlr)
+{
+	if (ctrlr->shadow_doorbell) {
+		spdk_free(ctrlr->shadow_doorbell);
+		ctrlr->shadow_doorbell = NULL;
+	}
+
+	if (ctrlr->eventidx) {
+		spdk_free(ctrlr->eventidx);
+		ctrlr->eventidx = NULL;
+	}
+}
+
+static void
+nvme_ctrlr_set_doorbell_buffer_config_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		SPDK_WARNLOG("Doorbell buffer config failed\n");
+	} else {
+		SPDK_INFOLOG(SPDK_LOG_NVME, "NVMe controller: %s doorbell buffer config enabled\n",
+			     ctrlr->trid.traddr);
+	}
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+			     ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc = 0;
+	uint64_t prp1, prp2, len;
+
+	if (!ctrlr->cdata.oacs.doorbell_buffer_config) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	/* only 1 page size for doorbell buffer */
+	ctrlr->shadow_doorbell = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size,
+					      NULL, SPDK_ENV_LCORE_ID_ANY,
+					      SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE);
+	if (ctrlr->shadow_doorbell == NULL) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	len = ctrlr->page_size;
+	prp1 = spdk_vtophys(ctrlr->shadow_doorbell, &len);
+	if (prp1 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) {
+		rc = -EFAULT;
+		goto error;
+	}
+
+	ctrlr->eventidx = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size,
+				       NULL, SPDK_ENV_LCORE_ID_ANY,
+				       SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE);
+	if (ctrlr->eventidx == NULL) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	len = ctrlr->page_size;
+	prp2 = spdk_vtophys(ctrlr->eventidx, &len);
+	if (prp2 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) {
+		rc = -EFAULT;
+		goto error;
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG,
+			     ctrlr->opts.admin_timeout_ms);
+
+	rc = nvme_ctrlr_cmd_doorbell_buffer_config(ctrlr, prp1, prp2,
+			nvme_ctrlr_set_doorbell_buffer_config_done, ctrlr);
+	if (rc != 0) {
+		goto error;
+	}
+
+	return 0;
+
+error:
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+	nvme_ctrlr_free_doorbell_buffer(ctrlr);
+	return rc;
+}
+
+static void
+nvme_ctrlr_abort_queued_aborts(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_request	*req, *tmp;
+	struct spdk_nvme_cpl	cpl = {};
+
+	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+	STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) {
+		STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq);
+
+		nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl);
+		nvme_free_request(req);
+	}
+}
+
+int
+spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc = 0;
+	struct spdk_nvme_qpair	*qpair;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	if (ctrlr->is_resetting || ctrlr->is_removed) {
+		/*
+		 * Controller is already resetting or has been removed. Return
+		 *  immediately since there is no need to kick off another
+		 *  reset in these cases.
+		 */
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return ctrlr->is_resetting ? 0 : -ENXIO;
+	}
+
+	ctrlr->is_resetting = true;
+	ctrlr->is_failed = false;
+
+	SPDK_NOTICELOG("resetting controller\n");
+
+	/* Abort all of the queued abort requests */
+	nvme_ctrlr_abort_queued_aborts(ctrlr);
+
+	nvme_transport_admin_qpair_abort_aers(ctrlr->adminq);
+
+	/* Disable all queues before disabling the controller hardware. */
+	TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
+		qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+	}
+
+	ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq);
+	if (nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq) != 0) {
+		SPDK_ERRLOG("Controller reinitialization failed.\n");
+		rc = -1;
+		goto out;
+	}
+
+	/* Doorbell buffer config is invalid during reset */
+	nvme_ctrlr_free_doorbell_buffer(ctrlr);
+
+	/* Set the state back to INIT to cause a full hardware reset. */
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE);
+
+	nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED);
+	while (ctrlr->state != NVME_CTRLR_STATE_READY) {
+		if (nvme_ctrlr_process_init(ctrlr) != 0) {
+			SPDK_ERRLOG("controller reinitialization failed\n");
+			rc = -1;
+			break;
+		}
+	}
+
+	/*
+	 * For PCIe controllers, the memory locations of the tranpsort qpair
+	 * don't change when the controller is reset. They simply need to be
+	 * re-enabled with admin commands to the controller. For fabric
+	 * controllers we need to disconnect and reconnect the qpair on its
+	 * own thread outside of the context of the reset.
+	 */
+	if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+		/* Reinitialize qpairs */
+		TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
+			if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) {
+				qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+				rc = -1;
+				continue;
+			}
+		}
+	}
+
+out:
+	if (rc) {
+		nvme_ctrlr_fail(ctrlr, false);
+	}
+	ctrlr->is_resetting = false;
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	if (!ctrlr->cdata.oaes.ns_attribute_notices) {
+		/*
+		 * If controller doesn't support ns_attribute_notices and
+		 * namespace attributes change (e.g. number of namespaces)
+		 * we need to update system handling device reset.
+		 */
+		nvme_io_msg_ctrlr_update(ctrlr);
+	}
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_set_trid(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_transport_id *trid)
+{
+	int rc = 0;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	if (ctrlr->is_failed == false) {
+		rc = -EPERM;
+		goto out;
+	}
+
+	if (trid->trtype != ctrlr->trid.trtype) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (strncmp(trid->subnqn, ctrlr->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ctrlr->trid = *trid;
+
+out:
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+static void
+nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		SPDK_ERRLOG("nvme_identify_controller failed!\n");
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return;
+	}
+
+	/*
+	 * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
+	 *  controller supports.
+	 */
+	ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_xfer_size %u\n", ctrlr->max_xfer_size);
+	if (ctrlr->cdata.mdts > 0) {
+		ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size,
+						ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cdata.cntlid);
+	if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+		ctrlr->cntlid = ctrlr->cdata.cntlid;
+	} else {
+		/*
+		 * Fabrics controllers should already have CNTLID from the Connect command.
+		 *
+		 * If CNTLID from Connect doesn't match CNTLID in the Identify Controller data,
+		 * trust the one from Connect.
+		 */
+		if (ctrlr->cntlid != ctrlr->cdata.cntlid) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME,
+				      "Identify CNTLID 0x%04" PRIx16 " != Connect CNTLID 0x%04" PRIx16 "\n",
+				      ctrlr->cdata.cntlid, ctrlr->cntlid);
+		}
+	}
+
+	if (ctrlr->cdata.sgls.supported) {
+		assert(ctrlr->cdata.sgls.supported != 0x3);
+		ctrlr->flags |= SPDK_NVME_CTRLR_SGL_SUPPORTED;
+		if (ctrlr->cdata.sgls.supported == 0x2) {
+			ctrlr->flags |= SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT;
+		}
+		/*
+		 * Use MSDBD to ensure our max_sges doesn't exceed what the
+		 *  controller supports.
+		 */
+		ctrlr->max_sges = nvme_transport_ctrlr_get_max_sges(ctrlr);
+		if (ctrlr->cdata.nvmf_specific.msdbd != 0) {
+			ctrlr->max_sges = spdk_min(ctrlr->cdata.nvmf_specific.msdbd, ctrlr->max_sges);
+		} else {
+			/* A value 0 indicates no limit. */
+		}
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_sges %u\n", ctrlr->max_sges);
+	}
+
+	if (ctrlr->cdata.oacs.security && !(ctrlr->quirks & NVME_QUIRK_OACS_SECURITY)) {
+		ctrlr->flags |= SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "fuses compare and write: %d\n", ctrlr->cdata.fuses.compare_and_write);
+	if (ctrlr->cdata.fuses.compare_and_write) {
+		ctrlr->flags |= SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED;
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_NUM_QUEUES,
+			     ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_identify(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int	rc;
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY,
+			     ctrlr->opts.admin_timeout_ms);
+
+	rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0,
+				     &ctrlr->cdata, sizeof(ctrlr->cdata),
+				     nvme_ctrlr_identify_done, ctrlr);
+	if (rc != 0) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return rc;
+	}
+
+	return 0;
+}
+
+enum nvme_active_ns_state {
+	NVME_ACTIVE_NS_STATE_IDLE,
+	NVME_ACTIVE_NS_STATE_PROCESSING,
+	NVME_ACTIVE_NS_STATE_DONE,
+	NVME_ACTIVE_NS_STATE_ERROR
+};
+
+typedef void (*nvme_active_ns_ctx_deleter)(struct nvme_active_ns_ctx *);
+
+struct nvme_active_ns_ctx {
+	struct spdk_nvme_ctrlr *ctrlr;
+	uint32_t page;
+	uint32_t num_pages;
+	uint32_t next_nsid;
+	uint32_t *new_ns_list;
+	nvme_active_ns_ctx_deleter deleter;
+
+	enum nvme_active_ns_state state;
+};
+
+static struct nvme_active_ns_ctx *
+nvme_active_ns_ctx_create(struct spdk_nvme_ctrlr *ctrlr, nvme_active_ns_ctx_deleter deleter)
+{
+	struct nvme_active_ns_ctx *ctx;
+	uint32_t num_pages = 0;
+	uint32_t *new_ns_list = NULL;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		SPDK_ERRLOG("Failed to allocate nvme_active_ns_ctx!\n");
+		return NULL;
+	}
+
+	if (ctrlr->num_ns) {
+		/* The allocated size must be a multiple of sizeof(struct spdk_nvme_ns_list) */
+		num_pages = (ctrlr->num_ns * sizeof(new_ns_list[0]) - 1) / sizeof(struct spdk_nvme_ns_list) + 1;
+		new_ns_list = spdk_zmalloc(num_pages * sizeof(struct spdk_nvme_ns_list), ctrlr->page_size,
+					   NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE);
+		if (!new_ns_list) {
+			SPDK_ERRLOG("Failed to allocate active_ns_list!\n");
+			free(ctx);
+			return NULL;
+		}
+	}
+
+	ctx->num_pages = num_pages;
+	ctx->new_ns_list = new_ns_list;
+	ctx->ctrlr = ctrlr;
+	ctx->deleter = deleter;
+
+	return ctx;
+}
+
+static void
+nvme_active_ns_ctx_destroy(struct nvme_active_ns_ctx *ctx)
+{
+	spdk_free(ctx->new_ns_list);
+	free(ctx);
+}
+
+static void
+nvme_ctrlr_identify_active_ns_swap(struct spdk_nvme_ctrlr *ctrlr, uint32_t **new_ns_list)
+{
+	spdk_free(ctrlr->active_ns_list);
+	ctrlr->active_ns_list = *new_ns_list;
+	*new_ns_list = NULL;
+}
+
+static void
+nvme_ctrlr_identify_active_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_active_ns_ctx *ctx = arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		ctx->state = NVME_ACTIVE_NS_STATE_ERROR;
+		goto out;
+	}
+
+	ctx->next_nsid = ctx->new_ns_list[1024 * ctx->page + 1023];
+	if (ctx->next_nsid == 0 || ++ctx->page == ctx->num_pages) {
+		ctx->state = NVME_ACTIVE_NS_STATE_DONE;
+		goto out;
+	}
+
+	nvme_ctrlr_identify_active_ns_async(ctx);
+	return;
+
+out:
+	if (ctx->deleter) {
+		ctx->deleter(ctx);
+	}
+}
+
+static void
+nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx)
+{
+	struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr;
+	uint32_t i;
+	int rc;
+
+	if (ctrlr->num_ns == 0) {
+		ctx->state = NVME_ACTIVE_NS_STATE_DONE;
+		goto out;
+	}
+
+	/*
+	 * If controller doesn't support active ns list CNS 0x02 dummy up
+	 * an active ns list, i.e. all namespaces report as active
+	 */
+	if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 1, 0) || ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS) {
+		for (i = 0; i < ctrlr->num_ns; i++) {
+			ctx->new_ns_list[i] = i + 1;
+		}
+
+		ctx->state = NVME_ACTIVE_NS_STATE_DONE;
+		goto out;
+	}
+
+	ctx->state = NVME_ACTIVE_NS_STATE_PROCESSING;
+	rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST, 0, ctx->next_nsid,
+				     &ctx->new_ns_list[1024 * ctx->page], sizeof(struct spdk_nvme_ns_list),
+				     nvme_ctrlr_identify_active_ns_async_done, ctx);
+	if (rc != 0) {
+		ctx->state = NVME_ACTIVE_NS_STATE_ERROR;
+		goto out;
+	}
+
+	return;
+
+out:
+	if (ctx->deleter) {
+		ctx->deleter(ctx);
+	}
+}
+
+static void
+_nvme_active_ns_ctx_deleter(struct nvme_active_ns_ctx *ctx)
+{
+	struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr;
+
+	if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) {
+		nvme_ctrlr_destruct_namespaces(ctrlr);
+		nvme_active_ns_ctx_destroy(ctx);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return;
+	}
+
+	assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE);
+	nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list);
+	nvme_active_ns_ctx_destroy(ctx);
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_NS, ctrlr->opts.admin_timeout_ms);
+}
+
+static void
+_nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_active_ns_ctx *ctx;
+
+	ctx = nvme_active_ns_ctx_create(ctrlr, _nvme_active_ns_ctx_deleter);
+	if (!ctx) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return;
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS,
+			     ctrlr->opts.admin_timeout_ms);
+	nvme_ctrlr_identify_active_ns_async(ctx);
+}
+
+int
+nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_active_ns_ctx *ctx;
+	int rc;
+
+	ctx = nvme_active_ns_ctx_create(ctrlr, NULL);
+	if (!ctx) {
+		return -ENOMEM;
+	}
+
+	nvme_ctrlr_identify_active_ns_async(ctx);
+	while (ctx->state == NVME_ACTIVE_NS_STATE_PROCESSING) {
+		rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		if (rc < 0) {
+			ctx->state = NVME_ACTIVE_NS_STATE_ERROR;
+			break;
+		}
+	}
+
+	if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) {
+		nvme_active_ns_ctx_destroy(ctx);
+		return -ENXIO;
+	}
+
+	assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE);
+	nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list);
+	nvme_active_ns_ctx_destroy(ctx);
+
+	return 0;
+}
+
+static void
+nvme_ctrlr_identify_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg;
+	struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+	uint32_t nsid;
+	int rc;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return;
+	} else {
+		nvme_ns_set_identify_data(ns);
+	}
+
+	/* move on to the next active NS */
+	nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id);
+	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+	if (ns == NULL) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ID_DESCS,
+				     ctrlr->opts.admin_timeout_ms);
+		return;
+	}
+	ns->ctrlr = ctrlr;
+	ns->id = nsid;
+
+	rc = nvme_ctrlr_identify_ns_async(ns);
+	if (rc) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+	}
+}
+
+static int
+nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns)
+{
+	struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+	struct spdk_nvme_ns_data *nsdata;
+
+	nsdata = &ctrlr->nsdata[ns->id - 1];
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS,
+			     ctrlr->opts.admin_timeout_ms);
+	return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id,
+				       nsdata, sizeof(*nsdata),
+				       nvme_ctrlr_identify_ns_async_done, ns);
+}
+
+static int
+nvme_ctrlr_identify_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+	uint32_t nsid;
+	struct spdk_nvme_ns *ns;
+	int rc;
+
+	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+	if (ns == NULL) {
+		/* No active NS, move on to the next state */
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	ns->ctrlr = ctrlr;
+	ns->id = nsid;
+
+	rc = nvme_ctrlr_identify_ns_async(ns);
+	if (rc) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+	}
+
+	return rc;
+}
+
+static void
+nvme_ctrlr_identify_id_desc_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg;
+	struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+	uint32_t nsid;
+	int rc;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+				     ctrlr->opts.admin_timeout_ms);
+		return;
+	}
+
+	/* move on to the next active NS */
+	nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id);
+	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+	if (ns == NULL) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+				     ctrlr->opts.admin_timeout_ms);
+		return;
+	}
+
+	rc = nvme_ctrlr_identify_id_desc_async(ns);
+	if (rc) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+	}
+}
+
+static int
+nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns)
+{
+	struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+
+	memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS,
+			     ctrlr->opts.admin_timeout_ms);
+	return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST,
+				       0, ns->id, ns->id_desc_list, sizeof(ns->id_desc_list),
+				       nvme_ctrlr_identify_id_desc_async_done, ns);
+}
+
+static int
+nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+	uint32_t nsid;
+	struct spdk_nvme_ns *ns;
+	int rc;
+
+	if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) ||
+	    (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n");
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+	if (ns == NULL) {
+		/* No active NS, move on to the next state */
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	rc = nvme_ctrlr_identify_id_desc_async(ns);
+	if (rc) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+	}
+
+	return rc;
+}
+
+static void
+nvme_ctrlr_update_nvmf_ioccsz(struct spdk_nvme_ctrlr *ctrlr)
+{
+	if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA ||
+	    ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP ||
+	    ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_FC) {
+		if (ctrlr->cdata.nvmf_specific.ioccsz < 4) {
+			SPDK_ERRLOG("Incorrect IOCCSZ %u, the minimum value should be 4\n",
+				    ctrlr->cdata.nvmf_specific.ioccsz);
+			ctrlr->cdata.nvmf_specific.ioccsz = 4;
+			assert(0);
+		}
+		ctrlr->ioccsz_bytes = ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd);
+		ctrlr->icdoff = ctrlr->cdata.nvmf_specific.icdoff;
+	}
+}
+
+static void
+nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	uint32_t cq_allocated, sq_allocated, min_allocated, i;
+	struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		SPDK_ERRLOG("Set Features - Number of Queues failed!\n");
+		ctrlr->opts.num_io_queues = 0;
+	} else {
+		/*
+		 * Data in cdw0 is 0-based.
+		 * Lower 16-bits indicate number of submission queues allocated.
+		 * Upper 16-bits indicate number of completion queues allocated.
+		 */
+		sq_allocated = (cpl->cdw0 & 0xFFFF) + 1;
+		cq_allocated = (cpl->cdw0 >> 16) + 1;
+
+		/*
+		 * For 1:1 queue mapping, set number of allocated queues to be minimum of
+		 * submission and completion queues.
+		 */
+		min_allocated = spdk_min(sq_allocated, cq_allocated);
+
+		/* Set number of queues to be minimum of requested and actually allocated. */
+		ctrlr->opts.num_io_queues = spdk_min(min_allocated, ctrlr->opts.num_io_queues);
+	}
+
+	ctrlr->free_io_qids = spdk_bit_array_create(ctrlr->opts.num_io_queues + 1);
+	if (ctrlr->free_io_qids == NULL) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return;
+	}
+
+	/* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */
+	spdk_bit_array_clear(ctrlr->free_io_qids, 0);
+	for (i = 1; i <= ctrlr->opts.num_io_queues; i++) {
+		spdk_bit_array_set(ctrlr->free_io_qids, i);
+	}
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS,
+			     ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc;
+
+	if (ctrlr->opts.num_io_queues > SPDK_NVME_MAX_IO_QUEUES) {
+		SPDK_NOTICELOG("Limiting requested num_io_queues %u to max %d\n",
+			       ctrlr->opts.num_io_queues, SPDK_NVME_MAX_IO_QUEUES);
+		ctrlr->opts.num_io_queues = SPDK_NVME_MAX_IO_QUEUES;
+	} else if (ctrlr->opts.num_io_queues < 1) {
+		SPDK_NOTICELOG("Requested num_io_queues 0, increasing to 1\n");
+		ctrlr->opts.num_io_queues = 1;
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES,
+			     ctrlr->opts.admin_timeout_ms);
+
+	rc = nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->opts.num_io_queues,
+					   nvme_ctrlr_set_num_queues_done, ctrlr);
+	if (rc != 0) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	uint32_t keep_alive_interval_ms;
+	struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		if ((cpl->status.sct == SPDK_NVME_SCT_GENERIC) &&
+		    (cpl->status.sc == SPDK_NVME_SC_INVALID_FIELD)) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Keep alive timeout Get Feature is not supported\n");
+		} else {
+			SPDK_ERRLOG("Keep alive timeout Get Feature failed: SC %x SCT %x\n",
+				    cpl->status.sc, cpl->status.sct);
+			ctrlr->opts.keep_alive_timeout_ms = 0;
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+			return;
+		}
+	} else {
+		if (ctrlr->opts.keep_alive_timeout_ms != cpl->cdw0) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller adjusted keep alive timeout to %u ms\n",
+				      cpl->cdw0);
+		}
+
+		ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0;
+	}
+
+	keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2;
+	if (keep_alive_interval_ms == 0) {
+		keep_alive_interval_ms = 1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms);
+
+	ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000);
+
+	/* Schedule the first Keep Alive to be sent as soon as possible. */
+	ctrlr->next_keep_alive_tick = spdk_get_ticks();
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID,
+			     ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_set_keep_alive_timeout(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc;
+
+	if (ctrlr->opts.keep_alive_timeout_ms == 0) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	if (ctrlr->cdata.kas == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller KAS is 0 - not enabling Keep Alive\n");
+		ctrlr->opts.keep_alive_timeout_ms = 0;
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID,
+				     ctrlr->opts.admin_timeout_ms);
+		return 0;
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT,
+			     ctrlr->opts.admin_timeout_ms);
+
+	/* Retrieve actual keep alive timeout, since the controller may have adjusted it. */
+	rc = spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, 0, NULL, 0,
+					     nvme_ctrlr_set_keep_alive_timeout_done, ctrlr);
+	if (rc != 0) {
+		SPDK_ERRLOG("Keep alive timeout Get Feature failed: %d\n", rc);
+		ctrlr->opts.keep_alive_timeout_ms = 0;
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+nvme_ctrlr_set_host_id_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		/*
+		 * Treat Set Features - Host ID failure as non-fatal, since the Host ID feature
+		 * is optional.
+		 */
+		SPDK_WARNLOG("Set Features - Host ID failed: SC 0x%x SCT 0x%x\n",
+			     cpl->status.sc, cpl->status.sct);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Set Features - Host ID was successful\n");
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_set_host_id(struct spdk_nvme_ctrlr *ctrlr)
+{
+	uint8_t *host_id;
+	uint32_t host_id_size;
+	int rc;
+
+	if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+		/*
+		 * NVMe-oF sends the host ID during Connect and doesn't allow
+		 * Set Features - Host Identifier after Connect, so we don't need to do anything here.
+		 */
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "NVMe-oF transport - not sending Set Features - Host ID\n");
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+		return 0;
+	}
+
+	if (ctrlr->cdata.ctratt.host_id_exhid_supported) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 128-bit extended host identifier\n");
+		host_id = ctrlr->opts.extended_host_id;
+		host_id_size = sizeof(ctrlr->opts.extended_host_id);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 64-bit host identifier\n");
+		host_id = ctrlr->opts.host_id;
+		host_id_size = sizeof(ctrlr->opts.host_id);
+	}
+
+	/* If the user specified an all-zeroes host identifier, don't send the command. */
+	if (spdk_mem_all_zero(host_id, host_id_size)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME,
+			      "User did not specify host ID - not sending Set Features - Host ID\n");
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+		return 0;
+	}
+
+	SPDK_LOGDUMP(SPDK_LOG_NVME, "host_id", host_id, host_id_size);
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_HOST_ID,
+			     ctrlr->opts.admin_timeout_ms);
+
+	rc = nvme_ctrlr_cmd_set_host_id(ctrlr, host_id, host_id_size, nvme_ctrlr_set_host_id_done, ctrlr);
+	if (rc != 0) {
+		SPDK_ERRLOG("Set Features - Host ID failed: %d\n", rc);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+	if (ctrlr->ns) {
+		uint32_t i, num_ns = ctrlr->num_ns;
+
+		for (i = 0; i < num_ns; i++) {
+			nvme_ns_destruct(&ctrlr->ns[i]);
+		}
+
+		spdk_free(ctrlr->ns);
+		ctrlr->ns = NULL;
+		ctrlr->num_ns = 0;
+	}
+
+	if (ctrlr->nsdata) {
+		spdk_free(ctrlr->nsdata);
+		ctrlr->nsdata = NULL;
+	}
+
+	spdk_free(ctrlr->active_ns_list);
+	ctrlr->active_ns_list = NULL;
+}
+
+static void
+nvme_ctrlr_update_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+	uint32_t i, nn = ctrlr->cdata.nn;
+	struct spdk_nvme_ns_data *nsdata;
+	bool ns_is_active;
+
+	for (i = 0; i < nn; i++) {
+		struct spdk_nvme_ns	*ns = &ctrlr->ns[i];
+		uint32_t		nsid = i + 1;
+
+		nsdata = &ctrlr->nsdata[nsid - 1];
+		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
+
+		if (nsdata->ncap && ns_is_active) {
+			if (nvme_ns_update(ns) != 0) {
+				SPDK_ERRLOG("Failed to update active NS %u\n", nsid);
+				continue;
+			}
+		}
+
+		if ((nsdata->ncap == 0) && ns_is_active) {
+			if (nvme_ns_construct(ns, nsid, ctrlr) != 0) {
+				continue;
+			}
+		}
+
+		if (nsdata->ncap && !ns_is_active) {
+			nvme_ns_destruct(ns);
+		}
+	}
+}
+
+static int
+nvme_ctrlr_construct_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc = 0;
+	uint32_t nn = ctrlr->cdata.nn;
+
+	/* ctrlr->num_ns may be 0 (startup) or a different number of namespaces (reset),
+	 * so check if we need to reallocate.
+	 */
+	if (nn != ctrlr->num_ns) {
+		nvme_ctrlr_destruct_namespaces(ctrlr);
+
+		if (nn == 0) {
+			SPDK_WARNLOG("controller has 0 namespaces\n");
+			return 0;
+		}
+
+		ctrlr->ns = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns), 64, NULL,
+					 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+		if (ctrlr->ns == NULL) {
+			rc = -ENOMEM;
+			goto fail;
+		}
+
+		ctrlr->nsdata = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns_data), 64,
+					     NULL, SPDK_ENV_SOCKET_ID_ANY,
+					     SPDK_MALLOC_SHARE | SPDK_MALLOC_DMA);
+		if (ctrlr->nsdata == NULL) {
+			rc = -ENOMEM;
+			goto fail;
+		}
+
+		ctrlr->num_ns = nn;
+	}
+
+	return 0;
+
+fail:
+	nvme_ctrlr_destruct_namespaces(ctrlr);
+	return rc;
+}
+
+static void
+nvme_ctrlr_async_event_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_async_event_request	*aer = arg;
+	struct spdk_nvme_ctrlr		*ctrlr = aer->ctrlr;
+	struct spdk_nvme_ctrlr_process	*active_proc;
+	union spdk_nvme_async_event_completion	event;
+	int					rc;
+
+	if (cpl->status.sct == SPDK_NVME_SCT_GENERIC &&
+	    cpl->status.sc == SPDK_NVME_SC_ABORTED_SQ_DELETION) {
+		/*
+		 *  This is simulated when controller is being shut down, to
+		 *  effectively abort outstanding asynchronous event requests
+		 *  and make sure all memory is freed.  Do not repost the
+		 *  request in this case.
+		 */
+		return;
+	}
+
+	if (cpl->status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC &&
+	    cpl->status.sc == SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED) {
+		/*
+		 *  SPDK will only send as many AERs as the device says it supports,
+		 *  so this status code indicates an out-of-spec device.  Do not repost
+		 *  the request in this case.
+		 */
+		SPDK_ERRLOG("Controller appears out-of-spec for asynchronous event request\n"
+			    "handling.  Do not repost this AER.\n");
+		return;
+	}
+
+	event.raw = cpl->cdw0;
+	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
+	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
+		rc = nvme_ctrlr_identify_active_ns(ctrlr);
+		if (rc) {
+			return;
+		}
+		nvme_ctrlr_update_namespaces(ctrlr);
+		nvme_io_msg_ctrlr_update(ctrlr);
+	}
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc && active_proc->aer_cb_fn) {
+		active_proc->aer_cb_fn(active_proc->aer_cb_arg, cpl);
+	}
+
+	/* If the ctrlr was removed or in the destruct state, we should not send aer again */
+	if (ctrlr->is_removed || ctrlr->is_destructed) {
+		return;
+	}
+
+	/*
+	 * Repost another asynchronous event request to replace the one
+	 *  that just completed.
+	 */
+	if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) {
+		/*
+		 * We can't do anything to recover from a failure here,
+		 * so just print a warning message and leave the AER unsubmitted.
+		 */
+		SPDK_ERRLOG("resubmitting AER failed!\n");
+	}
+}
+
+static int
+nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr,
+				    struct nvme_async_event_request *aer)
+{
+	struct nvme_request *req;
+
+	aer->ctrlr = ctrlr;
+	req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_async_event_cb, aer);
+	aer->req = req;
+	if (req == NULL) {
+		return -1;
+	}
+
+	req->cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
+	return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static void
+nvme_ctrlr_configure_aer_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_async_event_request		*aer;
+	int					rc;
+	uint32_t				i;
+	struct spdk_nvme_ctrlr *ctrlr =	(struct spdk_nvme_ctrlr *)arg;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		SPDK_NOTICELOG("nvme_ctrlr_configure_aer failed!\n");
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+				     ctrlr->opts.admin_timeout_ms);
+		return;
+	}
+
+	/* aerl is a zero-based value, so we need to add 1 here. */
+	ctrlr->num_aers = spdk_min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl + 1));
+
+	for (i = 0; i < ctrlr->num_aers; i++) {
+		aer = &ctrlr->aer[i];
+		rc = nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+		if (rc) {
+			SPDK_ERRLOG("nvme_ctrlr_construct_and_submit_aer failed!\n");
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+			return;
+		}
+	}
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+			     ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_feat_async_event_configuration	config;
+	int						rc;
+
+	config.raw = 0;
+	config.bits.crit_warn.bits.available_spare = 1;
+	config.bits.crit_warn.bits.temperature = 1;
+	config.bits.crit_warn.bits.device_reliability = 1;
+	config.bits.crit_warn.bits.read_only = 1;
+	config.bits.crit_warn.bits.volatile_memory_backup = 1;
+
+	if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 2, 0)) {
+		if (ctrlr->cdata.oaes.ns_attribute_notices) {
+			config.bits.ns_attr_notice = 1;
+		}
+		if (ctrlr->cdata.oaes.fw_activation_notices) {
+			config.bits.fw_activation_notice = 1;
+		}
+	}
+	if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) {
+		config.bits.telemetry_log_notice = 1;
+	}
+
+	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER,
+			     ctrlr->opts.admin_timeout_ms);
+
+	rc = nvme_ctrlr_cmd_set_async_event_config(ctrlr, config,
+			nvme_ctrlr_configure_aer_done,
+			ctrlr);
+	if (rc != 0) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+		return rc;
+	}
+
+	return 0;
+}
+
+struct spdk_nvme_ctrlr_process *
+nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+
+	TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
+		if (active_proc->pid == pid) {
+			return active_proc;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_nvme_ctrlr_process *
+nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return nvme_ctrlr_get_process(ctrlr, getpid());
+}
+
+/**
+ * This function will be called when a process is using the controller.
+ *  1. For the primary process, it is called when constructing the controller.
+ *  2. For the secondary process, it is called at probing the controller.
+ * Note: will check whether the process is already added for the same process.
+ */
+int
+nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle)
+{
+	struct spdk_nvme_ctrlr_process	*ctrlr_proc;
+	pid_t				pid = getpid();
+
+	/* Check whether the process is already added or not */
+	if (nvme_ctrlr_get_process(ctrlr, pid)) {
+		return 0;
+	}
+
+	/* Initialize the per process properties for this ctrlr */
+	ctrlr_proc = spdk_zmalloc(sizeof(struct spdk_nvme_ctrlr_process),
+				  64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+	if (ctrlr_proc == NULL) {
+		SPDK_ERRLOG("failed to allocate memory to track the process props\n");
+
+		return -1;
+	}
+
+	ctrlr_proc->is_primary = spdk_process_is_primary();
+	ctrlr_proc->pid = pid;
+	STAILQ_INIT(&ctrlr_proc->active_reqs);
+	ctrlr_proc->devhandle = devhandle;
+	ctrlr_proc->ref = 0;
+	TAILQ_INIT(&ctrlr_proc->allocated_io_qpairs);
+
+	TAILQ_INSERT_TAIL(&ctrlr->active_procs, ctrlr_proc, tailq);
+
+	return 0;
+}
+
+/**
+ * This function will be called when the process detaches the controller.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_remove_process(struct spdk_nvme_ctrlr *ctrlr,
+			  struct spdk_nvme_ctrlr_process *proc)
+{
+	struct spdk_nvme_qpair	*qpair, *tmp_qpair;
+
+	assert(STAILQ_EMPTY(&proc->active_reqs));
+
+	TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) {
+		spdk_nvme_ctrlr_free_io_qpair(qpair);
+	}
+
+	TAILQ_REMOVE(&ctrlr->active_procs, proc, tailq);
+
+	if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+		spdk_pci_device_detach(proc->devhandle);
+	}
+
+	spdk_free(proc);
+}
+
+/**
+ * This function will be called when the process exited unexpectedly
+ *  in order to free any incomplete nvme request, allocated IO qpairs
+ *  and allocated memory.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_cleanup_process(struct spdk_nvme_ctrlr_process *proc)
+{
+	struct nvme_request	*req, *tmp_req;
+	struct spdk_nvme_qpair	*qpair, *tmp_qpair;
+
+	STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
+		STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
+
+		assert(req->pid == proc->pid);
+
+		nvme_free_request(req);
+	}
+
+	TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) {
+		TAILQ_REMOVE(&proc->allocated_io_qpairs, qpair, per_process_tailq);
+
+		/*
+		 * The process may have been killed while some qpairs were in their
+		 *  completion context.  Clear that flag here to allow these IO
+		 *  qpairs to be deleted.
+		 */
+		qpair->in_completion_context = 0;
+
+		qpair->no_deletion_notification_needed = 1;
+
+		spdk_nvme_ctrlr_free_io_qpair(qpair);
+	}
+
+	spdk_free(proc);
+}
+
+/**
+ * This function will be called when destructing the controller.
+ *  1. There is no more admin request on this controller.
+ *  2. Clean up any left resource allocation when its associated process is gone.
+ */
+void
+nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc, *tmp;
+
+	/* Free all the processes' properties and make sure no pending admin IOs */
+	TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) {
+		TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq);
+
+		assert(STAILQ_EMPTY(&active_proc->active_reqs));
+
+		spdk_free(active_proc);
+	}
+}
+
+/**
+ * This function will be called when any other process attaches or
+ *  detaches the controller in order to cleanup those unexpectedly
+ *  terminated processes.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static int
+nvme_ctrlr_remove_inactive_proc(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc, *tmp;
+	int				active_proc_count = 0;
+
+	TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) {
+		if ((kill(active_proc->pid, 0) == -1) && (errno == ESRCH)) {
+			SPDK_ERRLOG("process %d terminated unexpected\n", active_proc->pid);
+
+			TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq);
+
+			nvme_ctrlr_cleanup_process(active_proc);
+		} else {
+			active_proc_count++;
+		}
+	}
+
+	return active_proc_count;
+}
+
+void
+nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc) {
+		active_proc->ref++;
+	}
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+void
+nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+	int				proc_count;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	proc_count = nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc) {
+		active_proc->ref--;
+		assert(active_proc->ref >= 0);
+
+		/*
+		 * The last active process will be removed at the end of
+		 * the destruction of the controller.
+		 */
+		if (active_proc->ref == 0 && proc_count != 1) {
+			nvme_ctrlr_remove_process(ctrlr, active_proc);
+		}
+	}
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+int
+nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+	int				ref = 0;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+	TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
+		ref += active_proc->ref;
+	}
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return ref;
+}
+
+/**
+ *  Get the PCI device handle which is only visible to its associated process.
+ */
+struct spdk_pci_device *
+nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+	struct spdk_pci_device		*devhandle = NULL;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc) {
+		devhandle = active_proc->devhandle;
+	}
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return devhandle;
+}
+
+/**
+ * This function will be called repeatedly during initialization until the controller is ready.
+ */
+int
+nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_cc_register cc;
+	union spdk_nvme_csts_register csts;
+	uint32_t ready_timeout_in_ms;
+	int rc = 0;
+
+	/*
+	 * May need to avoid accessing any register on the target controller
+	 * for a while. Return early without touching the FSM.
+	 * Check sleep_timeout_tsc > 0 for unit test.
+	 */
+	if ((ctrlr->sleep_timeout_tsc > 0) &&
+	    (spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) {
+		return 0;
+	}
+	ctrlr->sleep_timeout_tsc = 0;
+
+	if (nvme_ctrlr_get_cc(ctrlr, &cc) ||
+	    nvme_ctrlr_get_csts(ctrlr, &csts)) {
+		if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) {
+			/* While a device is resetting, it may be unable to service MMIO reads
+			 * temporarily. Allow for this case.
+			 */
+			SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n");
+			goto init_timeout;
+		}
+		SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state);
+		return -EIO;
+	}
+
+	ready_timeout_in_ms = 500 * ctrlr->cap.bits.to;
+
+	/*
+	 * Check if the current initialization step is done or has timed out.
+	 */
+	switch (ctrlr->state) {
+	case NVME_CTRLR_STATE_INIT_DELAY:
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, ready_timeout_in_ms);
+		if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_INIT) {
+			/*
+			 * Controller may need some delay before it's enabled.
+			 *
+			 * This is a workaround for an issue where the PCIe-attached NVMe controller
+			 * is not ready after VFIO reset. We delay the initialization rather than the
+			 * enabling itself, because this is required only for the very first enabling
+			 * - directly after a VFIO reset.
+			 */
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Adding 2 second delay before initializing the controller\n");
+			ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000);
+		}
+		break;
+
+	case NVME_CTRLR_STATE_INIT:
+		/* Begin the hardware initialization by making sure the controller is disabled. */
+		if (cc.bits.en) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1\n");
+			/*
+			 * Controller is currently enabled. We need to disable it to cause a reset.
+			 *
+			 * If CC.EN = 1 && CSTS.RDY = 0, the controller is in the process of becoming ready.
+			 *  Wait for the ready bit to be 1 before disabling the controller.
+			 */
+			if (csts.bits.rdy == 0) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 0 - waiting for reset to complete\n");
+				nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, ready_timeout_in_ms);
+				return 0;
+			}
+
+			/* CC.EN = 1 && CSTS.RDY == 1, so we can immediately disable the controller. */
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n");
+			cc.bits.en = 0;
+			if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+				SPDK_ERRLOG("set_cc() failed\n");
+				return -EIO;
+			}
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+
+			/*
+			 * Wait 2.5 seconds before accessing PCI registers.
+			 * Not using sleep() to avoid blocking other controller's initialization.
+			 */
+			if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVME, "Applying quirk: delay 2.5 seconds before reading registers\n");
+				ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000);
+			}
+			return 0;
+		} else {
+			if (csts.bits.rdy == 1) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 1 - waiting for shutdown to complete\n");
+			}
+
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+			return 0;
+		}
+		break;
+
+	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
+		if (csts.bits.rdy == 1) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - disabling controller\n");
+			/* CC.EN = 1 && CSTS.RDY = 1, so we can set CC.EN = 0 now. */
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n");
+			cc.bits.en = 0;
+			if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+				SPDK_ERRLOG("set_cc() failed\n");
+				return -EIO;
+			}
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+			return 0;
+		}
+		break;
+
+	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
+		if (csts.bits.rdy == 0) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 0\n");
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE, ready_timeout_in_ms);
+			/*
+			 * Delay 100us before setting CC.EN = 1.  Some NVMe SSDs miss CC.EN getting
+			 *  set to 1 if it is too soon after CSTS.RDY is reported as 0.
+			 */
+			spdk_delay_us(100);
+			return 0;
+		}
+		break;
+
+	case NVME_CTRLR_STATE_ENABLE:
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 1\n");
+		rc = nvme_ctrlr_enable(ctrlr);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, ready_timeout_in_ms);
+		return rc;
+
+	case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
+		if (csts.bits.rdy == 1) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - controller is ready\n");
+			/*
+			 * The controller has been enabled.
+			 *  Perform the rest of initialization serially.
+			 */
+			nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_RESET_ADMIN_QUEUE,
+					     ctrlr->opts.admin_timeout_ms);
+			return 0;
+		}
+		break;
+
+	case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE:
+		nvme_transport_qpair_reset(ctrlr->adminq);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY,
+				     ctrlr->opts.admin_timeout_ms);
+		break;
+
+	case NVME_CTRLR_STATE_IDENTIFY:
+		rc = nvme_ctrlr_identify(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_SET_NUM_QUEUES:
+		nvme_ctrlr_update_nvmf_ioccsz(ctrlr);
+		rc = nvme_ctrlr_set_num_queues(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_CONSTRUCT_NS:
+		rc = nvme_ctrlr_construct_namespaces(ctrlr);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS,
+				     ctrlr->opts.admin_timeout_ms);
+		break;
+
+	case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS:
+		_nvme_ctrlr_identify_active_ns(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_IDENTIFY_NS:
+		rc = nvme_ctrlr_identify_namespaces(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS:
+		rc = nvme_ctrlr_identify_id_desc_namespaces(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_CONFIGURE_AER:
+		rc = nvme_ctrlr_configure_aer(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES:
+		rc = nvme_ctrlr_set_supported_log_pages(ctrlr);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES,
+				     ctrlr->opts.admin_timeout_ms);
+		break;
+
+	case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES:
+		nvme_ctrlr_set_supported_features(ctrlr);
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_DB_BUF_CFG,
+				     ctrlr->opts.admin_timeout_ms);
+		break;
+
+	case NVME_CTRLR_STATE_SET_DB_BUF_CFG:
+		rc = nvme_ctrlr_set_doorbell_buffer_config(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT:
+		rc = nvme_ctrlr_set_keep_alive_timeout(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_SET_HOST_ID:
+		rc = nvme_ctrlr_set_host_id(ctrlr);
+		break;
+
+	case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID:
+		spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+		break;
+
+	case NVME_CTRLR_STATE_READY:
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Ctrlr already in ready state\n");
+		return 0;
+
+	case NVME_CTRLR_STATE_ERROR:
+		SPDK_ERRLOG("Ctrlr %s is in error state\n", ctrlr->trid.traddr);
+		return -1;
+
+	default:
+		assert(0);
+		return -1;
+	}
+
+init_timeout:
+	if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE &&
+	    spdk_get_ticks() > ctrlr->state_timeout_tsc) {
+		SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state);
+		return -1;
+	}
+
+	return rc;
+}
+
+int
+nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx)
+{
+	pthread_mutexattr_t attr;
+	int rc = 0;
+
+	if (pthread_mutexattr_init(&attr)) {
+		return -1;
+	}
+	if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) ||
+#ifndef __FreeBSD__
+	    pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) ||
+	    pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) ||
+#endif
+	    pthread_mutex_init(mtx, &attr)) {
+		rc = -1;
+	}
+	pthread_mutexattr_destroy(&attr);
+	return rc;
+}
+
+int
+nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc;
+
+	if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT_DELAY, NVME_TIMEOUT_INFINITE);
+	} else {
+		nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE);
+	}
+
+	if (ctrlr->opts.admin_queue_size > SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES) {
+		SPDK_ERRLOG("admin_queue_size %u exceeds max defined by NVMe spec, use max value\n",
+			    ctrlr->opts.admin_queue_size);
+		ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES;
+	}
+
+	if (ctrlr->opts.admin_queue_size < SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES) {
+		SPDK_ERRLOG("admin_queue_size %u is less than minimum defined by NVMe spec, use min value\n",
+			    ctrlr->opts.admin_queue_size);
+		ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES;
+	}
+
+	ctrlr->flags = 0;
+	ctrlr->free_io_qids = NULL;
+	ctrlr->is_resetting = false;
+	ctrlr->is_failed = false;
+	ctrlr->is_destructed = false;
+
+	TAILQ_INIT(&ctrlr->active_io_qpairs);
+	STAILQ_INIT(&ctrlr->queued_aborts);
+	ctrlr->outstanding_aborts = 0;
+
+	rc = nvme_robust_mutex_init_recursive_shared(&ctrlr->ctrlr_lock);
+	if (rc != 0) {
+		return rc;
+	}
+
+	TAILQ_INIT(&ctrlr->active_procs);
+
+	return rc;
+}
+
+/* This function should be called once at ctrlr initialization to set up constant properties. */
+void
+nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap,
+		    const union spdk_nvme_vs_register *vs)
+{
+	ctrlr->cap = *cap;
+	ctrlr->vs = *vs;
+
+	if (ctrlr->cap.bits.ams & SPDK_NVME_CAP_AMS_WRR) {
+		ctrlr->flags |= SPDK_NVME_CTRLR_WRR_SUPPORTED;
+	}
+
+	ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin);
+
+	/* For now, always select page_size == min_page_size. */
+	ctrlr->page_size = ctrlr->min_page_size;
+
+	ctrlr->opts.io_queue_size = spdk_max(ctrlr->opts.io_queue_size, SPDK_NVME_IO_QUEUE_MIN_ENTRIES);
+	ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, MAX_IO_QUEUE_ENTRIES);
+	ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, ctrlr->cap.bits.mqes + 1u);
+
+	ctrlr->opts.io_queue_requests = spdk_max(ctrlr->opts.io_queue_requests, ctrlr->opts.io_queue_size);
+}
+
+void
+nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr)
+{
+	pthread_mutex_destroy(&ctrlr->ctrlr_lock);
+}
+
+void
+nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_nvme_qpair *qpair, *tmp;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Prepare to destruct SSD: %s\n", ctrlr->trid.traddr);
+
+	ctrlr->is_destructed = true;
+
+	spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+
+	nvme_ctrlr_abort_queued_aborts(ctrlr);
+	nvme_transport_admin_qpair_abort_aers(ctrlr->adminq);
+
+	TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) {
+		spdk_nvme_ctrlr_free_io_qpair(qpair);
+	}
+
+	nvme_ctrlr_free_doorbell_buffer(ctrlr);
+
+	if (ctrlr->opts.no_shn_notification) {
+		SPDK_INFOLOG(SPDK_LOG_NVME, "Disable SSD: %s without shutdown notification\n",
+			     ctrlr->trid.traddr);
+		nvme_ctrlr_disable(ctrlr);
+	} else {
+		nvme_ctrlr_shutdown(ctrlr);
+	}
+
+	nvme_ctrlr_destruct_namespaces(ctrlr);
+
+	spdk_bit_array_free(&ctrlr->free_io_qids);
+
+	nvme_transport_ctrlr_destruct(ctrlr);
+}
+
+int
+nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr,
+				struct nvme_request *req)
+{
+	return nvme_qpair_submit_request(ctrlr->adminq, req);
+}
+
+static void
+nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl)
+{
+	/* Do nothing */
+}
+
+/*
+ * Check if we need to send a Keep Alive command.
+ * Caller must hold ctrlr->ctrlr_lock.
+ */
+static void
+nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr)
+{
+	uint64_t now;
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	now = spdk_get_ticks();
+	if (now < ctrlr->next_keep_alive_tick) {
+		return;
+	}
+
+	req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL);
+	if (req == NULL) {
+		return;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_KEEP_ALIVE;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	if (rc != 0) {
+		SPDK_ERRLOG("Submitting Keep Alive failed\n");
+	}
+
+	ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks;
+}
+
+int32_t
+spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int32_t num_completions;
+	int32_t rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	if (ctrlr->keep_alive_interval_ticks) {
+		nvme_ctrlr_keep_alive(ctrlr);
+	}
+
+	rc = nvme_io_msg_process(ctrlr);
+	if (rc < 0) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return rc;
+	}
+	num_completions = rc;
+
+	rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	if (rc < 0) {
+		num_completions = rc;
+	} else {
+		num_completions += rc;
+	}
+
+	return num_completions;
+}
+
+const struct spdk_nvme_ctrlr_data *
+spdk_nvme_ctrlr_get_data(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return &ctrlr->cdata;
+}
+
+union spdk_nvme_csts_register spdk_nvme_ctrlr_get_regs_csts(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_csts_register csts;
+
+	if (nvme_ctrlr_get_csts(ctrlr, &csts)) {
+		csts.raw = 0xFFFFFFFFu;
+	}
+	return csts;
+}
+
+union spdk_nvme_cap_register spdk_nvme_ctrlr_get_regs_cap(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->cap;
+}
+
+union spdk_nvme_vs_register spdk_nvme_ctrlr_get_regs_vs(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->vs;
+}
+
+union spdk_nvme_cmbsz_register spdk_nvme_ctrlr_get_regs_cmbsz(struct spdk_nvme_ctrlr *ctrlr)
+{
+	union spdk_nvme_cmbsz_register cmbsz;
+
+	if (nvme_ctrlr_get_cmbsz(ctrlr, &cmbsz)) {
+		cmbsz.raw = 0;
+	}
+
+	return cmbsz;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_num_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->num_ns;
+}
+
+static int32_t
+nvme_ctrlr_active_ns_idx(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+	int32_t result = -1;
+
+	if (ctrlr->active_ns_list == NULL || nsid == 0 || nsid > ctrlr->num_ns) {
+		return result;
+	}
+
+	int32_t lower = 0;
+	int32_t upper = ctrlr->num_ns - 1;
+	int32_t mid;
+
+	while (lower <= upper) {
+		mid = lower + (upper - lower) / 2;
+		if (ctrlr->active_ns_list[mid] == nsid) {
+			result = mid;
+			break;
+		} else {
+			if (ctrlr->active_ns_list[mid] != 0 && ctrlr->active_ns_list[mid] < nsid) {
+				lower = mid + 1;
+			} else {
+				upper = mid - 1;
+			}
+
+		}
+	}
+
+	return result;
+}
+
+bool
+spdk_nvme_ctrlr_is_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+	return nvme_ctrlr_active_ns_idx(ctrlr, nsid) != -1;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_first_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->active_ns_list ? ctrlr->active_ns_list[0] : 0;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t prev_nsid)
+{
+	int32_t nsid_idx = nvme_ctrlr_active_ns_idx(ctrlr, prev_nsid);
+	if (ctrlr->active_ns_list && nsid_idx >= 0 && (uint32_t)nsid_idx < ctrlr->num_ns - 1) {
+		return ctrlr->active_ns_list[nsid_idx + 1];
+	}
+	return 0;
+}
+
+struct spdk_nvme_ns *
+spdk_nvme_ctrlr_get_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+	if (nsid < 1 || nsid > ctrlr->num_ns) {
+		return NULL;
+	}
+
+	return &ctrlr->ns[nsid - 1];
+}
+
+struct spdk_pci_device *
+spdk_nvme_ctrlr_get_pci_device(struct spdk_nvme_ctrlr *ctrlr)
+{
+	if (ctrlr == NULL) {
+		return NULL;
+	}
+
+	if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+		return NULL;
+	}
+
+	return nvme_ctrlr_proc_get_devhandle(ctrlr);
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_max_xfer_size(const struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->max_xfer_size;
+}
+
+void
+spdk_nvme_ctrlr_register_aer_callback(struct spdk_nvme_ctrlr *ctrlr,
+				      spdk_nvme_aer_cb aer_cb_fn,
+				      void *aer_cb_arg)
+{
+	struct spdk_nvme_ctrlr_process *active_proc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc) {
+		active_proc->aer_cb_fn = aer_cb_fn;
+		active_proc->aer_cb_arg = aer_cb_arg;
+	}
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+void
+spdk_nvme_ctrlr_register_timeout_callback(struct spdk_nvme_ctrlr *ctrlr,
+		uint64_t timeout_us, spdk_nvme_timeout_cb cb_fn, void *cb_arg)
+{
+	struct spdk_nvme_ctrlr_process	*active_proc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (active_proc) {
+		active_proc->timeout_ticks = timeout_us * spdk_get_ticks_hz() / 1000000ULL;
+		active_proc->timeout_cb_fn = cb_fn;
+		active_proc->timeout_cb_arg = cb_arg;
+	}
+
+	ctrlr->timeout_enabled = true;
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+bool
+spdk_nvme_ctrlr_is_log_page_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page)
+{
+	/* No bounds check necessary, since log_page is uint8_t and log_page_supported has 256 entries */
+	SPDK_STATIC_ASSERT(sizeof(ctrlr->log_page_supported) == 256, "log_page_supported size mismatch");
+	return ctrlr->log_page_supported[log_page];
+}
+
+bool
+spdk_nvme_ctrlr_is_feature_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature_code)
+{
+	/* No bounds check necessary, since feature_code is uint8_t and feature_supported has 256 entries */
+	SPDK_STATIC_ASSERT(sizeof(ctrlr->feature_supported) == 256, "feature_supported size mismatch");
+	return ctrlr->feature_supported[feature_code];
+}
+
+int
+spdk_nvme_ctrlr_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+			  struct spdk_nvme_ctrlr_list *payload)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+	struct spdk_nvme_ns			*ns;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	res = nvme_ctrlr_cmd_attach_ns(ctrlr, nsid, payload,
+				       nvme_completion_poll_cb, status);
+	if (res) {
+		free(status);
+		return res;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_attach_ns failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+	free(status);
+
+	res = nvme_ctrlr_identify_active_ns(ctrlr);
+	if (res) {
+		return res;
+	}
+
+	ns = &ctrlr->ns[nsid - 1];
+	return nvme_ns_construct(ns, nsid, ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+			  struct spdk_nvme_ctrlr_list *payload)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+	struct spdk_nvme_ns			*ns;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	res = nvme_ctrlr_cmd_detach_ns(ctrlr, nsid, payload,
+				       nvme_completion_poll_cb, status);
+	if (res) {
+		free(status);
+		return res;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_detach_ns failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+	free(status);
+
+	res = nvme_ctrlr_identify_active_ns(ctrlr);
+	if (res) {
+		return res;
+	}
+
+	ns = &ctrlr->ns[nsid - 1];
+	/* Inactive NS */
+	nvme_ns_destruct(ns);
+
+	return 0;
+}
+
+uint32_t
+spdk_nvme_ctrlr_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+	uint32_t				nsid;
+	struct spdk_nvme_ns			*ns;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return 0;
+	}
+
+	res = nvme_ctrlr_cmd_create_ns(ctrlr, payload, nvme_completion_poll_cb, status);
+	if (res) {
+		free(status);
+		return 0;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_create_ns failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return 0;
+	}
+
+	nsid = status->cpl.cdw0;
+	ns = &ctrlr->ns[nsid - 1];
+	free(status);
+	/* Inactive NS */
+	res = nvme_ns_construct(ns, nsid, ctrlr);
+	if (res) {
+		return 0;
+	}
+
+	/* Return the namespace ID that was created */
+	return nsid;
+}
+
+int
+spdk_nvme_ctrlr_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+	struct spdk_nvme_ns			*ns;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	res = nvme_ctrlr_cmd_delete_ns(ctrlr, nsid, nvme_completion_poll_cb, status);
+	if (res) {
+		free(status);
+		return res;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_delete_ns failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+	free(status);
+
+	res = nvme_ctrlr_identify_active_ns(ctrlr);
+	if (res) {
+		return res;
+	}
+
+	ns = &ctrlr->ns[nsid - 1];
+	nvme_ns_destruct(ns);
+
+	return 0;
+}
+
+int
+spdk_nvme_ctrlr_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+		       struct spdk_nvme_format *format)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	res = nvme_ctrlr_cmd_format(ctrlr, nsid, format, nvme_completion_poll_cb,
+				    status);
+	if (res) {
+		free(status);
+		return res;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_format failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+	free(status);
+
+	return spdk_nvme_ctrlr_reset(ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_update_firmware(struct spdk_nvme_ctrlr *ctrlr, void *payload, uint32_t size,
+				int slot, enum spdk_nvme_fw_commit_action commit_action, struct spdk_nvme_status *completion_status)
+{
+	struct spdk_nvme_fw_commit		fw_commit;
+	struct nvme_completion_poll_status	*status;
+	int					res;
+	unsigned int				size_remaining;
+	unsigned int				offset;
+	unsigned int				transfer;
+	void					*p;
+
+	if (!completion_status) {
+		return -EINVAL;
+	}
+	memset(completion_status, 0, sizeof(struct spdk_nvme_status));
+	if (size % 4) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid size!\n");
+		return -1;
+	}
+
+	/* Current support only for SPDK_NVME_FW_COMMIT_REPLACE_IMG
+	 * and SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG
+	 */
+	if ((commit_action != SPDK_NVME_FW_COMMIT_REPLACE_IMG) &&
+	    (commit_action != SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid command!\n");
+		return -1;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	/* Firmware download */
+	size_remaining = size;
+	offset = 0;
+	p = payload;
+
+	while (size_remaining > 0) {
+		transfer = spdk_min(size_remaining, ctrlr->min_page_size);
+
+		memset(status, 0, sizeof(*status));
+		res = nvme_ctrlr_cmd_fw_image_download(ctrlr, transfer, offset, p,
+						       nvme_completion_poll_cb,
+						       status);
+		if (res) {
+			free(status);
+			return res;
+		}
+
+		if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+			SPDK_ERRLOG("spdk_nvme_ctrlr_fw_image_download failed!\n");
+			if (!status->timed_out) {
+				free(status);
+			}
+			return -ENXIO;
+		}
+		p += transfer;
+		offset += transfer;
+		size_remaining -= transfer;
+	}
+
+	/* Firmware commit */
+	memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit));
+	fw_commit.fs = slot;
+	fw_commit.ca = commit_action;
+
+	memset(status, 0, sizeof(*status));
+	res = nvme_ctrlr_cmd_fw_commit(ctrlr, &fw_commit, nvme_completion_poll_cb,
+				       status);
+	if (res) {
+		free(status);
+		return res;
+	}
+
+	res = nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock);
+
+	memcpy(completion_status, &status->cpl.status, sizeof(struct spdk_nvme_status));
+
+	if (!status->timed_out) {
+		free(status);
+	}
+
+	if (res) {
+		if (completion_status->sct != SPDK_NVME_SCT_COMMAND_SPECIFIC ||
+		    completion_status->sc != SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET) {
+			if (completion_status->sct == SPDK_NVME_SCT_COMMAND_SPECIFIC  &&
+			    completion_status->sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) {
+				SPDK_NOTICELOG("firmware activation requires conventional reset to be performed. !\n");
+			} else {
+				SPDK_ERRLOG("nvme_ctrlr_cmd_fw_commit failed!\n");
+			}
+			return -ENXIO;
+		}
+	}
+
+	return spdk_nvme_ctrlr_reset(ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc, size;
+	union spdk_nvme_cmbsz_register cmbsz;
+
+	cmbsz = spdk_nvme_ctrlr_get_regs_cmbsz(ctrlr);
+
+	if (cmbsz.bits.rds == 0 || cmbsz.bits.wds == 0) {
+		return -ENOTSUP;
+	}
+
+	size = cmbsz.bits.sz * (0x1000 << (cmbsz.bits.szu * 4));
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	rc = nvme_transport_ctrlr_reserve_cmb(ctrlr);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	if (rc < 0) {
+		return rc;
+	}
+
+	return size;
+}
+
+void *
+spdk_nvme_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
+{
+	void *buf;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	buf = nvme_transport_ctrlr_map_cmb(ctrlr, size);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return buf;
+}
+
+void
+spdk_nvme_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	nvme_transport_ctrlr_unmap_cmb(ctrlr);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+bool
+spdk_nvme_ctrlr_is_discovery(struct spdk_nvme_ctrlr *ctrlr)
+{
+	assert(ctrlr);
+
+	return !strncmp(ctrlr->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN,
+			strlen(SPDK_NVMF_DISCOVERY_NQN));
+}
+
+int
+spdk_nvme_ctrlr_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+				 uint16_t spsp, uint8_t nssf, void *payload, size_t size)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	res = spdk_nvme_ctrlr_cmd_security_receive(ctrlr, secp, spsp, nssf, payload, size,
+			nvme_completion_poll_cb, status);
+	if (res) {
+		free(status);
+		return res;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_receive failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+	free(status);
+
+	return 0;
+}
+
+int
+spdk_nvme_ctrlr_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+			      uint16_t spsp, uint8_t nssf, void *payload, size_t size)
+{
+	struct nvme_completion_poll_status	*status;
+	int					res;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	res = spdk_nvme_ctrlr_cmd_security_send(ctrlr, secp, spsp, nssf, payload, size,
+						nvme_completion_poll_cb,
+						status);
+	if (res) {
+		free(status);
+		return res;
+	}
+	if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_send failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+
+	free(status);
+
+	return 0;
+}
+
+uint64_t
+spdk_nvme_ctrlr_get_flags(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return ctrlr->flags;
+}
+
+const struct spdk_nvme_transport_id *
+spdk_nvme_ctrlr_get_transport_id(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return &ctrlr->trid;
+}
+
+/* FIXME need to specify max number of iovs */
+int
+spdk_nvme_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs,
+		   uint32_t len, size_t mps,
+		   void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len))
+{
+	uint64_t prp1, prp2;
+	void *vva;
+	uint32_t i;
+	uint32_t residue_len, nents;
+	uint64_t *prp_list;
+	int iovcnt;
+
+	prp1 = cmd->dptr.prp.prp1;
+	prp2 = cmd->dptr.prp.prp2;
+
+	/* PRP1 may started with unaligned page address */
+	residue_len = mps - (prp1 % mps);
+	residue_len = spdk_min(len, residue_len);
+
+	vva = gpa_to_vva(prv, prp1, residue_len);
+	if (spdk_unlikely(vva == NULL)) {
+		SPDK_ERRLOG("GPA to VVA failed\n");
+		return -1;
+	}
+	iovs[0].iov_base = vva;
+	iovs[0].iov_len = residue_len;
+	len -= residue_len;
+
+	if (len) {
+		if (spdk_unlikely(prp2 == 0)) {
+			SPDK_ERRLOG("no PRP2, %d remaining\n", len);
+			return -1;
+		}
+
+		if (len <= mps) {
+			/* 2 PRP used */
+			iovcnt = 2;
+			vva = gpa_to_vva(prv, prp2, len);
+			if (spdk_unlikely(vva == NULL)) {
+				SPDK_ERRLOG("no VVA for %#lx, len%#x\n",
+					    prp2, len);
+				return -1;
+			}
+			iovs[1].iov_base = vva;
+			iovs[1].iov_len = len;
+		} else {
+			/* PRP list used */
+			nents = (len + mps - 1) / mps;
+			vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list));
+			if (spdk_unlikely(vva == NULL)) {
+				SPDK_ERRLOG("no VVA for %#lx, nents=%#x\n",
+					    prp2, nents);
+				return -1;
+			}
+			prp_list = vva;
+			i = 0;
+			while (len != 0) {
+				residue_len = spdk_min(len, mps);
+				vva = gpa_to_vva(prv, prp_list[i], residue_len);
+				if (spdk_unlikely(vva == NULL)) {
+					SPDK_ERRLOG("no VVA for %#lx, residue_len=%#x\n",
+						    prp_list[i], residue_len);
+					return -1;
+				}
+				iovs[i + 1].iov_base = vva;
+				iovs[i + 1].iov_len = residue_len;
+				len -= residue_len;
+				i++;
+			}
+			iovcnt = i + 1;
+		}
+	} else {
+		/* 1 PRP used */
+		iovcnt = 1;
+	}
+
+	return iovcnt;
+}
diff --git a/src/spdk/lib/nvme/nvme_ctrlr_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c
new file mode 100644
index 000000000..9b16c8d6f
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c
@@ -0,0 +1,966 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+int
+spdk_nvme_ctrlr_io_cmd_raw_no_payload_build(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_qpair *qpair,
+		struct spdk_nvme_cmd *cmd,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+		return -EINVAL;
+	}
+
+	memset(&payload, 0, sizeof(payload));
+	req = nvme_allocate_request(qpair, &payload, 0, 0, cb_fn, cb_arg);
+
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_io_raw(struct spdk_nvme_ctrlr *ctrlr,
+			   struct spdk_nvme_qpair *qpair,
+			   struct spdk_nvme_cmd *cmd,
+			   void *buf, uint32_t len,
+			   spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+
+	req = nvme_allocate_request_contig(qpair, buf, len, cb_fn, cb_arg);
+
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr,
+				   struct spdk_nvme_qpair *qpair,
+				   struct spdk_nvme_cmd *cmd,
+				   void *buf, uint32_t len, void *md_buf,
+				   spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+	uint32_t md_len = 0;
+
+	payload = NVME_PAYLOAD_CONTIG(buf, md_buf);
+
+	/* Caculate metadata length */
+	if (md_buf) {
+		struct spdk_nvme_ns *ns = &ctrlr->ns[cmd->nsid - 1];
+
+		assert(ns->sector_size != 0);
+		md_len =  len / ns->sector_size * ns->md_size;
+	}
+
+	req = nvme_allocate_request(qpair, &payload, len, md_len, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_admin_raw(struct spdk_nvme_ctrlr *ctrlr,
+			      struct spdk_nvme_cmd *cmd,
+			      void *buf, uint32_t len,
+			      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	int			rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_contig(ctrlr->adminq, buf, len, cb_fn, cb_arg);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid,
+			void *payload, size_t payload_size,
+			spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+
+	req = nvme_allocate_request_user_copy(ctrlr->adminq,
+					      payload, payload_size,
+					      cb_fn, cb_arg, false);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_IDENTIFY;
+	cmd->cdw10_bits.identify.cns = cns;
+	cmd->cdw10_bits.identify.cntid = cntid;
+	cmd->nsid = nsid;
+
+	return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+int
+nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+			 struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request			*req;
+	struct spdk_nvme_cmd			*cmd;
+	int					rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq,
+					      payload, sizeof(struct spdk_nvme_ctrlr_list),
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT;
+	cmd->nsid = nsid;
+	cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_ATTACH;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+			 struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request			*req;
+	struct spdk_nvme_cmd			*cmd;
+	int					rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq,
+					      payload, sizeof(struct spdk_nvme_ctrlr_list),
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT;
+	cmd->nsid = nsid;
+	cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_DETACH;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload,
+			 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request			*req;
+	struct spdk_nvme_cmd			*cmd;
+	int					rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq,
+					      payload, sizeof(struct spdk_nvme_ns_data),
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT;
+	cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_CREATE;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn,
+			 void *cb_arg)
+{
+	struct nvme_request			*req;
+	struct spdk_nvme_cmd			*cmd;
+	int					rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT;
+	cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_DELETE;
+	cmd->nsid = nsid;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, uint64_t prp1, uint64_t prp2,
+				      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request			*req;
+	struct spdk_nvme_cmd			*cmd;
+	int					rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG;
+	cmd->dptr.prp.prp1 = prp1;
+	cmd->dptr.prp.prp2 = prp2;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, struct spdk_nvme_format *format,
+		      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_FORMAT_NVM;
+	cmd->nsid = nsid;
+	memcpy(&cmd->cdw10, format, sizeof(uint32_t));
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_set_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+				uint32_t cdw11, uint32_t cdw12, void *payload, uint32_t payload_size,
+				spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+					      true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_SET_FEATURES;
+	cmd->cdw10_bits.set_features.fid = feature;
+	cmd->cdw11 = cdw11;
+	cmd->cdw12 = cdw12;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+				uint32_t cdw11, void *payload, uint32_t payload_size,
+				spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+					      false);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_GET_FEATURES;
+	cmd->cdw10_bits.get_features.fid = feature;
+	cmd->cdw11 = cdw11;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+				   uint32_t cdw11, void *payload,
+				   uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+				   void *cb_arg, uint32_t ns_id)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+					      false);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_GET_FEATURES;
+	cmd->cdw10_bits.get_features.fid = feature;
+	cmd->cdw11 = cdw11;
+	cmd->nsid = ns_id;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int spdk_nvme_ctrlr_cmd_set_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+				       uint32_t cdw11, uint32_t cdw12, void *payload,
+				       uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+				       void *cb_arg, uint32_t ns_id)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+					      true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_SET_FEATURES;
+	cmd->cdw10_bits.set_features.fid = feature;
+	cmd->cdw11 = cdw11;
+	cmd->cdw12 = cdw12;
+	cmd->nsid = ns_id;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+			      uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	union spdk_nvme_feat_number_of_queues feat_num_queues;
+
+	feat_num_queues.raw = 0;
+	feat_num_queues.bits.nsqr = num_queues - 1;
+	feat_num_queues.bits.ncqr = num_queues - 1;
+
+	return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, feat_num_queues.raw,
+					       0,
+					       NULL, 0, cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+			      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	return spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, 0, NULL, 0,
+					       cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr,
+				      union spdk_nvme_feat_async_event_configuration config, spdk_nvme_cmd_cb cb_fn,
+				      void *cb_arg)
+{
+	uint32_t cdw11;
+
+	cdw11 = config.raw;
+	return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0,
+					       NULL, 0,
+					       cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size,
+			   spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	union spdk_nvme_feat_host_identifier feat_host_identifier;
+
+	feat_host_identifier.raw = 0;
+	if (host_id_size == 16) {
+		/* 128-bit extended host identifier */
+		feat_host_identifier.bits.exhid = 1;
+	} else if (host_id_size == 8) {
+		/* 64-bit host identifier */
+		feat_host_identifier.bits.exhid = 0;
+	} else {
+		SPDK_ERRLOG("Invalid host ID size %u\n", host_id_size);
+		return -EINVAL;
+	}
+
+	return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_HOST_IDENTIFIER,
+					       feat_host_identifier.raw, 0,
+					       host_id, host_id_size, cb_fn, cb_arg);
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_log_page_ext(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page,
+				     uint32_t nsid, void *payload, uint32_t payload_size,
+				     uint64_t offset, uint32_t cdw10,
+				     uint32_t cdw11, uint32_t cdw14,
+				     spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	uint32_t numd, numdl, numdu;
+	uint32_t lpol, lpou;
+	int rc;
+
+	if (payload_size == 0) {
+		return -EINVAL;
+	}
+
+	if (offset & 3) {
+		return -EINVAL;
+	}
+
+	numd = payload_size / sizeof(uint32_t) - 1u;
+	numdl = numd & 0xFFFFu;
+	numdu = (numd >> 16) & 0xFFFFu;
+
+	lpol = (uint32_t)offset;
+	lpou = (uint32_t)(offset >> 32);
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+	if (offset && !ctrlr->cdata.lpa.edlp) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -EINVAL;
+	}
+
+	req = nvme_allocate_request_user_copy(ctrlr->adminq,
+					      payload, payload_size, cb_fn, cb_arg, false);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_GET_LOG_PAGE;
+	cmd->nsid = nsid;
+	cmd->cdw10 = cdw10;
+	cmd->cdw10_bits.get_log_page.numdl = numdl;
+	cmd->cdw10_bits.get_log_page.lid = log_page;
+
+	cmd->cdw11 = cdw11;
+	cmd->cdw11_bits.get_log_page.numdu = numdu;
+	cmd->cdw12 = lpol;
+	cmd->cdw13 = lpou;
+	cmd->cdw14 = cdw14;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page,
+				 uint32_t nsid, void *payload, uint32_t payload_size,
+				 uint64_t offset, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	return spdk_nvme_ctrlr_cmd_get_log_page_ext(ctrlr, log_page, nsid, payload,
+			payload_size, offset, 0, 0, 0, cb_fn, cb_arg);
+}
+
+static void
+nvme_ctrlr_retry_queued_abort(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_request	*next, *tmp;
+	int rc;
+
+	if (ctrlr->is_resetting || ctrlr->is_destructed) {
+		return;
+	}
+
+	STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) {
+		STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq);
+		ctrlr->outstanding_aborts++;
+		rc = nvme_ctrlr_submit_admin_request(ctrlr, next);
+		if (rc < 0) {
+			SPDK_ERRLOG("Failed to submit queued abort.\n");
+			memset(&next->cpl, 0, sizeof(next->cpl));
+			next->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+			next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+			next->cpl.status.dnr = 1;
+			nvme_complete_request(next->cb_fn, next->cb_arg, next->qpair, next, &next->cpl);
+			nvme_free_request(next);
+		} else {
+			/* If the first abort succeeds, stop iterating. */
+			break;
+		}
+	}
+}
+
+static int
+_nvme_ctrlr_submit_abort_request(struct spdk_nvme_ctrlr *ctrlr,
+				 struct nvme_request *req)
+{
+	/* ACL is a 0's based value. */
+	if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl + 1U) {
+		STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq);
+		return 0;
+	} else {
+		ctrlr->outstanding_aborts++;
+		return nvme_ctrlr_submit_admin_request(ctrlr, req);
+	}
+}
+
+static void
+nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_request	*req = ctx;
+	struct spdk_nvme_ctrlr	*ctrlr;
+
+	ctrlr = req->qpair->ctrlr;
+
+	ctrlr->outstanding_aborts--;
+	nvme_ctrlr_retry_queued_abort(ctrlr);
+
+	req->user_cb_fn(req->user_cb_arg, cpl);
+}
+
+int
+spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+			  uint16_t cid, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	int rc;
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+
+	if (qpair == NULL) {
+		qpair = ctrlr->adminq;
+	}
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_cmd_abort_cpl, NULL);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+	req->cb_arg = req;
+	req->user_cb_fn = cb_fn;
+	req->user_cb_arg = cb_arg;
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_ABORT;
+	cmd->cdw10_bits.abort.sqid = qpair->id;
+	cmd->cdw10_bits.abort.cid = cid;
+
+	rc = _nvme_ctrlr_submit_abort_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+static void
+nvme_complete_abort_request(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_request *req = ctx;
+	struct nvme_request *parent = req->parent;
+	struct spdk_nvme_ctrlr *ctrlr;
+
+	ctrlr = req->qpair->ctrlr;
+
+	ctrlr->outstanding_aborts--;
+	nvme_ctrlr_retry_queued_abort(ctrlr);
+
+	nvme_request_remove_child(parent, req);
+
+	if (!spdk_nvme_cpl_is_abort_success(cpl)) {
+		parent->parent_status.cdw0 |= 1U;
+	}
+
+	if (parent->num_children == 0) {
+		nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
+				      parent, &parent->parent_status);
+		nvme_free_request(parent);
+	}
+}
+
+static int
+nvme_request_add_abort(struct nvme_request *req, void *arg)
+{
+	struct nvme_request *parent = arg;
+	struct nvme_request *child;
+	void *cmd_cb_arg;
+
+	cmd_cb_arg = parent->user_cb_arg;
+
+	if (req->cb_arg != cmd_cb_arg &&
+	    (req->parent == NULL || req->parent->cb_arg != cmd_cb_arg)) {
+		return 0;
+	}
+
+	child = nvme_allocate_request_null(parent->qpair->ctrlr->adminq,
+					   nvme_complete_abort_request, NULL);
+	if (child == NULL) {
+		return -ENOMEM;
+	}
+
+	child->cb_arg = child;
+
+	child->cmd.opc = SPDK_NVME_OPC_ABORT;
+	/* Copy SQID from the parent. */
+	child->cmd.cdw10_bits.abort.sqid = parent->cmd.cdw10_bits.abort.sqid;
+	child->cmd.cdw10_bits.abort.cid = req->cmd.cid;
+
+	child->parent = parent;
+
+	TAILQ_INSERT_TAIL(&parent->children, child, child_tailq);
+	parent->num_children++;
+
+	return 0;
+}
+
+int
+spdk_nvme_ctrlr_cmd_abort_ext(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+			      void *cmd_cb_arg,
+			      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	int rc = 0;
+	struct nvme_request *parent, *child, *tmp;
+	bool child_failed = false;
+	int aborted = 0;
+
+	if (cmd_cb_arg == NULL) {
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&ctrlr->ctrlr_lock);
+
+	if (qpair == NULL) {
+		qpair = ctrlr->adminq;
+	}
+
+	parent = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (parent == NULL) {
+		pthread_mutex_unlock(&ctrlr->ctrlr_lock);
+
+		return -ENOMEM;
+	}
+
+	TAILQ_INIT(&parent->children);
+	parent->num_children = 0;
+
+	parent->cmd.opc = SPDK_NVME_OPC_ABORT;
+	memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl));
+
+	/* Hold SQID that the requests to abort are associated with.
+	 * This will be copied to the children.
+	 *
+	 * CID is not set here because the parent is not submitted directly
+	 * and CID is not determined until request to abort is found.
+	 */
+	parent->cmd.cdw10_bits.abort.sqid = qpair->id;
+
+	/* This is used to find request to abort. */
+	parent->user_cb_arg = cmd_cb_arg;
+
+	/* Add an abort request for each outstanding request which has cmd_cb_arg
+	 * as its callback context.
+	 */
+	rc = nvme_transport_qpair_iterate_requests(qpair, nvme_request_add_abort, parent);
+	if (rc != 0) {
+		/* Free abort requests already added. */
+		child_failed = true;
+	}
+
+	TAILQ_FOREACH_SAFE(child, &parent->children, child_tailq, tmp) {
+		if (spdk_likely(!child_failed)) {
+			rc = _nvme_ctrlr_submit_abort_request(ctrlr, child);
+			if (spdk_unlikely(rc != 0)) {
+				child_failed = true;
+			}
+		} else {
+			/* Free remaining abort requests. */
+			nvme_request_remove_child(parent, child);
+			nvme_free_request(child);
+		}
+	}
+
+	if (spdk_likely(!child_failed)) {
+		/* There is no error so far. Abort requests were submitted successfully
+		 * or there was no outstanding request to abort.
+		 *
+		 * Hence abort queued requests which has cmd_cb_arg as its callback
+		 * context next.
+		 */
+		aborted = nvme_qpair_abort_queued_reqs(qpair, cmd_cb_arg);
+		if (parent->num_children == 0) {
+			/* There was no outstanding request to abort. */
+			if (aborted > 0) {
+				/* The queued requests were successfully aborted. Hence
+				 * complete the parent request with success synchronously.
+				 */
+				nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
+						      parent, &parent->parent_status);
+				nvme_free_request(parent);
+			} else {
+				/* There was no queued request to abort. */
+				rc = -ENOENT;
+			}
+		}
+	} else {
+		/* Failed to add or submit abort request. */
+		if (parent->num_children != 0) {
+			/* Return success since we must wait for those children
+			 * to complete but set the parent request to failure.
+			 */
+			parent->parent_status.cdw0 |= 1U;
+			rc = 0;
+		}
+	}
+
+	if (rc != 0) {
+		nvme_free_request(parent);
+	}
+
+	pthread_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr,
+			 const struct spdk_nvme_fw_commit *fw_commit,
+			 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_FIRMWARE_COMMIT;
+	memcpy(&cmd->cdw10, fw_commit, sizeof(uint32_t));
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+
+}
+
+int
+nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr,
+				 uint32_t size, uint32_t offset, void *payload,
+				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, size, cb_fn, cb_arg, true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD;
+	cmd->cdw10 = (size >> 2) - 1;
+	cmd->cdw11 = offset >> 2;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+				     uint16_t spsp, uint8_t nssf, void *payload,
+				     uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size,
+					      cb_fn, cb_arg, false);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_SECURITY_RECEIVE;
+	cmd->cdw10_bits.sec_send_recv.nssf = nssf;
+	cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp;
+	cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8);
+	cmd->cdw10_bits.sec_send_recv.secp = secp;
+	cmd->cdw11 = payload_size;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+				  uint16_t spsp, uint8_t nssf, void *payload,
+				  uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size,
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_SECURITY_SEND;
+	cmd->cdw10_bits.sec_send_recv.nssf = nssf;
+	cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp;
+	cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8);
+	cmd->cdw10_bits.sec_send_recv.secp = secp;
+	cmd->cdw11 = payload_size;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
+
+int
+nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+			struct spdk_nvme_sanitize *sanitize, uint32_t cdw11,
+			spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_SANITIZE;
+	cmd->nsid = nsid;
+	cmd->cdw11 = cdw11;
+	memcpy(&cmd->cdw10, sanitize, sizeof(cmd->cdw10));
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+	return rc;
+}
diff --git a/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c
new file mode 100644
index 000000000..2eba219ce
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c
@@ -0,0 +1,88 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvme_ocssd.h"
+#include "nvme_internal.h"
+
+bool
+spdk_nvme_ctrlr_is_ocssd_supported(struct spdk_nvme_ctrlr *ctrlr)
+{
+	if (ctrlr->quirks & NVME_QUIRK_OCSSD) {
+		/* TODO: There isn't a standardized way to identify Open-Channel SSD
+		 * different verdors may have different conditions.
+		 */
+
+		/*
+		 * Current QEMU OpenChannel Device needs to check nsdata->vs[0].
+		 * Here check nsdata->vs[0] of the first namespace.
+		 */
+		if (ctrlr->cdata.vid == SPDK_PCI_VID_CNEXLABS) {
+			if (ctrlr->num_ns && ctrlr->nsdata[0].vendor_specific[0] == 0x1) {
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
+
+int
+spdk_nvme_ocssd_ctrlr_cmd_geometry(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+				   void *payload, uint32_t payload_size,
+				   spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+	int rc;
+
+	if (!payload || (payload_size != sizeof(struct spdk_ocssd_geometry_data))) {
+		return -EINVAL;
+	}
+
+	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	req = nvme_allocate_request_user_copy(ctrlr->adminq,
+					      payload, payload_size, cb_fn, cb_arg, false);
+	if (req == NULL) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_OCSSD_OPC_GEOMETRY;
+	cmd->nsid = nsid;
+
+	rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	return rc;
+}
diff --git a/src/spdk/lib/nvme/nvme_cuse.c b/src/spdk/lib/nvme/nvme_cuse.c
new file mode 100644
index 000000000..9a5ee1f0d
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_cuse.c
@@ -0,0 +1,1115 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define FUSE_USE_VERSION 31
+
+#include <fuse3/cuse_lowlevel.h>
+
+#include <linux/nvme_ioctl.h>
+#include <linux/fs.h>
+
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+#include "nvme_cuse.h"
+
+struct cuse_device {
+	bool				is_started;
+
+	char				dev_name[128];
+	uint32_t			index;
+	int				claim_fd;
+	char				lock_name[64];
+
+	struct spdk_nvme_ctrlr		*ctrlr;		/**< NVMe controller */
+	uint32_t			nsid;		/**< NVMe name space id, or 0 */
+
+	pthread_t			tid;
+	struct fuse_session		*session;
+
+	struct cuse_device		*ctrlr_device;
+	struct cuse_device		*ns_devices;	/**< Array of cuse ns devices */
+
+	TAILQ_ENTRY(cuse_device)	tailq;
+};
+
+static pthread_mutex_t g_cuse_mtx = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, cuse_device) g_ctrlr_ctx_head = TAILQ_HEAD_INITIALIZER(g_ctrlr_ctx_head);
+static struct spdk_bit_array *g_ctrlr_started;
+
+struct cuse_io_ctx {
+	struct spdk_nvme_cmd		nvme_cmd;
+	enum spdk_nvme_data_transfer	data_transfer;
+
+	uint64_t			lba;
+	uint32_t			lba_count;
+
+	void				*data;
+	int				data_len;
+
+	fuse_req_t			req;
+};
+
+static void
+cuse_io_ctx_free(struct cuse_io_ctx *ctx)
+{
+	spdk_free(ctx->data);
+	free(ctx);
+}
+
+#define FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, val)		\
+	if (out_bufsz == 0) {						\
+		struct iovec out_iov;					\
+		out_iov.iov_base = (void *)arg;				\
+		out_iov.iov_len = sizeof(val);				\
+		fuse_reply_ioctl_retry(req, NULL, 0, &out_iov, 1);	\
+		return;							\
+	}
+
+static void
+cuse_nvme_admin_cmd_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct cuse_io_ctx *ctx = arg;
+	struct iovec out_iov[2];
+	struct spdk_nvme_cpl _cpl;
+
+	if (ctx->data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+		fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0);
+	} else {
+		memcpy(&_cpl, cpl, sizeof(struct spdk_nvme_cpl));
+
+		out_iov[0].iov_base = &_cpl.cdw0;
+		out_iov[0].iov_len = sizeof(_cpl.cdw0);
+
+		if (ctx->data_len > 0) {
+			out_iov[1].iov_base = ctx->data;
+			out_iov[1].iov_len = ctx->data_len;
+			fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 2);
+		} else {
+			fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 1);
+		}
+	}
+
+	cuse_io_ctx_free(ctx);
+}
+
+static void
+cuse_nvme_admin_cmd_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+	int rc;
+	struct cuse_io_ctx *ctx = arg;
+
+	rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &ctx->nvme_cmd, ctx->data, ctx->data_len,
+					   cuse_nvme_admin_cmd_cb, (void *)ctx);
+	if (rc < 0) {
+		fuse_reply_err(ctx->req, EINVAL);
+		cuse_io_ctx_free(ctx);
+	}
+}
+
+static void
+cuse_nvme_admin_cmd_send(fuse_req_t req, struct nvme_admin_cmd *admin_cmd,
+			 const void *data)
+{
+	struct cuse_io_ctx *ctx;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+	int rv;
+
+	ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
+	if (!ctx) {
+		SPDK_ERRLOG("Cannot allocate memory for cuse_io_ctx\n");
+		fuse_reply_err(req, ENOMEM);
+		return;
+	}
+
+	ctx->req = req;
+	ctx->data_transfer = spdk_nvme_opc_get_data_transfer(admin_cmd->opcode);
+
+	memset(&ctx->nvme_cmd, 0, sizeof(ctx->nvme_cmd));
+	ctx->nvme_cmd.opc = admin_cmd->opcode;
+	ctx->nvme_cmd.nsid = admin_cmd->nsid;
+	ctx->nvme_cmd.cdw10 = admin_cmd->cdw10;
+	ctx->nvme_cmd.cdw11 = admin_cmd->cdw11;
+	ctx->nvme_cmd.cdw12 = admin_cmd->cdw12;
+	ctx->nvme_cmd.cdw13 = admin_cmd->cdw13;
+	ctx->nvme_cmd.cdw14 = admin_cmd->cdw14;
+	ctx->nvme_cmd.cdw15 = admin_cmd->cdw15;
+
+	ctx->data_len = admin_cmd->data_len;
+
+	if (ctx->data_len > 0) {
+		ctx->data = spdk_malloc(ctx->data_len, 0, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+		if (!ctx->data) {
+			SPDK_ERRLOG("Cannot allocate memory for data\n");
+			fuse_reply_err(req, ENOMEM);
+			free(ctx);
+			return;
+		}
+		if (data != NULL) {
+			memcpy(ctx->data, data, ctx->data_len);
+		}
+	}
+
+	rv = nvme_io_msg_send(cuse_device->ctrlr, 0, cuse_nvme_admin_cmd_execute, ctx);
+	if (rv) {
+		SPDK_ERRLOG("Cannot send io msg to the controller\n");
+		fuse_reply_err(req, -rv);
+		cuse_io_ctx_free(ctx);
+		return;
+	}
+}
+
+static void
+cuse_nvme_admin_cmd(fuse_req_t req, int cmd, void *arg,
+		    struct fuse_file_info *fi, unsigned flags,
+		    const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	struct nvme_admin_cmd *admin_cmd;
+	struct iovec in_iov[2], out_iov[2];
+
+	in_iov[0].iov_base = (void *)arg;
+	in_iov[0].iov_len = sizeof(*admin_cmd);
+	if (in_bufsz == 0) {
+		fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0);
+		return;
+	}
+
+	admin_cmd = (struct nvme_admin_cmd *)in_buf;
+
+	switch (spdk_nvme_opc_get_data_transfer(admin_cmd->opcode)) {
+	case SPDK_NVME_DATA_NONE:
+		SPDK_ERRLOG("SPDK_NVME_DATA_NONE not implemented\n");
+		fuse_reply_err(req, EINVAL);
+		return;
+	case SPDK_NVME_DATA_HOST_TO_CONTROLLER:
+		if (admin_cmd->addr != 0) {
+			in_iov[1].iov_base = (void *)admin_cmd->addr;
+			in_iov[1].iov_len = admin_cmd->data_len;
+			if (in_bufsz == sizeof(*admin_cmd)) {
+				fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0);
+				return;
+			}
+			cuse_nvme_admin_cmd_send(req, admin_cmd, in_buf + sizeof(*admin_cmd));
+		} else {
+			cuse_nvme_admin_cmd_send(req, admin_cmd, NULL);
+		}
+		return;
+	case SPDK_NVME_DATA_CONTROLLER_TO_HOST:
+		if (out_bufsz == 0) {
+			out_iov[0].iov_base = &((struct nvme_admin_cmd *)arg)->result;
+			out_iov[0].iov_len = sizeof(uint32_t);
+			if (admin_cmd->data_len > 0) {
+				out_iov[1].iov_base = (void *)admin_cmd->addr;
+				out_iov[1].iov_len = admin_cmd->data_len;
+				fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 2);
+			} else {
+				fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 1);
+			}
+			return;
+		}
+
+		cuse_nvme_admin_cmd_send(req, admin_cmd, NULL);
+
+		return;
+	case SPDK_NVME_DATA_BIDIRECTIONAL:
+		fuse_reply_err(req, EINVAL);
+		return;
+	}
+}
+
+static void
+cuse_nvme_reset_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+	int rc;
+	fuse_req_t req = arg;
+
+	rc = spdk_nvme_ctrlr_reset(ctrlr);
+	if (rc) {
+		fuse_reply_err(req, rc);
+		return;
+	}
+
+	fuse_reply_ioctl_iov(req, 0, NULL, 0);
+}
+
+static void
+cuse_nvme_reset(fuse_req_t req, int cmd, void *arg,
+		struct fuse_file_info *fi, unsigned flags,
+		const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	int rv;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+	if (cuse_device->nsid) {
+		SPDK_ERRLOG("Namespace reset not supported\n");
+		fuse_reply_err(req, EINVAL);
+		return;
+	}
+
+	rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_reset_execute, (void *)req);
+	if (rv) {
+		SPDK_ERRLOG("Cannot send reset\n");
+		fuse_reply_err(req, EINVAL);
+	}
+}
+
+/*****************************************************************************
+ * Namespace IO requests
+ */
+
+static void
+cuse_nvme_submit_io_write_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+	struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref;
+
+	fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0);
+
+	cuse_io_ctx_free(ctx);
+}
+
+static void
+cuse_nvme_submit_io_write_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+	int rc;
+	struct cuse_io_ctx *ctx = arg;
+	struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+
+	rc = spdk_nvme_ns_cmd_write(ns, ctrlr->external_io_msgs_qpair, ctx->data,
+				    ctx->lba, /* LBA start */
+				    ctx->lba_count, /* number of LBAs */
+				    cuse_nvme_submit_io_write_done, ctx, 0);
+
+	if (rc != 0) {
+		SPDK_ERRLOG("write failed: rc = %d\n", rc);
+		fuse_reply_err(ctx->req, rc);
+		cuse_io_ctx_free(ctx);
+		return;
+	}
+}
+
+static void
+cuse_nvme_submit_io_write(fuse_req_t req, int cmd, void *arg,
+			  struct fuse_file_info *fi, unsigned flags,
+			  const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	const struct nvme_user_io *user_io = in_buf;
+	struct cuse_io_ctx *ctx;
+	struct spdk_nvme_ns *ns;
+	uint32_t block_size;
+	int rc;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+	ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
+	if (!ctx) {
+		SPDK_ERRLOG("Cannot allocate memory for context\n");
+		fuse_reply_err(req, ENOMEM);
+		return;
+	}
+
+	ctx->req = req;
+
+	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+	block_size = spdk_nvme_ns_get_sector_size(ns);
+
+	ctx->lba = user_io->slba;
+	ctx->lba_count = user_io->nblocks + 1;
+	ctx->data_len = ctx->lba_count * block_size;
+
+	ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+				 SPDK_MALLOC_DMA);
+	if (ctx->data == NULL) {
+		SPDK_ERRLOG("Write buffer allocation failed\n");
+		fuse_reply_err(ctx->req, ENOMEM);
+		free(ctx);
+		return;
+	}
+
+	memcpy(ctx->data, in_buf + sizeof(*user_io), ctx->data_len);
+
+	rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_write_cb,
+			      ctx);
+	if (rc < 0) {
+		SPDK_ERRLOG("Cannot send write io\n");
+		fuse_reply_err(ctx->req, rc);
+		cuse_io_ctx_free(ctx);
+	}
+}
+
+static void
+cuse_nvme_submit_io_read_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+	struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref;
+	struct iovec iov;
+
+	iov.iov_base = ctx->data;
+	iov.iov_len = ctx->data_len;
+
+	fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, &iov, 1);
+
+	cuse_io_ctx_free(ctx);
+}
+
+static void
+cuse_nvme_submit_io_read_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+	int rc;
+	struct cuse_io_ctx *ctx = arg;
+	struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+
+	rc = spdk_nvme_ns_cmd_read(ns, ctrlr->external_io_msgs_qpair, ctx->data,
+				   ctx->lba, /* LBA start */
+				   ctx->lba_count, /* number of LBAs */
+				   cuse_nvme_submit_io_read_done, ctx, 0);
+
+	if (rc != 0) {
+		SPDK_ERRLOG("read failed: rc = %d\n", rc);
+		fuse_reply_err(ctx->req, rc);
+		cuse_io_ctx_free(ctx);
+		return;
+	}
+}
+
+static void
+cuse_nvme_submit_io_read(fuse_req_t req, int cmd, void *arg,
+			 struct fuse_file_info *fi, unsigned flags,
+			 const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	int rc;
+	struct cuse_io_ctx *ctx;
+	const struct nvme_user_io *user_io = in_buf;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+	struct spdk_nvme_ns *ns;
+	uint32_t block_size;
+
+	ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
+	if (!ctx) {
+		SPDK_ERRLOG("Cannot allocate memory for context\n");
+		fuse_reply_err(req, ENOMEM);
+		return;
+	}
+
+	ctx->req = req;
+	ctx->lba = user_io->slba;
+	ctx->lba_count = user_io->nblocks;
+
+	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+	block_size = spdk_nvme_ns_get_sector_size(ns);
+
+	ctx->data_len = ctx->lba_count * block_size;
+	ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+				 SPDK_MALLOC_DMA);
+	if (ctx->data == NULL) {
+		SPDK_ERRLOG("Read buffer allocation failed\n");
+		fuse_reply_err(ctx->req, ENOMEM);
+		free(ctx);
+		return;
+	}
+
+	rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_read_cb, ctx);
+	if (rc < 0) {
+		SPDK_ERRLOG("Cannot send read io\n");
+		fuse_reply_err(ctx->req, rc);
+		cuse_io_ctx_free(ctx);
+	}
+}
+
+
+static void
+cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg,
+		    struct fuse_file_info *fi, unsigned flags,
+		    const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	const struct nvme_user_io *user_io;
+	struct iovec in_iov[2], out_iov;
+
+	in_iov[0].iov_base = (void *)arg;
+	in_iov[0].iov_len = sizeof(*user_io);
+	if (in_bufsz == 0) {
+		fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0);
+		return;
+	}
+
+	user_io = in_buf;
+
+	switch (user_io->opcode) {
+	case SPDK_NVME_OPC_READ:
+		out_iov.iov_base = (void *)user_io->addr;
+		out_iov.iov_len = (user_io->nblocks + 1) * 512;
+		if (out_bufsz == 0) {
+			fuse_reply_ioctl_retry(req, in_iov, 1, &out_iov, 1);
+			return;
+		}
+
+		cuse_nvme_submit_io_read(req, cmd, arg, fi, flags, in_buf,
+					 in_bufsz, out_bufsz);
+		break;
+	case SPDK_NVME_OPC_WRITE:
+		in_iov[1].iov_base = (void *)user_io->addr;
+		in_iov[1].iov_len = (user_io->nblocks + 1) * 512;
+		if (in_bufsz == sizeof(*user_io)) {
+			fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0);
+			return;
+		}
+
+		cuse_nvme_submit_io_write(req, cmd, arg, fi, flags, in_buf,
+					  in_bufsz, out_bufsz);
+
+		break;
+	default:
+		SPDK_ERRLOG("SUBMIT_IO: opc:%d not valid\n", user_io->opcode);
+		fuse_reply_err(req, EINVAL);
+		return;
+	}
+
+}
+
+/*****************************************************************************
+ * Other namespace IOCTLs
+ */
+static void
+cuse_blkgetsize64(fuse_req_t req, int cmd, void *arg,
+		  struct fuse_file_info *fi, unsigned flags,
+		  const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	uint64_t size;
+	struct spdk_nvme_ns *ns;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size);
+
+	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+	size = spdk_nvme_ns_get_num_sectors(ns);
+	fuse_reply_ioctl(req, 0, &size, sizeof(size));
+}
+
+static void
+cuse_blkpbszget(fuse_req_t req, int cmd, void *arg,
+		struct fuse_file_info *fi, unsigned flags,
+		const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	int pbsz;
+	struct spdk_nvme_ns *ns;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, pbsz);
+
+	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+	pbsz = spdk_nvme_ns_get_sector_size(ns);
+	fuse_reply_ioctl(req, 0, &pbsz, sizeof(pbsz));
+}
+
+static void
+cuse_blkgetsize(fuse_req_t req, int cmd, void *arg,
+		struct fuse_file_info *fi, unsigned flags,
+		const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	long size;
+	struct spdk_nvme_ns *ns;
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size);
+
+	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+
+	/* return size in 512 bytes blocks */
+	size = spdk_nvme_ns_get_num_sectors(ns) * 512 / spdk_nvme_ns_get_sector_size(ns);
+	fuse_reply_ioctl(req, 0, &size, sizeof(size));
+}
+
+static void
+cuse_getid(fuse_req_t req, int cmd, void *arg,
+	   struct fuse_file_info *fi, unsigned flags,
+	   const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+	fuse_reply_ioctl(req, cuse_device->nsid, NULL, 0);
+}
+
+static void
+cuse_ctrlr_ioctl(fuse_req_t req, int cmd, void *arg,
+		 struct fuse_file_info *fi, unsigned flags,
+		 const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	if (flags & FUSE_IOCTL_COMPAT) {
+		fuse_reply_err(req, ENOSYS);
+		return;
+	}
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	case NVME_IOCTL_RESET:
+		cuse_nvme_reset(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	default:
+		SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd);
+		fuse_reply_err(req, EINVAL);
+	}
+}
+
+static void
+cuse_ns_ioctl(fuse_req_t req, int cmd, void *arg,
+	      struct fuse_file_info *fi, unsigned flags,
+	      const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+	if (flags & FUSE_IOCTL_COMPAT) {
+		fuse_reply_err(req, ENOSYS);
+		return;
+	}
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	case NVME_IOCTL_SUBMIT_IO:
+		cuse_nvme_submit_io(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	case NVME_IOCTL_ID:
+		cuse_getid(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	case BLKPBSZGET:
+		cuse_blkpbszget(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	case BLKGETSIZE:
+		/* Returns the device size as a number of 512-byte blocks (returns pointer to long) */
+		cuse_blkgetsize(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	case BLKGETSIZE64:
+		/* Returns the device size in sectors (returns pointer to uint64_t) */
+		cuse_blkgetsize64(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+		break;
+
+	default:
+		SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd);
+		fuse_reply_err(req, EINVAL);
+	}
+}
+
+/*****************************************************************************
+ * CUSE threads initialization.
+ */
+
+static void cuse_open(fuse_req_t req, struct fuse_file_info *fi)
+{
+	fuse_reply_open(req, fi);
+}
+
+static const struct cuse_lowlevel_ops cuse_ctrlr_clop = {
+	.open		= cuse_open,
+	.ioctl		= cuse_ctrlr_ioctl,
+};
+
+static const struct cuse_lowlevel_ops cuse_ns_clop = {
+	.open		= cuse_open,
+	.ioctl		= cuse_ns_ioctl,
+};
+
+static void *
+cuse_thread(void *arg)
+{
+	struct cuse_device *cuse_device = arg;
+	char *cuse_argv[] = { "cuse", "-f" };
+	int cuse_argc = SPDK_COUNTOF(cuse_argv);
+	char devname_arg[128 + 8];
+	const char *dev_info_argv[] = { devname_arg };
+	struct cuse_info ci;
+	int multithreaded;
+	int rc;
+	struct fuse_buf buf = { .mem = NULL };
+	struct pollfd fds;
+	int timeout_msecs = 500;
+
+	spdk_unaffinitize_thread();
+
+	snprintf(devname_arg, sizeof(devname_arg), "DEVNAME=%s", cuse_device->dev_name);
+
+	memset(&ci, 0, sizeof(ci));
+	ci.dev_info_argc = 1;
+	ci.dev_info_argv = dev_info_argv;
+	ci.flags = CUSE_UNRESTRICTED_IOCTL;
+
+	if (cuse_device->nsid) {
+		cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ns_clop,
+				       &multithreaded, cuse_device);
+	} else {
+		cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ctrlr_clop,
+				       &multithreaded, cuse_device);
+	}
+	if (!cuse_device->session) {
+		SPDK_ERRLOG("Cannot create cuse session\n");
+		goto err;
+	}
+
+	SPDK_NOTICELOG("fuse session for device %s created\n", cuse_device->dev_name);
+
+	/* Receive and process fuse requests */
+	fds.fd = fuse_session_fd(cuse_device->session);
+	fds.events = POLLIN;
+	while (!fuse_session_exited(cuse_device->session)) {
+		rc = poll(&fds, 1, timeout_msecs);
+		if (rc <= 0) {
+			continue;
+		}
+		rc = fuse_session_receive_buf(cuse_device->session, &buf);
+		if (rc > 0) {
+			fuse_session_process_buf(cuse_device->session, &buf);
+		}
+	}
+	free(buf.mem);
+	fuse_session_reset(cuse_device->session);
+	cuse_lowlevel_teardown(cuse_device->session);
+err:
+	pthread_exit(NULL);
+}
+
+/*****************************************************************************
+ * CUSE devices management
+ */
+
+static int
+cuse_nvme_ns_start(struct cuse_device *ctrlr_device, uint32_t nsid)
+{
+	struct cuse_device *ns_device;
+	int rv;
+
+	ns_device = &ctrlr_device->ns_devices[nsid - 1];
+	if (ns_device->is_started) {
+		return 0;
+	}
+
+	ns_device->ctrlr = ctrlr_device->ctrlr;
+	ns_device->ctrlr_device = ctrlr_device;
+	ns_device->nsid = nsid;
+	rv = snprintf(ns_device->dev_name, sizeof(ns_device->dev_name), "%sn%d",
+		      ctrlr_device->dev_name, ns_device->nsid);
+	if (rv < 0) {
+		SPDK_ERRLOG("Device name too long.\n");
+		free(ns_device);
+		return -ENAMETOOLONG;
+	}
+
+	rv = pthread_create(&ns_device->tid, NULL, cuse_thread, ns_device);
+	if (rv != 0) {
+		SPDK_ERRLOG("pthread_create failed\n");
+		return -rv;
+	}
+
+	ns_device->is_started = true;
+
+	return 0;
+}
+
+static void
+cuse_nvme_ns_stop(struct cuse_device *ctrlr_device, uint32_t nsid)
+{
+	struct cuse_device *ns_device;
+
+	ns_device = &ctrlr_device->ns_devices[nsid - 1];
+	if (!ns_device->is_started) {
+		return;
+	}
+
+	fuse_session_exit(ns_device->session);
+	pthread_join(ns_device->tid, NULL);
+	ns_device->is_started = false;
+}
+
+static int
+nvme_cuse_claim(struct cuse_device *ctrlr_device, uint32_t index)
+{
+	int dev_fd;
+	int pid;
+	void *dev_map;
+	struct flock cusedev_lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0,
+	};
+
+	snprintf(ctrlr_device->lock_name, sizeof(ctrlr_device->lock_name),
+		 "/tmp/spdk_nvme_cuse_lock_%" PRIu32, index);
+
+	dev_fd = open(ctrlr_device->lock_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	if (dev_fd == -1) {
+		SPDK_ERRLOG("could not open %s\n", ctrlr_device->lock_name);
+		return -errno;
+	}
+
+	if (ftruncate(dev_fd, sizeof(int)) != 0) {
+		SPDK_ERRLOG("could not truncate %s\n", ctrlr_device->lock_name);
+		close(dev_fd);
+		return -errno;
+	}
+
+	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+		       MAP_SHARED, dev_fd, 0);
+	if (dev_map == MAP_FAILED) {
+		SPDK_ERRLOG("could not mmap dev %s (%d)\n", ctrlr_device->lock_name, errno);
+		close(dev_fd);
+		return -errno;
+	}
+
+	if (fcntl(dev_fd, F_SETLK, &cusedev_lock) != 0) {
+		pid = *(int *)dev_map;
+		SPDK_ERRLOG("Cannot create lock on device %s, probably"
+			    " process %d has claimed it\n", ctrlr_device->lock_name, pid);
+		munmap(dev_map, sizeof(int));
+		close(dev_fd);
+		/* F_SETLK returns unspecified errnos, normalize them */
+		return -EACCES;
+	}
+
+	*(int *)dev_map = (int)getpid();
+	munmap(dev_map, sizeof(int));
+	ctrlr_device->claim_fd = dev_fd;
+	ctrlr_device->index = index;
+	/* Keep dev_fd open to maintain the lock. */
+	return 0;
+}
+
+static void
+nvme_cuse_unclaim(struct cuse_device *ctrlr_device)
+{
+	close(ctrlr_device->claim_fd);
+	ctrlr_device->claim_fd = -1;
+	unlink(ctrlr_device->lock_name);
+}
+
+static void
+cuse_nvme_ctrlr_stop(struct cuse_device *ctrlr_device)
+{
+	uint32_t i;
+	uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr);
+
+	for (i = 1; i <= num_ns; i++) {
+		cuse_nvme_ns_stop(ctrlr_device, i);
+	}
+
+	fuse_session_exit(ctrlr_device->session);
+	pthread_join(ctrlr_device->tid, NULL);
+	TAILQ_REMOVE(&g_ctrlr_ctx_head, ctrlr_device, tailq);
+	spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index);
+	if (spdk_bit_array_count_set(g_ctrlr_started) == 0) {
+		spdk_bit_array_free(&g_ctrlr_started);
+	}
+	nvme_cuse_unclaim(ctrlr_device);
+	free(ctrlr_device->ns_devices);
+	free(ctrlr_device);
+}
+
+static int
+cuse_nvme_ctrlr_update_namespaces(struct cuse_device *ctrlr_device)
+{
+	uint32_t nsid;
+	uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr);
+
+	for (nsid = 1; nsid <= num_ns; nsid++) {
+		if (!spdk_nvme_ctrlr_is_active_ns(ctrlr_device->ctrlr, nsid)) {
+			cuse_nvme_ns_stop(ctrlr_device, nsid);
+			continue;
+		}
+
+		if (cuse_nvme_ns_start(ctrlr_device, nsid) < 0) {
+			SPDK_ERRLOG("Cannot start CUSE namespace device.");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+nvme_cuse_start(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rv = 0;
+	struct cuse_device *ctrlr_device;
+	uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
+
+	SPDK_NOTICELOG("Creating cuse device for controller\n");
+
+	if (g_ctrlr_started == NULL) {
+		g_ctrlr_started = spdk_bit_array_create(128);
+		if (g_ctrlr_started == NULL) {
+			SPDK_ERRLOG("Cannot create bit array\n");
+			return -ENOMEM;
+		}
+	}
+
+	ctrlr_device = (struct cuse_device *)calloc(1, sizeof(struct cuse_device));
+	if (!ctrlr_device) {
+		SPDK_ERRLOG("Cannot allocate memory for ctrlr_device.");
+		rv = -ENOMEM;
+		goto err2;
+	}
+
+	ctrlr_device->ctrlr = ctrlr;
+
+	/* Check if device already exists, if not increment index until success */
+	ctrlr_device->index = 0;
+	while (1) {
+		ctrlr_device->index = spdk_bit_array_find_first_clear(g_ctrlr_started, ctrlr_device->index);
+		if (ctrlr_device->index == UINT32_MAX) {
+			SPDK_ERRLOG("Too many registered controllers\n");
+			goto err2;
+		}
+
+		if (nvme_cuse_claim(ctrlr_device, ctrlr_device->index) == 0) {
+			break;
+		}
+		ctrlr_device->index++;
+	}
+	spdk_bit_array_set(g_ctrlr_started, ctrlr_device->index);
+	snprintf(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name), "spdk/nvme%d",
+		 ctrlr_device->index);
+
+	rv = pthread_create(&ctrlr_device->tid, NULL, cuse_thread, ctrlr_device);
+	if (rv != 0) {
+		SPDK_ERRLOG("pthread_create failed\n");
+		rv = -rv;
+		goto err3;
+	}
+	TAILQ_INSERT_TAIL(&g_ctrlr_ctx_head, ctrlr_device, tailq);
+
+	ctrlr_device->ns_devices = (struct cuse_device *)calloc(num_ns, sizeof(struct cuse_device));
+	/* Start all active namespaces */
+	if (cuse_nvme_ctrlr_update_namespaces(ctrlr_device) < 0) {
+		SPDK_ERRLOG("Cannot start CUSE namespace devices.");
+		cuse_nvme_ctrlr_stop(ctrlr_device);
+		rv = -1;
+		goto err3;
+	}
+
+	return 0;
+
+err3:
+	spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index);
+err2:
+	free(ctrlr_device);
+	if (spdk_bit_array_count_set(g_ctrlr_started) == 0) {
+		spdk_bit_array_free(&g_ctrlr_started);
+	}
+	return rv;
+}
+
+static struct cuse_device *
+nvme_cuse_get_cuse_ctrlr_device(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct cuse_device *ctrlr_device = NULL;
+
+	TAILQ_FOREACH(ctrlr_device, &g_ctrlr_ctx_head, tailq) {
+		if (ctrlr_device->ctrlr == ctrlr) {
+			break;
+		}
+	}
+
+	return ctrlr_device;
+}
+
+static struct cuse_device *
+nvme_cuse_get_cuse_ns_device(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+	struct cuse_device *ctrlr_device = NULL;
+	uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
+
+	if (nsid < 1 || nsid > num_ns) {
+		return NULL;
+	}
+
+	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+	if (!ctrlr_device) {
+		return NULL;
+	}
+
+	if (!ctrlr_device->ns_devices[nsid - 1].is_started) {
+		return NULL;
+	}
+
+	return &ctrlr_device->ns_devices[nsid - 1];
+}
+
+static void
+nvme_cuse_stop(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct cuse_device *ctrlr_device;
+
+	pthread_mutex_lock(&g_cuse_mtx);
+
+	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+	if (!ctrlr_device) {
+		SPDK_ERRLOG("Cannot find associated CUSE device\n");
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return;
+	}
+
+	cuse_nvme_ctrlr_stop(ctrlr_device);
+
+	pthread_mutex_unlock(&g_cuse_mtx);
+}
+
+static void
+nvme_cuse_update(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct cuse_device *ctrlr_device;
+
+	pthread_mutex_lock(&g_cuse_mtx);
+
+	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+	if (!ctrlr_device) {
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return;
+	}
+
+	cuse_nvme_ctrlr_update_namespaces(ctrlr_device);
+
+	pthread_mutex_unlock(&g_cuse_mtx);
+}
+
+static struct nvme_io_msg_producer cuse_nvme_io_msg_producer = {
+	.name = "cuse",
+	.stop = nvme_cuse_stop,
+	.update = nvme_cuse_update,
+};
+
+int
+spdk_nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int rc;
+
+	rc = nvme_io_msg_ctrlr_register(ctrlr, &cuse_nvme_io_msg_producer);
+	if (rc) {
+		return rc;
+	}
+
+	pthread_mutex_lock(&g_cuse_mtx);
+
+	rc = nvme_cuse_start(ctrlr);
+	if (rc) {
+		nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer);
+	}
+
+	pthread_mutex_unlock(&g_cuse_mtx);
+
+	return rc;
+}
+
+int
+spdk_nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct cuse_device *ctrlr_device;
+
+	pthread_mutex_lock(&g_cuse_mtx);
+
+	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+	if (!ctrlr_device) {
+		SPDK_ERRLOG("Cannot find associated CUSE device\n");
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return -ENODEV;
+	}
+
+	cuse_nvme_ctrlr_stop(ctrlr_device);
+
+	pthread_mutex_unlock(&g_cuse_mtx);
+
+	nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer);
+
+	return 0;
+}
+
+void
+spdk_nvme_cuse_update_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+	nvme_cuse_update(ctrlr);
+}
+
+int
+spdk_nvme_cuse_get_ctrlr_name(struct spdk_nvme_ctrlr *ctrlr, char *name, size_t *size)
+{
+	struct cuse_device *ctrlr_device;
+	size_t req_len;
+
+	pthread_mutex_lock(&g_cuse_mtx);
+
+	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+	if (!ctrlr_device) {
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return -ENODEV;
+	}
+
+	req_len = strnlen(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name));
+	if (*size < req_len) {
+		*size = req_len;
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return -ENOSPC;
+	}
+	snprintf(name, req_len + 1, "%s", ctrlr_device->dev_name);
+
+	pthread_mutex_unlock(&g_cuse_mtx);
+
+	return 0;
+}
+
+int
+spdk_nvme_cuse_get_ns_name(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, char *name, size_t *size)
+{
+	struct cuse_device *ns_device;
+	size_t req_len;
+
+	pthread_mutex_lock(&g_cuse_mtx);
+
+	ns_device = nvme_cuse_get_cuse_ns_device(ctrlr, nsid);
+	if (!ns_device) {
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return -ENODEV;
+	}
+
+	req_len = strnlen(ns_device->dev_name, sizeof(ns_device->dev_name));
+	if (*size < req_len) {
+		*size = req_len;
+		pthread_mutex_unlock(&g_cuse_mtx);
+		return -ENOSPC;
+	}
+	snprintf(name, req_len + 1, "%s", ns_device->dev_name);
+
+	pthread_mutex_unlock(&g_cuse_mtx);
+
+	return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_cuse.h b/src/spdk/lib/nvme/nvme_cuse.h
new file mode 100644
index 000000000..92b475190
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_cuse.h
@@ -0,0 +1,42 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVME_CUSE_H__
+#define __NVME_CUSE_H__
+
+#include "spdk/nvme.h"
+
+int nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr, const char *dev_path);
+void nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr);
+
+#endif /* __NVME_CUSE_H__ */
diff --git a/src/spdk/lib/nvme/nvme_fabric.c b/src/spdk/lib/nvme/nvme_fabric.c
new file mode 100644
index 000000000..9fff20873
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_fabric.c
@@ -0,0 +1,475 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over Fabrics transport-independent functions
+ */
+
+#include "nvme_internal.h"
+
+#include "spdk/endian.h"
+#include "spdk/string.h"
+
+static int
+nvme_fabric_prop_set_cmd(struct spdk_nvme_ctrlr *ctrlr,
+			 uint32_t offset, uint8_t size, uint64_t value)
+{
+	struct spdk_nvmf_fabric_prop_set_cmd cmd = {};
+	struct nvme_completion_poll_status *status;
+	int rc;
+
+	assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8);
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	cmd.opcode = SPDK_NVME_OPC_FABRIC;
+	cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
+	cmd.ofst = offset;
+	cmd.attrib.size = size;
+	cmd.value.u64 = value;
+
+	rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd,
+					   NULL, 0,
+					   nvme_completion_poll_cb, status);
+	if (rc < 0) {
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		if (!status->timed_out) {
+			free(status);
+		}
+		SPDK_ERRLOG("Property Set failed\n");
+		return -1;
+	}
+	free(status);
+
+	return 0;
+}
+
+static int
+nvme_fabric_prop_get_cmd(struct spdk_nvme_ctrlr *ctrlr,
+			 uint32_t offset, uint8_t size, uint64_t *value)
+{
+	struct spdk_nvmf_fabric_prop_set_cmd cmd = {};
+	struct nvme_completion_poll_status *status;
+	struct spdk_nvmf_fabric_prop_get_rsp *response;
+	int rc;
+
+	assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8);
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	cmd.opcode = SPDK_NVME_OPC_FABRIC;
+	cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
+	cmd.ofst = offset;
+	cmd.attrib.size = size;
+
+	rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd,
+					   NULL, 0, nvme_completion_poll_cb,
+					   status);
+	if (rc < 0) {
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		if (!status->timed_out) {
+			free(status);
+		}
+		SPDK_ERRLOG("Property Get failed\n");
+		return -1;
+	}
+
+	response = (struct spdk_nvmf_fabric_prop_get_rsp *)&status->cpl;
+
+	if (size == SPDK_NVMF_PROP_SIZE_4) {
+		*value = response->value.u32.low;
+	} else {
+		*value = response->value.u64;
+	}
+
+	free(status);
+
+	return 0;
+}
+
+int
+nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+	return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, value);
+}
+
+int
+nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+	return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value);
+}
+
+int
+nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+	uint64_t tmp_value;
+	int rc;
+	rc = nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, &tmp_value);
+
+	if (!rc) {
+		*value = (uint32_t)tmp_value;
+	}
+	return rc;
+}
+
+int
+nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+	return nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value);
+}
+
+static void
+nvme_fabric_discover_probe(struct spdk_nvmf_discovery_log_page_entry *entry,
+			   struct spdk_nvme_probe_ctx *probe_ctx,
+			   int discover_priority)
+{
+	struct spdk_nvme_transport_id trid;
+	uint8_t *end;
+	size_t len;
+
+	memset(&trid, 0, sizeof(trid));
+
+	if (entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+		SPDK_WARNLOG("Skipping unsupported discovery service referral\n");
+		return;
+	} else if (entry->subtype != SPDK_NVMF_SUBTYPE_NVME) {
+		SPDK_WARNLOG("Skipping unknown subtype %u\n", entry->subtype);
+		return;
+	}
+
+	trid.trtype = entry->trtype;
+	spdk_nvme_transport_id_populate_trstring(&trid, spdk_nvme_transport_id_trtype_str(entry->trtype));
+	if (!spdk_nvme_transport_available_by_name(trid.trstring)) {
+		SPDK_WARNLOG("NVMe transport type %u not available; skipping probe\n",
+			     trid.trtype);
+		return;
+	}
+
+	snprintf(trid.trstring, sizeof(trid.trstring), "%s", probe_ctx->trid.trstring);
+	trid.adrfam = entry->adrfam;
+
+	/* Ensure that subnqn is null terminated. */
+	end = memchr(entry->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1);
+	if (!end) {
+		SPDK_ERRLOG("Discovery entry SUBNQN is not null terminated\n");
+		return;
+	}
+	len = end - entry->subnqn;
+	memcpy(trid.subnqn, entry->subnqn, len);
+	trid.subnqn[len] = '\0';
+
+	/* Convert traddr to a null terminated string. */
+	len = spdk_strlen_pad(entry->traddr, sizeof(entry->traddr), ' ');
+	memcpy(trid.traddr, entry->traddr, len);
+	if (spdk_str_chomp(trid.traddr) != 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRADDR\n");
+	}
+
+	/* Convert trsvcid to a null terminated string. */
+	len = spdk_strlen_pad(entry->trsvcid, sizeof(entry->trsvcid), ' ');
+	memcpy(trid.trsvcid, entry->trsvcid, len);
+	if (spdk_str_chomp(trid.trsvcid) != 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRSVCID\n");
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "subnqn=%s, trtype=%u, traddr=%s, trsvcid=%s\n",
+		      trid.subnqn, trid.trtype,
+		      trid.traddr, trid.trsvcid);
+
+	/* Copy the priority from the discovery ctrlr */
+	trid.priority = discover_priority;
+
+	nvme_ctrlr_probe(&trid, probe_ctx, NULL);
+}
+
+static int
+nvme_fabric_get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr,
+				   void *log_page, uint32_t size, uint64_t offset)
+{
+	struct nvme_completion_poll_status *status;
+	int rc;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, 0, log_page, size, offset,
+					      nvme_completion_poll_cb, status);
+	if (rc < 0) {
+		free(status);
+		return -1;
+	}
+
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -1;
+	}
+	free(status);
+
+	return 0;
+}
+
+int
+nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
+		       bool direct_connect)
+{
+	struct spdk_nvme_ctrlr_opts discovery_opts;
+	struct spdk_nvme_ctrlr *discovery_ctrlr;
+	union spdk_nvme_cc_register cc;
+	int rc;
+	struct nvme_completion_poll_status *status;
+
+	if (strcmp(probe_ctx->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) {
+		/* It is not a discovery_ctrlr info and try to directly connect it */
+		rc = nvme_ctrlr_probe(&probe_ctx->trid, probe_ctx, NULL);
+		return rc;
+	}
+
+	spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts));
+	/* For discovery_ctrlr set the timeout to 0 */
+	discovery_opts.keep_alive_timeout_ms = 0;
+
+	discovery_ctrlr = nvme_transport_ctrlr_construct(&probe_ctx->trid, &discovery_opts, NULL);
+	if (discovery_ctrlr == NULL) {
+		return -1;
+	}
+	nvme_qpair_set_state(discovery_ctrlr->adminq, NVME_QPAIR_ENABLED);
+
+	/* TODO: this should be using the normal NVMe controller initialization process +1 */
+	cc.raw = 0;
+	cc.bits.en = 1;
+	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+	rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+					    cc.raw);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to set cc\n");
+		nvme_ctrlr_destruct(discovery_ctrlr);
+		return -1;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		nvme_ctrlr_destruct(discovery_ctrlr);
+		return -ENOMEM;
+	}
+
+	/* get the cdata info */
+	rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0,
+				     &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata),
+				     nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to identify cdata\n");
+		nvme_ctrlr_destruct(discovery_ctrlr);
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion(discovery_ctrlr->adminq, status)) {
+		SPDK_ERRLOG("nvme_identify_controller failed!\n");
+		nvme_ctrlr_destruct(discovery_ctrlr);
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -ENXIO;
+	}
+
+	free(status);
+
+	/* Direct attach through spdk_nvme_connect() API */
+	if (direct_connect == true) {
+		/* Set the ready state to skip the normal init process */
+		discovery_ctrlr->state = NVME_CTRLR_STATE_READY;
+		nvme_ctrlr_connected(probe_ctx, discovery_ctrlr);
+		nvme_ctrlr_add_process(discovery_ctrlr, 0);
+		return 0;
+	}
+
+	rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, probe_ctx);
+	nvme_ctrlr_destruct(discovery_ctrlr);
+	return rc;
+}
+
+int
+nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr,
+			   struct spdk_nvme_probe_ctx *probe_ctx)
+{
+	struct spdk_nvmf_discovery_log_page *log_page;
+	struct spdk_nvmf_discovery_log_page_entry *log_page_entry;
+	char buffer[4096];
+	int rc;
+	uint64_t i, numrec, buffer_max_entries_first, buffer_max_entries, log_page_offset = 0;
+	uint64_t remaining_num_rec = 0;
+	uint16_t recfmt;
+
+	memset(buffer, 0x0, 4096);
+	buffer_max_entries_first = (sizeof(buffer) - offsetof(struct spdk_nvmf_discovery_log_page,
+				    entries[0])) /
+				   sizeof(struct spdk_nvmf_discovery_log_page_entry);
+	buffer_max_entries = sizeof(buffer) / sizeof(struct spdk_nvmf_discovery_log_page_entry);
+	do {
+		rc = nvme_fabric_get_discovery_log_page(ctrlr, buffer, sizeof(buffer), log_page_offset);
+		if (rc < 0) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get Log Page - Discovery error\n");
+			return rc;
+		}
+
+		if (!remaining_num_rec) {
+			log_page = (struct spdk_nvmf_discovery_log_page *)buffer;
+			recfmt = from_le16(&log_page->recfmt);
+			if (recfmt != 0) {
+				SPDK_ERRLOG("Unrecognized discovery log record format %" PRIu16 "\n", recfmt);
+				return -EPROTO;
+			}
+			remaining_num_rec = log_page->numrec;
+			log_page_offset = offsetof(struct spdk_nvmf_discovery_log_page, entries[0]);
+			log_page_entry = &log_page->entries[0];
+			numrec = spdk_min(remaining_num_rec, buffer_max_entries_first);
+		} else {
+			numrec = spdk_min(remaining_num_rec, buffer_max_entries);
+			log_page_entry = (struct spdk_nvmf_discovery_log_page_entry *)buffer;
+		}
+
+		for (i = 0; i < numrec; i++) {
+			nvme_fabric_discover_probe(log_page_entry++, probe_ctx, ctrlr->trid.priority);
+		}
+		remaining_num_rec -= numrec;
+		log_page_offset += numrec * sizeof(struct spdk_nvmf_discovery_log_page_entry);
+	} while (remaining_num_rec != 0);
+
+	return 0;
+}
+
+int
+nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries)
+{
+	struct nvme_completion_poll_status *status;
+	struct spdk_nvmf_fabric_connect_rsp *rsp;
+	struct spdk_nvmf_fabric_connect_cmd cmd;
+	struct spdk_nvmf_fabric_connect_data *nvmf_data;
+	struct spdk_nvme_ctrlr *ctrlr;
+	int rc;
+
+	if (num_entries == 0 || num_entries > SPDK_NVME_IO_QUEUE_MAX_ENTRIES) {
+		return -EINVAL;
+	}
+
+	ctrlr = qpair->ctrlr;
+	if (!ctrlr) {
+		return -EINVAL;
+	}
+
+	nvmf_data = spdk_zmalloc(sizeof(*nvmf_data), 0, NULL,
+				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (!nvmf_data) {
+		SPDK_ERRLOG("nvmf_data allocation error\n");
+		return -ENOMEM;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		spdk_free(nvmf_data);
+		return -ENOMEM;
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode = SPDK_NVME_OPC_FABRIC;
+	cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
+	cmd.qid = qpair->id;
+	cmd.sqsize = num_entries - 1;
+	cmd.kato = ctrlr->opts.keep_alive_timeout_ms;
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		nvmf_data->cntlid = 0xFFFF;
+	} else {
+		nvmf_data->cntlid = ctrlr->cntlid;
+	}
+
+	SPDK_STATIC_ASSERT(sizeof(nvmf_data->hostid) == sizeof(ctrlr->opts.extended_host_id),
+			   "host ID size mismatch");
+	memcpy(nvmf_data->hostid, ctrlr->opts.extended_host_id, sizeof(nvmf_data->hostid));
+	snprintf(nvmf_data->hostnqn, sizeof(nvmf_data->hostnqn), "%s", ctrlr->opts.hostnqn);
+	snprintf(nvmf_data->subnqn, sizeof(nvmf_data->subnqn), "%s", ctrlr->trid.subnqn);
+
+	rc = spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair,
+					(struct spdk_nvme_cmd *)&cmd,
+					nvmf_data, sizeof(*nvmf_data),
+					nvme_completion_poll_cb, status);
+	if (rc < 0) {
+		SPDK_ERRLOG("Connect command failed\n");
+		spdk_free(nvmf_data);
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion(qpair, status)) {
+		SPDK_ERRLOG("Connect command failed\n");
+		spdk_free(nvmf_data);
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -EIO;
+	}
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		rsp = (struct spdk_nvmf_fabric_connect_rsp *)&status->cpl;
+		ctrlr->cntlid = rsp->status_code_specific.success.cntlid;
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cntlid);
+	}
+
+	spdk_free(nvmf_data);
+	free(status);
+	return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_internal.h b/src/spdk/lib/nvme/nvme_internal.h
new file mode 100644
index 000000000..98fec279d
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_internal.h
@@ -0,0 +1,1233 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVME_INTERNAL_H__
+#define __NVME_INTERNAL_H__
+
+#include "spdk/config.h"
+#include "spdk/likely.h"
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#include "spdk/queue.h"
+#include "spdk/barrier.h"
+#include "spdk/bit_array.h"
+#include "spdk/mmio.h"
+#include "spdk/pci_ids.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/nvme_intel.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/uuid.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+
+extern pid_t g_spdk_nvme_pid;
+
+/*
+ * Some Intel devices support vendor-unique read latency log page even
+ * though the log page directory says otherwise.
+ */
+#define NVME_INTEL_QUIRK_READ_LATENCY 0x1
+
+/*
+ * Some Intel devices support vendor-unique write latency log page even
+ * though the log page directory says otherwise.
+ */
+#define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2
+
+/*
+ * The controller needs a delay before starts checking the device
+ * readiness, which is done by reading the NVME_CSTS_RDY bit.
+ */
+#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY	0x4
+
+/*
+ * The controller performs best when I/O is split on particular
+ * LBA boundaries.
+ */
+#define NVME_INTEL_QUIRK_STRIPING 0x8
+
+/*
+ * The controller needs a delay after allocating an I/O queue pair
+ * before it is ready to accept I/O commands.
+ */
+#define NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC 0x10
+
+/*
+ * Earlier NVMe devices do not indicate whether unmapped blocks
+ * will read all zeroes or not. This define indicates that the
+ * device does in fact read all zeroes after an unmap event
+ */
+#define NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE 0x20
+
+/*
+ * The controller doesn't handle Identify value others than 0 or 1 correctly.
+ */
+#define NVME_QUIRK_IDENTIFY_CNS 0x40
+
+/*
+ * The controller supports Open Channel command set if matching additional
+ * condition, like the first byte (value 0x1) in the vendor specific
+ * bits of the namespace identify structure is set.
+ */
+#define NVME_QUIRK_OCSSD 0x80
+
+/*
+ * The controller has an Intel vendor ID but does not support Intel vendor-specific
+ * log pages.  This is primarily for QEMU emulated SSDs which report an Intel vendor
+ * ID but do not support these log pages.
+ */
+#define NVME_INTEL_QUIRK_NO_LOG_PAGES 0x100
+
+/*
+ * The controller does not set SHST_COMPLETE in a reasonable amount of time.  This
+ * is primarily seen in virtual VMWare NVMe SSDs.  This quirk merely adds an additional
+ * error message that on VMWare NVMe SSDs, the shutdown timeout may be expected.
+ */
+#define NVME_QUIRK_SHST_COMPLETE 0x200
+
+/*
+ * The controller requires an extra delay before starting the initialization process
+ * during attach.
+ */
+#define NVME_QUIRK_DELAY_BEFORE_INIT 0x400
+
+/*
+ * Some SSDs exhibit poor performance with the default SPDK NVMe IO queue size.
+ * This quirk will increase the default to 1024 which matches other operating
+ * systems, at the cost of some extra memory usage.  Users can still override
+ * the increased default by changing the spdk_nvme_io_qpair_opts when allocating
+ * a new queue pair.
+ */
+#define NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE 0x800
+
+/**
+ * The maximum access width to PCI memory space is 8 Bytes, don't use AVX2 or
+ * SSE instructions to optimize the memory access(memcpy or memset) larger than
+ * 8 Bytes.
+ */
+#define NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH 0x1000
+
+/**
+ * The SSD does not support OPAL even through it sets the security bit in OACS.
+ */
+#define NVME_QUIRK_OACS_SECURITY 0x2000
+
+#define NVME_MAX_ASYNC_EVENTS	(8)
+
+#define NVME_MAX_ADMIN_TIMEOUT_IN_SECS	(30)
+
+/* Maximum log page size to fetch for AERs. */
+#define NVME_MAX_AER_LOG_SIZE		(4096)
+
+/*
+ * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this
+ *  define specifies the maximum number of queues this driver will actually
+ *  try to configure, if available.
+ */
+#define DEFAULT_MAX_IO_QUEUES		(1024)
+#define DEFAULT_ADMIN_QUEUE_SIZE	(32)
+#define DEFAULT_IO_QUEUE_SIZE		(256)
+#define DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK	(1024) /* Matches Linux kernel driver */
+
+#define DEFAULT_IO_QUEUE_REQUESTS	(512)
+
+#define SPDK_NVME_DEFAULT_RETRY_COUNT	(4)
+
+#define SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED	(0)
+#define SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT	SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED
+
+#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
+
+/* We want to fit submission and completion rings each in a single 2MB
+ * hugepage to ensure physical address contiguity.
+ */
+#define MAX_IO_QUEUE_ENTRIES		(VALUE_2MB / spdk_max( \
+						sizeof(struct spdk_nvme_cmd), \
+						sizeof(struct spdk_nvme_cpl)))
+
+enum nvme_payload_type {
+	NVME_PAYLOAD_TYPE_INVALID = 0,
+
+	/** nvme_request::u.payload.contig_buffer is valid for this request */
+	NVME_PAYLOAD_TYPE_CONTIG,
+
+	/** nvme_request::u.sgl is valid for this request */
+	NVME_PAYLOAD_TYPE_SGL,
+};
+
+/**
+ * Descriptor for a request data payload.
+ */
+struct nvme_payload {
+	/**
+	 * Functions for retrieving physical addresses for scattered payloads.
+	 */
+	spdk_nvme_req_reset_sgl_cb reset_sgl_fn;
+	spdk_nvme_req_next_sge_cb next_sge_fn;
+
+	/**
+	 * If reset_sgl_fn == NULL, this is a contig payload, and contig_or_cb_arg contains the
+	 * virtual memory address of a single virtually contiguous buffer.
+	 *
+	 * If reset_sgl_fn != NULL, this is a SGL payload, and contig_or_cb_arg contains the
+	 * cb_arg that will be passed to the SGL callback functions.
+	 */
+	void *contig_or_cb_arg;
+
+	/** Virtual memory address of a single virtually contiguous metadata buffer */
+	void *md;
+};
+
+#define NVME_PAYLOAD_CONTIG(contig_, md_) \
+	(struct nvme_payload) { \
+		.reset_sgl_fn = NULL, \
+		.next_sge_fn = NULL, \
+		.contig_or_cb_arg = (contig_), \
+		.md = (md_), \
+	}
+
+#define NVME_PAYLOAD_SGL(reset_sgl_fn_, next_sge_fn_, cb_arg_, md_) \
+	(struct nvme_payload) { \
+		.reset_sgl_fn = (reset_sgl_fn_), \
+		.next_sge_fn = (next_sge_fn_), \
+		.contig_or_cb_arg = (cb_arg_), \
+		.md = (md_), \
+	}
+
+static inline enum nvme_payload_type
+nvme_payload_type(const struct nvme_payload *payload) {
+	return payload->reset_sgl_fn ? NVME_PAYLOAD_TYPE_SGL : NVME_PAYLOAD_TYPE_CONTIG;
+}
+
+struct nvme_error_cmd {
+	bool				do_not_submit;
+	uint64_t			timeout_tsc;
+	uint32_t			err_count;
+	uint8_t				opc;
+	struct spdk_nvme_status		status;
+	TAILQ_ENTRY(nvme_error_cmd)	link;
+};
+
+struct nvme_request {
+	struct spdk_nvme_cmd		cmd;
+
+	uint8_t				retries;
+
+	uint8_t				timed_out : 1;
+
+	/**
+	 * True if the request is in the queued_req list.
+	 */
+	uint8_t				queued : 1;
+	uint8_t				reserved : 6;
+
+	/**
+	 * Number of children requests still outstanding for this
+	 *  request which was split into multiple child requests.
+	 */
+	uint16_t			num_children;
+
+	/**
+	 * Offset in bytes from the beginning of payload for this request.
+	 * This is used for I/O commands that are split into multiple requests.
+	 */
+	uint32_t			payload_offset;
+	uint32_t			md_offset;
+
+	uint32_t			payload_size;
+
+	/**
+	 * Timeout ticks for error injection requests, can be extended in future
+	 * to support per-request timeout feature.
+	 */
+	uint64_t			timeout_tsc;
+
+	/**
+	 * Data payload for this request's command.
+	 */
+	struct nvme_payload		payload;
+
+	spdk_nvme_cmd_cb		cb_fn;
+	void				*cb_arg;
+	STAILQ_ENTRY(nvme_request)	stailq;
+
+	struct spdk_nvme_qpair		*qpair;
+
+	/*
+	 * The value of spdk_get_ticks() when the request was submitted to the hardware.
+	 * Only set if ctrlr->timeout_enabled is true.
+	 */
+	uint64_t			submit_tick;
+
+	/**
+	 * The active admin request can be moved to a per process pending
+	 *  list based on the saved pid to tell which process it belongs
+	 *  to. The cpl saves the original completion information which
+	 *  is used in the completion callback.
+	 * NOTE: these below two fields are only used for admin request.
+	 */
+	pid_t				pid;
+	struct spdk_nvme_cpl		cpl;
+
+	uint32_t			md_size;
+
+	/**
+	 * The following members should not be reordered with members
+	 *  above.  These members are only needed when splitting
+	 *  requests which is done rarely, and the driver is careful
+	 *  to not touch the following fields until a split operation is
+	 *  needed, to avoid touching an extra cacheline.
+	 */
+
+	/**
+	 * Points to the outstanding child requests for a parent request.
+	 *  Only valid if a request was split into multiple children
+	 *  requests, and is not initialized for non-split requests.
+	 */
+	TAILQ_HEAD(, nvme_request)	children;
+
+	/**
+	 * Linked-list pointers for a child request in its parent's list.
+	 */
+	TAILQ_ENTRY(nvme_request)	child_tailq;
+
+	/**
+	 * Points to a parent request if part of a split request,
+	 *   NULL otherwise.
+	 */
+	struct nvme_request		*parent;
+
+	/**
+	 * Completion status for a parent request.  Initialized to all 0's
+	 *  (SUCCESS) before child requests are submitted.  If a child
+	 *  request completes with error, the error status is copied here,
+	 *  to ensure that the parent request is also completed with error
+	 *  status once all child requests are completed.
+	 */
+	struct spdk_nvme_cpl		parent_status;
+
+	/**
+	 * The user_cb_fn and user_cb_arg fields are used for holding the original
+	 * callback data when using nvme_allocate_request_user_copy.
+	 */
+	spdk_nvme_cmd_cb		user_cb_fn;
+	void				*user_cb_arg;
+	void				*user_buffer;
+};
+
+struct nvme_completion_poll_status {
+	struct spdk_nvme_cpl	cpl;
+	bool			done;
+	/* This flag indicates that the request has been timed out and the memory
+	   must be freed in a completion callback */
+	bool			timed_out;
+};
+
+struct nvme_async_event_request {
+	struct spdk_nvme_ctrlr		*ctrlr;
+	struct nvme_request		*req;
+	struct spdk_nvme_cpl		cpl;
+};
+
+enum nvme_qpair_state {
+	NVME_QPAIR_DISCONNECTED,
+	NVME_QPAIR_DISCONNECTING,
+	NVME_QPAIR_CONNECTING,
+	NVME_QPAIR_CONNECTED,
+	NVME_QPAIR_ENABLING,
+	NVME_QPAIR_ENABLED,
+	NVME_QPAIR_DESTROYING,
+};
+
+struct spdk_nvme_qpair {
+	struct spdk_nvme_ctrlr			*ctrlr;
+
+	uint16_t				id;
+
+	uint8_t					qprio;
+
+	uint8_t					state : 3;
+
+	/*
+	 * Members for handling IO qpair deletion inside of a completion context.
+	 * These are specifically defined as single bits, so that they do not
+	 *  push this data structure out to another cacheline.
+	 */
+	uint8_t					in_completion_context : 1;
+	uint8_t					delete_after_completion_context: 1;
+
+	/*
+	 * Set when no deletion notification is needed. For example, the process
+	 * which allocated this qpair exited unexpectedly.
+	 */
+	uint8_t					no_deletion_notification_needed: 1;
+
+	uint8_t					first_fused_submitted: 1;
+
+	enum spdk_nvme_transport_type		trtype;
+
+	STAILQ_HEAD(, nvme_request)		free_req;
+	STAILQ_HEAD(, nvme_request)		queued_req;
+	STAILQ_HEAD(, nvme_request)		aborting_queued_req;
+
+	/* List entry for spdk_nvme_transport_poll_group::qpairs */
+	STAILQ_ENTRY(spdk_nvme_qpair)		poll_group_stailq;
+
+	/** Commands opcode in this list will return error */
+	TAILQ_HEAD(, nvme_error_cmd)		err_cmd_head;
+	/** Requests in this list will return error */
+	STAILQ_HEAD(, nvme_request)		err_req_head;
+
+	/* List entry for spdk_nvme_ctrlr::active_io_qpairs */
+	TAILQ_ENTRY(spdk_nvme_qpair)		tailq;
+
+	/* List entry for spdk_nvme_ctrlr_process::allocated_io_qpairs */
+	TAILQ_ENTRY(spdk_nvme_qpair)		per_process_tailq;
+
+	struct spdk_nvme_ctrlr_process		*active_proc;
+
+	struct spdk_nvme_transport_poll_group	*poll_group;
+
+	void					*poll_group_tailq_head;
+
+	void					*req_buf;
+
+	const struct spdk_nvme_transport	*transport;
+
+	uint8_t					transport_failure_reason: 2;
+};
+
+struct spdk_nvme_poll_group {
+	void						*ctx;
+	STAILQ_HEAD(, spdk_nvme_transport_poll_group)	tgroups;
+};
+
+struct spdk_nvme_transport_poll_group {
+	struct spdk_nvme_poll_group			*group;
+	const struct spdk_nvme_transport		*transport;
+	STAILQ_HEAD(, spdk_nvme_qpair)			connected_qpairs;
+	STAILQ_HEAD(, spdk_nvme_qpair)			disconnected_qpairs;
+	STAILQ_ENTRY(spdk_nvme_transport_poll_group)	link;
+	bool						in_completion_context;
+	uint64_t					num_qpairs_to_delete;
+};
+
+struct spdk_nvme_ns {
+	struct spdk_nvme_ctrlr		*ctrlr;
+	uint32_t			sector_size;
+
+	/*
+	 * Size of data transferred as part of each block,
+	 * including metadata if FLBAS indicates the metadata is transferred
+	 * as part of the data buffer at the end of each LBA.
+	 */
+	uint32_t			extended_lba_size;
+
+	uint32_t			md_size;
+	uint32_t			pi_type;
+	uint32_t			sectors_per_max_io;
+	uint32_t			sectors_per_stripe;
+	uint32_t			id;
+	uint16_t			flags;
+
+	/* Namespace Identification Descriptor List (CNS = 03h) */
+	uint8_t				id_desc_list[4096];
+};
+
+/**
+ * State of struct spdk_nvme_ctrlr (in particular, during initialization).
+ */
+enum nvme_ctrlr_state {
+	/**
+	 * Wait before initializing the controller.
+	 */
+	NVME_CTRLR_STATE_INIT_DELAY,
+
+	/**
+	 * Controller has not been initialized yet.
+	 */
+	NVME_CTRLR_STATE_INIT,
+
+	/**
+	 * Waiting for CSTS.RDY to transition from 0 to 1 so that CC.EN may be set to 0.
+	 */
+	NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1,
+
+	/**
+	 * Waiting for CSTS.RDY to transition from 1 to 0 so that CC.EN may be set to 1.
+	 */
+	NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
+
+	/**
+	 * Enable the controller by writing CC.EN to 1
+	 */
+	NVME_CTRLR_STATE_ENABLE,
+
+	/**
+	 * Waiting for CSTS.RDY to transition from 0 to 1 after enabling the controller.
+	 */
+	NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
+
+	/**
+	 * Reset the Admin queue of the controller.
+	 */
+	NVME_CTRLR_STATE_RESET_ADMIN_QUEUE,
+
+	/**
+	 * Identify Controller command will be sent to then controller.
+	 */
+	NVME_CTRLR_STATE_IDENTIFY,
+
+	/**
+	 * Waiting for Identify Controller command be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY,
+
+	/**
+	 * Set Number of Queues of the controller.
+	 */
+	NVME_CTRLR_STATE_SET_NUM_QUEUES,
+
+	/**
+	 * Waiting for Set Num of Queues command to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES,
+
+	/**
+	 * Construct Namespace data structures of the controller.
+	 */
+	NVME_CTRLR_STATE_CONSTRUCT_NS,
+
+	/**
+	 * Get active Namespace list of the controller.
+	 */
+	NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS,
+
+	/**
+	 * Waiting for the Identify Active Namespace commands to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS,
+
+	/**
+	 * Get Identify Namespace Data structure for each NS.
+	 */
+	NVME_CTRLR_STATE_IDENTIFY_NS,
+
+	/**
+	 * Waiting for the Identify Namespace commands to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS,
+
+	/**
+	 * Get Identify Namespace Identification Descriptors.
+	 */
+	NVME_CTRLR_STATE_IDENTIFY_ID_DESCS,
+
+	/**
+	 * Waiting for the Identify Namespace Identification
+	 * Descriptors to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS,
+
+	/**
+	 * Configure AER of the controller.
+	 */
+	NVME_CTRLR_STATE_CONFIGURE_AER,
+
+	/**
+	 * Waiting for the Configure AER to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER,
+
+	/**
+	 * Set supported log pages of the controller.
+	 */
+	NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+
+	/**
+	 * Set supported features of the controller.
+	 */
+	NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES,
+
+	/**
+	 * Set Doorbell Buffer Config of the controller.
+	 */
+	NVME_CTRLR_STATE_SET_DB_BUF_CFG,
+
+	/**
+	 * Waiting for Doorbell Buffer Config to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG,
+
+	/**
+	 * Set Keep Alive Timeout of the controller.
+	 */
+	NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+
+	/**
+	 * Waiting for Set Keep Alive Timeout to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT,
+
+	/**
+	 * Set Host ID of the controller.
+	 */
+	NVME_CTRLR_STATE_SET_HOST_ID,
+
+	/**
+	 * Waiting for Set Host ID to be completed.
+	 */
+	NVME_CTRLR_STATE_WAIT_FOR_HOST_ID,
+
+	/**
+	 * Controller initialization has completed and the controller is ready.
+	 */
+	NVME_CTRLR_STATE_READY,
+
+	/**
+	 * Controller inilialization has an error.
+	 */
+	NVME_CTRLR_STATE_ERROR
+};
+
+#define NVME_TIMEOUT_INFINITE	0
+
+/*
+ * Used to track properties for all processes accessing the controller.
+ */
+struct spdk_nvme_ctrlr_process {
+	/** Whether it is the primary process  */
+	bool						is_primary;
+
+	/** Process ID */
+	pid_t						pid;
+
+	/** Active admin requests to be completed */
+	STAILQ_HEAD(, nvme_request)			active_reqs;
+
+	TAILQ_ENTRY(spdk_nvme_ctrlr_process)		tailq;
+
+	/** Per process PCI device handle */
+	struct spdk_pci_device				*devhandle;
+
+	/** Reference to track the number of attachment to this controller. */
+	int						ref;
+
+	/** Allocated IO qpairs */
+	TAILQ_HEAD(, spdk_nvme_qpair)			allocated_io_qpairs;
+
+	spdk_nvme_aer_cb				aer_cb_fn;
+	void						*aer_cb_arg;
+
+	/**
+	 * A function pointer to timeout callback function
+	 */
+	spdk_nvme_timeout_cb		timeout_cb_fn;
+	void				*timeout_cb_arg;
+	uint64_t			timeout_ticks;
+};
+
+/*
+ * One of these per allocated PCI device.
+ */
+struct spdk_nvme_ctrlr {
+	/* Hot data (accessed in I/O path) starts here. */
+
+	/** Array of namespaces indexed by nsid - 1 */
+	struct spdk_nvme_ns		*ns;
+
+	uint32_t			num_ns;
+
+	bool				is_removed;
+
+	bool				is_resetting;
+
+	bool				is_failed;
+
+	bool				is_destructed;
+
+	bool				timeout_enabled;
+
+	uint16_t			max_sges;
+
+	uint16_t			cntlid;
+
+	/** Controller support flags */
+	uint64_t			flags;
+
+	/** NVMEoF in-capsule data size in bytes */
+	uint32_t			ioccsz_bytes;
+
+	/** NVMEoF in-capsule data offset in 16 byte units */
+	uint16_t			icdoff;
+
+	/* Cold data (not accessed in normal I/O path) is after this point. */
+
+	struct spdk_nvme_transport_id	trid;
+
+	union spdk_nvme_cap_register	cap;
+	union spdk_nvme_vs_register	vs;
+
+	enum nvme_ctrlr_state		state;
+	uint64_t			state_timeout_tsc;
+
+	uint64_t			next_keep_alive_tick;
+	uint64_t			keep_alive_interval_ticks;
+
+	TAILQ_ENTRY(spdk_nvme_ctrlr)	tailq;
+
+	/** All the log pages supported */
+	bool				log_page_supported[256];
+
+	/** All the features supported */
+	bool				feature_supported[256];
+
+	/** maximum i/o size in bytes */
+	uint32_t			max_xfer_size;
+
+	/** minimum page size supported by this controller in bytes */
+	uint32_t			min_page_size;
+
+	/** selected memory page size for this controller in bytes */
+	uint32_t			page_size;
+
+	uint32_t			num_aers;
+	struct nvme_async_event_request	aer[NVME_MAX_ASYNC_EVENTS];
+
+	/** guards access to the controller itself, including admin queues */
+	pthread_mutex_t			ctrlr_lock;
+
+	struct spdk_nvme_qpair		*adminq;
+
+	/** shadow doorbell buffer */
+	uint32_t			*shadow_doorbell;
+	/** eventidx buffer */
+	uint32_t			*eventidx;
+
+	/**
+	 * Identify Controller data.
+	 */
+	struct spdk_nvme_ctrlr_data	cdata;
+
+	/**
+	 * Keep track of active namespaces
+	 */
+	uint32_t			*active_ns_list;
+
+	/**
+	 * Array of Identify Namespace data.
+	 *
+	 * Stored separately from ns since nsdata should not normally be accessed during I/O.
+	 */
+	struct spdk_nvme_ns_data	*nsdata;
+
+	struct spdk_bit_array		*free_io_qids;
+	TAILQ_HEAD(, spdk_nvme_qpair)	active_io_qpairs;
+
+	struct spdk_nvme_ctrlr_opts	opts;
+
+	uint64_t			quirks;
+
+	/* Extra sleep time during controller initialization */
+	uint64_t			sleep_timeout_tsc;
+
+	/** Track all the processes manage this controller */
+	TAILQ_HEAD(, spdk_nvme_ctrlr_process)	active_procs;
+
+
+	STAILQ_HEAD(, nvme_request)	queued_aborts;
+	uint32_t			outstanding_aborts;
+
+	/* CB to notify the user when the ctrlr is removed/failed. */
+	spdk_nvme_remove_cb			remove_cb;
+	void					*cb_ctx;
+
+	struct spdk_nvme_qpair		*external_io_msgs_qpair;
+	pthread_mutex_t			external_io_msgs_lock;
+	struct spdk_ring		*external_io_msgs;
+
+	STAILQ_HEAD(, nvme_io_msg_producer) io_producers;
+};
+
+struct spdk_nvme_probe_ctx {
+	struct spdk_nvme_transport_id		trid;
+	void					*cb_ctx;
+	spdk_nvme_probe_cb			probe_cb;
+	spdk_nvme_attach_cb			attach_cb;
+	spdk_nvme_remove_cb			remove_cb;
+	TAILQ_HEAD(, spdk_nvme_ctrlr)		init_ctrlrs;
+};
+
+struct nvme_driver {
+	pthread_mutex_t			lock;
+
+	/** Multi-process shared attached controller list */
+	TAILQ_HEAD(, spdk_nvme_ctrlr)	shared_attached_ctrlrs;
+
+	bool				initialized;
+	struct spdk_uuid		default_extended_host_id;
+
+	/** netlink socket fd for hotplug messages */
+	int				hotplug_fd;
+};
+
+extern struct nvme_driver *g_spdk_nvme_driver;
+
+int nvme_driver_init(void);
+
+#define nvme_delay		usleep
+
+static inline bool
+nvme_qpair_is_admin_queue(struct spdk_nvme_qpair *qpair)
+{
+	return qpair->id == 0;
+}
+
+static inline bool
+nvme_qpair_is_io_queue(struct spdk_nvme_qpair *qpair)
+{
+	return qpair->id != 0;
+}
+
+static inline int
+nvme_robust_mutex_lock(pthread_mutex_t *mtx)
+{
+	int rc = pthread_mutex_lock(mtx);
+
+#ifndef __FreeBSD__
+	if (rc == EOWNERDEAD) {
+		rc = pthread_mutex_consistent(mtx);
+	}
+#endif
+
+	return rc;
+}
+
+static inline int
+nvme_robust_mutex_unlock(pthread_mutex_t *mtx)
+{
+	return pthread_mutex_unlock(mtx);
+}
+
+/* Poll group management functions. */
+int nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair);
+int nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair);
+
+/* Admin functions */
+int	nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr,
+				uint8_t cns, uint16_t cntid, uint32_t nsid,
+				void *payload, size_t payload_size,
+				spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+				      uint32_t num_queues, spdk_nvme_cmd_cb cb_fn,
+				      void *cb_arg);
+int	nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+				      spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr,
+		union spdk_nvme_feat_async_event_configuration config,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size,
+				   spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+				 struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+				 struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload,
+				 spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr,
+		uint64_t prp1, uint64_t prp2,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn,
+				 void *cb_arg);
+int	nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+			      struct spdk_nvme_format *format, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr,
+				 const struct spdk_nvme_fw_commit *fw_commit,
+				 spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr,
+		uint32_t size, uint32_t offset, void *payload,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int	nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+				struct spdk_nvme_sanitize *sanitize, uint32_t cdw11,
+				spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+void	nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl);
+int	nvme_wait_for_completion(struct spdk_nvme_qpair *qpair,
+				 struct nvme_completion_poll_status *status);
+int	nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair,
+		struct nvme_completion_poll_status *status,
+		pthread_mutex_t *robust_mutex);
+int	nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair,
+		struct nvme_completion_poll_status *status,
+		uint64_t timeout_in_secs);
+
+struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr,
+		pid_t pid);
+struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr);
+int	nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle);
+void	nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr);
+struct spdk_pci_device *nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr);
+
+int	nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid,
+			 struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle);
+
+int	nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove);
+int	nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr);
+int	nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx,
+			     struct spdk_nvme_ctrlr *ctrlr);
+
+int	nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr,
+					struct nvme_request *req);
+int	nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap);
+int	nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs);
+int	nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz);
+void	nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap,
+			    const union spdk_nvme_vs_register *vs);
+void nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair);
+int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
+		    struct spdk_nvme_ctrlr *ctrlr,
+		    enum spdk_nvme_qprio qprio,
+		    uint32_t num_requests);
+void	nvme_qpair_deinit(struct spdk_nvme_qpair *qpair);
+void	nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair);
+int	nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+				  struct nvme_request *req);
+void	nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+uint32_t nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg);
+void	nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests);
+
+int	nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ns_set_identify_data(struct spdk_nvme_ns *ns);
+int	nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id,
+			  struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ns_destruct(struct spdk_nvme_ns *ns);
+int	nvme_ns_update(struct spdk_nvme_ns *ns);
+
+int	nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value);
+int	nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value);
+int	nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value);
+int	nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect);
+int	nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value);
+int	nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr,
+				   struct spdk_nvme_probe_ctx *probe_ctx);
+int	nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries);
+
+static inline struct nvme_request *
+nvme_allocate_request(struct spdk_nvme_qpair *qpair,
+		      const struct nvme_payload *payload, uint32_t payload_size, uint32_t md_size,
+		      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+
+	req = STAILQ_FIRST(&qpair->free_req);
+	if (req == NULL) {
+		return req;
+	}
+
+	STAILQ_REMOVE_HEAD(&qpair->free_req, stailq);
+
+	/*
+	 * Only memset/zero fields that need it.  All other fields
+	 *  will be initialized appropriately either later in this
+	 *  function, or before they are needed later in the
+	 *  submission patch.  For example, the children
+	 *  TAILQ_ENTRY and following members are
+	 *  only used as part of I/O splitting so we avoid
+	 *  memsetting them until it is actually needed.
+	 *  They will be initialized in nvme_request_add_child()
+	 *  if the request is split.
+	 */
+	memset(req, 0, offsetof(struct nvme_request, payload_size));
+
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+	req->payload = *payload;
+	req->payload_size = payload_size;
+	req->md_size = md_size;
+	req->pid = g_spdk_nvme_pid;
+	req->submit_tick = 0;
+
+	return req;
+}
+
+static inline struct nvme_request *
+nvme_allocate_request_contig(struct spdk_nvme_qpair *qpair,
+			     void *buffer, uint32_t payload_size,
+			     spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_payload payload;
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+	return nvme_allocate_request(qpair, &payload, payload_size, 0, cb_fn, cb_arg);
+}
+
+static inline struct nvme_request *
+nvme_allocate_request_null(struct spdk_nvme_qpair *qpair, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	return nvme_allocate_request_contig(qpair, NULL, 0, cb_fn, cb_arg);
+}
+
+struct nvme_request *nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair,
+		void *buffer, uint32_t payload_size,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg, bool host_to_controller);
+
+static inline void
+nvme_complete_request(spdk_nvme_cmd_cb cb_fn, void *cb_arg, struct spdk_nvme_qpair *qpair,
+		      struct nvme_request *req, struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_cpl            err_cpl;
+	struct nvme_error_cmd           *cmd;
+
+	/* error injection at completion path,
+	 * only inject for successful completed commands
+	 */
+	if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head) &&
+			  !spdk_nvme_cpl_is_error(cpl))) {
+		TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) {
+
+			if (cmd->do_not_submit) {
+				continue;
+			}
+
+			if ((cmd->opc == req->cmd.opc) && cmd->err_count) {
+
+				err_cpl = *cpl;
+				err_cpl.status.sct = cmd->status.sct;
+				err_cpl.status.sc = cmd->status.sc;
+
+				cpl = &err_cpl;
+				cmd->err_count--;
+				break;
+			}
+		}
+	}
+
+	if (cb_fn) {
+		cb_fn(cb_arg, cpl);
+	}
+}
+
+static inline void
+nvme_free_request(struct nvme_request *req)
+{
+	assert(req != NULL);
+	assert(req->num_children == 0);
+	assert(req->qpair != NULL);
+
+	STAILQ_INSERT_HEAD(&req->qpair->free_req, req, stailq);
+}
+
+static inline void
+nvme_qpair_set_state(struct spdk_nvme_qpair *qpair, enum nvme_qpair_state state)
+{
+	qpair->state = state;
+}
+
+static inline enum nvme_qpair_state
+nvme_qpair_get_state(struct spdk_nvme_qpair *qpair) {
+	return qpair->state;
+}
+
+static inline void
+nvme_qpair_free_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+	assert(req != NULL);
+	assert(req->num_children == 0);
+
+	STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq);
+}
+
+static inline void
+nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child)
+{
+	assert(parent != NULL);
+	assert(child != NULL);
+	assert(child->parent == parent);
+	assert(parent->num_children != 0);
+
+	parent->num_children--;
+	child->parent = NULL;
+	TAILQ_REMOVE(&parent->children, child, child_tailq);
+}
+
+static inline void
+nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct nvme_request *child = child_arg;
+	struct nvme_request *parent = child->parent;
+
+	nvme_request_remove_child(parent, child);
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		memcpy(&parent->parent_status, cpl, sizeof(*cpl));
+	}
+
+	if (parent->num_children == 0) {
+		nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
+				      parent, &parent->parent_status);
+		nvme_free_request(parent);
+	}
+}
+
+static inline void
+nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child)
+{
+	assert(parent->num_children != UINT16_MAX);
+
+	if (parent->num_children == 0) {
+		/*
+		 * Defer initialization of the children TAILQ since it falls
+		 *  on a separate cacheline.  This ensures we do not touch this
+		 *  cacheline except on request splitting cases, which are
+		 *  relatively rare.
+		 */
+		TAILQ_INIT(&parent->children);
+		parent->parent = NULL;
+		memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl));
+	}
+
+	parent->num_children++;
+	TAILQ_INSERT_TAIL(&parent->children, child, child_tailq);
+	child->parent = parent;
+	child->cb_fn = nvme_cb_complete_child;
+	child->cb_arg = child;
+}
+
+static inline void
+nvme_request_free_children(struct nvme_request *req)
+{
+	struct nvme_request *child, *tmp;
+
+	if (req->num_children == 0) {
+		return;
+	}
+
+	/* free all child nvme_request */
+	TAILQ_FOREACH_SAFE(child, &req->children, child_tailq, tmp) {
+		nvme_request_remove_child(req, child);
+		nvme_request_free_children(child);
+		nvme_free_request(child);
+	}
+}
+
+int	nvme_request_check_timeout(struct nvme_request *req, uint16_t cid,
+				   struct spdk_nvme_ctrlr_process *active_proc, uint64_t now_tick);
+uint64_t nvme_get_quirks(const struct spdk_pci_id *id);
+
+int	nvme_robust_mutex_init_shared(pthread_mutex_t *mtx);
+int	nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx);
+
+bool	nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl);
+
+struct spdk_nvme_ctrlr *nvme_get_ctrlr_by_trid_unsafe(
+	const struct spdk_nvme_transport_id *trid);
+
+const struct spdk_nvme_transport *nvme_get_transport(const char *transport_name);
+const struct spdk_nvme_transport *nvme_get_first_transport(void);
+const struct spdk_nvme_transport *nvme_get_next_transport(const struct spdk_nvme_transport
+		*transport);
+
+/* Transport specific functions */
+struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+		const struct spdk_nvme_ctrlr_opts *opts,
+		void *devhandle);
+int nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect);
+int nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value);
+int nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value);
+int nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value);
+int nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value);
+uint32_t nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr);
+uint16_t nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr);
+struct spdk_nvme_qpair *nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+		uint16_t qid, const struct spdk_nvme_io_qpair_opts *opts);
+int nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr);
+void *nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size);
+int nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_qpair *qpair);
+int nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr,
+				       struct spdk_nvme_qpair *qpair);
+void nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_qpair *qpair);
+void nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+int nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair);
+int nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req);
+int32_t nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair,
+		uint32_t max_completions);
+void nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair);
+int nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+		int (*iter_fn)(struct nvme_request *req, void *arg),
+		void *arg);
+
+struct spdk_nvme_transport_poll_group *nvme_transport_poll_group_create(
+	const struct spdk_nvme_transport *transport);
+int nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+				  struct spdk_nvme_qpair *qpair);
+int nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+				     struct spdk_nvme_qpair *qpair);
+int nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair);
+int nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair);
+int64_t nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb);
+int nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup);
+/*
+ * Below ref related functions must be called with the global
+ *  driver lock held for the multi-process condition.
+ *  Within these functions, the per ctrlr ctrlr_lock is also
+ *  acquired for the multi-thread condition.
+ */
+void	nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr);
+void	nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr);
+int	nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr);
+
+static inline bool
+_is_page_aligned(uint64_t address, uint64_t page_size)
+{
+	return (address & (page_size - 1)) == 0;
+}
+
+#endif /* __NVME_INTERNAL_H__ */
diff --git a/src/spdk/lib/nvme/nvme_io_msg.c b/src/spdk/lib/nvme/nvme_io_msg.c
new file mode 100644
index 000000000..fb5aec3d4
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_io_msg.c
@@ -0,0 +1,216 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+
+#define SPDK_NVME_MSG_IO_PROCESS_SIZE 8
+
+/**
+ * Send message to IO queue.
+ */
+int
+nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn,
+		 void *arg)
+{
+	int rc;
+	struct spdk_nvme_io_msg *io;
+
+	/* Protect requests ring against preemptive producers */
+	pthread_mutex_lock(&ctrlr->external_io_msgs_lock);
+
+	io = (struct spdk_nvme_io_msg *)calloc(1, sizeof(struct spdk_nvme_io_msg));
+	if (!io) {
+		SPDK_ERRLOG("IO msg allocation failed.");
+		pthread_mutex_unlock(&ctrlr->external_io_msgs_lock);
+		return -ENOMEM;
+	}
+
+	io->ctrlr = ctrlr;
+	io->nsid = nsid;
+	io->fn = fn;
+	io->arg = arg;
+
+	rc = spdk_ring_enqueue(ctrlr->external_io_msgs, (void **)&io, 1, NULL);
+	if (rc != 1) {
+		assert(false);
+		free(io);
+		pthread_mutex_unlock(&ctrlr->external_io_msgs_lock);
+		return -ENOMEM;
+	}
+
+	pthread_mutex_unlock(&ctrlr->external_io_msgs_lock);
+
+	return 0;
+}
+
+int
+nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr)
+{
+	int i;
+	int count;
+	struct spdk_nvme_io_msg *io;
+	void *requests[SPDK_NVME_MSG_IO_PROCESS_SIZE];
+
+	if (!ctrlr->external_io_msgs || !ctrlr->external_io_msgs_qpair) {
+		/* Not ready or pending reset */
+		return 0;
+	}
+
+	spdk_nvme_qpair_process_completions(ctrlr->external_io_msgs_qpair, 0);
+
+	count = spdk_ring_dequeue(ctrlr->external_io_msgs, requests,
+				  SPDK_NVME_MSG_IO_PROCESS_SIZE);
+	if (count == 0) {
+		return 0;
+	}
+
+	for (i = 0; i < count; i++) {
+		io = requests[i];
+
+		assert(io != NULL);
+
+		io->fn(io->ctrlr, io->nsid, io->arg);
+		free(io);
+	}
+
+	return count;
+}
+
+static bool
+nvme_io_msg_is_producer_registered(struct spdk_nvme_ctrlr *ctrlr,
+				   struct nvme_io_msg_producer *io_msg_producer)
+{
+	struct nvme_io_msg_producer *tmp;
+
+	STAILQ_FOREACH(tmp, &ctrlr->io_producers, link) {
+		if (tmp == io_msg_producer) {
+			return true;
+		}
+	}
+	return false;
+}
+
+int
+nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr,
+			   struct nvme_io_msg_producer *io_msg_producer)
+{
+	if (io_msg_producer == NULL) {
+		SPDK_ERRLOG("io_msg_producer cannot be NULL\n");
+		return -EINVAL;
+	}
+
+	if (nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) {
+		return -EEXIST;
+	}
+
+	if (!STAILQ_EMPTY(&ctrlr->io_producers) || ctrlr->is_resetting) {
+		/* There are registered producers - IO messaging already started */
+		STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link);
+		return 0;
+	}
+
+	pthread_mutex_init(&ctrlr->external_io_msgs_lock, NULL);
+
+	/**
+	 * Initialize ring and qpair for controller
+	 */
+	ctrlr->external_io_msgs = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);
+	if (!ctrlr->external_io_msgs) {
+		SPDK_ERRLOG("Unable to allocate memory for message ring\n");
+		return -ENOMEM;
+	}
+
+	ctrlr->external_io_msgs_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
+	if (ctrlr->external_io_msgs_qpair == NULL) {
+		SPDK_ERRLOG("spdk_nvme_ctrlr_alloc_io_qpair() failed\n");
+		spdk_ring_free(ctrlr->external_io_msgs);
+		ctrlr->external_io_msgs = NULL;
+		return -ENOMEM;
+	}
+
+	STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link);
+
+	return 0;
+}
+
+void
+nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_io_msg_producer *io_msg_producer;
+
+	/* Update all producers */
+	STAILQ_FOREACH(io_msg_producer, &ctrlr->io_producers, link) {
+		io_msg_producer->update(ctrlr);
+	}
+}
+
+void
+nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_io_msg_producer *io_msg_producer, *tmp;
+
+	/* Stop all producers */
+	STAILQ_FOREACH_SAFE(io_msg_producer, &ctrlr->io_producers, link, tmp) {
+		io_msg_producer->stop(ctrlr);
+		STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link);
+	}
+
+	if (ctrlr->external_io_msgs) {
+		spdk_ring_free(ctrlr->external_io_msgs);
+		ctrlr->external_io_msgs = NULL;
+	}
+
+	if (ctrlr->external_io_msgs_qpair) {
+		spdk_nvme_ctrlr_free_io_qpair(ctrlr->external_io_msgs_qpair);
+		ctrlr->external_io_msgs_qpair = NULL;
+	}
+
+	pthread_mutex_destroy(&ctrlr->external_io_msgs_lock);
+}
+
+void
+nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr,
+			     struct nvme_io_msg_producer *io_msg_producer)
+{
+	assert(io_msg_producer != NULL);
+
+	if (!nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) {
+		return;
+	}
+
+	STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link);
+	if (STAILQ_EMPTY(&ctrlr->io_producers)) {
+		nvme_io_msg_ctrlr_detach(ctrlr);
+	}
+}
diff --git a/src/spdk/lib/nvme/nvme_io_msg.h b/src/spdk/lib/nvme/nvme_io_msg.h
new file mode 100644
index 000000000..9c18261d5
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_io_msg.h
@@ -0,0 +1,90 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * SPDK cuse
+ */
+
+
+#ifndef SPDK_NVME_IO_MSG_H_
+#define SPDK_NVME_IO_MSG_H_
+
+typedef void (*spdk_nvme_io_msg_fn)(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+				    void *arg);
+
+struct spdk_nvme_io_msg {
+	struct spdk_nvme_ctrlr	*ctrlr;
+	uint32_t		nsid;
+
+	spdk_nvme_io_msg_fn	fn;
+	void			*arg;
+};
+
+struct nvme_io_msg_producer {
+	const char *name;
+	void (*update)(struct spdk_nvme_ctrlr *ctrlr);
+	void (*stop)(struct spdk_nvme_ctrlr *ctrlr);
+	STAILQ_ENTRY(nvme_io_msg_producer) link;
+};
+
+int nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn,
+		     void *arg);
+
+/**
+ * Process IO message sent to controller from external module.
+ *
+ * This call process requests from the ring, send IO to an allocated qpair or
+ * admin commands in its context. This call is non-blocking and intended to be
+ * polled by SPDK thread to provide safe environment for NVMe request
+ * completition sent by external module to controller.
+ *
+ * The caller must ensure that each controller is polled by only one thread at
+ * a time.
+ *
+ * This function may be called at any point while the controller is attached to
+ * the SPDK NVMe driver.
+ *
+ * \param ctrlr Opaque handle to NVMe controller.
+ *
+ * \return number of processed external IO messages.
+ */
+int nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr);
+
+int nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr,
+			       struct nvme_io_msg_producer *io_msg_producer);
+void nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr,
+				  struct nvme_io_msg_producer *io_msg_producer);
+void nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr);
+
+#endif /* SPDK_NVME_IO_MSG_H_ */
diff --git a/src/spdk/lib/nvme/nvme_ns.c b/src/spdk/lib/nvme/nvme_ns.c
new file mode 100644
index 000000000..5d424e5c7
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns.c
@@ -0,0 +1,401 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+static inline struct spdk_nvme_ns_data *
+_nvme_ns_get_data(struct spdk_nvme_ns *ns)
+{
+	return &ns->ctrlr->nsdata[ns->id - 1];
+}
+
+/**
+ * Update Namespace flags based on Identify Controller
+ * and Identify Namespace.  This can be also used for
+ * Namespace Attribute Notice events and Namespace
+ * operations such as Attach/Detach.
+ */
+void
+nvme_ns_set_identify_data(struct spdk_nvme_ns *ns)
+{
+	struct spdk_nvme_ns_data	*nsdata;
+
+	nsdata = _nvme_ns_get_data(ns);
+
+	ns->flags = 0x0000;
+
+	ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads;
+	ns->extended_lba_size = ns->sector_size;
+
+	ns->md_size = nsdata->lbaf[nsdata->flbas.format].ms;
+	if (nsdata->flbas.extended) {
+		ns->flags |= SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED;
+		ns->extended_lba_size += ns->md_size;
+	}
+
+	ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size;
+
+	if (nsdata->noiob) {
+		ns->sectors_per_stripe = nsdata->noiob;
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u optimal IO boundary %" PRIu32 " blocks\n",
+			      ns->id, ns->sectors_per_stripe);
+	} else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING &&
+		   ns->ctrlr->cdata.vs[3] != 0) {
+		ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size /
+					 ns->sector_size;
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u stripe size quirk %" PRIu32 " blocks\n",
+			      ns->id, ns->sectors_per_stripe);
+	} else {
+		ns->sectors_per_stripe = 0;
+	}
+
+	if (ns->ctrlr->cdata.oncs.dsm) {
+		ns->flags |= SPDK_NVME_NS_DEALLOCATE_SUPPORTED;
+	}
+
+	if (ns->ctrlr->cdata.oncs.compare) {
+		ns->flags |= SPDK_NVME_NS_COMPARE_SUPPORTED;
+	}
+
+	if (ns->ctrlr->cdata.vwc.present) {
+		ns->flags |= SPDK_NVME_NS_FLUSH_SUPPORTED;
+	}
+
+	if (ns->ctrlr->cdata.oncs.write_zeroes) {
+		ns->flags |= SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED;
+	}
+
+	if (ns->ctrlr->cdata.oncs.write_unc) {
+		ns->flags |= SPDK_NVME_NS_WRITE_UNCORRECTABLE_SUPPORTED;
+	}
+
+	if (nsdata->nsrescap.raw) {
+		ns->flags |= SPDK_NVME_NS_RESERVATION_SUPPORTED;
+	}
+
+	ns->pi_type = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE;
+	if (nsdata->lbaf[nsdata->flbas.format].ms && nsdata->dps.pit) {
+		ns->flags |= SPDK_NVME_NS_DPS_PI_SUPPORTED;
+		ns->pi_type = nsdata->dps.pit;
+	}
+}
+
+static int
+nvme_ctrlr_identify_ns(struct spdk_nvme_ns *ns)
+{
+	struct nvme_completion_poll_status	*status;
+	struct spdk_nvme_ns_data		*nsdata;
+	int					rc;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	nsdata = _nvme_ns_get_data(ns);
+	rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id,
+				     nsdata, sizeof(*nsdata),
+				     nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status,
+			&ns->ctrlr->ctrlr_lock)) {
+		if (!status->timed_out) {
+			free(status);
+		}
+		/* This can occur if the namespace is not active. Simply zero the
+		 * namespace data and continue. */
+		nvme_ns_destruct(ns);
+		return 0;
+	}
+	free(status);
+
+	nvme_ns_set_identify_data(ns);
+
+	return 0;
+}
+
+static int
+nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns)
+{
+	struct nvme_completion_poll_status      *status;
+	int                                     rc;
+
+	memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+
+	if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) ||
+	    (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n");
+		return 0;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Attempting to retrieve NS ID Descriptor List\n");
+	rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, 0, ns->id,
+				     ns->id_desc_list, sizeof(ns->id_desc_list),
+				     nvme_completion_poll_cb, status);
+	if (rc < 0) {
+		free(status);
+		return rc;
+	}
+
+	rc = nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status, &ns->ctrlr->ctrlr_lock);
+	if (rc != 0) {
+		SPDK_WARNLOG("Failed to retrieve NS ID Descriptor List\n");
+		memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+	}
+
+	if (!status->timed_out) {
+		free(status);
+	}
+
+	return rc;
+}
+
+uint32_t
+spdk_nvme_ns_get_id(struct spdk_nvme_ns *ns)
+{
+	return ns->id;
+}
+
+bool
+spdk_nvme_ns_is_active(struct spdk_nvme_ns *ns)
+{
+	const struct spdk_nvme_ns_data *nsdata = NULL;
+
+	/*
+	 * According to the spec, valid NS has non-zero id.
+	 */
+	if (ns->id == 0) {
+		return false;
+	}
+
+	nsdata = _nvme_ns_get_data(ns);
+
+	/*
+	 * According to the spec, Identify Namespace will return a zero-filled structure for
+	 *  inactive namespace IDs.
+	 * Check NCAP since it must be nonzero for an active namespace.
+	 */
+	return nsdata->ncap != 0;
+}
+
+struct spdk_nvme_ctrlr *
+spdk_nvme_ns_get_ctrlr(struct spdk_nvme_ns *ns)
+{
+	return ns->ctrlr;
+}
+
+uint32_t
+spdk_nvme_ns_get_max_io_xfer_size(struct spdk_nvme_ns *ns)
+{
+	return ns->ctrlr->max_xfer_size;
+}
+
+uint32_t
+spdk_nvme_ns_get_sector_size(struct spdk_nvme_ns *ns)
+{
+	return ns->sector_size;
+}
+
+uint32_t
+spdk_nvme_ns_get_extended_sector_size(struct spdk_nvme_ns *ns)
+{
+	return ns->extended_lba_size;
+}
+
+uint64_t
+spdk_nvme_ns_get_num_sectors(struct spdk_nvme_ns *ns)
+{
+	return _nvme_ns_get_data(ns)->nsze;
+}
+
+uint64_t
+spdk_nvme_ns_get_size(struct spdk_nvme_ns *ns)
+{
+	return spdk_nvme_ns_get_num_sectors(ns) * spdk_nvme_ns_get_sector_size(ns);
+}
+
+uint32_t
+spdk_nvme_ns_get_flags(struct spdk_nvme_ns *ns)
+{
+	return ns->flags;
+}
+
+enum spdk_nvme_pi_type
+spdk_nvme_ns_get_pi_type(struct spdk_nvme_ns *ns) {
+	return ns->pi_type;
+}
+
+bool
+spdk_nvme_ns_supports_extended_lba(struct spdk_nvme_ns *ns)
+{
+	return (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) ? true : false;
+}
+
+bool
+spdk_nvme_ns_supports_compare(struct spdk_nvme_ns *ns)
+{
+	return (ns->flags & SPDK_NVME_NS_COMPARE_SUPPORTED) ? true : false;
+}
+
+uint32_t
+spdk_nvme_ns_get_md_size(struct spdk_nvme_ns *ns)
+{
+	return ns->md_size;
+}
+
+const struct spdk_nvme_ns_data *
+spdk_nvme_ns_get_data(struct spdk_nvme_ns *ns)
+{
+	return _nvme_ns_get_data(ns);
+}
+
+enum spdk_nvme_dealloc_logical_block_read_value spdk_nvme_ns_get_dealloc_logical_block_read_value(
+	struct spdk_nvme_ns *ns)
+{
+	struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+	const struct spdk_nvme_ns_data *data = spdk_nvme_ns_get_data(ns);
+
+	if (ctrlr->quirks & NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE) {
+		return SPDK_NVME_DEALLOC_READ_00;
+	} else {
+		return data->dlfeat.bits.read_value;
+	}
+}
+
+uint32_t
+spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns)
+{
+	return ns->sectors_per_stripe;
+}
+
+static const void *
+nvme_ns_find_id_desc(const struct spdk_nvme_ns *ns, enum spdk_nvme_nidt type, size_t *length)
+{
+	const struct spdk_nvme_ns_id_desc *desc;
+	size_t offset;
+
+	offset = 0;
+	while (offset + 4 < sizeof(ns->id_desc_list)) {
+		desc = (const struct spdk_nvme_ns_id_desc *)&ns->id_desc_list[offset];
+
+		if (desc->nidl == 0) {
+			/* End of list */
+			return NULL;
+		}
+
+		/*
+		 * Check if this descriptor fits within the list.
+		 * 4 is the fixed-size descriptor header (not counted in NIDL).
+		 */
+		if (offset + desc->nidl + 4 > sizeof(ns->id_desc_list)) {
+			/* Descriptor longer than remaining space in list (invalid) */
+			return NULL;
+		}
+
+		if (desc->nidt == type) {
+			*length = desc->nidl;
+			return &desc->nid[0];
+		}
+
+		offset += 4 + desc->nidl;
+	}
+
+	return NULL;
+}
+
+const struct spdk_uuid *
+spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns)
+{
+	const struct spdk_uuid *uuid;
+	size_t uuid_size;
+
+	uuid = nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size);
+	if (uuid == NULL || uuid_size != sizeof(*uuid)) {
+		return NULL;
+	}
+
+	return uuid;
+}
+
+int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id,
+		      struct spdk_nvme_ctrlr *ctrlr)
+{
+	int	rc;
+
+	assert(id > 0);
+
+	ns->ctrlr = ctrlr;
+	ns->id = id;
+
+	rc = nvme_ctrlr_identify_ns(ns);
+	if (rc != 0) {
+		return rc;
+	}
+
+	return nvme_ctrlr_identify_id_desc(ns);
+}
+
+void nvme_ns_destruct(struct spdk_nvme_ns *ns)
+{
+	struct spdk_nvme_ns_data *nsdata;
+
+	if (!ns->id) {
+		return;
+	}
+
+	nsdata = _nvme_ns_get_data(ns);
+	memset(nsdata, 0, sizeof(*nsdata));
+	ns->sector_size = 0;
+	ns->extended_lba_size = 0;
+	ns->md_size = 0;
+	ns->pi_type = 0;
+	ns->sectors_per_max_io = 0;
+	ns->sectors_per_stripe = 0;
+	ns->flags = 0;
+}
+
+int nvme_ns_update(struct spdk_nvme_ns *ns)
+{
+	return nvme_ctrlr_identify_ns(ns);
+}
diff --git a/src/spdk/lib/nvme/nvme_ns_cmd.c b/src/spdk/lib/nvme/nvme_ns_cmd.c
new file mode 100644
index 000000000..eaa825fa8
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns_cmd.c
@@ -0,0 +1,1074 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+static inline struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns,
+		struct spdk_nvme_qpair *qpair,
+		const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
+		uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn,
+		void *cb_arg, uint32_t opc, uint32_t io_flags,
+		uint16_t apptag_mask, uint16_t apptag, bool check_sgl);
+
+
+static bool
+nvme_ns_check_request_length(uint32_t lba_count, uint32_t sectors_per_max_io,
+			     uint32_t sectors_per_stripe, uint32_t qdepth)
+{
+	uint32_t child_per_io = UINT32_MAX;
+
+	/* After a namespace is destroyed(e.g. hotplug), all the fields associated with the
+	 * namespace will be cleared to zero, the function will return TRUE for this case,
+	 * and -EINVAL will be returned to caller.
+	 */
+	if (sectors_per_stripe > 0) {
+		child_per_io = (lba_count + sectors_per_stripe - 1) / sectors_per_stripe;
+	} else if (sectors_per_max_io > 0) {
+		child_per_io = (lba_count + sectors_per_max_io - 1) / sectors_per_max_io;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "checking maximum i/o length %d\n", child_per_io);
+
+	return child_per_io >= qdepth;
+}
+
+static struct nvme_request *
+_nvme_add_child_request(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+			const struct nvme_payload *payload,
+			uint32_t payload_offset, uint32_t md_offset,
+			uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+			uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag,
+			struct nvme_request *parent, bool check_sgl)
+{
+	struct nvme_request	*child;
+
+	child = _nvme_ns_cmd_rw(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, cb_fn,
+				cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl);
+	if (child == NULL) {
+		nvme_request_free_children(parent);
+		nvme_free_request(parent);
+		return NULL;
+	}
+
+	nvme_request_add_child(parent, child);
+	return child;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns,
+			   struct spdk_nvme_qpair *qpair,
+			   const struct nvme_payload *payload,
+			   uint32_t payload_offset, uint32_t md_offset,
+			   uint64_t lba, uint32_t lba_count,
+			   spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+			   uint32_t io_flags, struct nvme_request *req,
+			   uint32_t sectors_per_max_io, uint32_t sector_mask,
+			   uint16_t apptag_mask, uint16_t apptag)
+{
+	uint32_t		sector_size;
+	uint32_t		md_size = ns->md_size;
+	uint32_t		remaining_lba_count = lba_count;
+	struct nvme_request	*child;
+
+	sector_size = ns->extended_lba_size;
+
+	if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
+	    (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
+	    (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
+	    (md_size == 8)) {
+		sector_size -= 8;
+	}
+
+	while (remaining_lba_count > 0) {
+		lba_count = sectors_per_max_io - (lba & sector_mask);
+		lba_count = spdk_min(remaining_lba_count, lba_count);
+
+		child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+						lba, lba_count, cb_fn, cb_arg, opc,
+						io_flags, apptag_mask, apptag, req, true);
+		if (child == NULL) {
+			return NULL;
+		}
+
+		remaining_lba_count -= lba_count;
+		lba += lba_count;
+		payload_offset += lba_count * sector_size;
+		md_offset += lba_count * md_size;
+	}
+
+	return req;
+}
+
+static inline bool
+_is_io_flags_valid(uint32_t io_flags)
+{
+	if (io_flags & ~SPDK_NVME_IO_FLAGS_VALID_MASK) {
+		/* Invalid io_flags */
+		SPDK_ERRLOG("Invalid io_flags 0x%x\n", io_flags);
+		return false;
+	}
+
+	return true;
+}
+
+static void
+_nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req,
+			   uint32_t opc, uint64_t lba, uint32_t lba_count,
+			   uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+	struct spdk_nvme_cmd	*cmd;
+
+	assert(_is_io_flags_valid(io_flags));
+
+	cmd = &req->cmd;
+	cmd->opc = opc;
+	cmd->nsid = ns->id;
+
+	*(uint64_t *)&cmd->cdw10 = lba;
+
+	if (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
+		switch (ns->pi_type) {
+		case SPDK_NVME_FMT_NVM_PROTECTION_TYPE1:
+		case SPDK_NVME_FMT_NVM_PROTECTION_TYPE2:
+			cmd->cdw14 = (uint32_t)lba;
+			break;
+		}
+	}
+
+	cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK);
+
+	cmd->cdw12 = lba_count - 1;
+	cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK);
+
+	cmd->cdw15 = apptag_mask;
+	cmd->cdw15 = (cmd->cdw15 << 16 | apptag);
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request_prp(struct spdk_nvme_ns *ns,
+			       struct spdk_nvme_qpair *qpair,
+			       const struct nvme_payload *payload,
+			       uint32_t payload_offset, uint32_t md_offset,
+			       uint64_t lba, uint32_t lba_count,
+			       spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+			       uint32_t io_flags, struct nvme_request *req,
+			       uint16_t apptag_mask, uint16_t apptag)
+{
+	spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn;
+	spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn;
+	void *sgl_cb_arg = req->payload.contig_or_cb_arg;
+	bool start_valid, end_valid, last_sge, child_equals_parent;
+	uint64_t child_lba = lba;
+	uint32_t req_current_length = 0;
+	uint32_t child_length = 0;
+	uint32_t sge_length;
+	uint32_t page_size = qpair->ctrlr->page_size;
+	uintptr_t address;
+
+	reset_sgl_fn(sgl_cb_arg, payload_offset);
+	next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+	while (req_current_length < req->payload_size) {
+
+		if (sge_length == 0) {
+			continue;
+		} else if (req_current_length + sge_length > req->payload_size) {
+			sge_length = req->payload_size - req_current_length;
+		}
+
+		/*
+		 * The start of the SGE is invalid if the start address is not page aligned,
+		 *  unless it is the first SGE in the child request.
+		 */
+		start_valid = child_length == 0 || _is_page_aligned(address, page_size);
+
+		/* Boolean for whether this is the last SGE in the parent request. */
+		last_sge = (req_current_length + sge_length == req->payload_size);
+
+		/*
+		 * The end of the SGE is invalid if the end address is not page aligned,
+		 *  unless it is the last SGE in the parent request.
+		 */
+		end_valid = last_sge || _is_page_aligned(address + sge_length, page_size);
+
+		/*
+		 * This child request equals the parent request, meaning that no splitting
+		 *  was required for the parent request (the one passed into this function).
+		 *  In this case, we do not create a child request at all - we just send
+		 *  the original request as a single request at the end of this function.
+		 */
+		child_equals_parent = (child_length + sge_length == req->payload_size);
+
+		if (start_valid) {
+			/*
+			 * The start of the SGE is valid, so advance the length parameters,
+			 *  to include this SGE with previous SGEs for this child request
+			 *  (if any).  If it is not valid, we do not advance the length
+			 *  parameters nor get the next SGE, because we must send what has
+			 *  been collected before this SGE as a child request.
+			 */
+			child_length += sge_length;
+			req_current_length += sge_length;
+			if (req_current_length < req->payload_size) {
+				next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+			}
+			/*
+			 * If the next SGE is not page aligned, we will need to create a child
+			 *  request for what we have so far, and then start a new child request for
+			 *  the next SGE.
+			 */
+			start_valid = _is_page_aligned(address, page_size);
+		}
+
+		if (start_valid && end_valid && !last_sge) {
+			continue;
+		}
+
+		/*
+		 * We need to create a split here.  Send what we have accumulated so far as a child
+		 *  request.  Checking if child_equals_parent allows us to *not* create a child request
+		 *  when no splitting is required - in that case we will fall-through and just create
+		 *  a single request with no children for the entire I/O.
+		 */
+		if (!child_equals_parent) {
+			struct nvme_request *child;
+			uint32_t child_lba_count;
+
+			if ((child_length % ns->extended_lba_size) != 0) {
+				SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n",
+					    child_length, ns->extended_lba_size);
+				return NULL;
+			}
+			child_lba_count = child_length / ns->extended_lba_size;
+			/*
+			 * Note the last parameter is set to "false" - this tells the recursive
+			 *  call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+			 *  since we have already verified it here.
+			 */
+			child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+							child_lba, child_lba_count,
+							cb_fn, cb_arg, opc, io_flags,
+							apptag_mask, apptag, req, false);
+			if (child == NULL) {
+				return NULL;
+			}
+			payload_offset += child_length;
+			md_offset += child_lba_count * ns->md_size;
+			child_lba += child_lba_count;
+			child_length = 0;
+		}
+	}
+
+	if (child_length == req->payload_size) {
+		/* No splitting was required, so setup the whole payload as one request. */
+		_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+	}
+
+	return req;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request_sgl(struct spdk_nvme_ns *ns,
+			       struct spdk_nvme_qpair *qpair,
+			       const struct nvme_payload *payload,
+			       uint32_t payload_offset, uint32_t md_offset,
+			       uint64_t lba, uint32_t lba_count,
+			       spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+			       uint32_t io_flags, struct nvme_request *req,
+			       uint16_t apptag_mask, uint16_t apptag)
+{
+	spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn;
+	spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn;
+	void *sgl_cb_arg = req->payload.contig_or_cb_arg;
+	uint64_t child_lba = lba;
+	uint32_t req_current_length = 0;
+	uint32_t child_length = 0;
+	uint32_t sge_length;
+	uint16_t max_sges, num_sges;
+	uintptr_t address;
+
+	max_sges = ns->ctrlr->max_sges;
+
+	reset_sgl_fn(sgl_cb_arg, payload_offset);
+	num_sges = 0;
+
+	while (req_current_length < req->payload_size) {
+		next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+
+		if (req_current_length + sge_length > req->payload_size) {
+			sge_length = req->payload_size - req_current_length;
+		}
+
+		child_length += sge_length;
+		req_current_length += sge_length;
+		num_sges++;
+
+		if (num_sges < max_sges && req_current_length < req->payload_size) {
+			continue;
+		}
+
+		/*
+		 * We need to create a split here.  Send what we have accumulated so far as a child
+		 *  request.  Checking if the child equals the full payload allows us to *not*
+		 *  create a child request when no splitting is required - in that case we will
+		 *  fall-through and just create a single request with no children for the entire I/O.
+		 */
+		if (child_length != req->payload_size) {
+			struct nvme_request *child;
+			uint32_t child_lba_count;
+
+			if ((child_length % ns->extended_lba_size) != 0) {
+				SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n",
+					    child_length, ns->extended_lba_size);
+				return NULL;
+			}
+			child_lba_count = child_length / ns->extended_lba_size;
+			/*
+			 * Note the last parameter is set to "false" - this tells the recursive
+			 *  call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+			 *  since we have already verified it here.
+			 */
+			child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+							child_lba, child_lba_count,
+							cb_fn, cb_arg, opc, io_flags,
+							apptag_mask, apptag, req, false);
+			if (child == NULL) {
+				return NULL;
+			}
+			payload_offset += child_length;
+			md_offset += child_lba_count * ns->md_size;
+			child_lba += child_lba_count;
+			child_length = 0;
+			num_sges = 0;
+		}
+	}
+
+	if (child_length == req->payload_size) {
+		/* No splitting was required, so setup the whole payload as one request. */
+		_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+	}
+
+	return req;
+}
+
+static inline struct nvme_request *
+_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+		const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
+		uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+		uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl)
+{
+	struct nvme_request	*req;
+	uint32_t		sector_size;
+	uint32_t		sectors_per_max_io;
+	uint32_t		sectors_per_stripe;
+
+	sector_size = ns->extended_lba_size;
+	sectors_per_max_io = ns->sectors_per_max_io;
+	sectors_per_stripe = ns->sectors_per_stripe;
+
+	if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
+	    (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
+	    (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
+	    (ns->md_size == 8)) {
+		sector_size -= 8;
+	}
+
+	req = nvme_allocate_request(qpair, payload, lba_count * sector_size, lba_count * ns->md_size,
+				    cb_fn, cb_arg);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	req->payload_offset = payload_offset;
+	req->md_offset = md_offset;
+
+	/*
+	 * Intel DC P3*00 NVMe controllers benefit from driver-assisted striping.
+	 * If this controller defines a stripe boundary and this I/O spans a stripe
+	 *  boundary, split the request into multiple requests and submit each
+	 *  separately to hardware.
+	 */
+	if (sectors_per_stripe > 0 &&
+	    (((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) {
+
+		return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
+						  cb_fn,
+						  cb_arg, opc,
+						  io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag);
+	} else if (lba_count > sectors_per_max_io) {
+		return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
+						  cb_fn,
+						  cb_arg, opc,
+						  io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag);
+	} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) {
+		if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
+			return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset,
+							      lba, lba_count, cb_fn, cb_arg, opc, io_flags,
+							      req, apptag_mask, apptag);
+		} else {
+			return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset,
+							      lba, lba_count, cb_fn, cb_arg, opc, io_flags,
+							      req, apptag_mask, apptag);
+		}
+	}
+
+	_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+	return req;
+}
+
+int
+spdk_nvme_ns_cmd_compare(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+			 uint64_t lba,
+			 uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+			 uint32_t io_flags)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+			      SPDK_NVME_OPC_COMPARE,
+			      io_flags, 0,
+			      0, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_compare_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+				 void *buffer,
+				 void *metadata,
+				 uint64_t lba,
+				 uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+				 uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+			      SPDK_NVME_OPC_COMPARE,
+			      io_flags,
+			      apptag_mask, apptag, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+			  uint64_t lba, uint32_t lba_count,
+			  spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+			  spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+			  spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+			      SPDK_NVME_OPC_COMPARE,
+			      io_flags, 0, 0, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_comparev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+				  uint64_t lba, uint32_t lba_count,
+				  spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+				  spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+				  spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+				  uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+			      SPDK_NVME_OPC_COMPARE, io_flags, apptag_mask, apptag, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+		      uint64_t lba,
+		      uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+		      uint32_t io_flags)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+			      io_flags, 0,
+			      0, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+			      void *metadata,
+			      uint64_t lba,
+			      uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+			      uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+			      io_flags,
+			      apptag_mask, apptag, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+		       uint64_t lba, uint32_t lba_count,
+		       spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+		       spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+		       spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+			      io_flags, 0, 0, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+			       uint64_t lba, uint32_t lba_count,
+			       spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+			       spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+			       spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+			       uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+			      io_flags, apptag_mask, apptag, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+		       void *buffer, uint64_t lba,
+		       uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+		       uint32_t io_flags)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+			      io_flags, 0, 0, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+			       void *buffer, void *metadata, uint64_t lba,
+			       uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+			       uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+			      io_flags, apptag_mask, apptag, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+			uint64_t lba, uint32_t lba_count,
+			spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+			spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+			spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+			      io_flags, 0, 0, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+				uint64_t lba, uint32_t lba_count,
+				spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+				spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+				spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+				uint16_t apptag_mask, uint16_t apptag)
+{
+	struct nvme_request *req;
+	struct nvme_payload payload;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+	req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+			      io_flags, apptag_mask, apptag, true);
+	if (req != NULL) {
+		return nvme_qpair_submit_request(qpair, req);
+	} else if (nvme_ns_check_request_length(lba_count,
+						ns->sectors_per_max_io,
+						ns->sectors_per_stripe,
+						qpair->ctrlr->opts.io_queue_requests)) {
+		return -EINVAL;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+int
+spdk_nvme_ns_cmd_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+			      uint64_t lba, uint32_t lba_count,
+			      spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+			      uint32_t io_flags)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+	uint64_t		*tmp_lba;
+
+	if (!_is_io_flags_valid(io_flags)) {
+		return -EINVAL;
+	}
+
+	if (lba_count == 0 || lba_count > UINT16_MAX + 1) {
+		return -EINVAL;
+	}
+
+	req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_WRITE_ZEROES;
+	cmd->nsid = ns->id;
+
+	tmp_lba = (uint64_t *)&cmd->cdw10;
+	*tmp_lba = lba;
+	cmd->cdw12 = lba_count - 1;
+	cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK);
+	cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK);
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_write_uncorrectable(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+				     uint64_t lba, uint32_t lba_count,
+				     spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+	uint64_t		*tmp_lba;
+
+	if (lba_count == 0 || lba_count > UINT16_MAX + 1) {
+		return -EINVAL;
+	}
+
+	req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_WRITE_UNCORRECTABLE;
+	cmd->nsid = ns->id;
+
+	tmp_lba = (uint64_t *)&cmd->cdw10;
+	*tmp_lba = lba;
+	cmd->cdw12 = lba_count - 1;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_dataset_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+				    uint32_t type,
+				    const struct spdk_nvme_dsm_range *ranges, uint16_t num_ranges,
+				    spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	if (num_ranges == 0 || num_ranges > SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES) {
+		return -EINVAL;
+	}
+
+	if (ranges == NULL) {
+		return -EINVAL;
+	}
+
+	req = nvme_allocate_request_user_copy(qpair, (void *)ranges,
+					      num_ranges * sizeof(struct spdk_nvme_dsm_range),
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_DATASET_MANAGEMENT;
+	cmd->nsid = ns->id;
+
+	cmd->cdw10_bits.dsm.nr = num_ranges - 1;
+	cmd->cdw11 = type;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+		       spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_FLUSH;
+	cmd->nsid = ns->id;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_register(struct spdk_nvme_ns *ns,
+				      struct spdk_nvme_qpair *qpair,
+				      struct spdk_nvme_reservation_register_data *payload,
+				      bool ignore_key,
+				      enum spdk_nvme_reservation_register_action action,
+				      enum spdk_nvme_reservation_register_cptpl cptpl,
+				      spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	req = nvme_allocate_request_user_copy(qpair,
+					      payload, sizeof(struct spdk_nvme_reservation_register_data),
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_RESERVATION_REGISTER;
+	cmd->nsid = ns->id;
+
+	cmd->cdw10_bits.resv_register.rrega = action;
+	cmd->cdw10_bits.resv_register.iekey = ignore_key;
+	cmd->cdw10_bits.resv_register.cptpl = cptpl;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_release(struct spdk_nvme_ns *ns,
+				     struct spdk_nvme_qpair *qpair,
+				     struct spdk_nvme_reservation_key_data *payload,
+				     bool ignore_key,
+				     enum spdk_nvme_reservation_release_action action,
+				     enum spdk_nvme_reservation_type type,
+				     spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	req = nvme_allocate_request_user_copy(qpair,
+					      payload, sizeof(struct spdk_nvme_reservation_key_data), cb_fn,
+					      cb_arg, true);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_RESERVATION_RELEASE;
+	cmd->nsid = ns->id;
+
+	cmd->cdw10_bits.resv_release.rrela = action;
+	cmd->cdw10_bits.resv_release.iekey = ignore_key;
+	cmd->cdw10_bits.resv_release.rtype = type;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_acquire(struct spdk_nvme_ns *ns,
+				     struct spdk_nvme_qpair *qpair,
+				     struct spdk_nvme_reservation_acquire_data *payload,
+				     bool ignore_key,
+				     enum spdk_nvme_reservation_acquire_action action,
+				     enum spdk_nvme_reservation_type type,
+				     spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	req = nvme_allocate_request_user_copy(qpair,
+					      payload, sizeof(struct spdk_nvme_reservation_acquire_data),
+					      cb_fn, cb_arg, true);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_RESERVATION_ACQUIRE;
+	cmd->nsid = ns->id;
+
+	cmd->cdw10_bits.resv_acquire.racqa = action;
+	cmd->cdw10_bits.resv_acquire.iekey = ignore_key;
+	cmd->cdw10_bits.resv_acquire.rtype = type;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_report(struct spdk_nvme_ns *ns,
+				    struct spdk_nvme_qpair *qpair,
+				    void *payload, uint32_t len,
+				    spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	uint32_t		num_dwords;
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	if (len % 4) {
+		return -EINVAL;
+	}
+	num_dwords = len / 4;
+
+	req = nvme_allocate_request_user_copy(qpair, payload, len, cb_fn, cb_arg, false);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_RESERVATION_REPORT;
+	cmd->nsid = ns->id;
+
+	cmd->cdw10 = num_dwords;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c
new file mode 100644
index 000000000..f60aa6789
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c
@@ -0,0 +1,233 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvme_ocssd.h"
+#include "nvme_internal.h"
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_reset(struct spdk_nvme_ns *ns,
+				    struct spdk_nvme_qpair *qpair,
+				    uint64_t *lba_list, uint32_t num_lbas,
+				    struct spdk_ocssd_chunk_information_entry *chunk_info,
+				    spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	if (!lba_list || (num_lbas == 0) ||
+	    (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+		return -EINVAL;
+	}
+
+	req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_OCSSD_OPC_VECTOR_RESET;
+	cmd->nsid = ns->id;
+
+	if (chunk_info != NULL) {
+		cmd->mptr = spdk_vtophys(chunk_info, NULL);
+	}
+
+	/*
+	 * Dword 10 and 11 store a pointer to the list of logical block addresses.
+	 * If there is a single entry in the LBA list, the logical block
+	 * address should be stored instead.
+	 */
+	if (num_lbas == 1) {
+		*(uint64_t *)&cmd->cdw10 = *lba_list;
+	} else {
+		*(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL);
+	}
+
+	cmd->cdw12 = num_lbas - 1;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+static int
+_nvme_ocssd_ns_cmd_vector_rw_with_md(struct spdk_nvme_ns *ns,
+				     struct spdk_nvme_qpair *qpair,
+				     void *buffer, void *metadata,
+				     uint64_t *lba_list, uint32_t num_lbas,
+				     spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+				     enum spdk_ocssd_io_opcode opc,
+				     uint32_t io_flags)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+	struct nvme_payload	payload;
+	uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY;
+
+	if (io_flags & ~valid_flags) {
+		return -EINVAL;
+	}
+
+	if (!buffer || !lba_list || (num_lbas == 0) ||
+	    (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+		return -EINVAL;
+	}
+
+	payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+	req = nvme_allocate_request(qpair, &payload, num_lbas * ns->sector_size, num_lbas * ns->md_size,
+				    cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = opc;
+	cmd->nsid = ns->id;
+
+	/*
+	 * Dword 10 and 11 store a pointer to the list of logical block addresses.
+	 * If there is a single entry in the LBA list, the logical block
+	 * address should be stored instead.
+	 */
+	if (num_lbas == 1) {
+		*(uint64_t *)&cmd->cdw10 = *lba_list;
+	} else {
+		*(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL);
+	}
+
+	cmd->cdw12 = num_lbas - 1;
+	cmd->cdw12 |= io_flags;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_write_with_md(struct spdk_nvme_ns *ns,
+		struct spdk_nvme_qpair *qpair,
+		void *buffer, void *metadata,
+		uint64_t *lba_list, uint32_t num_lbas,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+		uint32_t io_flags)
+{
+	return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list,
+			num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_write(struct spdk_nvme_ns *ns,
+				    struct spdk_nvme_qpair *qpair,
+				    void *buffer,
+				    uint64_t *lba_list, uint32_t num_lbas,
+				    spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+				    uint32_t io_flags)
+{
+	return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list,
+			num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_read_with_md(struct spdk_nvme_ns *ns,
+		struct spdk_nvme_qpair *qpair,
+		void *buffer, void *metadata,
+		uint64_t *lba_list, uint32_t num_lbas,
+		spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+		uint32_t io_flags)
+{
+	return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list,
+			num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_read(struct spdk_nvme_ns *ns,
+				   struct spdk_nvme_qpair *qpair,
+				   void *buffer,
+				   uint64_t *lba_list, uint32_t num_lbas,
+				   spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+				   uint32_t io_flags)
+{
+	return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list,
+			num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_copy(struct spdk_nvme_ns *ns,
+				   struct spdk_nvme_qpair *qpair,
+				   uint64_t *dst_lba_list,
+				   uint64_t *src_lba_list,
+				   uint32_t num_lbas,
+				   spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+				   uint32_t io_flags)
+{
+	struct nvme_request	*req;
+	struct spdk_nvme_cmd	*cmd;
+
+	uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY;
+
+	if (io_flags & ~valid_flags) {
+		return -EINVAL;
+	}
+
+	if (!dst_lba_list || !src_lba_list || (num_lbas == 0) ||
+	    (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+		return -EINVAL;
+	}
+
+	req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_OCSSD_OPC_VECTOR_COPY;
+	cmd->nsid = ns->id;
+
+	/*
+	 * Dword 10 and 11 store a pointer to the list of source logical
+	 * block addresses.
+	 * Dword 14 and 15 store a pointer to the list of destination logical
+	 * block addresses.
+	 * If there is a single entry in the LBA list, the logical block
+	 * address should be stored instead.
+	 */
+	if (num_lbas == 1) {
+		*(uint64_t *)&cmd->cdw10 = *src_lba_list;
+		*(uint64_t *)&cmd->cdw14 = *dst_lba_list;
+	} else {
+		*(uint64_t *)&cmd->cdw10 = spdk_vtophys(src_lba_list, NULL);
+		*(uint64_t *)&cmd->cdw14 = spdk_vtophys(dst_lba_list, NULL);
+	}
+
+	cmd->cdw12 = num_lbas - 1;
+	cmd->cdw12 |= io_flags;
+
+	return nvme_qpair_submit_request(qpair, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_opal.c b/src/spdk/lib/nvme/nvme_opal.c
new file mode 100644
index 000000000..e0a3aa7fa
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_opal.c
@@ -0,0 +1,2566 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spdk/opal.h"
+#include "spdk_internal/log.h"
+#include "spdk/util.h"
+
+#include "nvme_opal_internal.h"
+
+static void
+opal_nvme_security_recv_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct opal_session *sess = arg;
+	struct spdk_opal_dev *dev = sess->dev;
+	void *response = sess->resp;
+	struct spdk_opal_compacket *header = response;
+	int ret;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		sess->sess_cb(sess, -EIO, sess->cb_arg);
+		return;
+	}
+
+	if (!header->outstanding_data && !header->min_transfer) {
+		sess->sess_cb(sess, 0, sess->cb_arg);
+		return;
+	}
+
+	memset(response, 0, IO_BUFFER_LENGTH);
+	ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG,
+			dev->comid, 0, sess->resp, IO_BUFFER_LENGTH,
+			opal_nvme_security_recv_done, sess);
+	if (ret) {
+		sess->sess_cb(sess, ret, sess->cb_arg);
+	}
+}
+
+static void
+opal_nvme_security_send_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+	struct opal_session *sess = arg;
+	struct spdk_opal_dev *dev = sess->dev;
+	int ret;
+
+	if (spdk_nvme_cpl_is_error(cpl)) {
+		sess->sess_cb(sess, -EIO, sess->cb_arg);
+		return;
+	}
+
+	ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG,
+			dev->comid, 0, sess->resp, IO_BUFFER_LENGTH,
+			opal_nvme_security_recv_done, sess);
+	if (ret) {
+		sess->sess_cb(sess, ret, sess->cb_arg);
+	}
+}
+
+static int
+opal_nvme_security_send(struct spdk_opal_dev *dev, struct opal_session *sess,
+			opal_sess_cb sess_cb, void *cb_arg)
+{
+	sess->sess_cb = sess_cb;
+	sess->cb_arg = cb_arg;
+
+	return spdk_nvme_ctrlr_cmd_security_send(dev->ctrlr, SPDK_SCSI_SECP_TCG, dev->comid,
+			0, sess->cmd, IO_BUFFER_LENGTH,
+			opal_nvme_security_send_done, sess);
+}
+
+static void
+opal_send_recv_done(struct opal_session *sess, int status, void *ctx)
+{
+	sess->status = status;
+	sess->done = true;
+}
+
+static int
+opal_send_recv(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+	int ret;
+
+	sess->done = false;
+	ret = opal_nvme_security_send(dev, sess, opal_send_recv_done, NULL);
+	if (ret) {
+		return ret;
+	}
+
+	while (!sess->done) {
+		spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
+	}
+
+	return sess->status;
+}
+
+static struct opal_session *
+opal_alloc_session(struct spdk_opal_dev *dev)
+{
+	struct opal_session *sess;
+
+	sess = calloc(1, sizeof(*sess));
+	if (!sess) {
+		return NULL;
+	}
+	sess->dev = dev;
+
+	return sess;
+}
+
+static void
+opal_add_token_u8(int *err, struct opal_session *sess, uint8_t token)
+{
+	if (*err) {
+		return;
+	}
+	if (sess->cmd_pos >= IO_BUFFER_LENGTH - 1) {
+		SPDK_ERRLOG("Error adding u8: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+	sess->cmd[sess->cmd_pos++] = token;
+}
+
+static void
+opal_add_short_atom_header(struct opal_session *sess, bool bytestring,
+			   bool has_sign, size_t len)
+{
+	uint8_t atom;
+	int err = 0;
+
+	atom = SPDK_SHORT_ATOM_ID;
+	atom |= bytestring ? SPDK_SHORT_ATOM_BYTESTRING_FLAG : 0;
+	atom |= has_sign ? SPDK_SHORT_ATOM_SIGN_FLAG : 0;
+	atom |= len & SPDK_SHORT_ATOM_LEN_MASK;
+
+	opal_add_token_u8(&err, sess, atom);
+}
+
+static void
+opal_add_medium_atom_header(struct opal_session *sess, bool bytestring,
+			    bool has_sign, size_t len)
+{
+	uint8_t header;
+
+	header = SPDK_MEDIUM_ATOM_ID;
+	header |= bytestring ? SPDK_MEDIUM_ATOM_BYTESTRING_FLAG : 0;
+	header |= has_sign ? SPDK_MEDIUM_ATOM_SIGN_FLAG : 0;
+	header |= (len >> 8) & SPDK_MEDIUM_ATOM_LEN_MASK;
+	sess->cmd[sess->cmd_pos++] = header;
+	sess->cmd[sess->cmd_pos++] = len;
+}
+
+static void
+opal_add_token_bytestring(int *err, struct opal_session *sess,
+			  const uint8_t *bytestring, size_t len)
+{
+	size_t header_len = 1;
+	bool is_short_atom = true;
+
+	if (*err) {
+		return;
+	}
+
+	if (len & ~SPDK_SHORT_ATOM_LEN_MASK) {
+		header_len = 2;
+		is_short_atom = false;
+	}
+
+	if (len >= IO_BUFFER_LENGTH - sess->cmd_pos - header_len) {
+		SPDK_ERRLOG("Error adding bytestring: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+
+	if (is_short_atom) {
+		opal_add_short_atom_header(sess, true, false, len);
+	} else {
+		opal_add_medium_atom_header(sess, true, false, len);
+	}
+
+	memcpy(&sess->cmd[sess->cmd_pos], bytestring, len);
+	sess->cmd_pos += len;
+}
+
+static void
+opal_add_token_u64(int *err, struct opal_session *sess, uint64_t number)
+{
+	int startat = 0;
+
+	if (*err) {
+		return;
+	}
+
+	/* add header first */
+	if (number <= SPDK_TINY_ATOM_DATA_MASK) {
+		sess->cmd[sess->cmd_pos++] = (uint8_t) number & SPDK_TINY_ATOM_DATA_MASK;
+	} else {
+		if (number < 0x100) {
+			sess->cmd[sess->cmd_pos++] = 0x81; /* short atom, 1 byte length */
+			startat = 0;
+		} else if (number < 0x10000) {
+			sess->cmd[sess->cmd_pos++] = 0x82; /* short atom, 2 byte length */
+			startat = 1;
+		} else if (number < 0x100000000) {
+			sess->cmd[sess->cmd_pos++] = 0x84; /* short atom, 4 byte length */
+			startat = 3;
+		} else {
+			sess->cmd[sess->cmd_pos++] = 0x88; /* short atom, 8 byte length */
+			startat = 7;
+		}
+
+		/* add number value */
+		for (int i = startat; i > -1; i--) {
+			sess->cmd[sess->cmd_pos++] = (uint8_t)((number >> (i * 8)) & 0xff);
+		}
+	}
+}
+
+static void
+opal_add_tokens(int *err, struct opal_session *sess, int num, ...)
+{
+	int i;
+	va_list args_ptr;
+	enum spdk_opal_token tmp;
+
+	va_start(args_ptr, num);
+
+	for (i = 0; i < num; i++) {
+		tmp = va_arg(args_ptr, enum spdk_opal_token);
+		opal_add_token_u8(err, sess, tmp);
+		if (*err != 0) { break; }
+	}
+
+	va_end(args_ptr);
+}
+
+static int
+opal_cmd_finalize(struct opal_session *sess, uint32_t hsn, uint32_t tsn, bool eod)
+{
+	struct spdk_opal_header *hdr;
+	int err = 0;
+
+	if (eod) {
+		opal_add_tokens(&err, sess, 6, SPDK_OPAL_ENDOFDATA,
+				SPDK_OPAL_STARTLIST,
+				0, 0, 0,
+				SPDK_OPAL_ENDLIST);
+	}
+
+	if (err) {
+		SPDK_ERRLOG("Error finalizing command.\n");
+		return -EFAULT;
+	}
+
+	hdr = (struct spdk_opal_header *)sess->cmd;
+
+	to_be32(&hdr->packet.session_tsn, tsn);
+	to_be32(&hdr->packet.session_hsn, hsn);
+
+	to_be32(&hdr->sub_packet.length, sess->cmd_pos - sizeof(*hdr));
+	while (sess->cmd_pos % 4) {
+		if (sess->cmd_pos >= IO_BUFFER_LENGTH) {
+			SPDK_ERRLOG("Error: Buffer overrun\n");
+			return -ERANGE;
+		}
+		sess->cmd[sess->cmd_pos++] = 0;
+	}
+	to_be32(&hdr->packet.length, sess->cmd_pos - sizeof(hdr->com_packet) -
+		sizeof(hdr->packet));
+	to_be32(&hdr->com_packet.length, sess->cmd_pos - sizeof(hdr->com_packet));
+
+	return 0;
+}
+
+static size_t
+opal_response_parse_tiny(struct spdk_opal_resp_token *token,
+			 const uint8_t *pos)
+{
+	token->pos = pos;
+	token->len = 1;
+	token->width = OPAL_WIDTH_TINY;
+
+	if (pos[0] & SPDK_TINY_ATOM_SIGN_FLAG) {
+		token->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		token->type = OPAL_DTA_TOKENID_UINT;
+		token->stored.unsigned_num = pos[0] & SPDK_TINY_ATOM_DATA_MASK;
+	}
+
+	return token->len;
+}
+
+static int
+opal_response_parse_short(struct spdk_opal_resp_token *token,
+			  const uint8_t *pos)
+{
+	token->pos = pos;
+	token->len = (pos[0] & SPDK_SHORT_ATOM_LEN_MASK) + 1; /* plus 1-byte header */
+	token->width = OPAL_WIDTH_SHORT;
+
+	if (pos[0] & SPDK_SHORT_ATOM_BYTESTRING_FLAG) {
+		token->type = OPAL_DTA_TOKENID_BYTESTRING;
+	} else if (pos[0] & SPDK_SHORT_ATOM_SIGN_FLAG) {
+		token->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		uint64_t u_integer = 0;
+		size_t i, b = 0;
+
+		token->type = OPAL_DTA_TOKENID_UINT;
+		if (token->len > 9) {
+			SPDK_ERRLOG("uint64 with more than 8 bytes\n");
+			return -EINVAL;
+		}
+		for (i = token->len - 1; i > 0; i--) {
+			u_integer |= ((uint64_t)pos[i] << (8 * b));
+			b++;
+		}
+		token->stored.unsigned_num = u_integer;
+	}
+
+	return token->len;
+}
+
+static size_t
+opal_response_parse_medium(struct spdk_opal_resp_token *token,
+			   const uint8_t *pos)
+{
+	token->pos = pos;
+	token->len = (((pos[0] & SPDK_MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; /* plus 2-byte header */
+	token->width = OPAL_WIDTH_MEDIUM;
+
+	if (pos[0] & SPDK_MEDIUM_ATOM_BYTESTRING_FLAG) {
+		token->type = OPAL_DTA_TOKENID_BYTESTRING;
+	} else if (pos[0] & SPDK_MEDIUM_ATOM_SIGN_FLAG) {
+		token->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		token->type = OPAL_DTA_TOKENID_UINT;
+	}
+
+	return token->len;
+}
+
+static size_t
+opal_response_parse_long(struct spdk_opal_resp_token *token,
+			 const uint8_t *pos)
+{
+	token->pos = pos;
+	token->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; /* plus 4-byte header */
+	token->width = OPAL_WIDTH_LONG;
+
+	if (pos[0] & SPDK_LONG_ATOM_BYTESTRING_FLAG) {
+		token->type = OPAL_DTA_TOKENID_BYTESTRING;
+	} else if (pos[0] & SPDK_LONG_ATOM_SIGN_FLAG) {
+		token->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		token->type = OPAL_DTA_TOKENID_UINT;
+	}
+
+	return token->len;
+}
+
+static size_t
+opal_response_parse_token(struct spdk_opal_resp_token *token,
+			  const uint8_t *pos)
+{
+	token->pos = pos;
+	token->len = 1;
+	token->type = OPAL_DTA_TOKENID_TOKEN;
+	token->width = OPAL_WIDTH_TOKEN;
+
+	return token->len;
+}
+
+static int
+opal_response_parse(const uint8_t *buf, size_t length,
+		    struct spdk_opal_resp_parsed *resp)
+{
+	const struct spdk_opal_header *hdr;
+	struct spdk_opal_resp_token *token_iter;
+	int num_entries = 0;
+	int total;
+	size_t token_length;
+	const uint8_t *pos;
+	uint32_t clen, plen, slen;
+
+	if (!buf || !resp) {
+		return -EINVAL;
+	}
+
+	hdr = (struct spdk_opal_header *)buf;
+	pos = buf + sizeof(*hdr);
+
+	clen = from_be32(&hdr->com_packet.length);
+	plen = from_be32(&hdr->packet.length);
+	slen = from_be32(&hdr->sub_packet.length);
+	SPDK_DEBUGLOG(SPDK_LOG_OPAL, "Response size: cp: %u, pkt: %u, subpkt: %u\n",
+		      clen, plen, slen);
+
+	if (clen == 0 || plen == 0 || slen == 0 ||
+	    slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+		SPDK_ERRLOG("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+			    clen, plen, slen);
+		return -EINVAL;
+	}
+
+	if (pos > buf + length) {
+		SPDK_ERRLOG("Pointer out of range\n");
+		return -EFAULT;
+	}
+
+	token_iter = resp->resp_tokens;
+	total = slen;
+
+	while (total > 0) {
+		if (pos[0] <= SPDK_TINY_ATOM_TYPE_MAX) { /* tiny atom */
+			token_length = opal_response_parse_tiny(token_iter, pos);
+		} else if (pos[0] <= SPDK_SHORT_ATOM_TYPE_MAX) { /* short atom */
+			token_length = opal_response_parse_short(token_iter, pos);
+		} else if (pos[0] <= SPDK_MEDIUM_ATOM_TYPE_MAX) { /* medium atom */
+			token_length = opal_response_parse_medium(token_iter, pos);
+		} else if (pos[0] <= SPDK_LONG_ATOM_TYPE_MAX) { /* long atom */
+			token_length = opal_response_parse_long(token_iter, pos);
+		} else { /* TOKEN */
+			token_length = opal_response_parse_token(token_iter, pos);
+		}
+
+		if (token_length <= 0) {
+			SPDK_ERRLOG("Parse response failure.\n");
+			return -EINVAL;
+		}
+
+		pos += token_length;
+		total -= token_length;
+		token_iter++;
+		num_entries++;
+
+		if (total < 0) {
+			SPDK_ERRLOG("Length not matching.\n");
+			return -EINVAL;
+		}
+	}
+
+	if (num_entries == 0) {
+		SPDK_ERRLOG("Couldn't parse response.\n");
+		return -EINVAL;
+	}
+	resp->num = num_entries;
+
+	return 0;
+}
+
+static inline bool
+opal_response_token_matches(const struct spdk_opal_resp_token *token,
+			    uint8_t match)
+{
+	if (!token ||
+	    token->type != OPAL_DTA_TOKENID_TOKEN ||
+	    token->pos[0] != match) {
+		return false;
+	}
+	return true;
+}
+
+static const struct spdk_opal_resp_token *
+opal_response_get_token(const struct spdk_opal_resp_parsed *resp, int index)
+{
+	const struct spdk_opal_resp_token *token;
+
+	if (index >= resp->num) {
+		SPDK_ERRLOG("Token number doesn't exist: %d, resp: %d\n",
+			    index, resp->num);
+		return NULL;
+	}
+
+	token = &resp->resp_tokens[index];
+	if (token->len == 0) {
+		SPDK_ERRLOG("Token length must be non-zero\n");
+		return NULL;
+	}
+
+	return token;
+}
+
+static uint64_t
+opal_response_get_u64(const struct spdk_opal_resp_parsed *resp, int index)
+{
+	if (!resp) {
+		SPDK_ERRLOG("Response is NULL\n");
+		return 0;
+	}
+
+	if (resp->resp_tokens[index].type != OPAL_DTA_TOKENID_UINT) {
+		SPDK_ERRLOG("Token is not unsigned int: %d\n",
+			    resp->resp_tokens[index].type);
+		return 0;
+	}
+
+	if (!(resp->resp_tokens[index].width == OPAL_WIDTH_TINY ||
+	      resp->resp_tokens[index].width == OPAL_WIDTH_SHORT)) {
+		SPDK_ERRLOG("Atom is not short or tiny: %d\n",
+			    resp->resp_tokens[index].width);
+		return 0;
+	}
+
+	return resp->resp_tokens[index].stored.unsigned_num;
+}
+
+static uint16_t
+opal_response_get_u16(const struct spdk_opal_resp_parsed *resp, int index)
+{
+	uint64_t i = opal_response_get_u64(resp, index);
+	if (i > 0xffffull) {
+		SPDK_ERRLOG("parse reponse u16 failed. Overflow\n");
+		return 0;
+	}
+	return (uint16_t) i;
+}
+
+static uint8_t
+opal_response_get_u8(const struct spdk_opal_resp_parsed *resp, int index)
+{
+	uint64_t i = opal_response_get_u64(resp, index);
+	if (i > 0xffull) {
+		SPDK_ERRLOG("parse reponse u8 failed. Overflow\n");
+		return 0;
+	}
+	return (uint8_t) i;
+}
+
+static size_t
+opal_response_get_string(const struct spdk_opal_resp_parsed *resp, int n,
+			 const char **store)
+{
+	uint8_t header_len;
+	struct spdk_opal_resp_token token;
+	*store = NULL;
+	if (!resp) {
+		SPDK_ERRLOG("Response is NULL\n");
+		return 0;
+	}
+
+	if (n > resp->num) {
+		SPDK_ERRLOG("Response has %d tokens. Can't access %d\n",
+			    resp->num, n);
+		return 0;
+	}
+
+	token = resp->resp_tokens[n];
+	if (token.type != OPAL_DTA_TOKENID_BYTESTRING) {
+		SPDK_ERRLOG("Token is not a byte string!\n");
+		return 0;
+	}
+
+	switch (token.width) {
+	case OPAL_WIDTH_SHORT:
+		header_len = 1;
+		break;
+	case OPAL_WIDTH_MEDIUM:
+		header_len = 2;
+		break;
+	case OPAL_WIDTH_LONG:
+		header_len = 4;
+		break;
+	default:
+		SPDK_ERRLOG("Can't get string from this Token\n");
+		return 0;
+	}
+
+	*store = token.pos + header_len;
+	return token.len - header_len;
+}
+
+static int
+opal_response_status(const struct spdk_opal_resp_parsed *resp)
+{
+	const struct spdk_opal_resp_token *tok;
+
+	/* if we get an EOS token, just return 0 */
+	tok = opal_response_get_token(resp, 0);
+	if (opal_response_token_matches(tok, SPDK_OPAL_ENDOFSESSION)) {
+		return 0;
+	}
+
+	if (resp->num < 5) {
+		return SPDK_DTAERROR_NO_METHOD_STATUS;
+	}
+
+	tok = opal_response_get_token(resp, resp->num - 5); /* the first token should be STARTLIST */
+	if (!opal_response_token_matches(tok, SPDK_OPAL_STARTLIST)) {
+		return SPDK_DTAERROR_NO_METHOD_STATUS;
+	}
+
+	tok = opal_response_get_token(resp, resp->num - 1); /* the last token should be ENDLIST */
+	if (!opal_response_token_matches(tok, SPDK_OPAL_ENDLIST)) {
+		return SPDK_DTAERROR_NO_METHOD_STATUS;
+	}
+
+	/* The second and third values in the status list are reserved, and are
+	defined in core spec to be 0x00 and 0x00 and SHOULD be ignored by the host. */
+	return (int)opal_response_get_u64(resp,
+					  resp->num - 4); /* We only need the first value in the status list. */
+}
+
+static int
+opal_parse_and_check_status(struct opal_session *sess)
+{
+	int error;
+
+	error = opal_response_parse(sess->resp, IO_BUFFER_LENGTH, &sess->parsed_resp);
+	if (error) {
+		SPDK_ERRLOG("Couldn't parse response.\n");
+		return error;
+	}
+	return opal_response_status(&sess->parsed_resp);
+}
+
+static inline void
+opal_clear_cmd(struct opal_session *sess)
+{
+	sess->cmd_pos = sizeof(struct spdk_opal_header);
+	memset(sess->cmd, 0, IO_BUFFER_LENGTH);
+}
+
+static inline void
+opal_set_comid(struct opal_session *sess, uint16_t comid)
+{
+	struct spdk_opal_header *hdr = (struct spdk_opal_header *)sess->cmd;
+
+	hdr->com_packet.comid[0] = comid >> 8;
+	hdr->com_packet.comid[1] = comid;
+	hdr->com_packet.extended_comid[0] = 0;
+	hdr->com_packet.extended_comid[1] = 0;
+}
+
+static inline int
+opal_init_key(struct spdk_opal_key *opal_key, const char *passwd)
+{
+	int len;
+
+	if (passwd == NULL || passwd[0] == '\0') {
+		SPDK_ERRLOG("Password is empty. Create key failed\n");
+		return -EINVAL;
+	}
+
+	len = strlen(passwd);
+
+	if (len >= OPAL_KEY_MAX) {
+		SPDK_ERRLOG("Password too long. Create key failed\n");
+		return -EINVAL;
+	}
+
+	opal_key->key_len = len;
+	memcpy(opal_key->key, passwd, opal_key->key_len);
+
+	return 0;
+}
+
+static void
+opal_build_locking_range(uint8_t *buffer, uint8_t locking_range)
+{
+	memcpy(buffer, spdk_opal_uid[UID_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH);
+
+	/* global */
+	if (locking_range == 0) {
+		return;
+	}
+
+	/* non-global */
+	buffer[5] = LOCKING_RANGE_NON_GLOBAL;
+	buffer[7] = locking_range;
+}
+
+static void
+opal_check_tper(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_tper_feat *tper = data;
+
+	dev->feat_info.tper = *tper;
+}
+
+/*
+ * check single user mode
+ */
+static bool
+opal_check_sum(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_single_user_mode_feat *sum = data;
+	uint32_t num_locking_objects = from_be32(&sum->num_locking_objects);
+
+	if (num_locking_objects == 0) {
+		SPDK_NOTICELOG("Need at least one locking object.\n");
+		return false;
+	}
+
+	dev->feat_info.single_user = *sum;
+
+	return true;
+}
+
+static void
+opal_check_lock(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_locking_feat *lock = data;
+
+	dev->feat_info.locking = *lock;
+}
+
+static void
+opal_check_geometry(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_geo_feat *geo = data;
+
+	dev->feat_info.geo = *geo;
+}
+
+static void
+opal_check_datastore(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_datastore_feat *datastore = data;
+
+	dev->feat_info.datastore = *datastore;
+}
+
+static uint16_t
+opal_get_comid_v100(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_v100_feat *v100 = data;
+	uint16_t base_comid = from_be16(&v100->base_comid);
+
+	dev->feat_info.v100 = *v100;
+
+	return base_comid;
+}
+
+static uint16_t
+opal_get_comid_v200(struct spdk_opal_dev *dev, const void *data)
+{
+	const struct spdk_opal_d0_v200_feat *v200 = data;
+	uint16_t base_comid = from_be16(&v200->base_comid);
+
+	dev->feat_info.v200 = *v200;
+
+	return base_comid;
+}
+
+static int
+opal_discovery0_end(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size)
+{
+	bool supported = false, single_user = false;
+	const struct spdk_opal_d0_hdr *hdr = (struct spdk_opal_d0_hdr *)payload;
+	struct spdk_opal_d0_feat_hdr *feat_hdr;
+	const uint8_t *epos = payload, *cpos = payload;
+	uint16_t comid = 0;
+	uint32_t hlen = from_be32(&(hdr->length));
+
+	if (hlen > payload_size - sizeof(*hdr)) {
+		SPDK_ERRLOG("Discovery length overflows buffer (%zu+%u)/%u\n",
+			    sizeof(*hdr), hlen, payload_size);
+		return -EFAULT;
+	}
+
+	epos += hlen; /* end of buffer */
+	cpos += sizeof(*hdr); /* current position on buffer */
+
+	while (cpos < epos) {
+		feat_hdr = (struct spdk_opal_d0_feat_hdr *)cpos;
+		uint16_t feat_code = from_be16(&feat_hdr->code);
+
+		switch (feat_code) {
+		case FEATURECODE_TPER:
+			opal_check_tper(dev, cpos);
+			break;
+		case FEATURECODE_SINGLEUSER:
+			single_user = opal_check_sum(dev, cpos);
+			break;
+		case FEATURECODE_GEOMETRY:
+			opal_check_geometry(dev, cpos);
+			break;
+		case FEATURECODE_LOCKING:
+			opal_check_lock(dev, cpos);
+			break;
+		case FEATURECODE_DATASTORE:
+			opal_check_datastore(dev, cpos);
+			break;
+		case FEATURECODE_OPALV100:
+			comid = opal_get_comid_v100(dev, cpos);
+			supported = true;
+			break;
+		case FEATURECODE_OPALV200:
+			comid = opal_get_comid_v200(dev, cpos);
+			supported = true;
+			break;
+		default:
+			SPDK_INFOLOG(SPDK_LOG_OPAL, "Unknow feature code: %d\n", feat_code);
+		}
+		cpos += feat_hdr->length + sizeof(*feat_hdr);
+	}
+
+	if (supported == false) {
+		SPDK_ERRLOG("Opal Not Supported.\n");
+		return -ENOTSUP;
+	}
+
+	if (single_user == false) {
+		SPDK_INFOLOG(SPDK_LOG_OPAL, "Single User Mode Not Supported\n");
+	}
+
+	dev->comid = comid;
+	return 0;
+}
+
+static int
+opal_discovery0(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size)
+{
+	int ret;
+
+	ret = spdk_nvme_ctrlr_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, LV0_DISCOVERY_COMID,
+					       0, payload, payload_size);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_discovery0_end(dev, payload, payload_size);
+}
+
+static int
+opal_end_session(struct spdk_opal_dev *dev, struct opal_session *sess, uint16_t comid)
+{
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, comid);
+	opal_add_token_u8(&err, sess, SPDK_OPAL_ENDOFSESSION);
+
+	if (err < 0) {
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, false);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	sess->hsn = 0;
+	sess->tsn = 0;
+
+	return opal_parse_and_check_status(sess);
+}
+
+void
+spdk_opal_dev_destruct(struct spdk_opal_dev *dev)
+{
+	free(dev);
+}
+
+static int
+opal_start_session_done(struct opal_session *sess)
+{
+	uint32_t hsn, tsn;
+	int error = 0;
+
+	error = opal_parse_and_check_status(sess);
+	if (error) {
+		return error;
+	}
+
+	hsn = opal_response_get_u64(&sess->parsed_resp, 4);
+	tsn = opal_response_get_u64(&sess->parsed_resp, 5);
+
+	if (hsn == 0 && tsn == 0) {
+		SPDK_ERRLOG("Couldn't authenticate session\n");
+		return -EPERM;
+	}
+
+	sess->hsn = hsn;
+	sess->tsn = tsn;
+
+	return 0;
+}
+
+static int
+opal_start_generic_session(struct spdk_opal_dev *dev,
+			   struct opal_session *sess,
+			   enum opal_uid_enum auth,
+			   enum opal_uid_enum sp_type,
+			   const char *key,
+			   uint8_t key_len)
+{
+	uint32_t hsn;
+	int err = 0;
+	int ret;
+
+	if (key == NULL && auth != UID_ANYBODY) {
+		return OPAL_INVAL_PARAM;
+	}
+
+	opal_clear_cmd(sess);
+
+	opal_set_comid(sess, dev->comid);
+	hsn = GENERIC_HOST_SESSION_NUM;
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD],
+				  OPAL_UID_LENGTH);
+	opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST);
+	opal_add_token_u64(&err, sess, hsn);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[sp_type], OPAL_UID_LENGTH);
+	opal_add_token_u8(&err, sess, SPDK_OPAL_TRUE); /* Write */
+
+	switch (auth) {
+	case UID_ANYBODY:
+		opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST);
+		break;
+	case UID_ADMIN1:
+	case UID_SID:
+		opal_add_token_u8(&err, sess, SPDK_OPAL_STARTNAME);
+		opal_add_token_u8(&err, sess, 0); /* HostChallenge */
+		opal_add_token_bytestring(&err, sess, key, key_len);
+		opal_add_tokens(&err, sess, 3,    /* number of token */
+				SPDK_OPAL_ENDNAME,
+				SPDK_OPAL_STARTNAME,
+				3);/* HostSignAuth */
+		opal_add_token_bytestring(&err, sess, spdk_opal_uid[auth],
+					  OPAL_UID_LENGTH);
+		opal_add_token_u8(&err, sess, SPDK_OPAL_ENDNAME);
+		opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST);
+		break;
+	default:
+		SPDK_ERRLOG("Cannot start Admin SP session with auth %d\n", auth);
+		return -EINVAL;
+	}
+
+	if (err) {
+		SPDK_ERRLOG("Error building start adminsp session command.\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_start_session_done(sess);
+}
+
+static int
+opal_get_msid_cpin_pin_done(struct opal_session *sess,
+			    struct spdk_opal_key *opal_key)
+{
+	const char *msid_pin;
+	size_t strlen;
+	int error = 0;
+
+	error = opal_parse_and_check_status(sess);
+	if (error) {
+		return error;
+	}
+
+	strlen = opal_response_get_string(&sess->parsed_resp, 4, &msid_pin);
+	if (!msid_pin) {
+		SPDK_ERRLOG("Couldn't extract PIN from response\n");
+		return -EINVAL;
+	}
+
+	opal_key->key_len = strlen;
+	memcpy(opal_key->key, msid_pin, opal_key->key_len);
+
+	SPDK_DEBUGLOG(SPDK_LOG_OPAL, "MSID = %p\n", opal_key->key);
+	return 0;
+}
+
+static int
+opal_get_msid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess,
+		       struct spdk_opal_key *opal_key)
+{
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_C_PIN_MSID],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_STARTCOLUMN,
+			SPDK_OPAL_PIN,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_ENDCOLUMN,
+			SPDK_OPAL_PIN,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building Get MSID CPIN PIN command.\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_get_msid_cpin_pin_done(sess, opal_key);
+}
+
+static int
+opal_build_generic_pw_cmd(struct opal_session *sess, uint8_t *key, size_t key_len,
+			  uint8_t *cpin_uid, struct spdk_opal_dev *dev)
+{
+	int err = 0;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, cpin_uid, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD],
+				  OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 6,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_VALUES,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_PIN);
+	opal_add_token_bytestring(&err, sess, key, key_len);
+	opal_add_tokens(&err, sess, 4,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST);
+	if (err) {
+		return err;
+	}
+
+	return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+}
+
+static int
+opal_get_locking_sp_lifecycle_done(struct opal_session *sess)
+{
+	uint8_t lifecycle;
+	int error = 0;
+
+	error = opal_parse_and_check_status(sess);
+	if (error) {
+		return error;
+	}
+
+	lifecycle = opal_response_get_u64(&sess->parsed_resp, 4);
+	if (lifecycle != OPAL_MANUFACTURED_INACTIVE) { /* status before activate */
+		SPDK_ERRLOG("Couldn't determine the status of the Lifecycle state\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+opal_get_locking_sp_lifecycle(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_STARTCOLUMN,
+			SPDK_OPAL_LIFECYCLE,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_ENDCOLUMN,
+			SPDK_OPAL_LIFECYCLE,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error Building GET Lifecycle Status command\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_get_locking_sp_lifecycle_done(sess);
+}
+
+static int
+opal_activate(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[ACTIVATE_METHOD],
+				  OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building Activate LockingSP command.\n");
+		return err;
+	}
+
+	/* TODO: Single User Mode for activatation */
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_start_auth_session(struct spdk_opal_dev *dev,
+			struct opal_session *sess,
+			enum spdk_opal_user user,
+			struct spdk_opal_key *opal_key)
+{
+	uint8_t uid_user[OPAL_UID_LENGTH];
+	int err = 0;
+	int ret;
+	uint32_t hsn = GENERIC_HOST_SESSION_NUM;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	if (user != OPAL_ADMIN1) {
+		memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH);
+		uid_user[7] = user;
+	} else {
+		memcpy(uid_user, spdk_opal_uid[UID_ADMIN1], OPAL_UID_LENGTH);
+	}
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD],
+				  OPAL_UID_LENGTH);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST);
+	opal_add_token_u64(&err, sess, hsn);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP],
+				  OPAL_UID_LENGTH);
+	opal_add_tokens(&err, sess, 3, SPDK_OPAL_TRUE, SPDK_OPAL_STARTNAME,
+			0); /* True for a Read-Write session  */
+	opal_add_token_bytestring(&err, sess, opal_key->key, opal_key->key_len);
+	opal_add_tokens(&err, sess, 3, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME, 3); /* HostSignAuth */
+	opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+	opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building STARTSESSION command.\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_start_session_done(sess);
+}
+
+static int
+opal_lock_unlock_range(struct spdk_opal_dev *dev, struct opal_session *sess,
+		       enum spdk_opal_locking_range locking_range,
+		       enum spdk_opal_lock_state l_state)
+{
+	uint8_t uid_locking_range[OPAL_UID_LENGTH];
+	uint8_t read_locked, write_locked;
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_build_locking_range(uid_locking_range, locking_range);
+
+	switch (l_state) {
+	case OPAL_READONLY:
+		read_locked = 0;
+		write_locked = 1;
+		break;
+	case OPAL_READWRITE:
+		read_locked = 0;
+		write_locked = 0;
+		break;
+	case OPAL_RWLOCK:
+		read_locked = 1;
+		write_locked = 1;
+		break;
+	default:
+		SPDK_ERRLOG("Tried to set an invalid locking state.\n");
+		return -EINVAL;
+	}
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 15, SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_VALUES,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_READLOCKED,
+			read_locked,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_WRITELOCKED,
+			write_locked,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building SET command.\n");
+		return err;
+	}
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int opal_generic_locking_range_enable_disable(struct spdk_opal_dev *dev,
+		struct opal_session *sess,
+		uint8_t *uid, bool read_lock_enabled, bool write_lock_enabled)
+{
+	int err = 0;
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 23, SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_VALUES,
+			SPDK_OPAL_STARTLIST,
+
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_READLOCKENABLED,
+			read_lock_enabled,
+			SPDK_OPAL_ENDNAME,
+
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_WRITELOCKENABLED,
+			write_lock_enabled,
+			SPDK_OPAL_ENDNAME,
+
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_READLOCKED,
+			0,
+			SPDK_OPAL_ENDNAME,
+
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_WRITELOCKED,
+			0,
+			SPDK_OPAL_ENDNAME,
+
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST);
+	if (err) {
+		SPDK_ERRLOG("Error building locking range enable/disable command.\n");
+	}
+	return err;
+}
+
+static int
+opal_setup_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess,
+			 enum spdk_opal_locking_range locking_range,
+			 uint64_t range_start, uint64_t range_length,
+			 bool read_lock_enabled, bool write_lock_enabled)
+{
+	uint8_t uid_locking_range[OPAL_UID_LENGTH];
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_build_locking_range(uid_locking_range, locking_range);
+
+	if (locking_range == 0) {
+		err = opal_generic_locking_range_enable_disable(dev, sess, uid_locking_range,
+				read_lock_enabled, write_lock_enabled);
+	} else {
+		opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+		opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+		opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD],
+					  OPAL_UID_LENGTH);
+
+		opal_add_tokens(&err, sess, 6,
+				SPDK_OPAL_STARTLIST,
+				SPDK_OPAL_STARTNAME,
+				SPDK_OPAL_VALUES,
+				SPDK_OPAL_STARTLIST,
+				SPDK_OPAL_STARTNAME,
+				SPDK_OPAL_RANGESTART);
+		opal_add_token_u64(&err, sess, range_start);
+		opal_add_tokens(&err, sess, 3,
+				SPDK_OPAL_ENDNAME,
+				SPDK_OPAL_STARTNAME,
+				SPDK_OPAL_RANGELENGTH);
+		opal_add_token_u64(&err, sess, range_length);
+		opal_add_tokens(&err, sess, 3,
+				SPDK_OPAL_ENDNAME,
+				SPDK_OPAL_STARTNAME,
+				SPDK_OPAL_READLOCKENABLED);
+		opal_add_token_u64(&err, sess, read_lock_enabled);
+		opal_add_tokens(&err, sess, 3,
+				SPDK_OPAL_ENDNAME,
+				SPDK_OPAL_STARTNAME,
+				SPDK_OPAL_WRITELOCKENABLED);
+		opal_add_token_u64(&err, sess, write_lock_enabled);
+		opal_add_tokens(&err, sess, 4,
+				SPDK_OPAL_ENDNAME,
+				SPDK_OPAL_ENDLIST,
+				SPDK_OPAL_ENDNAME,
+				SPDK_OPAL_ENDLIST);
+	}
+	if (err) {
+		SPDK_ERRLOG("Error building Setup Locking range command.\n");
+		return err;
+
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_get_max_ranges_done(struct opal_session *sess)
+{
+	int error = 0;
+
+	error = opal_parse_and_check_status(sess);
+	if (error) {
+		return error;
+	}
+
+	/* "MaxRanges" is token 4 of response */
+	return opal_response_get_u16(&sess->parsed_resp, 4);
+}
+
+static int
+opal_get_max_ranges(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKING_INFO_TABLE],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_STARTCOLUMN,
+			SPDK_OPAL_MAXRANGES,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_ENDCOLUMN,
+			SPDK_OPAL_MAXRANGES,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error Building GET Lifecycle Status command\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_get_max_ranges_done(sess);
+}
+
+static int
+opal_get_locking_range_info_done(struct opal_session *sess,
+				 struct spdk_opal_locking_range_info *info)
+{
+	int error = 0;
+
+	error = opal_parse_and_check_status(sess);
+	if (error) {
+		return error;
+	}
+
+	info->range_start = opal_response_get_u64(&sess->parsed_resp, 4);
+	info->range_length = opal_response_get_u64(&sess->parsed_resp, 8);
+	info->read_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 12);
+	info->write_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 16);
+	info->read_locked = opal_response_get_u8(&sess->parsed_resp, 20);
+	info->write_locked = opal_response_get_u8(&sess->parsed_resp, 24);
+
+	return 0;
+}
+
+static int
+opal_get_locking_range_info(struct spdk_opal_dev *dev,
+			    struct opal_session *sess,
+			    enum spdk_opal_locking_range locking_range_id)
+{
+	int err = 0;
+	int ret;
+	uint8_t uid_locking_range[OPAL_UID_LENGTH];
+	struct spdk_opal_locking_range_info *info;
+
+	opal_build_locking_range(uid_locking_range, locking_range_id);
+
+	assert(locking_range_id < SPDK_OPAL_MAX_LOCKING_RANGE);
+	info = &dev->locking_ranges[locking_range_id];
+	memset(info, 0, sizeof(*info));
+	info->locking_range_id = locking_range_id;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+
+	opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_STARTCOLUMN,
+			SPDK_OPAL_RANGESTART,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_ENDCOLUMN,
+			SPDK_OPAL_WRITELOCKED,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error Building get locking range info command\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_get_locking_range_info_done(sess, info);
+}
+
+static int
+opal_enable_user(struct spdk_opal_dev *dev, struct opal_session *sess,
+		 enum spdk_opal_user user)
+{
+	int err = 0;
+	int ret;
+	uint8_t uid_user[OPAL_UID_LENGTH];
+
+	memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH);
+	uid_user[7] = user;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 11,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_VALUES,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_AUTH_ENABLE,
+			SPDK_OPAL_TRUE,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error Building enable user command\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_add_user_to_locking_range(struct spdk_opal_dev *dev,
+			       struct opal_session *sess,
+			       enum spdk_opal_user user,
+			       enum spdk_opal_locking_range locking_range,
+			       enum spdk_opal_lock_state l_state)
+{
+	int err = 0;
+	int ret;
+	uint8_t uid_user[OPAL_UID_LENGTH];
+	uint8_t uid_locking_range[OPAL_UID_LENGTH];
+
+	memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH);
+	uid_user[7] = user;
+
+	switch (l_state) {
+	case OPAL_READONLY:
+		memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_RDLOCKED], OPAL_UID_LENGTH);
+		break;
+	case OPAL_READWRITE:
+		memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_WRLOCKED], OPAL_UID_LENGTH);
+		break;
+	default:
+		SPDK_ERRLOG("locking state should only be OPAL_READONLY or OPAL_READWRITE\n");
+		return -EINVAL;
+	}
+
+	uid_locking_range[7] = locking_range;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 8,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_VALUES,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_BOOLEAN_EXPR,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF],
+				  OPAL_UID_LENGTH / 2);
+	opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF],
+				  OPAL_UID_LENGTH / 2);
+	opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_BOOLEAN_ACE], OPAL_UID_LENGTH / 2);
+	opal_add_tokens(&err, sess, 7,
+			SPDK_OPAL_TRUE,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST);
+	if (err) {
+		SPDK_ERRLOG("Error building add user to locking range command\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_new_user_passwd(struct spdk_opal_dev *dev, struct opal_session *sess,
+		     enum spdk_opal_user user,
+		     struct spdk_opal_key *opal_key)
+{
+	uint8_t uid_cpin[OPAL_UID_LENGTH];
+	int ret;
+
+	if (user == OPAL_ADMIN1) {
+		memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_ADMIN1], OPAL_UID_LENGTH);
+	} else {
+		memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_USER1], OPAL_UID_LENGTH);
+		uid_cpin[7] = user;
+	}
+
+	ret = opal_build_generic_pw_cmd(sess, opal_key->key, opal_key->key_len, uid_cpin, dev);
+	if (ret != 0) {
+		SPDK_ERRLOG("Error building set password command\n");
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_set_sid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess, char *new_passwd)
+{
+	uint8_t cpin_uid[OPAL_UID_LENGTH];
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	ret = opal_init_key(&opal_key, new_passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	memcpy(cpin_uid, spdk_opal_uid[UID_C_PIN_SID], OPAL_UID_LENGTH);
+
+	if (opal_build_generic_pw_cmd(sess, opal_key.key, opal_key.key_len, cpin_uid, dev)) {
+		SPDK_ERRLOG("Error building Set SID cpin\n");
+		return -ERANGE;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+int
+spdk_opal_cmd_take_ownership(struct spdk_opal_dev *dev, char *new_passwd)
+{
+	int ret;
+	struct spdk_opal_key opal_key = {};
+	struct opal_session *sess;
+
+	assert(dev != NULL);
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_generic_session(dev, sess, UID_ANYBODY, UID_ADMINSP, NULL, 0);
+	if (ret) {
+		SPDK_ERRLOG("start admin SP session error %d\n", ret);
+		goto end;
+	}
+
+	ret = opal_get_msid_cpin_pin(dev, sess, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("get msid error %d\n", ret);
+		opal_end_session(dev, sess, dev->comid);
+		goto end;
+	}
+
+	ret = opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+		goto end;
+	}
+
+	/* reuse the session structure */
+	memset(sess, 0, sizeof(*sess));
+	sess->dev = dev;
+	ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP,
+					 opal_key.key, opal_key.key_len);
+	if (ret) {
+		SPDK_ERRLOG("start admin SP session error %d\n", ret);
+		goto end;
+	}
+	memset(&opal_key, 0, sizeof(struct spdk_opal_key));
+
+	ret = opal_set_sid_cpin_pin(dev, sess, new_passwd);
+	if (ret) {
+		SPDK_ERRLOG("set cpin error %d\n", ret);
+		opal_end_session(dev, sess, dev->comid);
+		goto end;
+	}
+
+	ret = opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+end:
+	free(sess);
+	return ret;
+}
+
+struct spdk_opal_dev *
+	spdk_opal_dev_construct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct spdk_opal_dev *dev;
+	void *payload;
+
+	dev = calloc(1, sizeof(*dev));
+	if (!dev) {
+		SPDK_ERRLOG("Memory allocation failed\n");
+		return NULL;
+	}
+
+	dev->ctrlr = ctrlr;
+
+	payload = calloc(1, IO_BUFFER_LENGTH);
+	if (!payload) {
+		free(dev);
+		return NULL;
+	}
+
+	if (opal_discovery0(dev, payload, IO_BUFFER_LENGTH)) {
+		SPDK_INFOLOG(SPDK_LOG_OPAL, "Opal is not supported on this device\n");
+		free(dev);
+		free(payload);
+		return NULL;
+	}
+
+	free(payload);
+	return dev;
+}
+
+static int
+opal_build_revert_tper_cmd(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+	int err = 0;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_ADMINSP],
+				  OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[REVERT_METHOD],
+				  OPAL_UID_LENGTH);
+	opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST);
+	opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST);
+	if (err) {
+		SPDK_ERRLOG("Error building REVERT TPER command.\n");
+		return -ERANGE;
+	}
+
+	return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+}
+
+static int
+opal_gen_new_active_key(struct spdk_opal_dev *dev, struct opal_session *sess,
+			struct spdk_opal_key *active_key)
+{
+	uint8_t uid_data[OPAL_UID_LENGTH] = {0};
+	int err = 0;
+	int length;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	if (active_key->key_len == 0) {
+		SPDK_ERRLOG("Error finding previous data to generate new active key\n");
+		return -EINVAL;
+	}
+
+	length = spdk_min(active_key->key_len, OPAL_UID_LENGTH);
+	memcpy(uid_data, active_key->key, length);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_data, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[GENKEY_METHOD],
+				  OPAL_UID_LENGTH);
+
+	opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building new key generation command.\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_get_active_key_done(struct opal_session *sess, struct spdk_opal_key *active_key)
+{
+	const char *key;
+	size_t str_len;
+	int error = 0;
+
+	error = opal_parse_and_check_status(sess);
+	if (error) {
+		return error;
+	}
+
+	str_len = opal_response_get_string(&sess->parsed_resp, 4, &key);
+	if (!key) {
+		SPDK_ERRLOG("Couldn't extract active key from response\n");
+		return -EINVAL;
+	}
+
+	active_key->key_len = str_len;
+	memcpy(active_key->key, key, active_key->key_len);
+
+	SPDK_DEBUGLOG(SPDK_LOG_OPAL, "active key = %p\n", active_key->key);
+	return 0;
+}
+
+static int
+opal_get_active_key(struct spdk_opal_dev *dev, struct opal_session *sess,
+		    enum spdk_opal_locking_range locking_range,
+		    struct spdk_opal_key *active_key)
+{
+	uint8_t uid_locking_range[OPAL_UID_LENGTH];
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_build_locking_range(uid_locking_range, locking_range);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD],
+				  OPAL_UID_LENGTH);
+	opal_add_tokens(&err, sess, 12,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTLIST,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_STARTCOLUMN,
+			SPDK_OPAL_ACTIVEKEY,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_STARTNAME,
+			SPDK_OPAL_ENDCOLUMN,
+			SPDK_OPAL_ACTIVEKEY,
+			SPDK_OPAL_ENDNAME,
+			SPDK_OPAL_ENDLIST,
+			SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building get active key command.\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_get_active_key_done(sess, active_key);
+}
+
+static int
+opal_erase_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess,
+			 enum spdk_opal_locking_range locking_range)
+{
+	uint8_t uid_locking_range[OPAL_UID_LENGTH];
+	int err = 0;
+	int ret;
+
+	opal_clear_cmd(sess);
+	opal_set_comid(sess, dev->comid);
+
+	opal_build_locking_range(uid_locking_range, locking_range);
+
+	opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+	opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+	opal_add_token_bytestring(&err, sess, spdk_opal_method[ERASE_METHOD],
+				  OPAL_UID_LENGTH);
+	opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST);
+
+	if (err) {
+		SPDK_ERRLOG("Error building erase locking range.\n");
+		return err;
+	}
+
+	ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+	if (ret) {
+		return ret;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		return ret;
+	}
+
+	return opal_parse_and_check_status(sess);
+}
+
+int
+spdk_opal_cmd_revert_tper(struct spdk_opal_dev *dev, const char *passwd)
+{
+	int ret;
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret) {
+		SPDK_ERRLOG("Init key failed\n");
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP,
+					 opal_key.key, opal_key.key_len);
+	if (ret) {
+		SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_build_revert_tper_cmd(dev, sess);
+	if (ret) {
+		opal_end_session(dev, sess, dev->comid);
+		SPDK_ERRLOG("Build revert tper command with error %d\n", ret);
+		goto end;
+	}
+
+	ret = opal_send_recv(dev, sess);
+	if (ret) {
+		opal_end_session(dev, sess, dev->comid);
+		SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret);
+		goto end;
+	}
+
+	ret = opal_parse_and_check_status(sess);
+	if (ret) {
+		opal_end_session(dev, sess, dev->comid);
+		SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret);
+	}
+	/* No opal_end_session() required here for successful case */
+
+end:
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_activate_locking_sp(struct spdk_opal_dev *dev, const char *passwd)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP,
+					 opal_key.key, opal_key.key_len);
+	if (ret) {
+		SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_get_locking_sp_lifecycle(dev, sess);
+	if (ret) {
+		SPDK_ERRLOG("Error on getting SP lifecycle with error %d\n", ret);
+		goto end;
+	}
+
+	ret = opal_activate(dev, sess);
+	if (ret) {
+		SPDK_ERRLOG("Error on activation with error %d\n", ret);
+	}
+
+end:
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("Error on ending session with error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_lock_unlock(struct spdk_opal_dev *dev, enum spdk_opal_user user,
+			  enum spdk_opal_lock_state flag, enum spdk_opal_locking_range locking_range,
+			  const char *passwd)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, user, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_lock_unlock_range(dev, sess, locking_range, flag);
+	if (ret) {
+		SPDK_ERRLOG("lock unlock range error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_setup_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user,
+				  enum spdk_opal_locking_range locking_range_id, uint64_t range_start,
+				  uint64_t range_length, const char *passwd)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, user, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_setup_locking_range(dev, sess, locking_range_id, range_start, range_length, true,
+				       true);
+	if (ret) {
+		SPDK_ERRLOG("setup locking range error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_get_max_ranges(struct spdk_opal_dev *dev, const char *passwd)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	if (dev->max_ranges) {
+		return dev->max_ranges;
+	}
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, OPAL_ADMIN1, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_get_max_ranges(dev, sess);
+	if (ret > 0) {
+		dev->max_ranges = ret;
+	}
+
+	ret = opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+
+	return (ret == 0 ? dev->max_ranges : ret);
+}
+
+int
+spdk_opal_cmd_get_locking_range_info(struct spdk_opal_dev *dev, const char *passwd,
+				     enum spdk_opal_user user_id,
+				     enum spdk_opal_locking_range locking_range_id)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, user_id, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_get_locking_range_info(dev, sess, locking_range_id);
+	if (ret) {
+		SPDK_ERRLOG("get locking range info error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_enable_user(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+			  const char *passwd)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret =  opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP,
+					  opal_key.key, opal_key.key_len);
+	if (ret) {
+		SPDK_ERRLOG("start locking SP session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_enable_user(dev, sess, user_id);
+	if (ret) {
+		SPDK_ERRLOG("enable user error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_add_user_to_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+					enum spdk_opal_locking_range locking_range_id,
+					enum spdk_opal_lock_state lock_flag, const char *passwd)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret =  opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP,
+					  opal_key.key, opal_key.key_len);
+	if (ret) {
+		SPDK_ERRLOG("start locking SP session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_add_user_to_locking_range(dev, sess, user_id, locking_range_id, lock_flag);
+	if (ret) {
+		SPDK_ERRLOG("add user to locking range error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_set_new_passwd(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+			     const char *new_passwd, const char *old_passwd, bool new_user)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key old_key = {};
+	struct spdk_opal_key new_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&old_key, old_passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	ret = opal_init_key(&new_key, new_passwd);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, new_user ? OPAL_ADMIN1 : user_id,
+				      &old_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_new_user_passwd(dev, sess, user_id, &new_key);
+	if (ret) {
+		SPDK_ERRLOG("set new passwd error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+				  enum spdk_opal_locking_range locking_range_id, const char *password)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, password);
+	if (ret != 0) {
+		return ret;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, user_id, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_erase_locking_range(dev, sess, locking_range_id);
+	if (ret) {
+		SPDK_ERRLOG("get active key error %d\n", ret);
+	}
+
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+
+	free(sess);
+	return ret;
+}
+
+int
+spdk_opal_cmd_secure_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+		enum spdk_opal_locking_range locking_range_id, const char *password)
+{
+	struct opal_session *sess;
+	struct spdk_opal_key opal_key = {};
+	struct spdk_opal_key *active_key;
+	int ret;
+
+	assert(dev != NULL);
+
+	ret = opal_init_key(&opal_key, password);
+	if (ret != 0) {
+		return ret;
+	}
+
+	active_key = calloc(1, sizeof(*active_key));
+	if (!active_key) {
+		return -ENOMEM;
+	}
+
+	sess = opal_alloc_session(dev);
+	if (!sess) {
+		free(active_key);
+		return -ENOMEM;
+	}
+
+	ret = opal_start_auth_session(dev, sess, user_id, &opal_key);
+	if (ret) {
+		SPDK_ERRLOG("start authenticate session error %d\n", ret);
+		free(active_key);
+		free(sess);
+		return ret;
+	}
+
+	ret = opal_get_active_key(dev, sess, locking_range_id, active_key);
+	if (ret) {
+		SPDK_ERRLOG("get active key error %d\n", ret);
+		goto end;
+	}
+
+	ret = opal_gen_new_active_key(dev, sess, active_key);
+	if (ret) {
+		SPDK_ERRLOG("generate new active key error %d\n", ret);
+		goto end;
+	}
+	memset(active_key, 0, sizeof(struct spdk_opal_key));
+
+end:
+	ret += opal_end_session(dev, sess, dev->comid);
+	if (ret) {
+		SPDK_ERRLOG("end session error %d\n", ret);
+	}
+	free(active_key);
+	free(sess);
+	return ret;
+}
+
+struct spdk_opal_d0_features_info *
+spdk_opal_get_d0_features_info(struct spdk_opal_dev *dev)
+{
+	return &dev->feat_info;
+}
+
+bool
+spdk_opal_supported(struct spdk_opal_dev *dev)
+{
+	return false;
+}
+
+struct spdk_opal_locking_range_info *
+spdk_opal_get_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id)
+{
+	assert(id < SPDK_OPAL_MAX_LOCKING_RANGE);
+	return &dev->locking_ranges[id];
+}
+
+void
+spdk_opal_free_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id)
+{
+	struct spdk_opal_locking_range_info *info;
+
+	assert(id < SPDK_OPAL_MAX_LOCKING_RANGE);
+	info = &dev->locking_ranges[id];
+	memset(info, 0, sizeof(*info));
+}
+
+/* Log component for opal submodule */
+SPDK_LOG_REGISTER_COMPONENT("opal", SPDK_LOG_OPAL)
diff --git a/src/spdk/lib/nvme/nvme_opal_internal.h b/src/spdk/lib/nvme/nvme_opal_internal.h
new file mode 100644
index 000000000..11815d435
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_opal_internal.h
@@ -0,0 +1,272 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_OPAL_INTERNAL_H
+#define SPDK_OPAL_INTERNAL_H
+
+#include "spdk/opal_spec.h"
+#include "spdk/opal.h"
+#include "spdk/scsi_spec.h"
+
+#define IO_BUFFER_LENGTH		2048
+#define MAX_TOKS			64
+#define OPAL_KEY_MAX			256
+#define OPAL_UID_LENGTH			8
+
+#define GENERIC_HOST_SESSION_NUM	0x69
+
+#define OPAL_INVAL_PARAM		12
+
+#define SPDK_DTAERROR_NO_METHOD_STATUS	0x89
+
+enum opal_token_type {
+	OPAL_DTA_TOKENID_BYTESTRING	= 0xE0,
+	OPAL_DTA_TOKENID_SINT		= 0xE1,
+	OPAL_DTA_TOKENID_UINT		= 0xE2,
+	OPAL_DTA_TOKENID_TOKEN		= 0xE3, /* actual token is returned */
+	OPAL_DTA_TOKENID_INVALID	= 0X0,
+};
+
+enum opal_atom_width {
+	OPAL_WIDTH_TINY,    /* 1 byte in length */
+	OPAL_WIDTH_SHORT,   /* a 1-byte header and contain up to 15 bytes of data */
+	OPAL_WIDTH_MEDIUM,  /* a 2-byte header and contain up to 2047 bytes of data */
+	OPAL_WIDTH_LONG,    /* a 4-byte header and which contain up to 16,777,215 bytes of data */
+	OPAL_WIDTH_TOKEN
+};
+
+enum opal_uid_enum {
+	/* users */
+	UID_SMUID,
+	UID_THISSP,
+	UID_ADMINSP,
+	UID_LOCKINGSP,
+	UID_ANYBODY,
+	UID_SID,
+	UID_ADMIN1,
+	UID_USER1,
+	UID_USER2,
+
+	/* tables */
+	UID_LOCKINGRANGE_GLOBAL,
+	UID_LOCKINGRANGE_ACE_RDLOCKED,
+	UID_LOCKINGRANGE_ACE_WRLOCKED,
+	UID_MBRCONTROL,
+	UID_MBR,
+	UID_AUTHORITY_TABLE,
+	UID_C_PIN_TABLE,
+	UID_LOCKING_INFO_TABLE,
+	UID_PSID,
+
+	/* C_PIN_TABLE object ID's */
+	UID_C_PIN_MSID,
+	UID_C_PIN_SID,
+	UID_C_PIN_ADMIN1,
+	UID_C_PIN_USER1,
+
+	/* half UID's (only first 4 bytes used) */
+	UID_HALF_AUTHORITY_OBJ_REF,
+	UID_HALF_BOOLEAN_ACE,
+};
+
+/* enum for indexing the spdk_opal_method array */
+enum opal_method_enum {
+	PROPERTIES_METHOD,
+	STARTSESSION_METHOD,
+	REVERT_METHOD,
+	ACTIVATE_METHOD,
+	NEXT_METHOD,
+	GETACL_METHOD,
+	GENKEY_METHOD,
+	REVERTSP_METHOD,
+	GET_METHOD,
+	SET_METHOD,
+	AUTHENTICATE_METHOD,
+	RANDOM_METHOD,
+	ERASE_METHOD,
+};
+
+struct spdk_opal_key {
+	uint8_t key_len;
+	uint8_t key[OPAL_KEY_MAX];
+};
+
+const uint8_t spdk_opal_uid[][OPAL_UID_LENGTH] = {
+	/* users */
+	[UID_SMUID] = /* Session Manager UID */
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff },
+	[UID_THISSP] =
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+	[UID_ADMINSP] =
+	{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 },
+	[UID_LOCKINGSP] =
+	{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 },
+	[UID_ANYBODY] =
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 },
+	[UID_SID] = /* Security Identifier UID */
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 },
+	[UID_ADMIN1] =
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 },
+	[UID_USER1] =
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 },
+	[UID_USER2] =
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 },
+
+	/* tables */
+	[UID_LOCKINGRANGE_GLOBAL] =
+	{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+	[UID_LOCKINGRANGE_ACE_RDLOCKED] =
+	{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
+	[UID_LOCKINGRANGE_ACE_WRLOCKED] =
+	{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 },
+	[UID_MBRCONTROL] =
+	{ 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 },
+	[UID_MBR] =
+	{ 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 },
+	[UID_AUTHORITY_TABLE] =
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00},
+	[UID_C_PIN_TABLE] =
+	{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00},
+	[UID_LOCKING_INFO_TABLE] =
+	{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 },
+	[UID_PSID] =
+	{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 },
+
+	/* C_PIN_TABLE object ID's */
+	[UID_C_PIN_MSID] =
+	{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
+	[UID_C_PIN_SID] =
+	{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
+	[UID_C_PIN_ADMIN1] =
+	{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
+	[UID_C_PIN_USER1] =
+	{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x03, 0x00, 0x01},
+
+	/* half UID's (only first 4 bytes used) */
+	[UID_HALF_AUTHORITY_OBJ_REF] =
+	{ 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
+	[UID_HALF_BOOLEAN_ACE] =
+	{ 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff },
+};
+
+/*
+ * TCG Storage SSC Methods.
+ */
+const uint8_t spdk_opal_method[][OPAL_UID_LENGTH] = {
+	[PROPERTIES_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
+	[STARTSESSION_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 },
+	[REVERT_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 },
+	[ACTIVATE_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 },
+	[NEXT_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 },
+	[GETACL_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d },
+	[GENKEY_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 },
+	[REVERTSP_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 },
+	[GET_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 },
+	[SET_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 },
+	[AUTHENTICATE_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c },
+	[RANDOM_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
+	[ERASE_METHOD] =
+	{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+};
+
+/*
+ * Response token
+ */
+struct spdk_opal_resp_token {
+	const uint8_t *pos;
+	uint8_t _padding[7];
+	union {
+		uint64_t unsigned_num;
+		int64_t signed_num;
+	} stored;
+	size_t len; /* header + data */
+	enum opal_token_type type;
+	enum opal_atom_width width;
+};
+
+struct spdk_opal_resp_parsed {
+	int num;
+	struct spdk_opal_resp_token resp_tokens[MAX_TOKS];
+};
+
+/* header of a response */
+struct spdk_opal_header {
+	struct spdk_opal_compacket com_packet;
+	struct spdk_opal_packet packet;
+	struct spdk_opal_data_subpacket sub_packet;
+};
+
+struct opal_session;
+struct spdk_opal_dev;
+
+typedef void (*opal_sess_cb)(struct opal_session *sess, int status, void *ctx);
+
+struct opal_session {
+	uint32_t hsn;
+	uint32_t tsn;
+	size_t cmd_pos;
+	uint8_t cmd[IO_BUFFER_LENGTH];
+	uint8_t resp[IO_BUFFER_LENGTH];
+	struct spdk_opal_resp_parsed parsed_resp;
+
+	opal_sess_cb sess_cb;
+	void *cb_arg;
+	bool done;
+	int status;
+	struct spdk_opal_dev *dev;
+};
+
+struct spdk_opal_dev {
+	struct spdk_nvme_ctrlr *ctrlr;
+
+	uint16_t comid;
+
+	struct spdk_opal_d0_features_info feat_info;
+
+	uint8_t max_ranges; /* max locking range number */
+	struct spdk_opal_locking_range_info locking_ranges[SPDK_OPAL_MAX_LOCKING_RANGE];
+};
+
+#endif
diff --git a/src/spdk/lib/nvme/nvme_pcie.c b/src/spdk/lib/nvme/nvme_pcie.c
new file mode 100644
index 000000000..132e34cdc
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_pcie.c
@@ -0,0 +1,2604 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2017, IBM Corporation. All rights reserved.
+ *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over PCIe transport
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "nvme_internal.h"
+#include "nvme_uevent.h"
+
+/*
+ * Number of completion queue entries to process before ringing the
+ *  completion queue doorbell.
+ */
+#define NVME_MIN_COMPLETIONS	(1)
+#define NVME_MAX_COMPLETIONS	(128)
+
+/*
+ * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
+ *  segment.
+ */
+#define NVME_MAX_SGL_DESCRIPTORS	(250)
+
+#define NVME_MAX_PRP_LIST_ENTRIES	(503)
+
+struct nvme_pcie_enum_ctx {
+	struct spdk_nvme_probe_ctx *probe_ctx;
+	struct spdk_pci_addr pci_addr;
+	bool has_pci_addr;
+};
+
+/* PCIe transport extensions for spdk_nvme_ctrlr */
+struct nvme_pcie_ctrlr {
+	struct spdk_nvme_ctrlr ctrlr;
+
+	/** NVMe MMIO register space */
+	volatile struct spdk_nvme_registers *regs;
+
+	/** NVMe MMIO register size */
+	uint64_t regs_size;
+
+	struct {
+		/* BAR mapping address which contains controller memory buffer */
+		void *bar_va;
+
+		/* BAR physical address which contains controller memory buffer */
+		uint64_t bar_pa;
+
+		/* Controller memory buffer size in Bytes */
+		uint64_t size;
+
+		/* Current offset of controller memory buffer, relative to start of BAR virt addr */
+		uint64_t current_offset;
+
+		void *mem_register_addr;
+		size_t mem_register_size;
+	} cmb;
+
+	/** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
+	uint32_t doorbell_stride_u32;
+
+	/* Opaque handle to associated PCI device. */
+	struct spdk_pci_device *devhandle;
+
+	/* Flag to indicate the MMIO register has been remapped */
+	bool is_remapped;
+};
+
+struct nvme_tracker {
+	TAILQ_ENTRY(nvme_tracker)       tq_list;
+
+	struct nvme_request		*req;
+	uint16_t			cid;
+
+	uint16_t			rsvd0;
+	uint32_t			rsvd1;
+
+	spdk_nvme_cmd_cb		cb_fn;
+	void				*cb_arg;
+
+	uint64_t			prp_sgl_bus_addr;
+
+	/* Don't move, metadata SGL is always contiguous with Data Block SGL */
+	struct spdk_nvme_sgl_descriptor		meta_sgl;
+	union {
+		uint64_t			prp[NVME_MAX_PRP_LIST_ENTRIES];
+		struct spdk_nvme_sgl_descriptor	sgl[NVME_MAX_SGL_DESCRIPTORS];
+	} u;
+};
+/*
+ * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
+ * and so that there is no padding required to meet alignment requirements.
+ */
+SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
+SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
+SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned");
+
+struct nvme_pcie_poll_group {
+	struct spdk_nvme_transport_poll_group group;
+};
+
+/* PCIe transport extensions for spdk_nvme_qpair */
+struct nvme_pcie_qpair {
+	/* Submission queue tail doorbell */
+	volatile uint32_t *sq_tdbl;
+
+	/* Completion queue head doorbell */
+	volatile uint32_t *cq_hdbl;
+
+	/* Submission queue */
+	struct spdk_nvme_cmd *cmd;
+
+	/* Completion queue */
+	struct spdk_nvme_cpl *cpl;
+
+	TAILQ_HEAD(, nvme_tracker) free_tr;
+	TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
+
+	/* Array of trackers indexed by command ID. */
+	struct nvme_tracker *tr;
+
+	uint16_t num_entries;
+
+	uint8_t retry_count;
+
+	uint16_t max_completions_cap;
+
+	uint16_t last_sq_tail;
+	uint16_t sq_tail;
+	uint16_t cq_head;
+	uint16_t sq_head;
+
+	struct {
+		uint8_t phase			: 1;
+		uint8_t delay_cmd_submit	: 1;
+		uint8_t has_shadow_doorbell	: 1;
+	} flags;
+
+	/*
+	 * Base qpair structure.
+	 * This is located after the hot data in this structure so that the important parts of
+	 * nvme_pcie_qpair are in the same cache line.
+	 */
+	struct spdk_nvme_qpair qpair;
+
+	struct {
+		/* Submission queue shadow tail doorbell */
+		volatile uint32_t *sq_tdbl;
+
+		/* Completion queue shadow head doorbell */
+		volatile uint32_t *cq_hdbl;
+
+		/* Submission queue event index */
+		volatile uint32_t *sq_eventidx;
+
+		/* Completion queue event index */
+		volatile uint32_t *cq_eventidx;
+	} shadow_doorbell;
+
+	/*
+	 * Fields below this point should not be touched on the normal I/O path.
+	 */
+
+	bool sq_in_cmb;
+
+	uint64_t cmd_bus_addr;
+	uint64_t cpl_bus_addr;
+
+	struct spdk_nvme_cmd *sq_vaddr;
+	struct spdk_nvme_cpl *cq_vaddr;
+};
+
+static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx,
+				  struct spdk_pci_addr *pci_addr);
+static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
+				     const struct spdk_nvme_io_qpair_opts *opts);
+static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
+
+__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
+static uint16_t g_signal_lock;
+static bool g_sigset = false;
+
+static void
+nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
+{
+	void *map_address;
+	uint16_t flag = 0;
+
+	if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE,
+					 __ATOMIC_RELAXED)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n");
+		return;
+	}
+
+	assert(g_thread_mmio_ctrlr != NULL);
+
+	if (!g_thread_mmio_ctrlr->is_remapped) {
+		map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+		if (map_address == MAP_FAILED) {
+			SPDK_ERRLOG("mmap failed\n");
+			__atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE);
+			return;
+		}
+		memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
+		g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
+		g_thread_mmio_ctrlr->is_remapped = true;
+	}
+	__atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE);
+}
+
+static void
+nvme_pcie_ctrlr_setup_signal(void)
+{
+	struct sigaction sa;
+
+	sa.sa_sigaction = nvme_sigbus_fault_sighandler;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = SA_SIGINFO;
+	sigaction(SIGBUS, &sa, NULL);
+}
+
+static inline struct nvme_pcie_ctrlr *
+nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+	assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
+	return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
+}
+
+static int
+_nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
+{
+	struct spdk_nvme_ctrlr *ctrlr, *tmp;
+	struct spdk_uevent event;
+	struct spdk_pci_addr pci_addr;
+
+	if (g_spdk_nvme_driver->hotplug_fd < 0) {
+		return 0;
+	}
+
+	while (nvme_get_uevent(g_spdk_nvme_driver->hotplug_fd, &event) > 0) {
+		if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
+		    event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
+			if (event.action == SPDK_NVME_UEVENT_ADD) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
+					      event.traddr);
+				if (spdk_process_is_primary()) {
+					if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
+						nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr);
+					}
+				}
+			} else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
+				struct spdk_nvme_transport_id trid;
+
+				memset(&trid, 0, sizeof(trid));
+				spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
+				snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
+
+				ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid);
+				if (ctrlr == NULL) {
+					return 0;
+				}
+				SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
+					      event.traddr);
+
+				nvme_ctrlr_fail(ctrlr, true);
+
+				/* get the user app to clean up and stop I/O */
+				if (ctrlr->remove_cb) {
+					nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+					ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr);
+					nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+				}
+			}
+		}
+	}
+
+	/* Initiate removal of physically hotremoved PCI controllers. Even after
+	 * they're hotremoved from the system, SPDK might still report them via RPC.
+	 */
+	TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
+		bool do_remove = false;
+		struct nvme_pcie_ctrlr *pctrlr;
+
+		if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+			continue;
+		}
+
+		pctrlr = nvme_pcie_ctrlr(ctrlr);
+		if (spdk_pci_device_is_removed(pctrlr->devhandle)) {
+			do_remove = true;
+		}
+
+		if (do_remove) {
+			nvme_ctrlr_fail(ctrlr, true);
+			if (ctrlr->remove_cb) {
+				nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+				ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr);
+				nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+			}
+		}
+	}
+	return 0;
+}
+
+static inline struct nvme_pcie_qpair *
+nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
+{
+	assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
+	return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
+}
+
+static volatile void *
+nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+	return (volatile void *)((uintptr_t)pctrlr->regs + offset);
+}
+
+static int
+nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+	assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
+	g_thread_mmio_ctrlr = pctrlr;
+	spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
+	g_thread_mmio_ctrlr = NULL;
+	return 0;
+}
+
+static int
+nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+	assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
+	g_thread_mmio_ctrlr = pctrlr;
+	spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
+	g_thread_mmio_ctrlr = NULL;
+	return 0;
+}
+
+static int
+nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+	assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
+	assert(value != NULL);
+	g_thread_mmio_ctrlr = pctrlr;
+	*value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
+	g_thread_mmio_ctrlr = NULL;
+	if (~(*value) == 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+	assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
+	assert(value != NULL);
+	g_thread_mmio_ctrlr = pctrlr;
+	*value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
+	g_thread_mmio_ctrlr = NULL;
+	if (~(*value) == 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
+{
+	return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
+					 value);
+}
+
+static int
+nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
+{
+	return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
+					 value);
+}
+
+static int
+nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
+{
+	return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
+					 aqa->raw);
+}
+
+static int
+nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
+{
+	return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
+					 &cmbloc->raw);
+}
+
+static int
+nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
+{
+	return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
+					 &cmbsz->raw);
+}
+
+static  uint32_t
+nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+	/*
+	 * For commands requiring more than 2 PRP entries, one PRP will be
+	 *  embedded in the command (prp1), and the rest of the PRP entries
+	 *  will be in a list pointed to by the command (prp2).  This means
+	 *  that real max number of PRP entries we support is 506+1, which
+	 *  results in a max xfer size of 506*ctrlr->page_size.
+	 */
+	return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
+}
+
+static uint16_t
+nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return NVME_MAX_SGL_DESCRIPTORS;
+}
+
+static void
+nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
+{
+	int rc;
+	void *addr = NULL;
+	uint32_t bir;
+	union spdk_nvme_cmbsz_register cmbsz;
+	union spdk_nvme_cmbloc_register cmbloc;
+	uint64_t size, unit_size, offset, bar_size = 0, bar_phys_addr = 0;
+
+	if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
+	    nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+		SPDK_ERRLOG("get registers failed\n");
+		goto exit;
+	}
+
+	if (!cmbsz.bits.sz) {
+		goto exit;
+	}
+
+	bir = cmbloc.bits.bir;
+	/* Values 0 2 3 4 5 are valid for BAR */
+	if (bir > 5 || bir == 1) {
+		goto exit;
+	}
+
+	/* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
+	unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
+	/* controller memory buffer size in Bytes */
+	size = unit_size * cmbsz.bits.sz;
+	/* controller memory buffer offset from BAR in Bytes */
+	offset = unit_size * cmbloc.bits.ofst;
+
+	rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
+				     &bar_phys_addr, &bar_size);
+	if ((rc != 0) || addr == NULL) {
+		goto exit;
+	}
+
+	if (offset > bar_size) {
+		goto exit;
+	}
+
+	if (size > bar_size - offset) {
+		goto exit;
+	}
+
+	pctrlr->cmb.bar_va = addr;
+	pctrlr->cmb.bar_pa = bar_phys_addr;
+	pctrlr->cmb.size = size;
+	pctrlr->cmb.current_offset = offset;
+
+	if (!cmbsz.bits.sqs) {
+		pctrlr->ctrlr.opts.use_cmb_sqs = false;
+	}
+
+	return;
+exit:
+	pctrlr->ctrlr.opts.use_cmb_sqs = false;
+	return;
+}
+
+static int
+nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
+{
+	int rc = 0;
+	union spdk_nvme_cmbloc_register cmbloc;
+	void *addr = pctrlr->cmb.bar_va;
+
+	if (addr) {
+		if (pctrlr->cmb.mem_register_addr) {
+			spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size);
+		}
+
+		if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+			SPDK_ERRLOG("get_cmbloc() failed\n");
+			return -EIO;
+		}
+		rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
+	}
+	return rc;
+}
+
+static int
+nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+	if (pctrlr->cmb.bar_va == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
+		return -ENOTSUP;
+	}
+
+	if (ctrlr->opts.use_cmb_sqs) {
+		SPDK_ERRLOG("CMB is already in use for submission queues.\n");
+		return -ENOTSUP;
+	}
+
+	return 0;
+}
+
+static void *
+nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+	union spdk_nvme_cmbsz_register cmbsz;
+	union spdk_nvme_cmbloc_register cmbloc;
+	uint64_t mem_register_start, mem_register_end;
+	int rc;
+
+	if (pctrlr->cmb.mem_register_addr != NULL) {
+		*size = pctrlr->cmb.mem_register_size;
+		return pctrlr->cmb.mem_register_addr;
+	}
+
+	*size = 0;
+
+	if (pctrlr->cmb.bar_va == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
+		return NULL;
+	}
+
+	if (ctrlr->opts.use_cmb_sqs) {
+		SPDK_ERRLOG("CMB is already in use for submission queues.\n");
+		return NULL;
+	}
+
+	if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
+	    nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+		SPDK_ERRLOG("get registers failed\n");
+		return NULL;
+	}
+
+	/* If only SQS is supported */
+	if (!(cmbsz.bits.wds || cmbsz.bits.rds)) {
+		return NULL;
+	}
+
+	/* If CMB is less than 4MiB in size then abort CMB mapping */
+	if (pctrlr->cmb.size < (1ULL << 22)) {
+		return NULL;
+	}
+
+	mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset +
+				       VALUE_2MB - 1);
+	mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset +
+				     pctrlr->cmb.size);
+	pctrlr->cmb.mem_register_addr = (void *)mem_register_start;
+	pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start;
+
+	rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start);
+	if (rc) {
+		SPDK_ERRLOG("spdk_mem_register() failed\n");
+		return NULL;
+	}
+
+	pctrlr->cmb.mem_register_addr = (void *)mem_register_start;
+	pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start;
+
+	*size = pctrlr->cmb.mem_register_size;
+	return pctrlr->cmb.mem_register_addr;
+}
+
+static int
+nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+	int rc;
+
+	if (pctrlr->cmb.mem_register_addr == NULL) {
+		return 0;
+	}
+
+	rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size);
+
+	if (rc == 0) {
+		pctrlr->cmb.mem_register_addr = NULL;
+		pctrlr->cmb.mem_register_size = 0;
+	}
+
+	return rc;
+}
+
+static int
+nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
+{
+	int rc;
+	void *addr = NULL;
+	uint64_t phys_addr = 0, size = 0;
+
+	rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
+				     &phys_addr, &size);
+
+	if ((addr == NULL) || (rc != 0)) {
+		SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
+			    rc, addr);
+		return -1;
+	}
+
+	pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
+	pctrlr->regs_size = size;
+	nvme_pcie_ctrlr_map_cmb(pctrlr);
+
+	return 0;
+}
+
+static int
+nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
+{
+	int rc = 0;
+	void *addr = (void *)pctrlr->regs;
+
+	if (pctrlr->ctrlr.is_removed) {
+		return rc;
+	}
+
+	rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
+	if (rc != 0) {
+		SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
+		return -1;
+	}
+
+	if (addr) {
+		/* NOTE: addr may have been remapped here. We're relying on DPDK to call
+		 * munmap internally.
+		 */
+		rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
+	}
+	return rc;
+}
+
+static int
+nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries)
+{
+	struct nvme_pcie_qpair *pqpair;
+	int rc;
+
+	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+	if (pqpair == NULL) {
+		return -ENOMEM;
+	}
+
+	pqpair->num_entries = num_entries;
+	pqpair->flags.delay_cmd_submit = 0;
+
+	ctrlr->adminq = &pqpair->qpair;
+
+	rc = nvme_qpair_init(ctrlr->adminq,
+			     0, /* qpair ID */
+			     ctrlr,
+			     SPDK_NVME_QPRIO_URGENT,
+			     num_entries);
+	if (rc != 0) {
+		return rc;
+	}
+
+	return nvme_pcie_qpair_construct(ctrlr->adminq, NULL);
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+static int
+pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+	struct spdk_nvme_transport_id trid = {};
+	struct nvme_pcie_enum_ctx *enum_ctx = ctx;
+	struct spdk_nvme_ctrlr *ctrlr;
+	struct spdk_pci_addr pci_addr;
+
+	pci_addr = spdk_pci_device_get_addr(pci_dev);
+
+	spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
+	spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
+
+	ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid);
+	if (!spdk_process_is_primary()) {
+		if (!ctrlr) {
+			SPDK_ERRLOG("Controller must be constructed in the primary process first.\n");
+			return -1;
+		}
+
+		return nvme_ctrlr_add_process(ctrlr, pci_dev);
+	}
+
+	/* check whether user passes the pci_addr */
+	if (enum_ctx->has_pci_addr &&
+	    (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
+		return 1;
+	}
+
+	return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev);
+}
+
+static int
+nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
+		     bool direct_connect)
+{
+	struct nvme_pcie_enum_ctx enum_ctx = {};
+
+	enum_ctx.probe_ctx = probe_ctx;
+
+	if (strlen(probe_ctx->trid.traddr) != 0) {
+		if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) {
+			return -1;
+		}
+		enum_ctx.has_pci_addr = true;
+	}
+
+	/* Only the primary process can monitor hotplug. */
+	if (spdk_process_is_primary()) {
+		_nvme_pcie_hotplug_monitor(probe_ctx);
+	}
+
+	if (enum_ctx.has_pci_addr == false) {
+		return spdk_pci_enumerate(spdk_pci_nvme_get_driver(),
+					  pcie_nvme_enum_cb, &enum_ctx);
+	} else {
+		return spdk_pci_device_attach(spdk_pci_nvme_get_driver(),
+					      pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
+	}
+}
+
+static int
+nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr)
+{
+	struct nvme_pcie_enum_ctx enum_ctx;
+
+	enum_ctx.probe_ctx = probe_ctx;
+	enum_ctx.has_pci_addr = true;
+	enum_ctx.pci_addr = *pci_addr;
+
+	return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx);
+}
+
+static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+		const struct spdk_nvme_ctrlr_opts *opts,
+		void *devhandle)
+{
+	struct spdk_pci_device *pci_dev = devhandle;
+	struct nvme_pcie_ctrlr *pctrlr;
+	union spdk_nvme_cap_register cap;
+	union spdk_nvme_vs_register vs;
+	uint16_t cmd_reg;
+	int rc;
+	struct spdk_pci_id pci_id;
+
+	rc = spdk_pci_device_claim(pci_dev);
+	if (rc < 0) {
+		SPDK_ERRLOG("could not claim device %s (%s)\n",
+			    trid->traddr, spdk_strerror(-rc));
+		return NULL;
+	}
+
+	pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
+			      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+	if (pctrlr == NULL) {
+		spdk_pci_device_unclaim(pci_dev);
+		SPDK_ERRLOG("could not allocate ctrlr\n");
+		return NULL;
+	}
+
+	pctrlr->is_remapped = false;
+	pctrlr->ctrlr.is_removed = false;
+	pctrlr->devhandle = devhandle;
+	pctrlr->ctrlr.opts = *opts;
+	pctrlr->ctrlr.trid = *trid;
+
+	rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
+	if (rc != 0) {
+		spdk_pci_device_unclaim(pci_dev);
+		spdk_free(pctrlr);
+		return NULL;
+	}
+
+	rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
+	if (rc != 0) {
+		spdk_pci_device_unclaim(pci_dev);
+		spdk_free(pctrlr);
+		return NULL;
+	}
+
+	/* Enable PCI busmaster and disable INTx */
+	spdk_pci_device_cfg_read16(pci_dev, &cmd_reg, 4);
+	cmd_reg |= 0x404;
+	spdk_pci_device_cfg_write16(pci_dev, cmd_reg, 4);
+
+	if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
+		SPDK_ERRLOG("get_cap() failed\n");
+		spdk_pci_device_unclaim(pci_dev);
+		spdk_free(pctrlr);
+		return NULL;
+	}
+
+	if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
+		SPDK_ERRLOG("get_vs() failed\n");
+		spdk_pci_device_unclaim(pci_dev);
+		spdk_free(pctrlr);
+		return NULL;
+	}
+
+	nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
+
+	/* Doorbell stride is 2 ^ (dstrd + 2),
+	 * but we want multiples of 4, so drop the + 2 */
+	pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
+
+	pci_id = spdk_pci_device_get_id(pci_dev);
+	pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
+
+	rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size);
+	if (rc != 0) {
+		nvme_ctrlr_destruct(&pctrlr->ctrlr);
+		return NULL;
+	}
+
+	/* Construct the primary process properties */
+	rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
+	if (rc != 0) {
+		nvme_ctrlr_destruct(&pctrlr->ctrlr);
+		return NULL;
+	}
+
+	if (g_sigset != true) {
+		nvme_pcie_ctrlr_setup_signal();
+		g_sigset = true;
+	}
+
+	return &pctrlr->ctrlr;
+}
+
+static int
+nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+	struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
+	union spdk_nvme_aqa_register aqa;
+
+	if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
+		SPDK_ERRLOG("set_asq() failed\n");
+		return -EIO;
+	}
+
+	if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
+		SPDK_ERRLOG("set_acq() failed\n");
+		return -EIO;
+	}
+
+	aqa.raw = 0;
+	/* acqs and asqs are 0-based. */
+	aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
+	aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
+
+	if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
+		SPDK_ERRLOG("set_aqa() failed\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int
+nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+	struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
+
+	if (ctrlr->adminq) {
+		nvme_pcie_qpair_destroy(ctrlr->adminq);
+	}
+
+	nvme_ctrlr_destruct_finish(ctrlr);
+
+	nvme_ctrlr_free_processes(ctrlr);
+
+	nvme_pcie_ctrlr_free_bars(pctrlr);
+
+	if (devhandle) {
+		spdk_pci_device_unclaim(devhandle);
+		spdk_pci_device_detach(devhandle);
+	}
+
+	spdk_free(pctrlr);
+
+	return 0;
+}
+
+static void
+nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
+{
+	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
+	tr->cid = cid;
+	tr->req = NULL;
+}
+
+static int
+nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+	uint32_t i;
+
+	/* all head/tail vals are set to 0 */
+	pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0;
+
+	/*
+	 * First time through the completion queue, HW will set phase
+	 *  bit on completions to 1.  So set this to 1 here, indicating
+	 *  we're looking for a 1 to know which entries have completed.
+	 *  we'll toggle the bit each time when the completion queue
+	 *  rolls over.
+	 */
+	pqpair->flags.phase = 1;
+	for (i = 0; i < pqpair->num_entries; i++) {
+		pqpair->cpl[i].status.p = 0;
+	}
+
+	return 0;
+}
+
+static void *
+nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment,
+			  uint64_t *phys_addr)
+{
+	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+	uintptr_t addr;
+
+	if (pctrlr->cmb.mem_register_addr != NULL) {
+		/* BAR is mapped for data */
+		return NULL;
+	}
+
+	addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset;
+	addr = (addr + (alignment - 1)) & ~(alignment - 1);
+
+	/* CMB may only consume part of the BAR, calculate accordingly */
+	if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) {
+		SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
+		return NULL;
+	}
+	*phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va;
+
+	pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va;
+
+	return (void *)addr;
+}
+
+static int
+nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
+			  const struct spdk_nvme_io_qpair_opts *opts)
+{
+	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
+	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_tracker	*tr;
+	uint16_t		i;
+	volatile uint32_t	*doorbell_base;
+	uint16_t		num_trackers;
+	size_t			page_align = sysconf(_SC_PAGESIZE);
+	size_t			queue_align, queue_len;
+	uint32_t                flags = SPDK_MALLOC_DMA;
+	uint64_t		sq_paddr = 0;
+	uint64_t		cq_paddr = 0;
+
+	if (opts) {
+		pqpair->sq_vaddr = opts->sq.vaddr;
+		pqpair->cq_vaddr = opts->cq.vaddr;
+		sq_paddr = opts->sq.paddr;
+		cq_paddr = opts->cq.paddr;
+	}
+
+	pqpair->retry_count = ctrlr->opts.transport_retry_count;
+
+	/*
+	 * Limit the maximum number of completions to return per call to prevent wraparound,
+	 * and calculate how many trackers can be submitted at once without overflowing the
+	 * completion queue.
+	 */
+	pqpair->max_completions_cap = pqpair->num_entries / 4;
+	pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
+	pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
+	num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
+
+	SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
+		     pqpair->max_completions_cap, num_trackers);
+
+	assert(num_trackers != 0);
+
+	pqpair->sq_in_cmb = false;
+
+	if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
+		flags |= SPDK_MALLOC_SHARE;
+	}
+
+	/* cmd and cpl rings must be aligned on page size boundaries. */
+	if (ctrlr->opts.use_cmb_sqs) {
+		pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
+							page_align, &pqpair->cmd_bus_addr);
+		if (pqpair->cmd != NULL) {
+			pqpair->sq_in_cmb = true;
+		}
+	}
+
+	if (pqpair->sq_in_cmb == false) {
+		if (pqpair->sq_vaddr) {
+			pqpair->cmd = pqpair->sq_vaddr;
+		} else {
+			/* To ensure physical address contiguity we make each ring occupy
+			 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
+			 */
+			queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd);
+			queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
+			pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags);
+			if (pqpair->cmd == NULL) {
+				SPDK_ERRLOG("alloc qpair_cmd failed\n");
+				return -ENOMEM;
+			}
+		}
+		if (sq_paddr) {
+			assert(pqpair->sq_vaddr != NULL);
+			pqpair->cmd_bus_addr = sq_paddr;
+		} else {
+			pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL);
+			if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
+				SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
+				return -EFAULT;
+			}
+		}
+	}
+
+	if (pqpair->cq_vaddr) {
+		pqpair->cpl = pqpair->cq_vaddr;
+	} else {
+		queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl);
+		queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
+		pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags);
+		if (pqpair->cpl == NULL) {
+			SPDK_ERRLOG("alloc qpair_cpl failed\n");
+			return -ENOMEM;
+		}
+	}
+	if (cq_paddr) {
+		assert(pqpair->cq_vaddr != NULL);
+		pqpair->cpl_bus_addr = cq_paddr;
+	} else {
+		pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL);
+		if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
+			SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
+			return -EFAULT;
+		}
+	}
+
+	doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
+	pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
+	pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
+
+	/*
+	 * Reserve space for all of the trackers in a single allocation.
+	 *   struct nvme_tracker must be padded so that its size is already a power of 2.
+	 *   This ensures the PRP list embedded in the nvme_tracker object will not span a
+	 *   4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
+	 */
+	pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
+				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+	if (pqpair->tr == NULL) {
+		SPDK_ERRLOG("nvme_tr failed\n");
+		return -ENOMEM;
+	}
+
+	TAILQ_INIT(&pqpair->free_tr);
+	TAILQ_INIT(&pqpair->outstanding_tr);
+
+	for (i = 0; i < num_trackers; i++) {
+		tr = &pqpair->tr[i];
+		nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL));
+		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
+	}
+
+	nvme_pcie_qpair_reset(qpair);
+
+	return 0;
+}
+
+/* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must
+ * not use wide instructions because QEMU will not emulate such instructions to MMIO space.
+ * So this function ensures we only copy 8 bytes at a time.
+ */
+static inline void
+nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
+{
+	uint64_t *dst64 = (uint64_t *)dst;
+	const uint64_t *src64 = (const uint64_t *)src;
+	uint32_t i;
+
+	for (i = 0; i < sizeof(*dst) / 8; i++) {
+		dst64[i] = src64[i];
+	}
+}
+
+static inline void
+nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
+{
+	/* dst and src are known to be non-overlapping and 64-byte aligned. */
+#if defined(__SSE2__)
+	__m128i *d128 = (__m128i *)dst;
+	const __m128i *s128 = (const __m128i *)src;
+
+	_mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
+	_mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
+	_mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
+	_mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
+#else
+	*dst = *src;
+#endif
+}
+
+/**
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
+		struct nvme_request *req, struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
+	struct nvme_request		*active_req = req;
+	struct spdk_nvme_ctrlr_process	*active_proc;
+
+	/*
+	 * The admin request is from another process. Move to the per
+	 *  process list for that process to handle it later.
+	 */
+	assert(nvme_qpair_is_admin_queue(qpair));
+	assert(active_req->pid != getpid());
+
+	active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid);
+	if (active_proc) {
+		/* Save the original completion information */
+		memcpy(&active_req->cpl, cpl, sizeof(*cpl));
+		STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
+	} else {
+		SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
+			    active_req->pid);
+
+		nvme_free_request(active_req);
+	}
+}
+
+/**
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
+	struct nvme_request		*req, *tmp_req;
+	pid_t				pid = getpid();
+	struct spdk_nvme_ctrlr_process	*proc;
+
+	/*
+	 * Check whether there is any pending admin request from
+	 * other active processes.
+	 */
+	assert(nvme_qpair_is_admin_queue(qpair));
+
+	proc = nvme_ctrlr_get_current_process(ctrlr);
+	if (!proc) {
+		SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
+		assert(proc);
+		return;
+	}
+
+	STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
+		STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
+
+		assert(req->pid == pid);
+
+		nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
+		nvme_free_request(req);
+	}
+}
+
+static inline int
+nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
+}
+
+static bool
+nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
+				     volatile uint32_t *shadow_db,
+				     volatile uint32_t *eventidx)
+{
+	uint16_t old;
+
+	if (!shadow_db) {
+		return true;
+	}
+
+	old = *shadow_db;
+	*shadow_db = value;
+
+	/*
+	 * Ensure that the doorbell is updated before reading the EventIdx from
+	 * memory
+	 */
+	spdk_mb();
+
+	if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
+		return false;
+	}
+
+	return true;
+}
+
+static inline void
+nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
+	bool need_mmio = true;
+
+	if (qpair->first_fused_submitted) {
+		/* This is first cmd of two fused commands - don't ring doorbell */
+		qpair->first_fused_submitted = 0;
+		return;
+	}
+
+	if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
+		need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
+				pqpair->sq_tail,
+				pqpair->shadow_doorbell.sq_tdbl,
+				pqpair->shadow_doorbell.sq_eventidx);
+	}
+
+	if (spdk_likely(need_mmio)) {
+		spdk_wmb();
+		g_thread_mmio_ctrlr = pctrlr;
+		spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
+		g_thread_mmio_ctrlr = NULL;
+	}
+}
+
+static inline void
+nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
+	bool need_mmio = true;
+
+	if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
+		need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
+				pqpair->cq_head,
+				pqpair->shadow_doorbell.cq_hdbl,
+				pqpair->shadow_doorbell.cq_eventidx);
+	}
+
+	if (spdk_likely(need_mmio)) {
+		g_thread_mmio_ctrlr = pctrlr;
+		spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
+		g_thread_mmio_ctrlr = NULL;
+	}
+}
+
+static void
+nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+	struct nvme_request	*req;
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
+
+	req = tr->req;
+	assert(req != NULL);
+
+	if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) {
+		/* This is first cmd of two fused commands - don't ring doorbell */
+		qpair->first_fused_submitted = 1;
+	}
+
+	/* Don't use wide instructions to copy NVMe command, this is limited by QEMU
+	 * virtual NVMe controller, the maximum access width is 8 Bytes for one time.
+	 */
+	if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) {
+		nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
+	} else {
+		/* Copy the command from the tracker to the submission queue. */
+		nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
+	}
+
+	if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
+		pqpair->sq_tail = 0;
+	}
+
+	if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
+		SPDK_ERRLOG("sq_tail is passing sq_head!\n");
+	}
+
+	if (!pqpair->flags.delay_cmd_submit) {
+		nvme_pcie_qpair_ring_sq_doorbell(qpair);
+	}
+}
+
+static void
+nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
+				 struct spdk_nvme_cpl *cpl, bool print_on_error)
+{
+	struct nvme_pcie_qpair		*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_request		*req;
+	bool				retry, error;
+	bool				req_from_current_proc = true;
+
+	req = tr->req;
+
+	assert(req != NULL);
+
+	error = spdk_nvme_cpl_is_error(cpl);
+	retry = error && nvme_completion_is_retry(cpl) &&
+		req->retries < pqpair->retry_count;
+
+	if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
+		spdk_nvme_qpair_print_command(qpair, &req->cmd);
+		spdk_nvme_qpair_print_completion(qpair, cpl);
+	}
+
+	assert(cpl->cid == req->cmd.cid);
+
+	if (retry) {
+		req->retries++;
+		nvme_pcie_qpair_submit_tracker(qpair, tr);
+	} else {
+		TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
+
+		/* Only check admin requests from different processes. */
+		if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
+			req_from_current_proc = false;
+			nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
+		} else {
+			nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
+		}
+
+		if (req_from_current_proc == true) {
+			nvme_qpair_free_request(qpair, req);
+		}
+
+		tr->req = NULL;
+
+		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
+	}
+}
+
+static void
+nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
+					struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
+					bool print_on_error)
+{
+	struct spdk_nvme_cpl	cpl;
+
+	memset(&cpl, 0, sizeof(cpl));
+	cpl.sqid = qpair->id;
+	cpl.cid = tr->cid;
+	cpl.status.sct = sct;
+	cpl.status.sc = sc;
+	cpl.status.dnr = dnr;
+	nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
+}
+
+static void
+nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_tracker *tr, *temp, *last;
+
+	last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head);
+
+	/* Abort previously submitted (outstanding) trs */
+	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
+		if (!qpair->ctrlr->opts.disable_error_logging) {
+			SPDK_ERRLOG("aborting outstanding command\n");
+		}
+		nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
+							SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
+
+		if (tr == last) {
+			break;
+		}
+	}
+}
+
+static int
+nvme_pcie_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+				 int (*iter_fn)(struct nvme_request *req, void *arg),
+				 void *arg)
+{
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_tracker *tr, *tmp;
+	int rc;
+
+	assert(iter_fn != NULL);
+
+	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
+		assert(tr->req != NULL);
+
+		rc = iter_fn(tr->req, arg);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void
+nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_tracker	*tr;
+
+	tr = TAILQ_FIRST(&pqpair->outstanding_tr);
+	while (tr != NULL) {
+		assert(tr->req != NULL);
+		if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+			nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
+								SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
+								false);
+			tr = TAILQ_FIRST(&pqpair->outstanding_tr);
+		} else {
+			tr = TAILQ_NEXT(tr, tq_list);
+		}
+	}
+}
+
+static void
+nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+	nvme_pcie_admin_qpair_abort_aers(qpair);
+}
+
+static int
+nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		nvme_pcie_admin_qpair_destroy(qpair);
+	}
+	/*
+	 * We check sq_vaddr and cq_vaddr to see if the user specified the memory
+	 * buffers when creating the I/O queue.
+	 * If the user specified them, we cannot free that memory.
+	 * Nor do we free it if it's in the CMB.
+	 */
+	if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) {
+		spdk_free(pqpair->cmd);
+	}
+	if (!pqpair->cq_vaddr && pqpair->cpl) {
+		spdk_free(pqpair->cpl);
+	}
+	if (pqpair->tr) {
+		spdk_free(pqpair->tr);
+	}
+
+	nvme_qpair_deinit(qpair);
+
+	spdk_free(pqpair);
+
+	return 0;
+}
+
+static void
+nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	nvme_pcie_qpair_abort_trackers(qpair, dnr);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
+				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
+				 void *cb_arg)
+{
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
+
+	cmd->cdw10_bits.create_io_q.qid = io_que->id;
+	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
+
+	cmd->cdw11_bits.create_io_cq.pc = 1;
+	cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
+
+	return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
+				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
+
+	cmd->cdw10_bits.create_io_q.qid = io_que->id;
+	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
+	cmd->cdw11_bits.create_io_sq.pc = 1;
+	cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio;
+	cmd->cdw11_bits.create_io_sq.cqid = io_que->id;
+	cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
+
+	return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
+	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
+
+	return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+	struct nvme_request *req;
+	struct spdk_nvme_cmd *cmd;
+
+	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+	if (req == NULL) {
+		return -ENOMEM;
+	}
+
+	cmd = &req->cmd;
+	cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
+	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
+
+	return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+				 uint16_t qid)
+{
+	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_completion_poll_status	*status;
+	int					rc;
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		SPDK_ERRLOG("nvme_create_io_cq failed!\n");
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -1;
+	}
+
+	memset(status, 0, sizeof(*status));
+	rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		free(status);
+		return rc;
+	}
+
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		SPDK_ERRLOG("nvme_create_io_sq failed!\n");
+		if (status->timed_out) {
+			/* Request is still queued, the memory will be freed in a completion callback.
+			   allocate a new request */
+			status = calloc(1, sizeof(*status));
+			if (!status) {
+				SPDK_ERRLOG("Failed to allocate status tracker\n");
+				return -ENOMEM;
+			}
+		}
+
+		memset(status, 0, sizeof(*status));
+		/* Attempt to delete the completion queue */
+		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status);
+		if (rc != 0) {
+			/* The originall or newly allocated status structure can be freed since
+			 * the corresponding request has been completed of failed to submit */
+			free(status);
+			return -1;
+		}
+		nvme_wait_for_completion(ctrlr->adminq, status);
+		if (!status->timed_out) {
+			/* status can be freed regardless of nvme_wait_for_completion return value */
+			free(status);
+		}
+		return -1;
+	}
+
+	if (ctrlr->shadow_doorbell) {
+		pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
+						  pctrlr->doorbell_stride_u32;
+		pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
+						  pctrlr->doorbell_stride_u32;
+		pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
+						      pctrlr->doorbell_stride_u32;
+		pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
+						      pctrlr->doorbell_stride_u32;
+		pqpair->flags.has_shadow_doorbell = 1;
+	} else {
+		pqpair->flags.has_shadow_doorbell = 0;
+	}
+	nvme_pcie_qpair_reset(qpair);
+	free(status);
+
+	return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+				const struct spdk_nvme_io_qpair_opts *opts)
+{
+	struct nvme_pcie_qpair *pqpair;
+	struct spdk_nvme_qpair *qpair;
+	int rc;
+
+	assert(ctrlr != NULL);
+
+	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
+			      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+	if (pqpair == NULL) {
+		return NULL;
+	}
+
+	pqpair->num_entries = opts->io_queue_size;
+	pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit;
+
+	qpair = &pqpair->qpair;
+
+	rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
+	if (rc != 0) {
+		nvme_pcie_qpair_destroy(qpair);
+		return NULL;
+	}
+
+	rc = nvme_pcie_qpair_construct(qpair, opts);
+
+	if (rc != 0) {
+		nvme_pcie_qpair_destroy(qpair);
+		return NULL;
+	}
+
+	return qpair;
+}
+
+static int
+nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		return 0;
+	} else {
+		return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
+	}
+}
+
+static void
+nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+}
+
+static int
+nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_completion_poll_status *status;
+	int rc;
+
+	assert(ctrlr != NULL);
+
+	if (ctrlr->is_removed) {
+		goto free;
+	}
+
+	status = calloc(1, sizeof(*status));
+	if (!status) {
+		SPDK_ERRLOG("Failed to allocate status tracker\n");
+		return -ENOMEM;
+	}
+
+	/* Delete the I/O submission queue */
+	rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
+		free(status);
+		return rc;
+	}
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -1;
+	}
+
+	memset(status, 0, sizeof(*status));
+	/* Delete the completion queue */
+	rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
+		free(status);
+		return rc;
+	}
+	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+		if (!status->timed_out) {
+			free(status);
+		}
+		return -1;
+	}
+	free(status);
+
+free:
+	if (qpair->no_deletion_notification_needed == 0) {
+		/* Abort the rest of the I/O */
+		nvme_pcie_qpair_abort_trackers(qpair, 1);
+	}
+
+	nvme_pcie_qpair_destroy(qpair);
+	return 0;
+}
+
+static void
+nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+	/*
+	 * Bad vtophys translation, so abort this request and return
+	 *  immediately.
+	 */
+	nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
+						SPDK_NVME_SC_INVALID_FIELD,
+						1 /* do not retry */, true);
+}
+
+/*
+ * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
+ *
+ * *prp_index will be updated to account for the number of PRP entries used.
+ */
+static inline int
+nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
+			  uint32_t page_size)
+{
+	struct spdk_nvme_cmd *cmd = &tr->req->cmd;
+	uintptr_t page_mask = page_size - 1;
+	uint64_t phys_addr;
+	uint32_t i;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
+		      *prp_index, virt_addr, (uint32_t)len);
+
+	if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
+		SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+		return -EFAULT;
+	}
+
+	i = *prp_index;
+	while (len) {
+		uint32_t seg_len;
+
+		/*
+		 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
+		 * so prp_index == count is valid.
+		 */
+		if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
+			SPDK_ERRLOG("out of PRP entries\n");
+			return -EFAULT;
+		}
+
+		phys_addr = spdk_vtophys(virt_addr, NULL);
+		if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
+			SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
+			return -EFAULT;
+		}
+
+		if (i == 0) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
+			cmd->dptr.prp.prp1 = phys_addr;
+			seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
+		} else {
+			if ((phys_addr & page_mask) != 0) {
+				SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
+				return -EFAULT;
+			}
+
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
+			tr->u.prp[i - 1] = phys_addr;
+			seg_len = page_size;
+		}
+
+		seg_len = spdk_min(seg_len, len);
+		virt_addr += seg_len;
+		len -= seg_len;
+		i++;
+	}
+
+	cmd->psdt = SPDK_NVME_PSDT_PRP;
+	if (i <= 1) {
+		cmd->dptr.prp.prp2 = 0;
+	} else if (i == 2) {
+		cmd->dptr.prp.prp2 = tr->u.prp[0];
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
+	} else {
+		cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
+	}
+
+	*prp_index = i;
+	return 0;
+}
+
+static int
+nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair,
+				      struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned)
+{
+	assert(0);
+	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+	return -EINVAL;
+}
+
+/**
+ * Build PRP list describing physically contiguous payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+				     struct nvme_tracker *tr, bool dword_aligned)
+{
+	uint32_t prp_index = 0;
+	int rc;
+
+	rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
+				       req->payload_size, qpair->ctrlr->page_size);
+	if (rc) {
+		nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+	}
+
+	return rc;
+}
+
+/**
+ * Build an SGL describing a physically contiguous payload buffer.
+ *
+ * This is more efficient than using PRP because large buffers can be
+ * described this way.
+ */
+static int
+nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+		struct nvme_tracker *tr, bool dword_aligned)
+{
+	void *virt_addr;
+	uint64_t phys_addr, mapping_length;
+	uint32_t length;
+	struct spdk_nvme_sgl_descriptor *sgl;
+	uint32_t nseg = 0;
+
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+	sgl = tr->u.sgl;
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
+
+	length = req->payload_size;
+	virt_addr = req->payload.contig_or_cb_arg + req->payload_offset;
+	mapping_length = length;
+
+	while (length > 0) {
+		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+			return -EFAULT;
+		}
+
+		if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
+			SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+			return -EFAULT;
+		}
+
+		phys_addr = spdk_vtophys(virt_addr, &mapping_length);
+		if (phys_addr == SPDK_VTOPHYS_ERROR) {
+			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+			return -EFAULT;
+		}
+
+		mapping_length = spdk_min(length, mapping_length);
+
+		length -= mapping_length;
+		virt_addr += mapping_length;
+
+		sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+		sgl->unkeyed.length = mapping_length;
+		sgl->address = phys_addr;
+		sgl->unkeyed.subtype = 0;
+
+		sgl++;
+		nseg++;
+	}
+
+	if (nseg == 1) {
+		/*
+		 * The whole transfer can be described by a single SGL descriptor.
+		 *  Use the special case described by the spec where SGL1's type is Data Block.
+		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
+		 *  SGL element into SGL1.
+		 */
+		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
+		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
+	} else {
+		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
+		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
+		 */
+		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
+		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
+	}
+
+	return 0;
+}
+
+/**
+ * Build SGL list describing scattered payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+				     struct nvme_tracker *tr, bool dword_aligned)
+{
+	int rc;
+	void *virt_addr;
+	uint64_t phys_addr;
+	uint32_t remaining_transfer_len, remaining_user_sge_len, length;
+	struct spdk_nvme_sgl_descriptor *sgl;
+	uint32_t nseg = 0;
+
+	/*
+	 * Build scattered payloads.
+	 */
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+	assert(req->payload.reset_sgl_fn != NULL);
+	assert(req->payload.next_sge_fn != NULL);
+	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+	sgl = tr->u.sgl;
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
+
+	remaining_transfer_len = req->payload_size;
+
+	while (remaining_transfer_len > 0) {
+		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
+					      &virt_addr, &remaining_user_sge_len);
+		if (rc) {
+			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+			return -EFAULT;
+		}
+
+		/* Bit Bucket SGL descriptor */
+		if ((uint64_t)virt_addr == UINT64_MAX) {
+			/* TODO: enable WRITE and COMPARE when necessary */
+			if (req->cmd.opc != SPDK_NVME_OPC_READ) {
+				SPDK_ERRLOG("Only READ command can be supported\n");
+				goto exit;
+			}
+			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+				SPDK_ERRLOG("Too many SGL entries\n");
+				goto exit;
+			}
+
+			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET;
+			/* If the SGL describes a destination data buffer, the length of data
+			 * buffer shall be discarded by controller, and the length is included
+			 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length
+			 * is not included in the NLB parameter.
+			 */
+			remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
+			remaining_transfer_len -= remaining_user_sge_len;
+
+			sgl->unkeyed.length = remaining_user_sge_len;
+			sgl->address = 0;
+			sgl->unkeyed.subtype = 0;
+
+			sgl++;
+			nseg++;
+
+			continue;
+		}
+
+		remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
+		remaining_transfer_len -= remaining_user_sge_len;
+		while (remaining_user_sge_len > 0) {
+			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+				SPDK_ERRLOG("Too many SGL entries\n");
+				goto exit;
+			}
+
+			if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
+				SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+				goto exit;
+			}
+
+			phys_addr = spdk_vtophys(virt_addr, NULL);
+			if (phys_addr == SPDK_VTOPHYS_ERROR) {
+				goto exit;
+			}
+
+			length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr));
+			remaining_user_sge_len -= length;
+			virt_addr += length;
+
+			if (nseg > 0 && phys_addr ==
+			    (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
+				/* extend previous entry */
+				(*(sgl - 1)).unkeyed.length += length;
+				continue;
+			}
+
+			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+			sgl->unkeyed.length = length;
+			sgl->address = phys_addr;
+			sgl->unkeyed.subtype = 0;
+
+			sgl++;
+			nseg++;
+		}
+	}
+
+	if (nseg == 1) {
+		/*
+		 * The whole transfer can be described by a single SGL descriptor.
+		 *  Use the special case described by the spec where SGL1's type is Data Block.
+		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
+		 *  SGL element into SGL1.
+		 */
+		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
+		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
+	} else {
+		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
+		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
+		 */
+		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
+		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
+	}
+
+	return 0;
+
+exit:
+	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+	return -EFAULT;
+}
+
+/**
+ * Build PRP list describing scattered payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+				       struct nvme_tracker *tr, bool dword_aligned)
+{
+	int rc;
+	void *virt_addr;
+	uint32_t remaining_transfer_len, length;
+	uint32_t prp_index = 0;
+	uint32_t page_size = qpair->ctrlr->page_size;
+
+	/*
+	 * Build scattered payloads.
+	 */
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+	assert(req->payload.reset_sgl_fn != NULL);
+	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+	remaining_transfer_len = req->payload_size;
+	while (remaining_transfer_len > 0) {
+		assert(req->payload.next_sge_fn != NULL);
+		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
+		if (rc) {
+			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+			return -EFAULT;
+		}
+
+		length = spdk_min(remaining_transfer_len, length);
+
+		/*
+		 * Any incompatible sges should have been handled up in the splitting routine,
+		 *  but assert here as an additional check.
+		 *
+		 * All SGEs except last must end on a page boundary.
+		 */
+		assert((length == remaining_transfer_len) ||
+		       _is_page_aligned((uintptr_t)virt_addr + length, page_size));
+
+		rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
+		if (rc) {
+			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+			return rc;
+		}
+
+		remaining_transfer_len -= length;
+	}
+
+	return 0;
+}
+
+typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *,
+			   bool);
+
+static build_req_fn const g_nvme_pcie_build_req_table[][2] = {
+	[NVME_PAYLOAD_TYPE_INVALID] = {
+		nvme_pcie_qpair_build_request_invalid,			/* PRP */
+		nvme_pcie_qpair_build_request_invalid			/* SGL */
+	},
+	[NVME_PAYLOAD_TYPE_CONTIG] = {
+		nvme_pcie_qpair_build_contig_request,			/* PRP */
+		nvme_pcie_qpair_build_contig_hw_sgl_request		/* SGL */
+	},
+	[NVME_PAYLOAD_TYPE_SGL] = {
+		nvme_pcie_qpair_build_prps_sgl_request,			/* PRP */
+		nvme_pcie_qpair_build_hw_sgl_request			/* SGL */
+	}
+};
+
+static int
+nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
+			       bool sgl_supported, bool dword_aligned)
+{
+	void *md_payload;
+	struct nvme_request *req = tr->req;
+
+	if (req->payload.md) {
+		md_payload = req->payload.md + req->md_offset;
+		if (dword_aligned && ((uintptr_t)md_payload & 3)) {
+			SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload);
+			goto exit;
+		}
+
+		if (sgl_supported && dword_aligned) {
+			assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG);
+			req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
+			tr->meta_sgl.address = spdk_vtophys(md_payload, NULL);
+			if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) {
+				goto exit;
+			}
+			tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+			tr->meta_sgl.unkeyed.length = req->md_size;
+			tr->meta_sgl.unkeyed.subtype = 0;
+			req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor);
+		} else {
+			req->cmd.mptr = spdk_vtophys(md_payload, NULL);
+			if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
+				goto exit;
+			}
+		}
+	}
+
+	return 0;
+
+exit:
+	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+	return -EINVAL;
+}
+
+static int
+nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+	struct nvme_tracker	*tr;
+	int			rc = 0;
+	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	enum nvme_payload_type	payload_type;
+	bool			sgl_supported;
+	bool			dword_aligned = true;
+
+	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+		nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	}
+
+	tr = TAILQ_FIRST(&pqpair->free_tr);
+
+	if (tr == NULL) {
+		/* Inform the upper layer to try again later. */
+		rc = -EAGAIN;
+		goto exit;
+	}
+
+	TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
+	TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
+	tr->req = req;
+	tr->cb_fn = req->cb_fn;
+	tr->cb_arg = req->cb_arg;
+	req->cmd.cid = tr->cid;
+
+	if (req->payload_size != 0) {
+		payload_type = nvme_payload_type(&req->payload);
+		/* According to the specification, PRPs shall be used for all
+		 *  Admin commands for NVMe over PCIe implementations.
+		 */
+		sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 &&
+				!nvme_qpair_is_admin_queue(qpair);
+
+		if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) {
+			dword_aligned = false;
+		}
+		rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned);
+		if (rc < 0) {
+			goto exit;
+		}
+
+		rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned);
+		if (rc < 0) {
+			goto exit;
+		}
+	}
+
+	nvme_pcie_qpair_submit_tracker(qpair, tr);
+
+exit:
+	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	}
+
+	return rc;
+}
+
+static void
+nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+	uint64_t t02;
+	struct nvme_tracker *tr, *tmp;
+	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+	struct spdk_nvme_ctrlr_process *active_proc;
+
+	/* Don't check timeouts during controller initialization. */
+	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+		return;
+	}
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	} else {
+		active_proc = qpair->active_proc;
+	}
+
+	/* Only check timeouts if the current process has a timeout callback. */
+	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+		return;
+	}
+
+	t02 = spdk_get_ticks();
+	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
+		assert(tr->req != NULL);
+
+		if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
+			/*
+			 * The requests are in order, so as soon as one has not timed out,
+			 * stop iterating.
+			 */
+			break;
+		}
+	}
+}
+
+static int32_t
+nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
+	struct nvme_tracker	*tr;
+	struct spdk_nvme_cpl	*cpl, *next_cpl;
+	uint32_t		 num_completions = 0;
+	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
+	uint16_t		 next_cq_head;
+	uint8_t			 next_phase;
+	bool			 next_is_valid = false;
+
+	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+		nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+	}
+
+	if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
+		/*
+		 * max_completions == 0 means unlimited, but complete at most
+		 * max_completions_cap batch of I/O at a time so that the completion
+		 * queue doorbells don't wrap around.
+		 */
+		max_completions = pqpair->max_completions_cap;
+	}
+
+	while (1) {
+		cpl = &pqpair->cpl[pqpair->cq_head];
+
+		if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
+			break;
+		}
+
+		if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
+			next_cq_head = pqpair->cq_head + 1;
+			next_phase = pqpair->flags.phase;
+		} else {
+			next_cq_head = 0;
+			next_phase = !pqpair->flags.phase;
+		}
+		next_cpl = &pqpair->cpl[next_cq_head];
+		next_is_valid = (next_cpl->status.p == next_phase);
+		if (next_is_valid) {
+			__builtin_prefetch(&pqpair->tr[next_cpl->cid]);
+		}
+
+#ifdef __PPC64__
+		/*
+		 * This memory barrier prevents reordering of:
+		 * - load after store from/to tr
+		 * - load after load cpl phase and cpl cid
+		 */
+		spdk_mb();
+#elif defined(__aarch64__)
+		__asm volatile("dmb oshld" ::: "memory");
+#endif
+
+		if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
+			pqpair->cq_head = 0;
+			pqpair->flags.phase = !pqpair->flags.phase;
+		}
+
+		tr = &pqpair->tr[cpl->cid];
+		/* Prefetch the req's STAILQ_ENTRY since we'll need to access it
+		 * as part of putting the req back on the qpair's free list.
+		 */
+		__builtin_prefetch(&tr->req->stailq);
+		pqpair->sq_head = cpl->sqhd;
+
+		if (tr->req) {
+			nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
+		} else {
+			SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
+			spdk_nvme_qpair_print_completion(qpair, cpl);
+			assert(0);
+		}
+
+		if (++num_completions == max_completions) {
+			break;
+		}
+	}
+
+	if (num_completions > 0) {
+		nvme_pcie_qpair_ring_cq_doorbell(qpair);
+	}
+
+	if (pqpair->flags.delay_cmd_submit) {
+		if (pqpair->last_sq_tail != pqpair->sq_tail) {
+			nvme_pcie_qpair_ring_sq_doorbell(qpair);
+			pqpair->last_sq_tail = pqpair->sq_tail;
+		}
+	}
+
+	if (spdk_unlikely(ctrlr->timeout_enabled)) {
+		/*
+		 * User registered for timeout callback
+		 */
+		nvme_pcie_qpair_check_timeout(qpair);
+	}
+
+	/* Before returning, complete any pending admin request. */
+	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+		nvme_pcie_qpair_complete_pending_admin_request(qpair);
+
+		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+	}
+
+	return num_completions;
+}
+
+static struct spdk_nvme_transport_poll_group *
+nvme_pcie_poll_group_create(void)
+{
+	struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group));
+
+	if (group == NULL) {
+		SPDK_ERRLOG("Unable to allocate poll group.\n");
+		return NULL;
+	}
+
+	return &group->group;
+}
+
+static int
+nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	return 0;
+}
+
+static int
+nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	return 0;
+}
+
+static int
+nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+			 struct spdk_nvme_qpair *qpair)
+{
+	return 0;
+}
+
+static int
+nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+			    struct spdk_nvme_qpair *qpair)
+{
+	return 0;
+}
+
+static int64_t
+nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+	struct spdk_nvme_qpair *qpair, *tmp_qpair;
+	int32_t local_completions = 0;
+	int64_t total_completions = 0;
+
+	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
+		disconnected_qpair_cb(qpair, tgroup->group->ctx);
+	}
+
+	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
+		local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair);
+		if (local_completions < 0) {
+			disconnected_qpair_cb(qpair, tgroup->group->ctx);
+			local_completions = 0;
+		}
+		total_completions += local_completions;
+	}
+
+	return total_completions;
+}
+
+static int
+nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+		return -EBUSY;
+	}
+
+	free(tgroup);
+
+	return 0;
+}
+
+static struct spdk_pci_id nvme_pci_driver_id[] = {
+	{
+		.class_id = SPDK_PCI_CLASS_NVME,
+		.vendor_id = SPDK_PCI_ANY_ID,
+		.device_id = SPDK_PCI_ANY_ID,
+		.subvendor_id = SPDK_PCI_ANY_ID,
+		.subdevice_id = SPDK_PCI_ANY_ID,
+	},
+	{ .vendor_id = 0, /* sentinel */ },
+};
+
+SPDK_PCI_DRIVER_REGISTER("nvme", nvme_pci_driver_id,
+			 SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
+
+const struct spdk_nvme_transport_ops pcie_ops = {
+	.name = "PCIE",
+	.type = SPDK_NVME_TRANSPORT_PCIE,
+	.ctrlr_construct = nvme_pcie_ctrlr_construct,
+	.ctrlr_scan = nvme_pcie_ctrlr_scan,
+	.ctrlr_destruct = nvme_pcie_ctrlr_destruct,
+	.ctrlr_enable = nvme_pcie_ctrlr_enable,
+
+	.ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4,
+	.ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8,
+	.ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4,
+	.ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8,
+
+	.ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size,
+	.ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges,
+
+	.ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb,
+	.ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb,
+	.ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb,
+
+	.ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair,
+	.ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair,
+	.ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair,
+	.ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair,
+
+	.qpair_abort_reqs = nvme_pcie_qpair_abort_reqs,
+	.qpair_reset = nvme_pcie_qpair_reset,
+	.qpair_submit_request = nvme_pcie_qpair_submit_request,
+	.qpair_process_completions = nvme_pcie_qpair_process_completions,
+	.qpair_iterate_requests = nvme_pcie_qpair_iterate_requests,
+	.admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers,
+
+	.poll_group_create = nvme_pcie_poll_group_create,
+	.poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair,
+	.poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair,
+	.poll_group_add = nvme_pcie_poll_group_add,
+	.poll_group_remove = nvme_pcie_poll_group_remove,
+	.poll_group_process_completions = nvme_pcie_poll_group_process_completions,
+	.poll_group_destroy = nvme_pcie_poll_group_destroy,
+};
+
+SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops);
diff --git a/src/spdk/lib/nvme/nvme_poll_group.c b/src/spdk/lib/nvme/nvme_poll_group.c
new file mode 100644
index 000000000..291f55e63
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_poll_group.c
@@ -0,0 +1,164 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "nvme_internal.h"
+
+struct spdk_nvme_poll_group *
+spdk_nvme_poll_group_create(void *ctx)
+{
+	struct spdk_nvme_poll_group *group;
+
+	group = calloc(1, sizeof(*group));
+	if (group == NULL) {
+		return NULL;
+	}
+
+	group->ctx = ctx;
+	STAILQ_INIT(&group->tgroups);
+
+	return group;
+}
+
+int
+spdk_nvme_poll_group_add(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_transport_poll_group *tgroup;
+	const struct spdk_nvme_transport *transport;
+
+	if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) {
+		return -EINVAL;
+	}
+
+	STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		if (tgroup->transport == qpair->transport) {
+			break;
+		}
+	}
+
+	/* See if a new transport has been added (dlopen style) and we need to update the poll group */
+	if (!tgroup) {
+		transport = nvme_get_first_transport();
+		while (transport != NULL) {
+			if (transport == qpair->transport) {
+				tgroup = nvme_transport_poll_group_create(transport);
+				if (tgroup == NULL) {
+					return -ENOMEM;
+				}
+				tgroup->group = group;
+				STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
+				break;
+			}
+			transport = nvme_get_next_transport(transport);
+		}
+	}
+
+	return tgroup ? nvme_transport_poll_group_add(tgroup, qpair) : -ENODEV;
+}
+
+int
+spdk_nvme_poll_group_remove(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_transport_poll_group *tgroup;
+
+	STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		if (tgroup->transport == qpair->transport) {
+			return nvme_transport_poll_group_remove(tgroup, qpair);
+		}
+	}
+
+	return -ENODEV;
+}
+
+int
+nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	return nvme_transport_poll_group_connect_qpair(qpair);
+}
+
+int
+nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	return nvme_transport_poll_group_disconnect_qpair(qpair);
+}
+
+int64_t
+spdk_nvme_poll_group_process_completions(struct spdk_nvme_poll_group *group,
+		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+	struct spdk_nvme_transport_poll_group *tgroup;
+	int64_t local_completions = 0, error_reason = 0, num_completions = 0;
+
+	if (disconnected_qpair_cb == NULL) {
+		return -EINVAL;
+	}
+
+	STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		local_completions = nvme_transport_poll_group_process_completions(tgroup, completions_per_qpair,
+				    disconnected_qpair_cb);
+		if (local_completions < 0 && error_reason == 0) {
+			error_reason = local_completions;
+		} else {
+			num_completions += local_completions;
+			/* Just to be safe */
+			assert(num_completions >= 0);
+		}
+	}
+
+	return error_reason ? error_reason : num_completions;
+}
+
+void *
+spdk_nvme_poll_group_get_ctx(struct spdk_nvme_poll_group *group)
+{
+	return group->ctx;
+}
+
+int
+spdk_nvme_poll_group_destroy(struct spdk_nvme_poll_group *group)
+{
+	struct spdk_nvme_transport_poll_group *tgroup, *tmp_tgroup;
+
+	STAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp_tgroup) {
+		STAILQ_REMOVE(&group->tgroups, tgroup, spdk_nvme_transport_poll_group, link);
+		if (nvme_transport_poll_group_destroy(tgroup) != 0) {
+			STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
+			return -EBUSY;
+		}
+
+	}
+
+	free(group);
+
+	return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_qpair.c b/src/spdk/lib/nvme/nvme_qpair.c
new file mode 100644
index 000000000..a3fdc2169
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_qpair.c
@@ -0,0 +1,1064 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+#include "spdk/nvme_ocssd.h"
+
+#define NVME_CMD_DPTR_STR_SIZE 256
+
+static int nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req);
+
+struct nvme_string {
+	uint16_t	value;
+	const char	*str;
+};
+
+static const struct nvme_string admin_opcode[] = {
+	{ SPDK_NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" },
+	{ SPDK_NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" },
+	{ SPDK_NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" },
+	{ SPDK_NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" },
+	{ SPDK_NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" },
+	{ SPDK_NVME_OPC_IDENTIFY, "IDENTIFY" },
+	{ SPDK_NVME_OPC_ABORT, "ABORT" },
+	{ SPDK_NVME_OPC_SET_FEATURES, "SET FEATURES" },
+	{ SPDK_NVME_OPC_GET_FEATURES, "GET FEATURES" },
+	{ SPDK_NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
+	{ SPDK_NVME_OPC_NS_MANAGEMENT, "NAMESPACE MANAGEMENT" },
+	{ SPDK_NVME_OPC_FIRMWARE_COMMIT, "FIRMWARE COMMIT" },
+	{ SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
+	{ SPDK_NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" },
+	{ SPDK_NVME_OPC_NS_ATTACHMENT, "NAMESPACE ATTACHMENT" },
+	{ SPDK_NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" },
+	{ SPDK_NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" },
+	{ SPDK_NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" },
+	{ SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" },
+	{ SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" },
+	{ SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" },
+	{ SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" },
+	{ SPDK_NVME_OPC_FABRIC, "FABRIC" },
+	{ SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" },
+	{ SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" },
+	{ SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" },
+	{ SPDK_NVME_OPC_SANITIZE, "SANITIZE" },
+	{ SPDK_NVME_OPC_GET_LBA_STATUS, "GET LBA STATUS" },
+	{ SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" },
+	{ 0xFFFF, "ADMIN COMMAND" }
+};
+
+static const struct nvme_string fabric_opcode[] = {
+	{ SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET, "PROPERTY SET" },
+	{ SPDK_NVMF_FABRIC_COMMAND_CONNECT, "CONNECT" },
+	{ SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET, "PROPERTY GET" },
+	{ SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_SEND, "AUTHENTICATION SEND" },
+	{ SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_RECV, "AUTHENTICATION RECV" },
+	{ 0xFFFF, "RESERVED / VENDOR SPECIFIC" }
+};
+
+static const struct nvme_string feat_opcode[] = {
+	{ SPDK_NVME_FEAT_ARBITRATION, "ARBITRATION" },
+	{ SPDK_NVME_FEAT_POWER_MANAGEMENT, "POWER MANAGEMENT" },
+	{ SPDK_NVME_FEAT_LBA_RANGE_TYPE, "LBA RANGE TYPE" },
+	{ SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD, "TEMPERATURE THRESHOLD" },
+	{ SPDK_NVME_FEAT_ERROR_RECOVERY, "ERROR_RECOVERY" },
+	{ SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE, "VOLATILE WRITE CACHE" },
+	{ SPDK_NVME_FEAT_NUMBER_OF_QUEUES, "NUMBER OF QUEUES" },
+	{ SPDK_NVME_FEAT_INTERRUPT_COALESCING, "INTERRUPT COALESCING" },
+	{ SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION, "INTERRUPT VECTOR CONFIGURATION" },
+	{ SPDK_NVME_FEAT_WRITE_ATOMICITY, "WRITE ATOMICITY" },
+	{ SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, "ASYNC EVENT CONFIGURATION" },
+	{ SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION, "AUTONOMOUS POWER STATE TRANSITION" },
+	{ SPDK_NVME_FEAT_HOST_MEM_BUFFER, "HOST MEM BUFFER" },
+	{ SPDK_NVME_FEAT_TIMESTAMP, "TIMESTAMP" },
+	{ SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, "KEEP ALIVE TIMER" },
+	{ SPDK_NVME_FEAT_HOST_CONTROLLED_THERMAL_MANAGEMENT, "HOST CONTROLLED THERMAL MANAGEMENT" },
+	{ SPDK_NVME_FEAT_NON_OPERATIONAL_POWER_STATE_CONFIG, "NON OPERATIONAL POWER STATE CONFIG" },
+	{ SPDK_NVME_FEAT_SOFTWARE_PROGRESS_MARKER, "SOFTWARE PROGRESS MARKER" },
+	{ SPDK_NVME_FEAT_HOST_IDENTIFIER, "HOST IDENTIFIER" },
+	{ SPDK_NVME_FEAT_HOST_RESERVE_MASK, "HOST RESERVE MASK" },
+	{ SPDK_NVME_FEAT_HOST_RESERVE_PERSIST, "HOST RESERVE PERSIST" },
+	{ 0xFFFF, "RESERVED" }
+};
+
+static const struct nvme_string io_opcode[] = {
+	{ SPDK_NVME_OPC_FLUSH, "FLUSH" },
+	{ SPDK_NVME_OPC_WRITE, "WRITE" },
+	{ SPDK_NVME_OPC_READ, "READ" },
+	{ SPDK_NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
+	{ SPDK_NVME_OPC_COMPARE, "COMPARE" },
+	{ SPDK_NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" },
+	{ SPDK_NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" },
+	{ SPDK_NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
+	{ SPDK_NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" },
+	{ SPDK_NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
+	{ SPDK_NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
+	{ SPDK_OCSSD_OPC_VECTOR_RESET, "OCSSD / VECTOR RESET" },
+	{ SPDK_OCSSD_OPC_VECTOR_WRITE, "OCSSD / VECTOR WRITE" },
+	{ SPDK_OCSSD_OPC_VECTOR_READ, "OCSSD / VECTOR READ" },
+	{ SPDK_OCSSD_OPC_VECTOR_COPY, "OCSSD / VECTOR COPY" },
+	{ 0xFFFF, "IO COMMAND" }
+};
+
+static const struct nvme_string sgl_type[] = {
+	{ SPDK_NVME_SGL_TYPE_DATA_BLOCK, "DATA BLOCK" },
+	{ SPDK_NVME_SGL_TYPE_BIT_BUCKET, "BIT BUCKET" },
+	{ SPDK_NVME_SGL_TYPE_SEGMENT, "SEGMENT" },
+	{ SPDK_NVME_SGL_TYPE_LAST_SEGMENT, "LAST SEGMENT" },
+	{ SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK, "TRANSPORT DATA BLOCK" },
+	{ SPDK_NVME_SGL_TYPE_VENDOR_SPECIFIC, "VENDOR SPECIFIC" },
+	{ 0xFFFF, "RESERVED" }
+};
+
+static const struct nvme_string sgl_subtype[] = {
+	{ SPDK_NVME_SGL_SUBTYPE_ADDRESS, "ADDRESS" },
+	{ SPDK_NVME_SGL_SUBTYPE_OFFSET, "OFFSET" },
+	{ SPDK_NVME_SGL_SUBTYPE_TRANSPORT, "TRANSPORT" },
+	{ SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY, "INVALIDATE KEY" },
+	{ 0xFFFF, "RESERVED" }
+};
+
+static const char *
+nvme_get_string(const struct nvme_string *strings, uint16_t value)
+{
+	const struct nvme_string *entry;
+
+	entry = strings;
+
+	while (entry->value != 0xFFFF) {
+		if (entry->value == value) {
+			return entry->str;
+		}
+		entry++;
+	}
+	return entry->str;
+}
+
+static void
+nvme_get_sgl_unkeyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+	struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
+
+	snprintf(buf, size, " len:0x%x", sgl->unkeyed.length);
+}
+
+static void
+nvme_get_sgl_keyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+	struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
+
+	snprintf(buf, size, " len:0x%x key:0x%x", sgl->keyed.length, sgl->keyed.key);
+}
+
+static void
+nvme_get_sgl(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+	struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
+	int c;
+
+	c = snprintf(buf, size, "SGL %s %s 0x%" PRIx64, nvme_get_string(sgl_type, sgl->generic.type),
+		     nvme_get_string(sgl_subtype, sgl->generic.subtype), sgl->address);
+	assert(c >= 0 && (size_t)c < size);
+
+	if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) {
+		nvme_get_sgl_unkeyed(buf + c, size - c, cmd);
+	}
+
+	if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
+		nvme_get_sgl_keyed(buf + c, size - c, cmd);
+	}
+}
+
+static void
+nvme_get_prp(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+	snprintf(buf, size, "PRP1 0x%" PRIx64 " PRP2 0x%" PRIx64, cmd->dptr.prp.prp1, cmd->dptr.prp.prp2);
+}
+
+static void
+nvme_get_dptr(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+	if (spdk_nvme_opc_get_data_transfer(cmd->opc) != SPDK_NVME_DATA_NONE) {
+		switch (cmd->psdt) {
+		case SPDK_NVME_PSDT_PRP:
+			nvme_get_prp(buf, size, cmd);
+			break;
+		case SPDK_NVME_PSDT_SGL_MPTR_CONTIG:
+		case SPDK_NVME_PSDT_SGL_MPTR_SGL:
+			nvme_get_sgl(buf, size, cmd);
+			break;
+		default:
+			;
+		}
+	}
+}
+
+static void
+nvme_admin_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd)
+{
+	struct spdk_nvmf_capsule_cmd *fcmd = (void *)cmd;
+	char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'};
+
+	assert(cmd != NULL);
+
+	nvme_get_dptr(dptr, sizeof(dptr), cmd);
+
+	switch ((int)cmd->opc) {
+	case SPDK_NVME_OPC_SET_FEATURES:
+	case SPDK_NVME_OPC_GET_FEATURES:
+		SPDK_NOTICELOG("%s %s cid:%d cdw10:%08x %s\n",
+			       nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(feat_opcode,
+					       cmd->cdw10_bits.set_features.fid), cmd->cid, cmd->cdw10, dptr);
+		break;
+	case SPDK_NVME_OPC_FABRIC:
+		SPDK_NOTICELOG("%s %s qid:%d cid:%d %s\n",
+			       nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(fabric_opcode, fcmd->fctype), qid,
+			       fcmd->cid, dptr);
+		break;
+	default:
+		SPDK_NOTICELOG("%s (%02x) qid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x %s\n",
+			       nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid, cmd->cdw10,
+			       cmd->cdw11, dptr);
+	}
+}
+
+static void
+nvme_io_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd)
+{
+	char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'};
+
+	assert(cmd != NULL);
+
+	nvme_get_dptr(dptr, sizeof(dptr), cmd);
+
+	switch ((int)cmd->opc) {
+	case SPDK_NVME_OPC_WRITE:
+	case SPDK_NVME_OPC_READ:
+	case SPDK_NVME_OPC_WRITE_UNCORRECTABLE:
+	case SPDK_NVME_OPC_COMPARE:
+		SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d "
+			       "lba:%llu len:%d %s\n",
+			       nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid,
+			       ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
+			       (cmd->cdw12 & 0xFFFF) + 1, dptr);
+		break;
+	case SPDK_NVME_OPC_FLUSH:
+	case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+		SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n",
+			       nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid);
+		break;
+	default:
+		SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n",
+			       nvme_get_string(io_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid);
+		break;
+	}
+}
+
+void
+spdk_nvme_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd)
+{
+	assert(cmd != NULL);
+
+	if (qid == 0 || cmd->opc == SPDK_NVME_OPC_FABRIC) {
+		nvme_admin_qpair_print_command(qid, cmd);
+	} else {
+		nvme_io_qpair_print_command(qid, cmd);
+	}
+}
+
+void
+spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd)
+{
+	assert(qpair != NULL);
+	assert(cmd != NULL);
+
+	spdk_nvme_print_command(qpair->id, cmd);
+}
+
+static const struct nvme_string generic_status[] = {
+	{ SPDK_NVME_SC_SUCCESS, "SUCCESS" },
+	{ SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
+	{ SPDK_NVME_SC_INVALID_FIELD, "INVALID FIELD" },
+	{ SPDK_NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
+	{ SPDK_NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
+	{ SPDK_NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
+	{ SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
+	{ SPDK_NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
+	{ SPDK_NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
+	{ SPDK_NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
+	{ SPDK_NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
+	{ SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
+	{ SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
+	{ SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR, "INVALID SGL SEGMENT DESCRIPTOR" },
+	{ SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS, "INVALID NUMBER OF SGL DESCRIPTORS" },
+	{ SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
+	{ SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
+	{ SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
+	{ SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF, "INVALID CONTROLLER MEMORY BUFFER" },
+	{ SPDK_NVME_SC_INVALID_PRP_OFFSET, "INVALID PRP OFFSET" },
+	{ SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
+	{ SPDK_NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
+	{ SPDK_NVME_SC_INVALID_SGL_OFFSET, "INVALID SGL OFFSET" },
+	{ SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT, "HOSTID INCONSISTENT FORMAT" },
+	{ SPDK_NVME_SC_KEEP_ALIVE_EXPIRED, "KEEP ALIVE EXPIRED" },
+	{ SPDK_NVME_SC_KEEP_ALIVE_INVALID, "KEEP ALIVE INVALID" },
+	{ SPDK_NVME_SC_ABORTED_PREEMPT, "ABORTED - PREEMPT AND ABORT" },
+	{ SPDK_NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
+	{ SPDK_NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
+	{ SPDK_NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID, "DATA BLOCK GRANULARITY INVALID" },
+	{ SPDK_NVME_SC_COMMAND_INVALID_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
+	{ SPDK_NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
+	{ SPDK_NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
+	{ SPDK_NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
+	{ SPDK_NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
+	{ SPDK_NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
+	{ 0xFFFF, "GENERIC" }
+};
+
+static const struct nvme_string command_specific_status[] = {
+	{ SPDK_NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
+	{ SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
+	{ SPDK_NVME_SC_INVALID_QUEUE_SIZE, "INVALID QUEUE SIZE" },
+	{ SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
+	{ SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
+	{ SPDK_NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
+	{ SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
+	{ SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
+	{ SPDK_NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
+	{ SPDK_NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
+	{ SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET, "FIRMWARE REQUIRES CONVENTIONAL RESET" },
+	{ SPDK_NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
+	{ SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE, "FEATURE ID NOT SAVEABLE" },
+	{ SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
+	{ SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
+	{ SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET, "FIRMWARE REQUIRES NVM RESET" },
+	{ SPDK_NVME_SC_FIRMWARE_REQ_RESET, "FIRMWARE REQUIRES RESET" },
+	{ SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION, "FIRMWARE REQUIRES MAX TIME VIOLATION" },
+	{ SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
+	{ SPDK_NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
+	{ SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
+	{ SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE, "NAMESPACE ID UNAVAILABLE" },
+	{ SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
+	{ SPDK_NVME_SC_NAMESPACE_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
+	{ SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED, "NAMESPACE NOT ATTACHED" },
+	{ SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED, "THINPROVISIONING NOT SUPPORTED" },
+	{ SPDK_NVME_SC_CONTROLLER_LIST_INVALID, "CONTROLLER LIST INVALID" },
+	{ SPDK_NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
+	{ SPDK_NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED, "BOOT PARTITION WRITE PROHIBITED" },
+	{ SPDK_NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER ID" },
+	{ SPDK_NVME_SC_INVALID_SECONDARY_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
+	{ SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES, "INVALID NUMBER OF CONTROLLER RESOURCES" },
+	{ SPDK_NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
+	{ SPDK_NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
+	{ SPDK_NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
+	{ SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE, "WRITE TO RO RANGE" },
+	{ 0xFFFF, "COMMAND SPECIFIC" }
+};
+
+static const struct nvme_string media_error_status[] = {
+	{ SPDK_NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
+	{ SPDK_NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
+	{ SPDK_NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
+	{ SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
+	{ SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
+	{ SPDK_NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
+	{ SPDK_NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
+	{ SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
+	{ SPDK_OCSSD_SC_OFFLINE_CHUNK, "RESET OFFLINE CHUNK" },
+	{ SPDK_OCSSD_SC_INVALID_RESET, "INVALID RESET" },
+	{ SPDK_OCSSD_SC_WRITE_FAIL_WRITE_NEXT_UNIT, "WRITE FAIL WRITE NEXT UNIT" },
+	{ SPDK_OCSSD_SC_WRITE_FAIL_CHUNK_EARLY_CLOSE, "WRITE FAIL CHUNK EARLY CLOSE" },
+	{ SPDK_OCSSD_SC_OUT_OF_ORDER_WRITE, "OUT OF ORDER WRITE" },
+	{ SPDK_OCSSD_SC_READ_HIGH_ECC, "READ HIGH ECC" },
+	{ 0xFFFF, "MEDIA ERROR" }
+};
+
+static const struct nvme_string path_status[] = {
+	{ SPDK_NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
+	{ SPDK_NVME_SC_CONTROLLER_PATH_ERROR, "CONTROLLER PATH ERROR" },
+	{ SPDK_NVME_SC_HOST_PATH_ERROR, "HOST PATH ERROR" },
+	{ SPDK_NVME_SC_ABORTED_BY_HOST, "ABORTED BY HOST" },
+	{ 0xFFFF, "PATH ERROR" }
+};
+
+const char *
+spdk_nvme_cpl_get_status_string(const struct spdk_nvme_status *status)
+{
+	const struct nvme_string *entry;
+
+	switch (status->sct) {
+	case SPDK_NVME_SCT_GENERIC:
+		entry = generic_status;
+		break;
+	case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+		entry = command_specific_status;
+		break;
+	case SPDK_NVME_SCT_MEDIA_ERROR:
+		entry = media_error_status;
+		break;
+	case SPDK_NVME_SCT_PATH:
+		entry = path_status;
+		break;
+	case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+		return "VENDOR SPECIFIC";
+	default:
+		return "RESERVED";
+	}
+
+	return nvme_get_string(entry, status->sc);
+}
+
+void
+spdk_nvme_print_completion(uint16_t qid, struct spdk_nvme_cpl *cpl)
+{
+	assert(cpl != NULL);
+
+	/* Check that sqid matches qid. Note that sqid is reserved
+	 * for fabrics so don't print an error when sqid is 0. */
+	if (cpl->sqid != qid && cpl->sqid != 0) {
+		SPDK_ERRLOG("sqid %u doesn't match qid\n", cpl->sqid);
+	}
+
+	SPDK_NOTICELOG("%s (%02x/%02x) qid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
+		       spdk_nvme_cpl_get_status_string(&cpl->status),
+		       cpl->status.sct, cpl->status.sc, qid, cpl->cid, cpl->cdw0,
+		       cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr);
+}
+
+void
+spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl)
+{
+	spdk_nvme_print_completion(qpair->id, cpl);
+}
+
+bool
+nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl)
+{
+	/*
+	 * TODO: spec is not clear how commands that are aborted due
+	 *  to TLER will be marked.  So for now, it seems
+	 *  NAMESPACE_NOT_READY is the only case where we should
+	 *  look at the DNR bit.
+	 */
+	switch ((int)cpl->status.sct) {
+	case SPDK_NVME_SCT_GENERIC:
+		switch ((int)cpl->status.sc) {
+		case SPDK_NVME_SC_NAMESPACE_NOT_READY:
+		case SPDK_NVME_SC_FORMAT_IN_PROGRESS:
+			if (cpl->status.dnr) {
+				return false;
+			} else {
+				return true;
+			}
+		case SPDK_NVME_SC_INVALID_OPCODE:
+		case SPDK_NVME_SC_INVALID_FIELD:
+		case SPDK_NVME_SC_COMMAND_ID_CONFLICT:
+		case SPDK_NVME_SC_DATA_TRANSFER_ERROR:
+		case SPDK_NVME_SC_ABORTED_POWER_LOSS:
+		case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR:
+		case SPDK_NVME_SC_ABORTED_BY_REQUEST:
+		case SPDK_NVME_SC_ABORTED_SQ_DELETION:
+		case SPDK_NVME_SC_ABORTED_FAILED_FUSED:
+		case SPDK_NVME_SC_ABORTED_MISSING_FUSED:
+		case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
+		case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR:
+		case SPDK_NVME_SC_LBA_OUT_OF_RANGE:
+		case SPDK_NVME_SC_CAPACITY_EXCEEDED:
+		default:
+			return false;
+		}
+	case SPDK_NVME_SCT_PATH:
+		/*
+		 * Per NVMe TP 4028 (Path and Transport Error Enhancements), retries should be
+		 * based on the setting of the DNR bit for Internal Path Error
+		 */
+		switch ((int)cpl->status.sc) {
+		case SPDK_NVME_SC_INTERNAL_PATH_ERROR:
+			return !cpl->status.dnr;
+		default:
+			return false;
+		}
+	case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+	case SPDK_NVME_SCT_MEDIA_ERROR:
+	case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+	default:
+		return false;
+	}
+}
+
+static void
+nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair,
+				   struct nvme_request *req, uint32_t sct, uint32_t sc,
+				   uint32_t dnr, bool print_on_error)
+{
+	struct spdk_nvme_cpl	cpl;
+	bool			error;
+
+	memset(&cpl, 0, sizeof(cpl));
+	cpl.sqid = qpair->id;
+	cpl.status.sct = sct;
+	cpl.status.sc = sc;
+	cpl.status.dnr = dnr;
+
+	error = spdk_nvme_cpl_is_error(&cpl);
+
+	if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
+		SPDK_NOTICELOG("Command completed manually:\n");
+		spdk_nvme_qpair_print_command(qpair, &req->cmd);
+		spdk_nvme_qpair_print_completion(qpair, &cpl);
+	}
+
+	nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &cpl);
+	nvme_free_request(req);
+}
+
+static void
+_nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	struct nvme_request		*req;
+
+	while (!STAILQ_EMPTY(&qpair->queued_req)) {
+		req = STAILQ_FIRST(&qpair->queued_req);
+		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+		if (!qpair->ctrlr->opts.disable_error_logging) {
+			SPDK_ERRLOG("aborting queued i/o\n");
+		}
+		nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+						   SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
+	}
+}
+
+/* The callback to a request may submit the next request which is queued and
+ * then the same callback may abort it immediately. This repetition may cause
+ * infinite recursive calls. Hence move aborting requests to another list here
+ * and abort them later at resubmission.
+ */
+static void
+_nvme_qpair_complete_abort_queued_reqs(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_request		*req;
+
+	while (!STAILQ_EMPTY(&qpair->aborting_queued_req)) {
+		req = STAILQ_FIRST(&qpair->aborting_queued_req);
+		STAILQ_REMOVE_HEAD(&qpair->aborting_queued_req, stailq);
+		nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+						   SPDK_NVME_SC_ABORTED_BY_REQUEST, 1, true);
+	}
+}
+
+uint32_t
+nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg)
+{
+	struct nvme_request	*req, *tmp;
+	uint32_t		aborting = 0;
+
+	STAILQ_FOREACH_SAFE(req, &qpair->queued_req, stailq, tmp) {
+		if (req->cb_arg == cmd_cb_arg) {
+			STAILQ_REMOVE(&qpair->queued_req, req, nvme_request, stailq);
+			STAILQ_INSERT_TAIL(&qpair->aborting_queued_req, req, stailq);
+			if (!qpair->ctrlr->opts.disable_error_logging) {
+				SPDK_ERRLOG("aborting queued i/o\n");
+			}
+			aborting++;
+		}
+	}
+
+	return aborting;
+}
+
+static inline bool
+nvme_qpair_check_enabled(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_request *req;
+
+	/*
+	 * Either during initial connect or reset, the qpair should follow the given state machine.
+	 * QPAIR_DISABLED->QPAIR_CONNECTING->QPAIR_CONNECTED->QPAIR_ENABLING->QPAIR_ENABLED. In the
+	 * reset case, once the qpair is properly connected, we need to abort any outstanding requests
+	 * from the old transport connection and encourage the application to retry them. We also need
+	 * to submit any queued requests that built up while we were in the connected or enabling state.
+	 */
+	if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTED && !qpair->ctrlr->is_resetting) {
+		nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLING);
+		/*
+		 * PCIe is special, for fabrics transports, we can abort requests before disconnect during reset
+		 * but we have historically not disconnected pcie qpairs during reset so we have to abort requests
+		 * here.
+		 */
+		if (qpair->ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+			nvme_qpair_abort_reqs(qpair, 0);
+		}
+		nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLED);
+		while (!STAILQ_EMPTY(&qpair->queued_req)) {
+			req = STAILQ_FIRST(&qpair->queued_req);
+			STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+			if (nvme_qpair_resubmit_request(qpair, req)) {
+				break;
+			}
+		}
+	}
+
+	/*
+	 * When doing a reset, we must disconnect the qpair on the proper core.
+	 * Note, reset is the only case where we set the failure reason without
+	 * setting the qpair state since reset is done at the generic layer on the
+	 * controller thread and we can't disconnect I/O qpairs from the controller
+	 * thread.
+	 */
+	if (qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE &&
+	    nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) {
+		/* Don't disconnect PCIe qpairs. They are a special case for reset. */
+		if (qpair->ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+			nvme_ctrlr_disconnect_qpair(qpair);
+		}
+		return false;
+	}
+
+	return nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED;
+}
+
+void
+nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests)
+{
+	uint32_t i;
+	int resubmit_rc;
+	struct nvme_request *req;
+
+	for (i = 0; i < num_requests; i++) {
+		if (qpair->ctrlr->is_resetting) {
+			break;
+		}
+		if ((req = STAILQ_FIRST(&qpair->queued_req)) == NULL) {
+			break;
+		}
+		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+		resubmit_rc = nvme_qpair_resubmit_request(qpair, req);
+		if (spdk_unlikely(resubmit_rc != 0)) {
+			SPDK_ERRLOG("Unable to resubmit as many requests as we completed.\n");
+			break;
+		}
+	}
+
+	_nvme_qpair_complete_abort_queued_reqs(qpair);
+}
+
+int32_t
+spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+	int32_t ret;
+	struct nvme_request *req, *tmp;
+
+	if (spdk_unlikely(qpair->ctrlr->is_failed)) {
+		if (qpair->ctrlr->is_removed) {
+			nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+			nvme_qpair_abort_reqs(qpair, 1 /* Do not retry */);
+		}
+		return -ENXIO;
+	}
+
+	if (spdk_unlikely(!nvme_qpair_check_enabled(qpair) &&
+			  !(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING))) {
+		/*
+		 * qpair is not enabled, likely because a controller reset is
+		 *  in progress.
+		 */
+		return -ENXIO;
+	}
+
+	/* error injection for those queued error requests */
+	if (spdk_unlikely(!STAILQ_EMPTY(&qpair->err_req_head))) {
+		STAILQ_FOREACH_SAFE(req, &qpair->err_req_head, stailq, tmp) {
+			if (spdk_get_ticks() - req->submit_tick > req->timeout_tsc) {
+				STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq);
+				nvme_qpair_manual_complete_request(qpair, req,
+								   req->cpl.status.sct,
+								   req->cpl.status.sc, 0, true);
+			}
+		}
+	}
+
+	qpair->in_completion_context = 1;
+	ret = nvme_transport_qpair_process_completions(qpair, max_completions);
+	if (ret < 0) {
+		SPDK_ERRLOG("CQ error, abort requests after transport retry counter exceeded\n");
+		if (nvme_qpair_is_admin_queue(qpair)) {
+			nvme_ctrlr_fail(qpair->ctrlr, false);
+		}
+	}
+	qpair->in_completion_context = 0;
+	if (qpair->delete_after_completion_context) {
+		/*
+		 * A request to delete this qpair was made in the context of this completion
+		 *  routine - so it is safe to delete it now.
+		 */
+		spdk_nvme_ctrlr_free_io_qpair(qpair);
+		return ret;
+	}
+
+	/*
+	 * At this point, ret must represent the number of completions we reaped.
+	 * submit as many queued requests as we completed.
+	 */
+	nvme_qpair_resubmit_requests(qpair, ret);
+
+	return ret;
+}
+
+spdk_nvme_qp_failure_reason
+spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair)
+{
+	return qpair->transport_failure_reason;
+}
+
+int
+nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
+		struct spdk_nvme_ctrlr *ctrlr,
+		enum spdk_nvme_qprio qprio,
+		uint32_t num_requests)
+{
+	size_t req_size_padded;
+	uint32_t i;
+
+	qpair->id = id;
+	qpair->qprio = qprio;
+
+	qpair->in_completion_context = 0;
+	qpair->delete_after_completion_context = 0;
+	qpair->no_deletion_notification_needed = 0;
+
+	qpair->ctrlr = ctrlr;
+	qpair->trtype = ctrlr->trid.trtype;
+
+	STAILQ_INIT(&qpair->free_req);
+	STAILQ_INIT(&qpair->queued_req);
+	STAILQ_INIT(&qpair->aborting_queued_req);
+	TAILQ_INIT(&qpair->err_cmd_head);
+	STAILQ_INIT(&qpair->err_req_head);
+
+	req_size_padded = (sizeof(struct nvme_request) + 63) & ~(size_t)63;
+
+	qpair->req_buf = spdk_zmalloc(req_size_padded * num_requests, 64, NULL,
+				      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+	if (qpair->req_buf == NULL) {
+		SPDK_ERRLOG("no memory to allocate qpair(cntlid:0x%x sqid:%d) req_buf with %d request\n",
+			    ctrlr->cntlid, qpair->id, num_requests);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < num_requests; i++) {
+		struct nvme_request *req = qpair->req_buf + i * req_size_padded;
+
+		req->qpair = qpair;
+		STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq);
+	}
+
+	return 0;
+}
+
+void
+nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_request		*req;
+
+	while (!STAILQ_EMPTY(&qpair->err_req_head)) {
+		req = STAILQ_FIRST(&qpair->err_req_head);
+		STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq);
+		nvme_qpair_manual_complete_request(qpair, req,
+						   req->cpl.status.sct,
+						   req->cpl.status.sc, 0, true);
+	}
+}
+
+void
+nvme_qpair_deinit(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_error_cmd *cmd, *entry;
+
+	_nvme_qpair_abort_queued_reqs(qpair, 1);
+	_nvme_qpair_complete_abort_queued_reqs(qpair);
+	nvme_qpair_complete_error_reqs(qpair);
+
+	TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) {
+		TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link);
+		spdk_free(cmd);
+	}
+
+	spdk_free(qpair->req_buf);
+}
+
+static inline int
+_nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+	int			rc = 0;
+	struct nvme_request	*child_req, *tmp;
+	struct nvme_error_cmd	*cmd;
+	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
+	bool			child_req_failed = false;
+
+	nvme_qpair_check_enabled(qpair);
+
+	if (req->num_children) {
+		/*
+		 * This is a split (parent) request. Submit all of the children but not the parent
+		 * request itself, since the parent is the original unsplit request.
+		 */
+		TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
+			if (spdk_likely(!child_req_failed)) {
+				rc = nvme_qpair_submit_request(qpair, child_req);
+				if (spdk_unlikely(rc != 0)) {
+					child_req_failed = true;
+				}
+			} else { /* free remaining child_reqs since one child_req fails */
+				nvme_request_remove_child(req, child_req);
+				nvme_request_free_children(child_req);
+				nvme_free_request(child_req);
+			}
+		}
+
+		if (spdk_unlikely(child_req_failed)) {
+			/* part of children requests have been submitted,
+			 * return success since we must wait for those children to complete,
+			 * but set the parent request to failure.
+			 */
+			if (req->num_children) {
+				req->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+				req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+				return 0;
+			}
+			goto error;
+		}
+
+		return rc;
+	}
+
+	/* queue those requests which matches with opcode in err_cmd list */
+	if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head))) {
+		TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) {
+			if (!cmd->do_not_submit) {
+				continue;
+			}
+
+			if ((cmd->opc == req->cmd.opc) && cmd->err_count) {
+				/* add to error request list and set cpl */
+				req->timeout_tsc = cmd->timeout_tsc;
+				req->submit_tick = spdk_get_ticks();
+				req->cpl.status.sct = cmd->status.sct;
+				req->cpl.status.sc = cmd->status.sc;
+				STAILQ_INSERT_TAIL(&qpair->err_req_head, req, stailq);
+				cmd->err_count--;
+				return 0;
+			}
+		}
+	}
+
+	if (spdk_unlikely(ctrlr->is_failed)) {
+		rc = -ENXIO;
+		goto error;
+	}
+
+	/* assign submit_tick before submitting req to specific transport */
+	if (spdk_unlikely(ctrlr->timeout_enabled)) {
+		if (req->submit_tick == 0) { /* req submitted for the first time */
+			req->submit_tick = spdk_get_ticks();
+			req->timed_out = false;
+		}
+	} else {
+		req->submit_tick = 0;
+	}
+
+	/* Allow two cases:
+	 * 1. NVMe qpair is enabled.
+	 * 2. Always allow fabrics commands through - these get
+	 * the controller out of reset state.
+	 */
+	if (spdk_likely(nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) ||
+	    (req->cmd.opc == SPDK_NVME_OPC_FABRIC &&
+	     nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) {
+		rc = nvme_transport_qpair_submit_request(qpair, req);
+	} else {
+		/* The controller is being reset - queue this request and
+		 *  submit it later when the reset is completed.
+		 */
+		return -EAGAIN;
+	}
+
+	if (spdk_likely(rc == 0)) {
+		req->queued = false;
+		return 0;
+	}
+
+	if (rc == -EAGAIN) {
+		return -EAGAIN;
+	}
+
+error:
+	if (req->parent != NULL) {
+		nvme_request_remove_child(req->parent, req);
+	}
+
+	/* The request is from queued_req list we should trigger the callback from caller */
+	if (spdk_unlikely(req->queued)) {
+		nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+						   SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, true, true);
+		return rc;
+	}
+
+	nvme_free_request(req);
+
+	return rc;
+}
+
+int
+nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+	int rc;
+
+	/* This prevents us from entering an infinite loop when freeing queued I/O in disconnect. */
+	if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING ||
+			  nvme_qpair_get_state(qpair) == NVME_QPAIR_DESTROYING)) {
+		if (req->parent != NULL) {
+			nvme_request_remove_child(req->parent, req);
+		}
+		nvme_free_request(req);
+		return -ENXIO;
+	}
+
+	if (spdk_unlikely(!STAILQ_EMPTY(&qpair->queued_req) && req->num_children == 0)) {
+		/*
+		 * requests that have no children should be sent to the transport after all
+		 * currently queued requests. Requests with chilren will be split and go back
+		 * through this path.
+		 */
+		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
+		req->queued = true;
+		return 0;
+	}
+
+	rc = _nvme_qpair_submit_request(qpair, req);
+	if (rc == -EAGAIN) {
+		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
+		req->queued = true;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+static int
+nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+	int rc;
+
+	/*
+	 * We should never have a request with children on the queue.
+	 * This is necessary to preserve the 1:1 relationship between
+	 * completions and resubmissions.
+	 */
+	assert(req->num_children == 0);
+	assert(req->queued);
+	rc = _nvme_qpair_submit_request(qpair, req);
+	if (spdk_unlikely(rc == -EAGAIN)) {
+		STAILQ_INSERT_HEAD(&qpair->queued_req, req, stailq);
+	}
+
+	return rc;
+}
+
+void
+nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	nvme_qpair_complete_error_reqs(qpair);
+	_nvme_qpair_abort_queued_reqs(qpair, dnr);
+	_nvme_qpair_complete_abort_queued_reqs(qpair);
+	nvme_transport_qpair_abort_reqs(qpair, dnr);
+}
+
+int
+spdk_nvme_qpair_add_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr,
+					struct spdk_nvme_qpair *qpair,
+					uint8_t opc, bool do_not_submit,
+					uint64_t timeout_in_us,
+					uint32_t err_count,
+					uint8_t sct, uint8_t sc)
+{
+	struct nvme_error_cmd *entry, *cmd = NULL;
+
+	if (qpair == NULL) {
+		qpair = ctrlr->adminq;
+	}
+
+	TAILQ_FOREACH(entry, &qpair->err_cmd_head, link) {
+		if (entry->opc == opc) {
+			cmd = entry;
+			break;
+		}
+	}
+
+	if (cmd == NULL) {
+		cmd = spdk_zmalloc(sizeof(*cmd), 64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+		if (!cmd) {
+			return -ENOMEM;
+		}
+		TAILQ_INSERT_TAIL(&qpair->err_cmd_head, cmd, link);
+	}
+
+	cmd->do_not_submit = do_not_submit;
+	cmd->err_count = err_count;
+	cmd->timeout_tsc = timeout_in_us * spdk_get_ticks_hz() / 1000000ULL;
+	cmd->opc = opc;
+	cmd->status.sct = sct;
+	cmd->status.sc = sc;
+
+	return 0;
+}
+
+void
+spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_qpair *qpair,
+		uint8_t opc)
+{
+	struct nvme_error_cmd *cmd, *entry;
+
+	if (qpair == NULL) {
+		qpair = ctrlr->adminq;
+	}
+
+	TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) {
+		if (cmd->opc == opc) {
+			TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link);
+			spdk_free(cmd);
+			return;
+		}
+	}
+
+	return;
+}
diff --git a/src/spdk/lib/nvme/nvme_quirks.c b/src/spdk/lib/nvme/nvme_quirks.c
new file mode 100644
index 000000000..38c8f0eae
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_quirks.c
@@ -0,0 +1,155 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+struct nvme_quirk {
+	struct spdk_pci_id	id;
+	uint64_t		flags;
+};
+
+static const struct nvme_quirk nvme_quirks[] = {
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_INTEL_QUIRK_READ_LATENCY |
+		NVME_INTEL_QUIRK_WRITE_LATENCY |
+		NVME_INTEL_QUIRK_STRIPING |
+		NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+		NVME_QUIRK_DELAY_BEFORE_INIT |
+		NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_INTEL_QUIRK_READ_LATENCY |
+		NVME_INTEL_QUIRK_WRITE_LATENCY |
+		NVME_INTEL_QUIRK_STRIPING |
+		NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+		NVME_QUIRK_DELAY_BEFORE_INIT |
+		NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_INTEL_QUIRK_READ_LATENCY |
+		NVME_INTEL_QUIRK_WRITE_LATENCY |
+		NVME_INTEL_QUIRK_STRIPING |
+		NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+		NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_INTEL_QUIRK_READ_LATENCY |
+		NVME_INTEL_QUIRK_WRITE_LATENCY |
+		NVME_INTEL_QUIRK_STRIPING |
+		NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+		NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_IDENTIFY_CNS |
+		NVME_INTEL_QUIRK_NO_LOG_PAGES |
+		NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_IDENTIFY_CNS |
+		NVME_QUIRK_OCSSD
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VMWARE, 0x07f0, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_SHST_COMPLETE
+	},
+	{	{SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x2700, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+		NVME_QUIRK_OACS_SECURITY
+	},
+	{	{0x000000, 0x0000, 0x0000, 0x0000, 0x0000}, 0}
+};
+
+/* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */
+static bool
+pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2)
+{
+	if ((s1->class_id == SPDK_PCI_CLASS_ANY_ID || s1->class_id == s2->class_id) &&
+	    (s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) &&
+	    (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) &&
+	    (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) &&
+	    (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) {
+		return true;
+	}
+	return false;
+}
+
+uint64_t
+nvme_get_quirks(const struct spdk_pci_id *id)
+{
+	const struct nvme_quirk *quirk = nvme_quirks;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Searching for %04x:%04x [%04x:%04x]...\n",
+		      id->vendor_id, id->device_id,
+		      id->subvendor_id, id->subdevice_id);
+
+	while (quirk->id.vendor_id) {
+		if (pci_id_match(&quirk->id, id)) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Matched quirk %04x:%04x [%04x:%04x]:\n",
+				      quirk->id.vendor_id, quirk->id.device_id,
+				      quirk->id.subvendor_id, quirk->id.subdevice_id);
+
+#define PRINT_QUIRK(quirk_flag) \
+			do { \
+				if (quirk->flags & (quirk_flag)) { \
+					SPDK_DEBUGLOG(SPDK_LOG_NVME, "Quirk enabled: %s\n", #quirk_flag); \
+				} \
+			} while (0)
+
+			PRINT_QUIRK(NVME_INTEL_QUIRK_READ_LATENCY);
+			PRINT_QUIRK(NVME_INTEL_QUIRK_WRITE_LATENCY);
+			PRINT_QUIRK(NVME_QUIRK_DELAY_BEFORE_CHK_RDY);
+			PRINT_QUIRK(NVME_INTEL_QUIRK_STRIPING);
+			PRINT_QUIRK(NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC);
+			PRINT_QUIRK(NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE);
+			PRINT_QUIRK(NVME_QUIRK_IDENTIFY_CNS);
+			PRINT_QUIRK(NVME_QUIRK_OCSSD);
+
+			return quirk->flags;
+		}
+		quirk++;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "No quirks found.\n");
+
+	return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_rdma.c b/src/spdk/lib/nvme/nvme_rdma.c
new file mode 100644
index 000000000..84537c4a1
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_rdma.c
@@ -0,0 +1,2852 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over RDMA transport
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/assert.h"
+#include "spdk/log.h"
+#include "spdk/trace.h"
+#include "spdk/queue.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/config.h"
+
+#include "nvme_internal.h"
+#include "spdk_internal/rdma.h"
+
+#define NVME_RDMA_TIME_OUT_IN_MS 2000
+#define NVME_RDMA_RW_BUFFER_SIZE 131072
+
+/*
+ * NVME RDMA qpair Resource Defaults
+ */
+#define NVME_RDMA_DEFAULT_TX_SGE		2
+#define NVME_RDMA_DEFAULT_RX_SGE		1
+
+/* Max number of NVMe-oF SGL descriptors supported by the host */
+#define NVME_RDMA_MAX_SGL_DESCRIPTORS		16
+
+/* number of STAILQ entries for holding pending RDMA CM events. */
+#define NVME_RDMA_NUM_CM_EVENTS			256
+
+/* CM event processing timeout */
+#define NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US	1000000
+
+/* The default size for a shared rdma completion queue. */
+#define DEFAULT_NVME_RDMA_CQ_SIZE		4096
+
+/*
+ * In the special case of a stale connection we don't expose a mechanism
+ * for the user to retry the connection so we need to handle it internally.
+ */
+#define NVME_RDMA_STALE_CONN_RETRY_MAX		5
+#define NVME_RDMA_STALE_CONN_RETRY_DELAY_US	10000
+
+/*
+ * Maximum value of transport_retry_count used by RDMA controller
+ */
+#define NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT	7
+
+/*
+ * Maximum value of transport_ack_timeout used by RDMA controller
+ */
+#define NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT	31
+
+/*
+ * Number of poller cycles to keep a pointer to destroyed qpairs
+ * in the poll group.
+ */
+#define NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES	50
+
+/*
+ * The max length of keyed SGL data block (3 bytes)
+ */
+#define NVME_RDMA_MAX_KEYED_SGL_LENGTH ((1u << 24u) - 1)
+
+#define WC_PER_QPAIR(queue_depth)	(queue_depth * 2)
+
+enum nvme_rdma_wr_type {
+	RDMA_WR_TYPE_RECV,
+	RDMA_WR_TYPE_SEND,
+};
+
+struct nvme_rdma_wr {
+	/* Using this instead of the enum allows this struct to only occupy one byte. */
+	uint8_t	type;
+};
+
+struct spdk_nvmf_cmd {
+	struct spdk_nvme_cmd cmd;
+	struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS];
+};
+
+struct spdk_nvme_rdma_hooks g_nvme_hooks = {};
+
+/* Mapping from virtual address to ibv_mr pointer for a protection domain */
+struct spdk_nvme_rdma_mr_map {
+	struct ibv_pd				*pd;
+	struct spdk_mem_map			*map;
+	uint64_t				ref;
+	LIST_ENTRY(spdk_nvme_rdma_mr_map)	link;
+};
+
+/* STAILQ wrapper for cm events. */
+struct nvme_rdma_cm_event_entry {
+	struct rdma_cm_event			*evt;
+	STAILQ_ENTRY(nvme_rdma_cm_event_entry)	link;
+};
+
+/* NVMe RDMA transport extensions for spdk_nvme_ctrlr */
+struct nvme_rdma_ctrlr {
+	struct spdk_nvme_ctrlr			ctrlr;
+
+	struct ibv_pd				*pd;
+
+	uint16_t				max_sge;
+
+	struct rdma_event_channel		*cm_channel;
+
+	STAILQ_HEAD(, nvme_rdma_cm_event_entry)	pending_cm_events;
+
+	STAILQ_HEAD(, nvme_rdma_cm_event_entry)	free_cm_events;
+
+	struct nvme_rdma_cm_event_entry		*cm_events;
+};
+
+struct nvme_rdma_destroyed_qpair {
+	struct nvme_rdma_qpair			*destroyed_qpair_tracker;
+	uint32_t				completed_cycles;
+	STAILQ_ENTRY(nvme_rdma_destroyed_qpair)	link;
+};
+
+struct nvme_rdma_poller {
+	struct ibv_context		*device;
+	struct ibv_cq			*cq;
+	int				required_num_wc;
+	int				current_num_wc;
+	STAILQ_ENTRY(nvme_rdma_poller)	link;
+};
+
+struct nvme_rdma_poll_group {
+	struct spdk_nvme_transport_poll_group		group;
+	STAILQ_HEAD(, nvme_rdma_poller)			pollers;
+	int						num_pollers;
+	STAILQ_HEAD(, nvme_rdma_destroyed_qpair)	destroyed_qpairs;
+};
+
+struct spdk_nvme_send_wr_list {
+	struct ibv_send_wr	*first;
+	struct ibv_send_wr	*last;
+};
+
+struct spdk_nvme_recv_wr_list {
+	struct ibv_recv_wr	*first;
+	struct ibv_recv_wr	*last;
+};
+
+/* Memory regions */
+union nvme_rdma_mr {
+	struct ibv_mr	*mr;
+	uint64_t	key;
+};
+
+/* NVMe RDMA qpair extensions for spdk_nvme_qpair */
+struct nvme_rdma_qpair {
+	struct spdk_nvme_qpair			qpair;
+
+	struct spdk_rdma_qp			*rdma_qp;
+	struct rdma_cm_id			*cm_id;
+	struct ibv_cq				*cq;
+
+	struct	spdk_nvme_rdma_req		*rdma_reqs;
+
+	uint32_t				max_send_sge;
+
+	uint32_t				max_recv_sge;
+
+	uint16_t				num_entries;
+
+	bool					delay_cmd_submit;
+
+	bool					poll_group_disconnect_in_progress;
+
+	uint32_t				num_completions;
+
+	/* Parallel arrays of response buffers + response SGLs of size num_entries */
+	struct ibv_sge				*rsp_sgls;
+	struct spdk_nvme_rdma_rsp		*rsps;
+
+	struct ibv_recv_wr			*rsp_recv_wrs;
+
+	struct spdk_nvme_send_wr_list		sends_to_post;
+	struct spdk_nvme_recv_wr_list		recvs_to_post;
+
+	/* Memory region describing all rsps for this qpair */
+	union nvme_rdma_mr			rsp_mr;
+
+	/*
+	 * Array of num_entries NVMe commands registered as RDMA message buffers.
+	 * Indexed by rdma_req->id.
+	 */
+	struct spdk_nvmf_cmd			*cmds;
+
+	/* Memory region describing all cmds for this qpair */
+	union nvme_rdma_mr			cmd_mr;
+
+	struct spdk_nvme_rdma_mr_map		*mr_map;
+
+	TAILQ_HEAD(, spdk_nvme_rdma_req)	free_reqs;
+	TAILQ_HEAD(, spdk_nvme_rdma_req)	outstanding_reqs;
+
+	/* Counts of outstanding send and recv objects */
+	uint16_t				current_num_recvs;
+	uint16_t				current_num_sends;
+
+	/* Placed at the end of the struct since it is not used frequently */
+	struct rdma_cm_event			*evt;
+
+	/* Used by poll group to keep the qpair around until it is ready to remove it. */
+	bool					defer_deletion_to_pg;
+};
+
+enum NVME_RDMA_COMPLETION_FLAGS {
+	NVME_RDMA_SEND_COMPLETED = 1u << 0,
+	NVME_RDMA_RECV_COMPLETED = 1u << 1,
+};
+
+struct spdk_nvme_rdma_req {
+	uint16_t				id;
+	uint16_t				completion_flags: 2;
+	uint16_t				reserved: 14;
+	/* if completion of RDMA_RECV received before RDMA_SEND, we will complete nvme request
+	 * during processing of RDMA_SEND. To complete the request we must know the index
+	 * of nvme_cpl received in RDMA_RECV, so store it in this field */
+	uint16_t				rsp_idx;
+
+	struct nvme_rdma_wr			rdma_wr;
+
+	struct ibv_send_wr			send_wr;
+
+	struct nvme_request			*req;
+
+	struct ibv_sge				send_sgl[NVME_RDMA_DEFAULT_TX_SGE];
+
+	TAILQ_ENTRY(spdk_nvme_rdma_req)		link;
+};
+
+enum nvme_rdma_key_type {
+	NVME_RDMA_MR_RKEY,
+	NVME_RDMA_MR_LKEY
+};
+
+struct spdk_nvme_rdma_rsp {
+	struct spdk_nvme_cpl	cpl;
+	struct nvme_rdma_qpair	*rqpair;
+	uint16_t		idx;
+	struct nvme_rdma_wr	rdma_wr;
+};
+
+static const char *rdma_cm_event_str[] = {
+	"RDMA_CM_EVENT_ADDR_RESOLVED",
+	"RDMA_CM_EVENT_ADDR_ERROR",
+	"RDMA_CM_EVENT_ROUTE_RESOLVED",
+	"RDMA_CM_EVENT_ROUTE_ERROR",
+	"RDMA_CM_EVENT_CONNECT_REQUEST",
+	"RDMA_CM_EVENT_CONNECT_RESPONSE",
+	"RDMA_CM_EVENT_CONNECT_ERROR",
+	"RDMA_CM_EVENT_UNREACHABLE",
+	"RDMA_CM_EVENT_REJECTED",
+	"RDMA_CM_EVENT_ESTABLISHED",
+	"RDMA_CM_EVENT_DISCONNECTED",
+	"RDMA_CM_EVENT_DEVICE_REMOVAL",
+	"RDMA_CM_EVENT_MULTICAST_JOIN",
+	"RDMA_CM_EVENT_MULTICAST_ERROR",
+	"RDMA_CM_EVENT_ADDR_CHANGE",
+	"RDMA_CM_EVENT_TIMEWAIT_EXIT"
+};
+
+static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps);
+static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
+struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group,
+		uint32_t qp_num);
+
+static inline void *
+nvme_rdma_calloc(size_t nmemb, size_t size)
+{
+	if (!g_nvme_hooks.get_rkey) {
+		return calloc(nmemb, size);
+	} else {
+		return spdk_zmalloc(nmemb * size, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+	}
+}
+
+static inline void
+nvme_rdma_free(void *buf)
+{
+	if (!g_nvme_hooks.get_rkey) {
+		free(buf);
+	} else {
+		spdk_free(buf);
+	}
+}
+
+static int nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+		struct spdk_nvme_qpair *qpair);
+
+static inline struct nvme_rdma_qpair *
+nvme_rdma_qpair(struct spdk_nvme_qpair *qpair)
+{
+	assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA);
+	return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair);
+}
+
+static inline struct nvme_rdma_poll_group *
+nvme_rdma_poll_group(struct spdk_nvme_transport_poll_group *group)
+{
+	return (SPDK_CONTAINEROF(group, struct nvme_rdma_poll_group, group));
+}
+
+static inline struct nvme_rdma_ctrlr *
+nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+	assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA);
+	return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr);
+}
+
+static struct spdk_nvme_rdma_req *
+nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair)
+{
+	struct spdk_nvme_rdma_req *rdma_req;
+
+	rdma_req = TAILQ_FIRST(&rqpair->free_reqs);
+	if (rdma_req) {
+		TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link);
+		TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link);
+	}
+
+	return rdma_req;
+}
+
+static void
+nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req)
+{
+	rdma_req->completion_flags = 0;
+	rdma_req->req = NULL;
+	TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link);
+}
+
+static void
+nvme_rdma_req_complete(struct spdk_nvme_rdma_req *rdma_req,
+		       struct spdk_nvme_cpl *rsp)
+{
+	struct nvme_request *req = rdma_req->req;
+	struct nvme_rdma_qpair *rqpair;
+
+	assert(req != NULL);
+
+	rqpair = nvme_rdma_qpair(req->qpair);
+	TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link);
+
+	nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp);
+	nvme_free_request(req);
+}
+
+static const char *
+nvme_rdma_cm_event_str_get(uint32_t event)
+{
+	if (event < SPDK_COUNTOF(rdma_cm_event_str)) {
+		return rdma_cm_event_str[event];
+	} else {
+		return "Undefined";
+	}
+}
+
+
+static int
+nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair)
+{
+	struct rdma_cm_event				*event = rqpair->evt;
+	struct spdk_nvmf_rdma_accept_private_data	*accept_data;
+	int						rc = 0;
+
+	if (event) {
+		switch (event->event) {
+		case RDMA_CM_EVENT_ADDR_RESOLVED:
+		case RDMA_CM_EVENT_ADDR_ERROR:
+		case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		case RDMA_CM_EVENT_ROUTE_ERROR:
+			break;
+		case RDMA_CM_EVENT_CONNECT_REQUEST:
+			break;
+		case RDMA_CM_EVENT_CONNECT_ERROR:
+			break;
+		case RDMA_CM_EVENT_UNREACHABLE:
+		case RDMA_CM_EVENT_REJECTED:
+			break;
+		case RDMA_CM_EVENT_CONNECT_RESPONSE:
+			rc = spdk_rdma_qp_complete_connect(rqpair->rdma_qp);
+		/* fall through */
+		case RDMA_CM_EVENT_ESTABLISHED:
+			accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data;
+			if (accept_data == NULL) {
+				rc = -1;
+			} else {
+				SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n",
+					      rqpair->num_entries, accept_data->crqsize);
+				rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize);
+			}
+			break;
+		case RDMA_CM_EVENT_DISCONNECTED:
+			rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE;
+			break;
+		case RDMA_CM_EVENT_DEVICE_REMOVAL:
+			rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+			break;
+		case RDMA_CM_EVENT_MULTICAST_JOIN:
+		case RDMA_CM_EVENT_MULTICAST_ERROR:
+			break;
+		case RDMA_CM_EVENT_ADDR_CHANGE:
+			rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+			break;
+		case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+			break;
+		default:
+			SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
+			break;
+		}
+		rqpair->evt = NULL;
+		rdma_ack_cm_event(event);
+	}
+
+	return rc;
+}
+
+/*
+ * This function must be called under the nvme controller's lock
+ * because it touches global controller variables. The lock is taken
+ * by the generic transport code before invoking a few of the functions
+ * in this file: nvme_rdma_ctrlr_connect_qpair, nvme_rdma_ctrlr_delete_io_qpair,
+ * and conditionally nvme_rdma_qpair_process_completions when it is calling
+ * completions on the admin qpair. When adding a new call to this function, please
+ * verify that it is in a situation where it falls under the lock.
+ */
+static int
+nvme_rdma_poll_events(struct nvme_rdma_ctrlr *rctrlr)
+{
+	struct nvme_rdma_cm_event_entry	*entry, *tmp;
+	struct nvme_rdma_qpair		*event_qpair;
+	struct rdma_cm_event		*event;
+	struct rdma_event_channel	*channel = rctrlr->cm_channel;
+
+	STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) {
+		event_qpair = nvme_rdma_qpair(entry->evt->id->context);
+		if (event_qpair->evt == NULL) {
+			event_qpair->evt = entry->evt;
+			STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link);
+			STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link);
+		}
+	}
+
+	while (rdma_get_cm_event(channel, &event) == 0) {
+		event_qpair = nvme_rdma_qpair(event->id->context);
+		if (event_qpair->evt == NULL) {
+			event_qpair->evt = event;
+		} else {
+			assert(rctrlr == nvme_rdma_ctrlr(event_qpair->qpair.ctrlr));
+			entry = STAILQ_FIRST(&rctrlr->free_cm_events);
+			if (entry == NULL) {
+				rdma_ack_cm_event(event);
+				return -ENOMEM;
+			}
+			STAILQ_REMOVE(&rctrlr->free_cm_events, entry, nvme_rdma_cm_event_entry, link);
+			entry->evt = event;
+			STAILQ_INSERT_TAIL(&rctrlr->pending_cm_events, entry, link);
+		}
+	}
+
+	if (errno == EAGAIN || errno == EWOULDBLOCK) {
+		return 0;
+	} else {
+		return errno;
+	}
+}
+
+static int
+nvme_rdma_validate_cm_event(enum rdma_cm_event_type expected_evt_type,
+			    struct rdma_cm_event *reaped_evt)
+{
+	int rc = -EBADMSG;
+
+	if (expected_evt_type == reaped_evt->event) {
+		return 0;
+	}
+
+	switch (expected_evt_type) {
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/*
+		 * There is an enum ib_cm_rej_reason in the kernel headers that sets 10 as
+		 * IB_CM_REJ_STALE_CONN. I can't find the corresponding userspace but we get
+		 * the same values here.
+		 */
+		if (reaped_evt->event == RDMA_CM_EVENT_REJECTED && reaped_evt->status == 10) {
+			rc = -ESTALE;
+		} else if (reaped_evt->event == RDMA_CM_EVENT_CONNECT_RESPONSE) {
+			/*
+			 *  If we are using a qpair which is not created using rdma cm API
+			 *  then we will receive RDMA_CM_EVENT_CONNECT_RESPONSE instead of
+			 *  RDMA_CM_EVENT_ESTABLISHED.
+			 */
+			return 0;
+		}
+		break;
+	default:
+		break;
+	}
+
+	SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n",
+		    nvme_rdma_cm_event_str_get(expected_evt_type),
+		    nvme_rdma_cm_event_str_get(reaped_evt->event), reaped_evt->event,
+		    reaped_evt->status);
+	return rc;
+}
+
+static int
+nvme_rdma_process_event(struct nvme_rdma_qpair *rqpair,
+			struct rdma_event_channel *channel,
+			enum rdma_cm_event_type evt)
+{
+	struct nvme_rdma_ctrlr	*rctrlr;
+	uint64_t timeout_ticks;
+	int	rc = 0, rc2;
+
+	if (rqpair->evt != NULL) {
+		rc = nvme_rdma_qpair_process_cm_event(rqpair);
+		if (rc) {
+			return rc;
+		}
+	}
+
+	timeout_ticks = (NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC +
+			spdk_get_ticks();
+	rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr);
+	assert(rctrlr != NULL);
+
+	while (!rqpair->evt && spdk_get_ticks() < timeout_ticks && rc == 0) {
+		rc = nvme_rdma_poll_events(rctrlr);
+	}
+
+	if (rc) {
+		return rc;
+	}
+
+	if (rqpair->evt == NULL) {
+		return -EADDRNOTAVAIL;
+	}
+
+	rc = nvme_rdma_validate_cm_event(evt, rqpair->evt);
+
+	rc2 = nvme_rdma_qpair_process_cm_event(rqpair);
+	/* bad message takes precedence over the other error codes from processing the event. */
+	return rc == 0 ? rc2 : rc;
+}
+
+static int
+nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair)
+{
+	int			rc;
+	struct spdk_rdma_qp_init_attr	attr = {};
+	struct ibv_device_attr	dev_attr;
+	struct nvme_rdma_ctrlr	*rctrlr;
+
+	rc = ibv_query_device(rqpair->cm_id->verbs, &dev_attr);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+		return -1;
+	}
+
+	if (rqpair->qpair.poll_group) {
+		assert(!rqpair->cq);
+		rc = nvme_poll_group_connect_qpair(&rqpair->qpair);
+		if (rc) {
+			SPDK_ERRLOG("Unable to activate the rdmaqpair.\n");
+			return -1;
+		}
+		assert(rqpair->cq);
+	} else {
+		rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0);
+		if (!rqpair->cq) {
+			SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno));
+			return -1;
+		}
+	}
+
+	rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr);
+	if (g_nvme_hooks.get_ibv_pd) {
+		rctrlr->pd = g_nvme_hooks.get_ibv_pd(&rctrlr->ctrlr.trid, rqpair->cm_id->verbs);
+	} else {
+		rctrlr->pd = NULL;
+	}
+
+	attr.pd =		rctrlr->pd;
+	attr.send_cq		= rqpair->cq;
+	attr.recv_cq		= rqpair->cq;
+	attr.cap.max_send_wr	= rqpair->num_entries; /* SEND operations */
+	attr.cap.max_recv_wr	= rqpair->num_entries; /* RECV operations */
+	attr.cap.max_send_sge	= spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge);
+	attr.cap.max_recv_sge	= spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge);
+
+	rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &attr);
+
+	if (!rqpair->rdma_qp) {
+		return -1;
+	}
+
+	/* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */
+	rqpair->max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, attr.cap.max_send_sge);
+	rqpair->max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, attr.cap.max_recv_sge);
+	rqpair->current_num_recvs = 0;
+	rqpair->current_num_sends = 0;
+
+	rctrlr->pd = rqpair->rdma_qp->qp->pd;
+
+	rqpair->cm_id->context = &rqpair->qpair;
+
+	return 0;
+}
+
+static inline int
+nvme_rdma_qpair_submit_sends(struct nvme_rdma_qpair *rqpair)
+{
+	struct ibv_send_wr *bad_send_wr;
+	int rc;
+
+	rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_send_wr);
+
+	if (spdk_unlikely(rc)) {
+		SPDK_ERRLOG("Failed to post WRs on send queue, errno %d (%s), bad_wr %p\n",
+			    rc, spdk_strerror(rc), bad_send_wr);
+		while (bad_send_wr != NULL) {
+			assert(rqpair->current_num_sends > 0);
+			rqpair->current_num_sends--;
+			bad_send_wr = bad_send_wr->next;
+		}
+		return rc;
+	}
+
+	return 0;
+}
+
+static inline int
+nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair)
+{
+	struct ibv_recv_wr *bad_recv_wr;
+	int rc = 0;
+
+	if (rqpair->recvs_to_post.first) {
+		rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->recvs_to_post.first, &bad_recv_wr);
+		if (spdk_unlikely(rc)) {
+			SPDK_ERRLOG("Failed to post WRs on receive queue, errno %d (%s), bad_wr %p\n",
+				    rc, spdk_strerror(rc), bad_recv_wr);
+			while (bad_recv_wr != NULL) {
+				assert(rqpair->current_num_sends > 0);
+				rqpair->current_num_recvs--;
+				bad_recv_wr = bad_recv_wr->next;
+			}
+		}
+
+		rqpair->recvs_to_post.first = NULL;
+	}
+	return rc;
+}
+
+/* Append the given send wr structure to the qpair's outstanding sends list. */
+/* This function accepts only a single wr. */
+static inline int
+nvme_rdma_qpair_queue_send_wr(struct nvme_rdma_qpair *rqpair, struct ibv_send_wr *wr)
+{
+	assert(wr->next == NULL);
+
+	assert(rqpair->current_num_sends < rqpair->num_entries);
+
+	rqpair->current_num_sends++;
+	spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, wr);
+
+	if (!rqpair->delay_cmd_submit) {
+		return nvme_rdma_qpair_submit_sends(rqpair);
+	}
+
+	return 0;
+}
+
+/* Append the given recv wr structure to the qpair's outstanding recvs list. */
+/* This function accepts only a single wr. */
+static inline int
+nvme_rdma_qpair_queue_recv_wr(struct nvme_rdma_qpair *rqpair, struct ibv_recv_wr *wr)
+{
+
+	assert(wr->next == NULL);
+	assert(rqpair->current_num_recvs < rqpair->num_entries);
+
+	rqpair->current_num_recvs++;
+	if (rqpair->recvs_to_post.first == NULL) {
+		rqpair->recvs_to_post.first = wr;
+	} else {
+		rqpair->recvs_to_post.last->next = wr;
+	}
+
+	rqpair->recvs_to_post.last = wr;
+
+	if (!rqpair->delay_cmd_submit) {
+		return nvme_rdma_qpair_submit_recvs(rqpair);
+	}
+
+	return 0;
+}
+
+#define nvme_rdma_trace_ibv_sge(sg_list) \
+	if (sg_list) { \
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \
+			      (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \
+	}
+
+static int
+nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx)
+{
+	struct ibv_recv_wr *wr;
+
+	wr = &rqpair->rsp_recv_wrs[rsp_idx];
+	wr->next = NULL;
+	nvme_rdma_trace_ibv_sge(wr->sg_list);
+	return nvme_rdma_qpair_queue_recv_wr(rqpair, wr);
+}
+
+static int
+nvme_rdma_reg_mr(struct rdma_cm_id *cm_id, union nvme_rdma_mr *mr, void *mem, size_t length)
+{
+	if (!g_nvme_hooks.get_rkey) {
+		mr->mr = rdma_reg_msgs(cm_id, mem, length);
+		if (mr->mr == NULL) {
+			SPDK_ERRLOG("Unable to register mr: %s (%d)\n",
+				    spdk_strerror(errno), errno);
+			return -1;
+		}
+	} else {
+		mr->key = g_nvme_hooks.get_rkey(cm_id->pd, mem, length);
+	}
+
+	return 0;
+}
+
+static void
+nvme_rdma_dereg_mr(union nvme_rdma_mr *mr)
+{
+	if (!g_nvme_hooks.get_rkey) {
+		if (mr->mr && rdma_dereg_mr(mr->mr)) {
+			SPDK_ERRLOG("Unable to de-register mr\n");
+		}
+	} else {
+		if (mr->key) {
+			g_nvme_hooks.put_rkey(mr->key);
+		}
+	}
+	memset(mr, 0, sizeof(*mr));
+}
+
+static uint32_t
+nvme_rdma_mr_get_lkey(union nvme_rdma_mr *mr)
+{
+	uint32_t lkey;
+
+	if (!g_nvme_hooks.get_rkey) {
+		lkey = mr->mr->lkey;
+	} else {
+		lkey = *((uint64_t *) mr->key);
+	}
+
+	return lkey;
+}
+
+static void
+nvme_rdma_unregister_rsps(struct nvme_rdma_qpair *rqpair)
+{
+	nvme_rdma_dereg_mr(&rqpair->rsp_mr);
+}
+
+static void
+nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair)
+{
+	nvme_rdma_free(rqpair->rsps);
+	rqpair->rsps = NULL;
+	nvme_rdma_free(rqpair->rsp_sgls);
+	rqpair->rsp_sgls = NULL;
+	nvme_rdma_free(rqpair->rsp_recv_wrs);
+	rqpair->rsp_recv_wrs = NULL;
+}
+
+static int
+nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair)
+{
+	rqpair->rsps = NULL;
+	rqpair->rsp_recv_wrs = NULL;
+
+	rqpair->rsp_sgls = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls));
+	if (!rqpair->rsp_sgls) {
+		SPDK_ERRLOG("Failed to allocate rsp_sgls\n");
+		goto fail;
+	}
+
+	rqpair->rsp_recv_wrs = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_recv_wrs));
+	if (!rqpair->rsp_recv_wrs) {
+		SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n");
+		goto fail;
+	}
+
+	rqpair->rsps = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsps));
+	if (!rqpair->rsps) {
+		SPDK_ERRLOG("can not allocate rdma rsps\n");
+		goto fail;
+	}
+
+	return 0;
+fail:
+	nvme_rdma_free_rsps(rqpair);
+	return -ENOMEM;
+}
+
+static int
+nvme_rdma_register_rsps(struct nvme_rdma_qpair *rqpair)
+{
+	uint16_t i;
+	int rc;
+	uint32_t lkey;
+
+	rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->rsp_mr,
+			      rqpair->rsps, rqpair->num_entries * sizeof(*rqpair->rsps));
+
+	if (rc < 0) {
+		goto fail;
+	}
+
+	lkey = nvme_rdma_mr_get_lkey(&rqpair->rsp_mr);
+
+	for (i = 0; i < rqpair->num_entries; i++) {
+		struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i];
+		struct spdk_nvme_rdma_rsp *rsp = &rqpair->rsps[i];
+
+		rsp->rqpair = rqpair;
+		rsp->rdma_wr.type = RDMA_WR_TYPE_RECV;
+		rsp->idx = i;
+		rsp_sgl->addr = (uint64_t)&rqpair->rsps[i];
+		rsp_sgl->length = sizeof(struct spdk_nvme_cpl);
+		rsp_sgl->lkey = lkey;
+
+		rqpair->rsp_recv_wrs[i].wr_id = (uint64_t)&rsp->rdma_wr;
+		rqpair->rsp_recv_wrs[i].next = NULL;
+		rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl;
+		rqpair->rsp_recv_wrs[i].num_sge = 1;
+
+		rc = nvme_rdma_post_recv(rqpair, i);
+		if (rc) {
+			goto fail;
+		}
+	}
+
+	rc = nvme_rdma_qpair_submit_recvs(rqpair);
+	if (rc) {
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	nvme_rdma_unregister_rsps(rqpair);
+	return rc;
+}
+
+static void
+nvme_rdma_unregister_reqs(struct nvme_rdma_qpair *rqpair)
+{
+	nvme_rdma_dereg_mr(&rqpair->cmd_mr);
+}
+
+static void
+nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair)
+{
+	if (!rqpair->rdma_reqs) {
+		return;
+	}
+
+	nvme_rdma_free(rqpair->cmds);
+	rqpair->cmds = NULL;
+
+	nvme_rdma_free(rqpair->rdma_reqs);
+	rqpair->rdma_reqs = NULL;
+}
+
+static int
+nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair)
+{
+	uint16_t i;
+
+	rqpair->rdma_reqs = nvme_rdma_calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req));
+	if (rqpair->rdma_reqs == NULL) {
+		SPDK_ERRLOG("Failed to allocate rdma_reqs\n");
+		goto fail;
+	}
+
+	rqpair->cmds = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->cmds));
+	if (!rqpair->cmds) {
+		SPDK_ERRLOG("Failed to allocate RDMA cmds\n");
+		goto fail;
+	}
+
+
+	TAILQ_INIT(&rqpair->free_reqs);
+	TAILQ_INIT(&rqpair->outstanding_reqs);
+	for (i = 0; i < rqpair->num_entries; i++) {
+		struct spdk_nvme_rdma_req	*rdma_req;
+		struct spdk_nvmf_cmd		*cmd;
+
+		rdma_req = &rqpair->rdma_reqs[i];
+		rdma_req->rdma_wr.type = RDMA_WR_TYPE_SEND;
+		cmd = &rqpair->cmds[i];
+
+		rdma_req->id = i;
+
+		/* The first RDMA sgl element will always point
+		 * at this data structure. Depending on whether
+		 * an NVMe-oF SGL is required, the length of
+		 * this element may change. */
+		rdma_req->send_sgl[0].addr = (uint64_t)cmd;
+		rdma_req->send_wr.wr_id = (uint64_t)&rdma_req->rdma_wr;
+		rdma_req->send_wr.next = NULL;
+		rdma_req->send_wr.opcode = IBV_WR_SEND;
+		rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED;
+		rdma_req->send_wr.sg_list = rdma_req->send_sgl;
+		rdma_req->send_wr.imm_data = 0;
+
+		TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link);
+	}
+
+	return 0;
+fail:
+	nvme_rdma_free_reqs(rqpair);
+	return -ENOMEM;
+}
+
+static int
+nvme_rdma_register_reqs(struct nvme_rdma_qpair *rqpair)
+{
+	int i;
+	int rc;
+	uint32_t lkey;
+
+	rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->cmd_mr,
+			      rqpair->cmds, rqpair->num_entries * sizeof(*rqpair->cmds));
+
+	if (rc < 0) {
+		goto fail;
+	}
+
+	lkey = nvme_rdma_mr_get_lkey(&rqpair->cmd_mr);
+
+	for (i = 0; i < rqpair->num_entries; i++) {
+		rqpair->rdma_reqs[i].send_sgl[0].lkey = lkey;
+	}
+
+	return 0;
+
+fail:
+	nvme_rdma_unregister_reqs(rqpair);
+	return -ENOMEM;
+}
+
+static int
+nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair,
+		       struct sockaddr *src_addr,
+		       struct sockaddr *dst_addr,
+		       struct rdma_event_channel *cm_channel)
+{
+	int ret;
+
+	ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr,
+				NVME_RDMA_TIME_OUT_IN_MS);
+	if (ret) {
+		SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno);
+		return ret;
+	}
+
+	ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
+	if (ret) {
+		SPDK_ERRLOG("RDMA address resolution error\n");
+		return -1;
+	}
+
+	if (rqpair->qpair.ctrlr->opts.transport_ack_timeout != SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED) {
+#ifdef SPDK_CONFIG_RDMA_SET_ACK_TIMEOUT
+		uint8_t timeout = rqpair->qpair.ctrlr->opts.transport_ack_timeout;
+		ret = rdma_set_option(rqpair->cm_id, RDMA_OPTION_ID,
+				      RDMA_OPTION_ID_ACK_TIMEOUT,
+				      &timeout, sizeof(timeout));
+		if (ret) {
+			SPDK_NOTICELOG("Can't apply RDMA_OPTION_ID_ACK_TIMEOUT %d, ret %d\n", timeout, ret);
+		}
+#else
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport_ack_timeout is not supported\n");
+#endif
+	}
+
+
+	ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS);
+	if (ret) {
+		SPDK_ERRLOG("rdma_resolve_route\n");
+		return ret;
+	}
+
+	ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
+	if (ret) {
+		SPDK_ERRLOG("RDMA route resolution error\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+nvme_rdma_connect(struct nvme_rdma_qpair *rqpair)
+{
+	struct rdma_conn_param				param = {};
+	struct spdk_nvmf_rdma_request_private_data	request_data = {};
+	struct ibv_device_attr				attr;
+	int						ret;
+	struct spdk_nvme_ctrlr				*ctrlr;
+	struct nvme_rdma_ctrlr				*rctrlr;
+
+	ret = ibv_query_device(rqpair->cm_id->verbs, &attr);
+	if (ret != 0) {
+		SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+		return ret;
+	}
+
+	param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom);
+
+	ctrlr = rqpair->qpair.ctrlr;
+	if (!ctrlr) {
+		return -1;
+	}
+	rctrlr = nvme_rdma_ctrlr(ctrlr);
+	assert(rctrlr != NULL);
+
+	request_data.qid = rqpair->qpair.id;
+	request_data.hrqsize = rqpair->num_entries;
+	request_data.hsqsize = rqpair->num_entries - 1;
+	request_data.cntlid = ctrlr->cntlid;
+
+	param.private_data = &request_data;
+	param.private_data_len = sizeof(request_data);
+	param.retry_count = ctrlr->opts.transport_retry_count;
+	param.rnr_retry_count = 7;
+
+	/* Fields below are ignored by rdma cm if qpair has been
+	 * created using rdma cm API. */
+	param.srq = 0;
+	param.qp_num = rqpair->rdma_qp->qp->qp_num;
+
+	ret = rdma_connect(rqpair->cm_id, &param);
+	if (ret) {
+		SPDK_ERRLOG("nvme rdma connect error\n");
+		return ret;
+	}
+
+	ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED);
+	if (ret == -ESTALE) {
+		SPDK_NOTICELOG("Received a stale connection notice during connection.\n");
+		return -EAGAIN;
+	} else if (ret) {
+		SPDK_ERRLOG("RDMA connect error %d\n", ret);
+		return ret;
+	} else {
+		return 0;
+	}
+}
+
+static int
+nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
+{
+	struct addrinfo *res;
+	struct addrinfo hints;
+	int ret;
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = family;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_protocol = 0;
+
+	ret = getaddrinfo(addr, service, &hints, &res);
+	if (ret) {
+		SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret);
+		return ret;
+	}
+
+	if (res->ai_addrlen > sizeof(*sa)) {
+		SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen);
+		ret = EINVAL;
+	} else {
+		memcpy(sa, res->ai_addr, res->ai_addrlen);
+	}
+
+	freeaddrinfo(res);
+	return ret;
+}
+
+static int
+nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map,
+			enum spdk_mem_map_notify_action action,
+			void *vaddr, size_t size)
+{
+	struct ibv_pd *pd = cb_ctx;
+	struct ibv_mr *mr;
+	int rc;
+
+	switch (action) {
+	case SPDK_MEM_MAP_NOTIFY_REGISTER:
+		if (!g_nvme_hooks.get_rkey) {
+			mr = ibv_reg_mr(pd, vaddr, size,
+					IBV_ACCESS_LOCAL_WRITE |
+					IBV_ACCESS_REMOTE_READ |
+					IBV_ACCESS_REMOTE_WRITE);
+			if (mr == NULL) {
+				SPDK_ERRLOG("ibv_reg_mr() failed\n");
+				return -EFAULT;
+			} else {
+				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
+			}
+		} else {
+			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
+							  g_nvme_hooks.get_rkey(pd, vaddr, size));
+		}
+		break;
+	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+		if (!g_nvme_hooks.get_rkey) {
+			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
+			if (mr) {
+				ibv_dereg_mr(mr);
+			}
+		}
+		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
+		break;
+	default:
+		SPDK_UNREACHABLE();
+	}
+
+	return rc;
+}
+
+static int
+nvme_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
+{
+	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
+	return addr_1 == addr_2;
+}
+
+static int
+nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair)
+{
+	struct ibv_pd *pd = rqpair->rdma_qp->qp->pd;
+	struct spdk_nvme_rdma_mr_map *mr_map;
+	const struct spdk_mem_map_ops nvme_rdma_map_ops = {
+		.notify_cb = nvme_rdma_mr_map_notify,
+		.are_contiguous = nvme_rdma_check_contiguous_entries
+	};
+
+	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
+
+	/* Look up existing mem map registration for this pd */
+	LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) {
+		if (mr_map->pd == pd) {
+			mr_map->ref++;
+			rqpair->mr_map = mr_map;
+			pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+			return 0;
+		}
+	}
+
+	mr_map = nvme_rdma_calloc(1, sizeof(*mr_map));
+	if (mr_map == NULL) {
+		SPDK_ERRLOG("Failed to allocate mr_map\n");
+		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+		return -1;
+	}
+
+	mr_map->ref = 1;
+	mr_map->pd = pd;
+	mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd);
+	if (mr_map->map == NULL) {
+		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
+		nvme_rdma_free(mr_map);
+
+		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+		return -1;
+	}
+
+	rqpair->mr_map = mr_map;
+	LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link);
+
+	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+
+	return 0;
+}
+
+static void
+nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair)
+{
+	struct spdk_nvme_rdma_mr_map *mr_map;
+
+	mr_map = rqpair->mr_map;
+	rqpair->mr_map = NULL;
+
+	if (mr_map == NULL) {
+		return;
+	}
+
+	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
+
+	assert(mr_map->ref > 0);
+	mr_map->ref--;
+	if (mr_map->ref == 0) {
+		LIST_REMOVE(mr_map, link);
+		spdk_mem_map_free(&mr_map->map);
+		nvme_rdma_free(mr_map);
+	}
+
+	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+}
+
+static int
+_nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct sockaddr_storage dst_addr;
+	struct sockaddr_storage src_addr;
+	bool src_addr_specified;
+	int rc;
+	struct nvme_rdma_ctrlr *rctrlr;
+	struct nvme_rdma_qpair *rqpair;
+	int family;
+
+	rqpair = nvme_rdma_qpair(qpair);
+	rctrlr = nvme_rdma_ctrlr(ctrlr);
+	assert(rctrlr != NULL);
+
+	switch (ctrlr->trid.adrfam) {
+	case SPDK_NVMF_ADRFAM_IPV4:
+		family = AF_INET;
+		break;
+	case SPDK_NVMF_ADRFAM_IPV6:
+		family = AF_INET6;
+		break;
+	default:
+		SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam);
+		return -1;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family);
+
+	memset(&dst_addr, 0, sizeof(dst_addr));
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid);
+	rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid);
+	if (rc != 0) {
+		SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n");
+		return -1;
+	}
+
+	if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) {
+		memset(&src_addr, 0, sizeof(src_addr));
+		rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid);
+		if (rc != 0) {
+			SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n");
+			return -1;
+		}
+		src_addr_specified = true;
+	} else {
+		src_addr_specified = false;
+	}
+
+	rc = rdma_create_id(rctrlr->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP);
+	if (rc < 0) {
+		SPDK_ERRLOG("rdma_create_id() failed\n");
+		return -1;
+	}
+
+	rc = nvme_rdma_resolve_addr(rqpair,
+				    src_addr_specified ? (struct sockaddr *)&src_addr : NULL,
+				    (struct sockaddr *)&dst_addr, rctrlr->cm_channel);
+	if (rc < 0) {
+		SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n");
+		return -1;
+	}
+
+	rc = nvme_rdma_qpair_init(rqpair);
+	if (rc < 0) {
+		SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n");
+		return -1;
+	}
+
+	rc = nvme_rdma_connect(rqpair);
+	if (rc != 0) {
+		SPDK_ERRLOG("Unable to connect the rqpair\n");
+		return rc;
+	}
+
+	rc = nvme_rdma_register_reqs(rqpair);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+	if (rc) {
+		SPDK_ERRLOG("Unable to register rqpair RDMA requests\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests registered\n");
+
+	rc = nvme_rdma_register_rsps(rqpair);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+	if (rc < 0) {
+		SPDK_ERRLOG("Unable to register rqpair RDMA responses\n");
+		return -1;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses registered\n");
+
+	rc = nvme_rdma_register_mem(rqpair);
+	if (rc < 0) {
+		SPDK_ERRLOG("Unable to register memory for RDMA\n");
+		return -1;
+	}
+
+	rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries);
+	if (rc < 0) {
+		rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
+		SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	int rc;
+	int retry_count = 0;
+
+	rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair);
+
+	/*
+	 * -EAGAIN represents the special case where the target side still thought it was connected.
+	 * Most NICs will fail the first connection attempt, and the NICs will clean up whatever
+	 * state they need to. After that, subsequent connection attempts will succeed.
+	 */
+	if (rc == -EAGAIN) {
+		SPDK_NOTICELOG("Detected stale connection on Target side for qpid: %d\n", qpair->id);
+		do {
+			nvme_delay(NVME_RDMA_STALE_CONN_RETRY_DELAY_US);
+			nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+			rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair);
+			retry_count++;
+		} while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX);
+	}
+
+	return rc;
+}
+
+/*
+ * Build SGL describing empty payload.
+ */
+static int
+nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req)
+{
+	struct nvme_request *req = rdma_req->req;
+
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+
+	/* The first element of this SGL is pointing at an
+	 * spdk_nvmf_cmd object. For this particular command,
+	 * we only need the first 64 bytes corresponding to
+	 * the NVMe command. */
+	rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+	/* The RDMA SGL needs one element describing the NVMe command. */
+	rdma_req->send_wr.num_sge = 1;
+
+	req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+	req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+	req->cmd.dptr.sgl1.keyed.length = 0;
+	req->cmd.dptr.sgl1.keyed.key = 0;
+	req->cmd.dptr.sgl1.address = 0;
+
+	return 0;
+}
+
+static inline bool
+nvme_rdma_get_key(struct spdk_mem_map *map, void *payload, uint64_t size,
+		  enum nvme_rdma_key_type key_type, uint32_t *key)
+{
+	struct ibv_mr *mr;
+	uint64_t real_size = size;
+	uint32_t _key = 0;
+
+	if (!g_nvme_hooks.get_rkey) {
+		mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)payload, &real_size);
+
+		if (spdk_unlikely(!mr)) {
+			SPDK_ERRLOG("No translation for ptr %p, size %lu\n", payload, size);
+			return false;
+		}
+		switch (key_type) {
+		case NVME_RDMA_MR_RKEY:
+			_key = mr->rkey;
+			break;
+		case NVME_RDMA_MR_LKEY:
+			_key = mr->lkey;
+			break;
+		default:
+			SPDK_ERRLOG("Invalid key type %d\n", key_type);
+			assert(0);
+			return false;
+		}
+	} else {
+		_key = spdk_mem_map_translate(map, (uint64_t)payload, &real_size);
+	}
+
+	if (spdk_unlikely(real_size < size)) {
+		SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
+		return false;
+	}
+
+	*key = _key;
+	return true;
+}
+
+/*
+ * Build inline SGL describing contiguous payload buffer.
+ */
+static int
+nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair,
+				      struct spdk_nvme_rdma_req *rdma_req)
+{
+	struct nvme_request *req = rdma_req->req;
+	uint32_t lkey = 0;
+	void *payload;
+
+	payload = req->payload.contig_or_cb_arg + req->payload_offset;
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+	if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size,
+					     NVME_RDMA_MR_LKEY, &lkey))) {
+		return -1;
+	}
+
+	rdma_req->send_sgl[1].lkey = lkey;
+
+	/* The first element of this SGL is pointing at an
+	 * spdk_nvmf_cmd object. For this particular command,
+	 * we only need the first 64 bytes corresponding to
+	 * the NVMe command. */
+	rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+	rdma_req->send_sgl[1].addr = (uint64_t)payload;
+	rdma_req->send_sgl[1].length = (uint32_t)req->payload_size;
+
+	/* The RDMA SGL contains two elements. The first describes
+	 * the NVMe command and the second describes the data
+	 * payload. */
+	rdma_req->send_wr.num_sge = 2;
+
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+	req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+	req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+	req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size;
+	/* Inline only supported for icdoff == 0 currently.  This function will
+	 * not get called for controllers with other values. */
+	req->cmd.dptr.sgl1.address = (uint64_t)0;
+
+	return 0;
+}
+
+/*
+ * Build SGL describing contiguous payload buffer.
+ */
+static int
+nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair,
+			       struct spdk_nvme_rdma_req *rdma_req)
+{
+	struct nvme_request *req = rdma_req->req;
+	void *payload = req->payload.contig_or_cb_arg + req->payload_offset;
+	uint32_t rkey = 0;
+
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+	if (spdk_unlikely(req->payload_size > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) {
+		SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n",
+			    req->payload_size, NVME_RDMA_MAX_KEYED_SGL_LENGTH);
+		return -1;
+	}
+
+	if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size,
+					     NVME_RDMA_MR_RKEY, &rkey))) {
+		return -1;
+	}
+
+	req->cmd.dptr.sgl1.keyed.key = rkey;
+
+	/* The first element of this SGL is pointing at an
+	 * spdk_nvmf_cmd object. For this particular command,
+	 * we only need the first 64 bytes corresponding to
+	 * the NVMe command. */
+	rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+	/* The RDMA SGL needs one element describing the NVMe command. */
+	rdma_req->send_wr.num_sge = 1;
+
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+	req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+	req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+	req->cmd.dptr.sgl1.keyed.length = req->payload_size;
+	req->cmd.dptr.sgl1.address = (uint64_t)payload;
+
+	return 0;
+}
+
+/*
+ * Build SGL describing scattered payload buffer.
+ */
+static int
+nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair,
+			    struct spdk_nvme_rdma_req *rdma_req)
+{
+	struct nvme_request *req = rdma_req->req;
+	struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id];
+	void *virt_addr;
+	uint32_t remaining_size;
+	uint32_t sge_length;
+	int rc, max_num_sgl, num_sgl_desc;
+	uint32_t rkey = 0;
+
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+	assert(req->payload.reset_sgl_fn != NULL);
+	assert(req->payload.next_sge_fn != NULL);
+	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+	max_num_sgl = req->qpair->ctrlr->max_sges;
+
+	remaining_size = req->payload_size;
+	num_sgl_desc = 0;
+	do {
+		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length);
+		if (rc) {
+			return -1;
+		}
+
+		sge_length = spdk_min(remaining_size, sge_length);
+
+		if (spdk_unlikely(sge_length > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) {
+			SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n",
+				    sge_length, NVME_RDMA_MAX_KEYED_SGL_LENGTH);
+			return -1;
+		}
+
+		if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, sge_length,
+						     NVME_RDMA_MR_RKEY, &rkey))) {
+			return -1;
+		}
+
+		cmd->sgl[num_sgl_desc].keyed.key = rkey;
+		cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+		cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+		cmd->sgl[num_sgl_desc].keyed.length = sge_length;
+		cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr;
+
+		remaining_size -= sge_length;
+		num_sgl_desc++;
+	} while (remaining_size > 0 && num_sgl_desc < max_num_sgl);
+
+
+	/* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
+	if (remaining_size > 0) {
+		return -1;
+	}
+
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+
+	/* The RDMA SGL needs one element describing some portion
+	 * of the spdk_nvmf_cmd structure. */
+	rdma_req->send_wr.num_sge = 1;
+
+	/*
+	 * If only one SGL descriptor is required, it can be embedded directly in the command
+	 * as a data block descriptor.
+	 */
+	if (num_sgl_desc == 1) {
+		/* The first element of this SGL is pointing at an
+		 * spdk_nvmf_cmd object. For this particular command,
+		 * we only need the first 64 bytes corresponding to
+		 * the NVMe command. */
+		rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+		req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type;
+		req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype;
+		req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length;
+		req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key;
+		req->cmd.dptr.sgl1.address = cmd->sgl[0].address;
+	} else {
+		/*
+		 * Otherwise, The SGL descriptor embedded in the command must point to the list of
+		 * SGL descriptors used to describe the operation. In that case it is a last segment descriptor.
+		 */
+		rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct
+					       spdk_nvme_sgl_descriptor) * num_sgl_desc;
+
+		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+		req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+		req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor);
+		req->cmd.dptr.sgl1.address = (uint64_t)0;
+	}
+
+	return 0;
+}
+
+/*
+ * Build inline SGL describing sgl payload buffer.
+ */
+static int
+nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair,
+				   struct spdk_nvme_rdma_req *rdma_req)
+{
+	struct nvme_request *req = rdma_req->req;
+	uint32_t lkey = 0;
+	uint32_t length;
+	void *virt_addr;
+	int rc;
+
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+	assert(req->payload.reset_sgl_fn != NULL);
+	assert(req->payload.next_sge_fn != NULL);
+	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+	rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
+	if (rc) {
+		return -1;
+	}
+
+	if (length < req->payload_size) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Inline SGL request split so sending separately.\n");
+		return nvme_rdma_build_sgl_request(rqpair, rdma_req);
+	}
+
+	if (length > req->payload_size) {
+		length = req->payload_size;
+	}
+
+	if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, length,
+					     NVME_RDMA_MR_LKEY, &lkey))) {
+		return -1;
+	}
+
+	rdma_req->send_sgl[1].addr = (uint64_t)virt_addr;
+	rdma_req->send_sgl[1].length = length;
+	rdma_req->send_sgl[1].lkey = lkey;
+
+	rdma_req->send_wr.num_sge = 2;
+
+	/* The first element of this SGL is pointing at an
+	 * spdk_nvmf_cmd object. For this particular command,
+	 * we only need the first 64 bytes corresponding to
+	 * the NVMe command. */
+	rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+	req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+	req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+	req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size;
+	/* Inline only supported for icdoff == 0 currently.  This function will
+	 * not get called for controllers with other values. */
+	req->cmd.dptr.sgl1.address = (uint64_t)0;
+
+	return 0;
+}
+
+static int
+nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req,
+		   struct spdk_nvme_rdma_req *rdma_req)
+{
+	struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr;
+	enum nvme_payload_type payload_type;
+	bool icd_supported;
+	int rc;
+
+	assert(rdma_req->req == NULL);
+	rdma_req->req = req;
+	req->cmd.cid = rdma_req->id;
+	payload_type = nvme_payload_type(&req->payload);
+	/*
+	 * Check if icdoff is non zero, to avoid interop conflicts with
+	 * targets with non-zero icdoff.  Both SPDK and the Linux kernel
+	 * targets use icdoff = 0.  For targets with non-zero icdoff, we
+	 * will currently just not use inline data for now.
+	 */
+	icd_supported = spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER
+			&& req->payload_size <= ctrlr->ioccsz_bytes && ctrlr->icdoff == 0;
+
+	if (req->payload_size == 0) {
+		rc = nvme_rdma_build_null_request(rdma_req);
+	} else if (payload_type == NVME_PAYLOAD_TYPE_CONTIG) {
+		if (icd_supported) {
+			rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req);
+		} else {
+			rc = nvme_rdma_build_contig_request(rqpair, rdma_req);
+		}
+	} else if (payload_type == NVME_PAYLOAD_TYPE_SGL) {
+		if (icd_supported) {
+			rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req);
+		} else {
+			rc = nvme_rdma_build_sgl_request(rqpair, rdma_req);
+		}
+	} else {
+		rc = -1;
+	}
+
+	if (rc) {
+		rdma_req->req = NULL;
+		return rc;
+	}
+
+	memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd));
+	return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
+			     uint16_t qid, uint32_t qsize,
+			     enum spdk_nvme_qprio qprio,
+			     uint32_t num_requests,
+			     bool delay_cmd_submit)
+{
+	struct nvme_rdma_qpair *rqpair;
+	struct spdk_nvme_qpair *qpair;
+	int rc;
+
+	rqpair = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_qpair));
+	if (!rqpair) {
+		SPDK_ERRLOG("failed to get create rqpair\n");
+		return NULL;
+	}
+
+	rqpair->num_entries = qsize;
+	rqpair->delay_cmd_submit = delay_cmd_submit;
+	qpair = &rqpair->qpair;
+	rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests);
+	if (rc != 0) {
+		return NULL;
+	}
+
+	rc = nvme_rdma_alloc_reqs(rqpair);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+	if (rc) {
+		SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n");
+		nvme_rdma_free(rqpair);
+		return NULL;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n");
+
+	rc = nvme_rdma_alloc_rsps(rqpair);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+	if (rc < 0) {
+		SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n");
+		nvme_rdma_free_reqs(rqpair);
+		nvme_rdma_free(rqpair);
+		return NULL;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n");
+
+	return qpair;
+}
+
+static void
+nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+	struct nvme_rdma_ctrlr *rctrlr = NULL;
+	struct nvme_rdma_cm_event_entry *entry, *tmp;
+
+	nvme_rdma_unregister_mem(rqpair);
+	nvme_rdma_unregister_reqs(rqpair);
+	nvme_rdma_unregister_rsps(rqpair);
+
+	if (rqpair->evt) {
+		rdma_ack_cm_event(rqpair->evt);
+		rqpair->evt = NULL;
+	}
+
+	/*
+	 * This works because we have the controller lock both in
+	 * this function and in the function where we add new events.
+	 */
+	if (qpair->ctrlr != NULL) {
+		rctrlr = nvme_rdma_ctrlr(qpair->ctrlr);
+		STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) {
+			if (nvme_rdma_qpair(entry->evt->id->context) == rqpair) {
+				STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link);
+				rdma_ack_cm_event(entry->evt);
+				STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link);
+			}
+		}
+	}
+
+	if (rqpair->cm_id) {
+		if (rqpair->rdma_qp) {
+			spdk_rdma_qp_disconnect(rqpair->rdma_qp);
+			if (rctrlr != NULL) {
+				if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) {
+					SPDK_DEBUGLOG(SPDK_LOG_NVME, "Target did not respond to qpair disconnect.\n");
+				}
+			}
+			spdk_rdma_qp_destroy(rqpair->rdma_qp);
+			rqpair->rdma_qp = NULL;
+		}
+
+		rdma_destroy_id(rqpair->cm_id);
+		rqpair->cm_id = NULL;
+	}
+
+	if (rqpair->cq) {
+		ibv_destroy_cq(rqpair->cq);
+		rqpair->cq = NULL;
+	}
+}
+
+static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+
+static int
+nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_rdma_qpair *rqpair;
+
+	rqpair = nvme_rdma_qpair(qpair);
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+	if (rqpair->defer_deletion_to_pg) {
+		nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+		return 0;
+	}
+
+	nvme_rdma_qpair_abort_reqs(qpair, 1);
+	nvme_qpair_deinit(qpair);
+
+	nvme_rdma_free_reqs(rqpair);
+	nvme_rdma_free_rsps(rqpair);
+	nvme_rdma_free(rqpair);
+
+	return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+				const struct spdk_nvme_io_qpair_opts *opts)
+{
+	return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio,
+					    opts->io_queue_requests,
+					    opts->delay_cmd_submit);
+}
+
+static int
+nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+	/* do nothing here */
+	return 0;
+}
+
+static int nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+
+static struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+		const struct spdk_nvme_ctrlr_opts *opts,
+		void *devhandle)
+{
+	struct nvme_rdma_ctrlr *rctrlr;
+	union spdk_nvme_cap_register cap;
+	union spdk_nvme_vs_register vs;
+	struct ibv_context **contexts;
+	struct ibv_device_attr dev_attr;
+	int i, flag, rc;
+
+	rctrlr = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_ctrlr));
+	if (rctrlr == NULL) {
+		SPDK_ERRLOG("could not allocate ctrlr\n");
+		return NULL;
+	}
+
+	rctrlr->ctrlr.opts = *opts;
+	rctrlr->ctrlr.trid = *trid;
+
+	if (opts->transport_retry_count > NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT) {
+		SPDK_NOTICELOG("transport_retry_count exceeds max value %d, use max value\n",
+			       NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT);
+		rctrlr->ctrlr.opts.transport_retry_count = NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT;
+	}
+
+	if (opts->transport_ack_timeout > NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) {
+		SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n",
+			       NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT);
+		rctrlr->ctrlr.opts.transport_ack_timeout = NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT;
+	}
+
+	contexts = rdma_get_devices(NULL);
+	if (contexts == NULL) {
+		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
+		nvme_rdma_free(rctrlr);
+		return NULL;
+	}
+
+	i = 0;
+	rctrlr->max_sge = NVME_RDMA_MAX_SGL_DESCRIPTORS;
+
+	while (contexts[i] != NULL) {
+		rc = ibv_query_device(contexts[i], &dev_attr);
+		if (rc < 0) {
+			SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+			rdma_free_devices(contexts);
+			nvme_rdma_free(rctrlr);
+			return NULL;
+		}
+		rctrlr->max_sge = spdk_min(rctrlr->max_sge, (uint16_t)dev_attr.max_sge);
+		i++;
+	}
+
+	rdma_free_devices(contexts);
+
+	rc = nvme_ctrlr_construct(&rctrlr->ctrlr);
+	if (rc != 0) {
+		nvme_rdma_free(rctrlr);
+		return NULL;
+	}
+
+	STAILQ_INIT(&rctrlr->pending_cm_events);
+	STAILQ_INIT(&rctrlr->free_cm_events);
+	rctrlr->cm_events = nvme_rdma_calloc(NVME_RDMA_NUM_CM_EVENTS, sizeof(*rctrlr->cm_events));
+	if (rctrlr->cm_events == NULL) {
+		SPDK_ERRLOG("unable to allocat buffers to hold CM events.\n");
+		goto destruct_ctrlr;
+	}
+
+	for (i = 0; i < NVME_RDMA_NUM_CM_EVENTS; i++) {
+		STAILQ_INSERT_TAIL(&rctrlr->free_cm_events, &rctrlr->cm_events[i], link);
+	}
+
+	rctrlr->cm_channel = rdma_create_event_channel();
+	if (rctrlr->cm_channel == NULL) {
+		SPDK_ERRLOG("rdma_create_event_channel() failed\n");
+		goto destruct_ctrlr;
+	}
+
+	flag = fcntl(rctrlr->cm_channel->fd, F_GETFL);
+	if (fcntl(rctrlr->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+		SPDK_ERRLOG("Cannot set event channel to non blocking\n");
+		goto destruct_ctrlr;
+	}
+
+	rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0,
+			       rctrlr->ctrlr.opts.admin_queue_size, 0,
+			       rctrlr->ctrlr.opts.admin_queue_size, false);
+	if (!rctrlr->ctrlr.adminq) {
+		SPDK_ERRLOG("failed to create admin qpair\n");
+		goto destruct_ctrlr;
+	}
+
+	rc = nvme_transport_ctrlr_connect_qpair(&rctrlr->ctrlr, rctrlr->ctrlr.adminq);
+	if (rc < 0) {
+		SPDK_ERRLOG("failed to connect admin qpair\n");
+		goto destruct_ctrlr;
+	}
+
+	if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) {
+		SPDK_ERRLOG("get_cap() failed\n");
+		goto destruct_ctrlr;
+	}
+
+	if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) {
+		SPDK_ERRLOG("get_vs() failed\n");
+		goto destruct_ctrlr;
+	}
+
+	if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) {
+		SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
+		goto destruct_ctrlr;
+	}
+
+	nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n");
+	return &rctrlr->ctrlr;
+
+destruct_ctrlr:
+	nvme_ctrlr_destruct(&rctrlr->ctrlr);
+	return NULL;
+}
+
+static int
+nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr);
+	struct nvme_rdma_cm_event_entry *entry;
+
+	if (ctrlr->adminq) {
+		nvme_rdma_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq);
+	}
+
+	STAILQ_FOREACH(entry, &rctrlr->pending_cm_events, link) {
+		rdma_ack_cm_event(entry->evt);
+	}
+
+	STAILQ_INIT(&rctrlr->free_cm_events);
+	STAILQ_INIT(&rctrlr->pending_cm_events);
+	nvme_rdma_free(rctrlr->cm_events);
+
+	if (rctrlr->cm_channel) {
+		rdma_destroy_event_channel(rctrlr->cm_channel);
+		rctrlr->cm_channel = NULL;
+	}
+
+	nvme_ctrlr_destruct_finish(ctrlr);
+
+	nvme_rdma_free(rctrlr);
+
+	return 0;
+}
+
+static int
+nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+			       struct nvme_request *req)
+{
+	struct nvme_rdma_qpair *rqpair;
+	struct spdk_nvme_rdma_req *rdma_req;
+	struct ibv_send_wr *wr;
+
+	rqpair = nvme_rdma_qpair(qpair);
+	assert(rqpair != NULL);
+	assert(req != NULL);
+
+	rdma_req = nvme_rdma_req_get(rqpair);
+	if (!rdma_req) {
+		/* Inform the upper layer to try again later. */
+		return -EAGAIN;
+	}
+
+	if (nvme_rdma_req_init(rqpair, req, rdma_req)) {
+		SPDK_ERRLOG("nvme_rdma_req_init() failed\n");
+		TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link);
+		nvme_rdma_req_put(rqpair, rdma_req);
+		return -1;
+	}
+
+	wr = &rdma_req->send_wr;
+	wr->next = NULL;
+	nvme_rdma_trace_ibv_sge(wr->sg_list);
+	return nvme_rdma_qpair_queue_send_wr(rqpair, wr);
+}
+
+static int
+nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+	/* Currently, doing nothing here */
+	return 0;
+}
+
+static void
+nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	struct spdk_nvme_rdma_req *rdma_req, *tmp;
+	struct spdk_nvme_cpl cpl;
+	struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+
+	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	cpl.status.dnr = dnr;
+
+	/*
+	 * We cannot abort requests at the RDMA layer without
+	 * unregistering them. If we do, we can still get error
+	 * free completions on the shared completion queue.
+	 */
+	if (nvme_qpair_get_state(qpair) > NVME_QPAIR_DISCONNECTING &&
+	    nvme_qpair_get_state(qpair) != NVME_QPAIR_DESTROYING) {
+		nvme_ctrlr_disconnect_qpair(qpair);
+	}
+
+	TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+		nvme_rdma_req_complete(rdma_req, &cpl);
+		nvme_rdma_req_put(rqpair, rdma_req);
+	}
+}
+
+static void
+nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+	uint64_t t02;
+	struct spdk_nvme_rdma_req *rdma_req, *tmp;
+	struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+	struct spdk_nvme_ctrlr_process *active_proc;
+
+	/* Don't check timeouts during controller initialization. */
+	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+		return;
+	}
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	} else {
+		active_proc = qpair->active_proc;
+	}
+
+	/* Only check timeouts if the current process has a timeout callback. */
+	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+		return;
+	}
+
+	t02 = spdk_get_ticks();
+	TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+		assert(rdma_req->req != NULL);
+
+		if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) {
+			/*
+			 * The requests are in order, so as soon as one has not timed out,
+			 * stop iterating.
+			 */
+			break;
+		}
+	}
+}
+
+static inline int
+nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req)
+{
+	nvme_rdma_req_complete(rdma_req, &rqpair->rsps[rdma_req->rsp_idx].cpl);
+	nvme_rdma_req_put(rqpair, rdma_req);
+	return nvme_rdma_post_recv(rqpair, rdma_req->rsp_idx);
+}
+
+#define MAX_COMPLETIONS_PER_POLL 128
+
+static void
+nvme_rdma_fail_qpair(struct spdk_nvme_qpair *qpair, int failure_reason)
+{
+	if (failure_reason == IBV_WC_RETRY_EXC_ERR) {
+		qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE;
+	} else if (qpair->transport_failure_reason == SPDK_NVME_QPAIR_FAILURE_NONE) {
+		qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
+	}
+
+	nvme_ctrlr_disconnect_qpair(qpair);
+}
+
+static void
+nvme_rdma_conditional_fail_qpair(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poll_group *group)
+{
+	struct nvme_rdma_destroyed_qpair	*qpair_tracker;
+
+	assert(rqpair);
+	if (group) {
+		STAILQ_FOREACH(qpair_tracker, &group->destroyed_qpairs, link) {
+			if (qpair_tracker->destroyed_qpair_tracker == rqpair) {
+				return;
+			}
+		}
+	}
+	nvme_rdma_fail_qpair(&rqpair->qpair, 0);
+}
+
+static int
+nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
+				 struct nvme_rdma_poll_group *group,
+				 struct nvme_rdma_qpair *rdma_qpair)
+{
+	struct ibv_wc			wc[MAX_COMPLETIONS_PER_POLL];
+	struct nvme_rdma_qpair		*rqpair;
+	struct spdk_nvme_rdma_req	*rdma_req;
+	struct spdk_nvme_rdma_rsp	*rdma_rsp;
+	struct nvme_rdma_wr		*rdma_wr;
+	uint32_t			reaped = 0;
+	int				completion_rc = 0;
+	int				rc, i;
+
+	rc = ibv_poll_cq(cq, batch_size, wc);
+	if (rc < 0) {
+		SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
+			    errno, spdk_strerror(errno));
+		return -ECANCELED;
+	} else if (rc == 0) {
+		return 0;
+	}
+
+	for (i = 0; i < rc; i++) {
+		rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id;
+		switch (rdma_wr->type) {
+		case RDMA_WR_TYPE_RECV:
+			rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr);
+			rqpair = rdma_rsp->rqpair;
+			assert(rqpair->current_num_recvs > 0);
+			rqpair->current_num_recvs--;
+
+			if (wc[i].status) {
+				SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n",
+					    rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status));
+				nvme_rdma_conditional_fail_qpair(rqpair, group);
+				completion_rc = -ENXIO;
+				continue;
+			}
+
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n");
+
+			if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) {
+				SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len);
+				nvme_rdma_conditional_fail_qpair(rqpair, group);
+				completion_rc = -ENXIO;
+				continue;
+			}
+			rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid];
+			rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED;
+			rdma_req->rsp_idx = rdma_rsp->idx;
+
+			if ((rdma_req->completion_flags & NVME_RDMA_SEND_COMPLETED) != 0) {
+				if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) {
+					SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+					nvme_rdma_conditional_fail_qpair(rqpair, group);
+					completion_rc = -ENXIO;
+					continue;
+				}
+				reaped++;
+				rqpair->num_completions++;
+			}
+			break;
+
+		case RDMA_WR_TYPE_SEND:
+			rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr);
+
+			/* If we are flushing I/O */
+			if (wc[i].status) {
+				rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL;
+				if (!rqpair) {
+					rqpair = rdma_qpair != NULL ? rdma_qpair : nvme_rdma_poll_group_get_qpair_by_id(group,
+							wc[i].qp_num);
+				}
+				assert(rqpair);
+				assert(rqpair->current_num_sends > 0);
+				rqpair->current_num_sends--;
+				nvme_rdma_conditional_fail_qpair(rqpair, group);
+				SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n",
+					    rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status));
+				completion_rc = -ENXIO;
+				continue;
+			}
+
+			rqpair = nvme_rdma_qpair(rdma_req->req->qpair);
+			rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED;
+			rqpair->current_num_sends--;
+
+			if ((rdma_req->completion_flags & NVME_RDMA_RECV_COMPLETED) != 0) {
+				if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) {
+					SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+					nvme_rdma_conditional_fail_qpair(rqpair, group);
+					completion_rc = -ENXIO;
+					continue;
+				}
+				reaped++;
+				rqpair->num_completions++;
+			}
+			break;
+
+		default:
+			SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", rdma_wr->type);
+			return -ECANCELED;
+		}
+	}
+
+	if (completion_rc) {
+		return completion_rc;
+	}
+
+	return reaped;
+}
+
+static void
+dummy_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
+{
+
+}
+
+static int
+nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair,
+				    uint32_t max_completions)
+{
+	struct nvme_rdma_qpair		*rqpair = nvme_rdma_qpair(qpair);
+	int				rc = 0, batch_size;
+	struct ibv_cq			*cq;
+	struct nvme_rdma_ctrlr		*rctrlr;
+
+	/*
+	 * This is used during the connection phase. It's possible that we are still reaping error completions
+	 * from other qpairs so we need to call the poll group function. Also, it's more correct since the cq
+	 * is shared.
+	 */
+	if (qpair->poll_group != NULL) {
+		return spdk_nvme_poll_group_process_completions(qpair->poll_group->group, max_completions,
+				dummy_disconnected_qpair_cb);
+	}
+
+	if (max_completions == 0) {
+		max_completions = rqpair->num_entries;
+	} else {
+		max_completions = spdk_min(max_completions, rqpair->num_entries);
+	}
+
+	if (nvme_qpair_is_admin_queue(&rqpair->qpair)) {
+		rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr);
+		nvme_rdma_poll_events(rctrlr);
+	}
+	nvme_rdma_qpair_process_cm_event(rqpair);
+
+	if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) {
+		nvme_rdma_fail_qpair(qpair, 0);
+		return -ENXIO;
+	}
+
+	cq = rqpair->cq;
+
+	rqpair->num_completions = 0;
+	do {
+		batch_size = spdk_min((max_completions - rqpair->num_completions), MAX_COMPLETIONS_PER_POLL);
+		rc = nvme_rdma_cq_process_completions(cq, batch_size, NULL, rqpair);
+
+		if (rc == 0) {
+			break;
+			/* Handle the case where we fail to poll the cq. */
+		} else if (rc == -ECANCELED) {
+			nvme_rdma_fail_qpair(qpair, 0);
+			return -ENXIO;
+		} else if (rc == -ENXIO) {
+			return rc;
+		}
+	} while (rqpair->num_completions < max_completions);
+
+	if (spdk_unlikely(nvme_rdma_qpair_submit_sends(rqpair) ||
+			  nvme_rdma_qpair_submit_recvs(rqpair))) {
+		nvme_rdma_fail_qpair(qpair, 0);
+		return -ENXIO;
+	}
+
+	if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) {
+		nvme_rdma_qpair_check_timeout(qpair);
+	}
+
+	return rqpair->num_completions;
+}
+
+static uint32_t
+nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+	/* max_mr_size by ibv_query_device indicates the largest value that we can
+	 * set for a registered memory region.  It is independent from the actual
+	 * I/O size and is very likely to be larger than 2 MiB which is the
+	 * granularity we currently register memory regions.  Hence return
+	 * UINT32_MAX here and let the generic layer use the controller data to
+	 * moderate this value.
+	 */
+	return UINT32_MAX;
+}
+
+static uint16_t
+nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr);
+
+	return rctrlr->max_sge;
+}
+
+static int
+nvme_rdma_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+				 int (*iter_fn)(struct nvme_request *req, void *arg),
+				 void *arg)
+{
+	struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+	struct spdk_nvme_rdma_req *rdma_req, *tmp;
+	int rc;
+
+	assert(iter_fn != NULL);
+
+	TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+		assert(rdma_req->req != NULL);
+
+		rc = iter_fn(rdma_req->req, arg);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void
+nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_rdma_req *rdma_req, *tmp;
+	struct spdk_nvme_cpl cpl;
+	struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+
+	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+	TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+		assert(rdma_req->req != NULL);
+
+		if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+			continue;
+		}
+
+		nvme_rdma_req_complete(rdma_req, &cpl);
+		nvme_rdma_req_put(rqpair, rdma_req);
+	}
+}
+
+static int
+nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx)
+{
+	struct nvme_rdma_poller *poller;
+
+	poller = calloc(1, sizeof(*poller));
+	if (poller == NULL) {
+		SPDK_ERRLOG("Unable to allocate poller.\n");
+		return -ENOMEM;
+	}
+
+	poller->device = ctx;
+	poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0);
+
+	if (poller->cq == NULL) {
+		free(poller);
+		return -EINVAL;
+	}
+
+	STAILQ_INSERT_HEAD(&group->pollers, poller, link);
+	group->num_pollers++;
+	poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE;
+	poller->required_num_wc = 0;
+	return 0;
+}
+
+static void
+nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group)
+{
+	struct nvme_rdma_poller	*poller, *tmp_poller;
+
+	STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) {
+		if (poller->cq) {
+			ibv_destroy_cq(poller->cq);
+		}
+		STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link);
+		free(poller);
+	}
+}
+
+static struct spdk_nvme_transport_poll_group *
+nvme_rdma_poll_group_create(void)
+{
+	struct nvme_rdma_poll_group	*group;
+	struct ibv_context		**contexts;
+	int i = 0;
+
+	group = calloc(1, sizeof(*group));
+	if (group == NULL) {
+		SPDK_ERRLOG("Unable to allocate poll group.\n");
+		return NULL;
+	}
+
+	STAILQ_INIT(&group->pollers);
+
+	contexts = rdma_get_devices(NULL);
+	if (contexts == NULL) {
+		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
+		free(group);
+		return NULL;
+	}
+
+	while (contexts[i] != NULL) {
+		if (nvme_rdma_poller_create(group, contexts[i])) {
+			nvme_rdma_poll_group_free_pollers(group);
+			free(group);
+			rdma_free_devices(contexts);
+			return NULL;
+		}
+		i++;
+	}
+
+	rdma_free_devices(contexts);
+	STAILQ_INIT(&group->destroyed_qpairs);
+	return &group->group;
+}
+
+struct nvme_rdma_qpair *
+nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, uint32_t qp_num)
+{
+	struct spdk_nvme_qpair *qpair;
+	struct nvme_rdma_destroyed_qpair *rqpair_tracker;
+	struct nvme_rdma_qpair *rqpair;
+
+	STAILQ_FOREACH(qpair, &group->group.disconnected_qpairs, poll_group_stailq) {
+		rqpair = nvme_rdma_qpair(qpair);
+		if (rqpair->rdma_qp->qp->qp_num == qp_num) {
+			return rqpair;
+		}
+	}
+
+	STAILQ_FOREACH(qpair, &group->group.connected_qpairs, poll_group_stailq) {
+		rqpair = nvme_rdma_qpair(qpair);
+		if (rqpair->rdma_qp->qp->qp_num == qp_num) {
+			return rqpair;
+		}
+	}
+
+	STAILQ_FOREACH(rqpair_tracker, &group->destroyed_qpairs, link) {
+		rqpair = rqpair_tracker->destroyed_qpair_tracker;
+		if (rqpair->rdma_qp->qp->qp_num == qp_num) {
+			return rqpair;
+		}
+	}
+
+	return NULL;
+}
+
+static int
+nvme_rdma_resize_cq(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poller *poller)
+{
+	int	current_num_wc, required_num_wc;
+
+	required_num_wc = poller->required_num_wc + WC_PER_QPAIR(rqpair->num_entries);
+	current_num_wc = poller->current_num_wc;
+	if (current_num_wc < required_num_wc) {
+		current_num_wc = spdk_max(current_num_wc * 2, required_num_wc);
+	}
+
+	if (poller->current_num_wc != current_num_wc) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Resize RDMA CQ from %d to %d\n", poller->current_num_wc,
+			      current_num_wc);
+		if (ibv_resize_cq(poller->cq, current_num_wc)) {
+			SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno));
+			return -1;
+		}
+
+		poller->current_num_wc = current_num_wc;
+	}
+
+	poller->required_num_wc = required_num_wc;
+	return 0;
+}
+
+static int
+nvme_rdma_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_rdma_qpair		*rqpair = nvme_rdma_qpair(qpair);
+	struct nvme_rdma_poll_group	*group = nvme_rdma_poll_group(qpair->poll_group);
+	struct nvme_rdma_poller		*poller;
+
+	assert(rqpair->cq == NULL);
+
+	STAILQ_FOREACH(poller, &group->pollers, link) {
+		if (poller->device == rqpair->cm_id->verbs) {
+			if (nvme_rdma_resize_cq(rqpair, poller)) {
+				return -EPROTO;
+			}
+			rqpair->cq = poller->cq;
+			break;
+		}
+	}
+
+	if (rqpair->cq == NULL) {
+		SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_rdma_qpair			*rqpair = nvme_rdma_qpair(qpair);
+	struct nvme_rdma_poll_group		*group;
+	struct nvme_rdma_destroyed_qpair	*destroyed_qpair;
+	enum nvme_qpair_state			state;
+
+	if (rqpair->poll_group_disconnect_in_progress) {
+		return -EINPROGRESS;
+	}
+
+	rqpair->poll_group_disconnect_in_progress = true;
+	state = nvme_qpair_get_state(qpair);
+	group = nvme_rdma_poll_group(qpair->poll_group);
+	rqpair->cq = NULL;
+
+	/*
+	 * We want to guard against an endless recursive loop while making
+	 * sure the qpair is disconnected before we disconnect it from the qpair.
+	 */
+	if (state > NVME_QPAIR_DISCONNECTING && state != NVME_QPAIR_DESTROYING) {
+		nvme_ctrlr_disconnect_qpair(qpair);
+	}
+
+	/*
+	 * If this fails, the system is in serious trouble,
+	 * just let the qpair get cleaned up immediately.
+	 */
+	destroyed_qpair = calloc(1, sizeof(*destroyed_qpair));
+	if (destroyed_qpair == NULL) {
+		return 0;
+	}
+
+	destroyed_qpair->destroyed_qpair_tracker = rqpair;
+	destroyed_qpair->completed_cycles = 0;
+	STAILQ_INSERT_TAIL(&group->destroyed_qpairs, destroyed_qpair, link);
+
+	rqpair->defer_deletion_to_pg = true;
+
+	rqpair->poll_group_disconnect_in_progress = false;
+	return 0;
+}
+
+static int
+nvme_rdma_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+			 struct spdk_nvme_qpair *qpair)
+{
+	return 0;
+}
+
+static int
+nvme_rdma_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+			    struct spdk_nvme_qpair *qpair)
+{
+	if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+		return nvme_poll_group_disconnect_qpair(qpair);
+	}
+
+	return 0;
+}
+
+static void
+nvme_rdma_poll_group_delete_qpair(struct nvme_rdma_poll_group *group,
+				  struct nvme_rdma_destroyed_qpair *qpair_tracker)
+{
+	struct nvme_rdma_qpair *rqpair = qpair_tracker->destroyed_qpair_tracker;
+
+	rqpair->defer_deletion_to_pg = false;
+	if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) {
+		nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair);
+	}
+	STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link);
+	free(qpair_tracker);
+}
+
+static int64_t
+nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+	struct spdk_nvme_qpair			*qpair, *tmp_qpair;
+	struct nvme_rdma_destroyed_qpair	*qpair_tracker, *tmp_qpair_tracker;
+	struct nvme_rdma_qpair			*rqpair;
+	struct nvme_rdma_poll_group		*group;
+	struct nvme_rdma_poller			*poller;
+	int					num_qpairs = 0, batch_size, rc;
+	int64_t					total_completions = 0;
+	uint64_t				completions_allowed = 0;
+	uint64_t				completions_per_poller = 0;
+	uint64_t				poller_completions = 0;
+
+
+	if (completions_per_qpair == 0) {
+		completions_per_qpair = MAX_COMPLETIONS_PER_POLL;
+	}
+
+	group = nvme_rdma_poll_group(tgroup);
+	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
+		disconnected_qpair_cb(qpair, tgroup->group->ctx);
+	}
+
+	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
+		rqpair = nvme_rdma_qpair(qpair);
+		rqpair->num_completions = 0;
+		nvme_rdma_qpair_process_cm_event(rqpair);
+
+		if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) {
+			nvme_rdma_fail_qpair(qpair, 0);
+			disconnected_qpair_cb(qpair, tgroup->group->ctx);
+			continue;
+		}
+		num_qpairs++;
+	}
+
+	completions_allowed = completions_per_qpair * num_qpairs;
+	completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1);
+
+	STAILQ_FOREACH(poller, &group->pollers, link) {
+		poller_completions = 0;
+		do {
+			batch_size = spdk_min((completions_per_poller - poller_completions), MAX_COMPLETIONS_PER_POLL);
+			rc = nvme_rdma_cq_process_completions(poller->cq, batch_size, group, NULL);
+			if (rc <= 0) {
+				if (rc == -ECANCELED) {
+					return -EIO;
+				}
+				break;
+			}
+
+			poller_completions += rc;
+		} while (poller_completions < completions_per_poller);
+		total_completions += poller_completions;
+	}
+
+	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
+		rqpair = nvme_rdma_qpair(qpair);
+		if (spdk_unlikely(qpair->ctrlr->timeout_enabled)) {
+			nvme_rdma_qpair_check_timeout(qpair);
+		}
+
+		nvme_rdma_qpair_submit_sends(rqpair);
+		nvme_rdma_qpair_submit_recvs(rqpair);
+		nvme_qpair_resubmit_requests(&rqpair->qpair, rqpair->num_completions);
+	}
+
+	/*
+	 * Once a qpair is disconnected, we can still get flushed completions for those disconnected qpairs.
+	 * For most pieces of hardware, those requests will complete immediately. However, there are certain
+	 * cases where flushed requests will linger. Default is to destroy qpair after all completions are freed,
+	 * but have a fallback for other cases where we don't get all of our completions back.
+	 */
+	STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) {
+		qpair_tracker->completed_cycles++;
+		rqpair = qpair_tracker->destroyed_qpair_tracker;
+		if ((rqpair->current_num_sends == 0 && rqpair->current_num_recvs == 0) ||
+		    qpair_tracker->completed_cycles > NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES) {
+			nvme_rdma_poll_group_delete_qpair(group, qpair_tracker);
+		}
+	}
+
+	return total_completions;
+}
+
+static int
+nvme_rdma_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+	struct nvme_rdma_poll_group		*group = nvme_rdma_poll_group(tgroup);
+	struct nvme_rdma_destroyed_qpair	*qpair_tracker, *tmp_qpair_tracker;
+	struct nvme_rdma_qpair			*rqpair;
+
+	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+		return -EBUSY;
+	}
+
+	STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) {
+		rqpair = qpair_tracker->destroyed_qpair_tracker;
+		if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) {
+			rqpair->defer_deletion_to_pg = false;
+			nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair);
+		}
+
+		STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link);
+		free(qpair_tracker);
+	}
+
+	nvme_rdma_poll_group_free_pollers(group);
+	free(group);
+
+	return 0;
+}
+
+void
+spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks)
+{
+	g_nvme_hooks = *hooks;
+}
+
+const struct spdk_nvme_transport_ops rdma_ops = {
+	.name = "RDMA",
+	.type = SPDK_NVME_TRANSPORT_RDMA,
+	.ctrlr_construct = nvme_rdma_ctrlr_construct,
+	.ctrlr_scan = nvme_fabric_ctrlr_scan,
+	.ctrlr_destruct = nvme_rdma_ctrlr_destruct,
+	.ctrlr_enable = nvme_rdma_ctrlr_enable,
+
+	.ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4,
+	.ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8,
+	.ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4,
+	.ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8,
+
+	.ctrlr_get_max_xfer_size = nvme_rdma_ctrlr_get_max_xfer_size,
+	.ctrlr_get_max_sges = nvme_rdma_ctrlr_get_max_sges,
+
+	.ctrlr_create_io_qpair = nvme_rdma_ctrlr_create_io_qpair,
+	.ctrlr_delete_io_qpair = nvme_rdma_ctrlr_delete_io_qpair,
+	.ctrlr_connect_qpair = nvme_rdma_ctrlr_connect_qpair,
+	.ctrlr_disconnect_qpair = nvme_rdma_ctrlr_disconnect_qpair,
+
+	.qpair_abort_reqs = nvme_rdma_qpair_abort_reqs,
+	.qpair_reset = nvme_rdma_qpair_reset,
+	.qpair_submit_request = nvme_rdma_qpair_submit_request,
+	.qpair_process_completions = nvme_rdma_qpair_process_completions,
+	.qpair_iterate_requests = nvme_rdma_qpair_iterate_requests,
+	.admin_qpair_abort_aers = nvme_rdma_admin_qpair_abort_aers,
+
+	.poll_group_create = nvme_rdma_poll_group_create,
+	.poll_group_connect_qpair = nvme_rdma_poll_group_connect_qpair,
+	.poll_group_disconnect_qpair = nvme_rdma_poll_group_disconnect_qpair,
+	.poll_group_add = nvme_rdma_poll_group_add,
+	.poll_group_remove = nvme_rdma_poll_group_remove,
+	.poll_group_process_completions = nvme_rdma_poll_group_process_completions,
+	.poll_group_destroy = nvme_rdma_poll_group_destroy,
+
+};
+
+SPDK_NVME_TRANSPORT_REGISTER(rdma, &rdma_ops);
diff --git a/src/spdk/lib/nvme/nvme_tcp.c b/src/spdk/lib/nvme/nvme_tcp.c
new file mode 100644
index 000000000..98e8c6827
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_tcp.c
@@ -0,0 +1,1973 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe/TCP transport
+ */
+
+#include "nvme_internal.h"
+
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/stdinc.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/assert.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/nvme_tcp.h"
+
+#define NVME_TCP_RW_BUFFER_SIZE 131072
+#define NVME_TCP_TIME_OUT_IN_SECONDS 2
+
+#define NVME_TCP_HPDA_DEFAULT			0
+#define NVME_TCP_MAX_R2T_DEFAULT		1
+#define NVME_TCP_PDU_H2C_MIN_DATA_SIZE		4096
+#define NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE	8192
+
+/* NVMe TCP transport extensions for spdk_nvme_ctrlr */
+struct nvme_tcp_ctrlr {
+	struct spdk_nvme_ctrlr			ctrlr;
+};
+
+struct nvme_tcp_poll_group {
+	struct spdk_nvme_transport_poll_group group;
+	struct spdk_sock_group *sock_group;
+	uint32_t completions_per_qpair;
+	int64_t num_completions;
+};
+
+/* NVMe TCP qpair extensions for spdk_nvme_qpair */
+struct nvme_tcp_qpair {
+	struct spdk_nvme_qpair			qpair;
+	struct spdk_sock			*sock;
+
+	TAILQ_HEAD(, nvme_tcp_req)		free_reqs;
+	TAILQ_HEAD(, nvme_tcp_req)		outstanding_reqs;
+
+	TAILQ_HEAD(, nvme_tcp_pdu)		send_queue;
+	struct nvme_tcp_pdu			recv_pdu;
+	struct nvme_tcp_pdu			send_pdu; /* only for error pdu and init pdu */
+	struct nvme_tcp_pdu			*send_pdus; /* Used by tcp_reqs */
+	enum nvme_tcp_pdu_recv_state		recv_state;
+
+	struct nvme_tcp_req			*tcp_reqs;
+
+	uint16_t				num_entries;
+
+	bool					host_hdgst_enable;
+	bool					host_ddgst_enable;
+
+	/** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */
+	uint32_t				maxh2cdata;
+
+	uint32_t				maxr2t;
+
+	/* 0 based value, which is used to guide the padding */
+	uint8_t					cpda;
+
+	enum nvme_tcp_qpair_state		state;
+};
+
+enum nvme_tcp_req_state {
+	NVME_TCP_REQ_FREE,
+	NVME_TCP_REQ_ACTIVE,
+	NVME_TCP_REQ_ACTIVE_R2T,
+};
+
+struct nvme_tcp_req {
+	struct nvme_request			*req;
+	enum nvme_tcp_req_state			state;
+	uint16_t				cid;
+	uint16_t				ttag;
+	uint32_t				datao;
+	uint32_t				r2tl_remain;
+	uint32_t				active_r2ts;
+	bool					in_capsule_data;
+	/* It is used to track whether the req can be safely freed */
+	struct {
+		uint8_t				send_ack : 1;
+		uint8_t				data_recv : 1;
+		uint8_t				r2t_recv : 1;
+		uint8_t				reserved : 5;
+	} ordering;
+	struct nvme_tcp_pdu			*send_pdu;
+	struct iovec				iov[NVME_TCP_MAX_SGL_DESCRIPTORS];
+	uint32_t				iovcnt;
+	struct nvme_tcp_qpair			*tqpair;
+	TAILQ_ENTRY(nvme_tcp_req)		link;
+};
+
+static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req);
+
+static inline struct nvme_tcp_qpair *
+nvme_tcp_qpair(struct spdk_nvme_qpair *qpair)
+{
+	assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP);
+	return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair);
+}
+
+static inline struct nvme_tcp_poll_group *
+nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group)
+{
+	return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group);
+}
+
+static inline struct nvme_tcp_ctrlr *
+nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+	assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP);
+	return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr);
+}
+
+static struct nvme_tcp_req *
+nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair)
+{
+	struct nvme_tcp_req *tcp_req;
+
+	tcp_req = TAILQ_FIRST(&tqpair->free_reqs);
+	if (!tcp_req) {
+		return NULL;
+	}
+
+	assert(tcp_req->state == NVME_TCP_REQ_FREE);
+	tcp_req->state = NVME_TCP_REQ_ACTIVE;
+	TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link);
+	tcp_req->datao = 0;
+	tcp_req->req = NULL;
+	tcp_req->in_capsule_data = false;
+	tcp_req->r2tl_remain = 0;
+	tcp_req->active_r2ts = 0;
+	tcp_req->iovcnt = 0;
+	tcp_req->ordering.send_ack = 0;
+	tcp_req->ordering.data_recv = 0;
+	tcp_req->ordering.r2t_recv = 0;
+	memset(tcp_req->send_pdu, 0, sizeof(struct nvme_tcp_pdu));
+	TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link);
+
+	return tcp_req;
+}
+
+static void
+nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
+{
+	assert(tcp_req->state != NVME_TCP_REQ_FREE);
+	tcp_req->state = NVME_TCP_REQ_FREE;
+	TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link);
+}
+
+static int
+nvme_tcp_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
+{
+	struct addrinfo *res;
+	struct addrinfo hints;
+	int ret;
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = family;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_protocol = 0;
+
+	ret = getaddrinfo(addr, service, &hints, &res);
+	if (ret) {
+		SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret);
+		return ret;
+	}
+
+	if (res->ai_addrlen > sizeof(*sa)) {
+		SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen);
+		ret = EINVAL;
+	} else {
+		memcpy(sa, res->ai_addr, res->ai_addrlen);
+	}
+
+	freeaddrinfo(res);
+	return ret;
+}
+
+static void
+nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair)
+{
+	free(tqpair->tcp_reqs);
+	tqpair->tcp_reqs = NULL;
+
+	spdk_free(tqpair->send_pdus);
+	tqpair->send_pdus = NULL;
+}
+
+static int
+nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair)
+{
+	uint16_t i;
+	struct nvme_tcp_req	*tcp_req;
+
+	tqpair->tcp_reqs = calloc(tqpair->num_entries, sizeof(struct nvme_tcp_req));
+	if (tqpair->tcp_reqs == NULL) {
+		SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair);
+		goto fail;
+	}
+
+	tqpair->send_pdus = spdk_zmalloc(tqpair->num_entries * sizeof(struct nvme_tcp_pdu),
+					 0x1000, NULL,
+					 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+
+	if (tqpair->send_pdus == NULL) {
+		SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair);
+		goto fail;
+	}
+
+	TAILQ_INIT(&tqpair->send_queue);
+	TAILQ_INIT(&tqpair->free_reqs);
+	TAILQ_INIT(&tqpair->outstanding_reqs);
+	for (i = 0; i < tqpair->num_entries; i++) {
+		tcp_req = &tqpair->tcp_reqs[i];
+		tcp_req->cid = i;
+		tcp_req->tqpair = tqpair;
+		tcp_req->send_pdu = &tqpair->send_pdus[i];
+		TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link);
+	}
+
+	return 0;
+fail:
+	nvme_tcp_free_reqs(tqpair);
+	return -ENOMEM;
+}
+
+static void
+nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+	struct nvme_tcp_pdu *pdu;
+
+	spdk_sock_close(&tqpair->sock);
+
+	/* clear the send_queue */
+	while (!TAILQ_EMPTY(&tqpair->send_queue)) {
+		pdu = TAILQ_FIRST(&tqpair->send_queue);
+		/* Remove the pdu from the send_queue to prevent the wrong sending out
+		 * in the next round connection
+		 */
+		TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
+	}
+}
+
+static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+
+static int
+nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_tcp_qpair *tqpair;
+
+	if (!qpair) {
+		return -1;
+	}
+
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+	nvme_tcp_qpair_abort_reqs(qpair, 1);
+	nvme_qpair_deinit(qpair);
+	tqpair = nvme_tcp_qpair(qpair);
+	nvme_tcp_free_reqs(tqpair);
+	free(tqpair);
+
+	return 0;
+}
+
+static int
+nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+	return 0;
+}
+
+static int
+nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr);
+
+	if (ctrlr->adminq) {
+		nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq);
+	}
+
+	nvme_ctrlr_destruct_finish(ctrlr);
+
+	free(tctrlr);
+
+	return 0;
+}
+
+static void
+_pdu_write_done(void *cb_arg, int err)
+{
+	struct nvme_tcp_pdu *pdu = cb_arg;
+	struct nvme_tcp_qpair *tqpair = pdu->qpair;
+
+	TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
+
+	if (err != 0) {
+		nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair);
+		return;
+	}
+
+	assert(pdu->cb_fn != NULL);
+	pdu->cb_fn(pdu->cb_arg);
+}
+
+static int
+nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
+			 struct nvme_tcp_pdu *pdu,
+			 nvme_tcp_qpair_xfer_complete_cb cb_fn,
+			 void *cb_arg)
+{
+	int hlen;
+	uint32_t crc32c;
+	uint32_t mapped_length = 0;
+
+	hlen = pdu->hdr.common.hlen;
+
+	/* Header Digest */
+	if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) {
+		crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+		MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c);
+	}
+
+	/* Data Digest */
+	if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) {
+		crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+		MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
+	}
+
+	pdu->cb_fn = cb_fn;
+	pdu->cb_arg = cb_arg;
+
+	pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu,
+			       tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
+			       &mapped_length);
+	pdu->qpair = tqpair;
+	pdu->sock_req.cb_fn = _pdu_write_done;
+	pdu->sock_req.cb_arg = pdu;
+	TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
+	spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
+
+	return 0;
+}
+
+/*
+ * Build SGL describing contiguous payload buffer.
+ */
+static int
+nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
+{
+	struct nvme_request *req = tcp_req->req;
+
+	tcp_req->iov[0].iov_base = req->payload.contig_or_cb_arg + req->payload_offset;
+	tcp_req->iov[0].iov_len = req->payload_size;
+	tcp_req->iovcnt = 1;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+	return 0;
+}
+
+/*
+ * Build SGL describing scattered payload buffer.
+ */
+static int
+nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
+{
+	int rc;
+	uint32_t length, remaining_size, iovcnt = 0, max_num_sgl;
+	struct nvme_request *req = tcp_req->req;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+
+	assert(req->payload_size != 0);
+	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+	assert(req->payload.reset_sgl_fn != NULL);
+	assert(req->payload.next_sge_fn != NULL);
+	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+	max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS);
+	remaining_size = req->payload_size;
+
+	do {
+		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &tcp_req->iov[iovcnt].iov_base,
+					      &length);
+		if (rc) {
+			return -1;
+		}
+
+		length = spdk_min(length, remaining_size);
+		tcp_req->iov[iovcnt].iov_len = length;
+		remaining_size -= length;
+		iovcnt++;
+	} while (remaining_size > 0 && iovcnt < max_num_sgl);
+
+
+	/* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
+	if (remaining_size > 0) {
+		SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n",
+			    tcp_req, iovcnt, remaining_size);
+		return -1;
+	}
+
+	tcp_req->iovcnt = iovcnt;
+
+	return 0;
+}
+
+static int
+nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req,
+		  struct nvme_tcp_req *tcp_req)
+{
+	struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr;
+	int rc = 0;
+	enum spdk_nvme_data_transfer xfer;
+	uint32_t max_incapsule_data_size;
+
+	tcp_req->req = req;
+	req->cmd.cid = tcp_req->cid;
+	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+	req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK;
+	req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT;
+	req->cmd.dptr.sgl1.unkeyed.length = req->payload_size;
+
+	if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
+		rc = nvme_tcp_build_contig_request(tqpair, tcp_req);
+	} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
+		rc = nvme_tcp_build_sgl_request(tqpair, tcp_req);
+	} else {
+		rc = -1;
+	}
+
+	if (rc) {
+		return rc;
+	}
+
+	if (req->cmd.opc == SPDK_NVME_OPC_FABRIC) {
+		struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd;
+
+		xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype);
+	} else {
+		xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc);
+	}
+	if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+		max_incapsule_data_size = ctrlr->ioccsz_bytes;
+		if ((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || nvme_qpair_is_admin_queue(&tqpair->qpair)) {
+			max_incapsule_data_size = spdk_min(max_incapsule_data_size, NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE);
+		}
+
+		if (req->payload_size <= max_incapsule_data_size) {
+			req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+			req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+			req->cmd.dptr.sgl1.address = 0;
+			tcp_req->in_capsule_data = true;
+		}
+	}
+
+	return 0;
+}
+
+static inline void
+nvme_tcp_req_put_safe(struct nvme_tcp_req *tcp_req)
+{
+	if (tcp_req->ordering.send_ack && tcp_req->ordering.data_recv) {
+		assert(tcp_req->state == NVME_TCP_REQ_ACTIVE);
+		assert(tcp_req->tqpair != NULL);
+		nvme_tcp_req_put(tcp_req->tqpair, tcp_req);
+	}
+}
+
+static void
+nvme_tcp_qpair_cmd_send_complete(void *cb_arg)
+{
+	struct nvme_tcp_req *tcp_req = cb_arg;
+
+	tcp_req->ordering.send_ack = 1;
+	/* Handle the r2t case */
+	if (spdk_unlikely(tcp_req->ordering.r2t_recv)) {
+		nvme_tcp_send_h2c_data(tcp_req);
+	} else {
+		nvme_tcp_req_put_safe(tcp_req);
+	}
+}
+
+static int
+nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair,
+				struct nvme_tcp_req *tcp_req)
+{
+	struct nvme_tcp_pdu *pdu;
+	struct spdk_nvme_tcp_cmd *capsule_cmd;
+	uint32_t plen = 0, alignment;
+	uint8_t pdo;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+	pdu = tcp_req->send_pdu;
+
+	capsule_cmd = &pdu->hdr.capsule_cmd;
+	capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD;
+	plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd);
+	capsule_cmd->ccsqe = tcp_req->req->cmd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair);
+
+	if (tqpair->host_hdgst_enable) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Header digest is enabled for capsule command on tcp_req=%p\n",
+			      tcp_req);
+		capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) {
+		goto end;
+	}
+
+	pdo = plen;
+	pdu->padding_len = 0;
+	if (tqpair->cpda) {
+		alignment = (tqpair->cpda + 1) << 2;
+		if (alignment > plen) {
+			pdu->padding_len = alignment - plen;
+			pdo = alignment;
+			plen = alignment;
+		}
+	}
+
+	capsule_cmd->common.pdo = pdo;
+	plen += tcp_req->req->payload_size;
+	if (tqpair->host_ddgst_enable) {
+		capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	tcp_req->datao = 0;
+	nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt,
+				  0, tcp_req->req->payload_size);
+end:
+	capsule_cmd->common.plen = plen;
+	return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req);
+
+}
+
+static int
+nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+			      struct nvme_request *req)
+{
+	struct nvme_tcp_qpair *tqpair;
+	struct nvme_tcp_req *tcp_req;
+
+	tqpair = nvme_tcp_qpair(qpair);
+	assert(tqpair != NULL);
+	assert(req != NULL);
+
+	tcp_req = nvme_tcp_req_get(tqpair);
+	if (!tcp_req) {
+		/* Inform the upper layer to try again later. */
+		return -EAGAIN;
+	}
+
+	if (nvme_tcp_req_init(tqpair, req, tcp_req)) {
+		SPDK_ERRLOG("nvme_tcp_req_init() failed\n");
+		TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link);
+		nvme_tcp_req_put(tqpair, tcp_req);
+		return -1;
+	}
+
+	return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req);
+}
+
+static int
+nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+	return 0;
+}
+
+static void
+nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req,
+		      struct spdk_nvme_cpl *rsp)
+{
+	struct nvme_request *req;
+
+	assert(tcp_req->req != NULL);
+	req = tcp_req->req;
+
+	TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link);
+	nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp);
+	nvme_free_request(req);
+}
+
+static void
+nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	struct nvme_tcp_req *tcp_req, *tmp;
+	struct spdk_nvme_cpl cpl;
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	cpl.status.dnr = dnr;
+
+	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+		nvme_tcp_req_complete(tcp_req, &cpl);
+		nvme_tcp_req_put(tqpair, tcp_req);
+	}
+}
+
+static void
+nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair,
+			      enum nvme_tcp_pdu_recv_state state)
+{
+	if (tqpair->recv_state == state) {
+		SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n",
+			    tqpair, state);
+		return;
+	}
+
+	tqpair->recv_state = state;
+	switch (state) {
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+	case NVME_TCP_PDU_RECV_STATE_ERROR:
+		memset(&tqpair->recv_pdu, 0, sizeof(struct nvme_tcp_pdu));
+		break;
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+	default:
+		break;
+	}
+}
+
+static void
+nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg)
+{
+	struct nvme_tcp_qpair *tqpair = cb_arg;
+
+	tqpair->state = NVME_TCP_QPAIR_STATE_EXITING;
+}
+
+static void
+nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
+				 enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset)
+{
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_term_req_hdr *h2c_term_req;
+	uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req);
+	uint8_t copy_len;
+
+	rsp_pdu = &tqpair->send_pdu;
+	memset(rsp_pdu, 0, sizeof(*rsp_pdu));
+	h2c_term_req = &rsp_pdu->hdr.term_req;
+	h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
+	h2c_term_req->common.hlen = h2c_term_req_hdr_len;
+
+	if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+	    (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+		DSET32(&h2c_term_req->fei, error_offset);
+	}
+
+	copy_len = pdu->hdr.common.hlen;
+	if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) {
+		copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
+	}
+
+	/* Copy the error info into the buffer */
+	memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len);
+	nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len);
+
+	/* Contain the header len of the wrong received pdu */
+	h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len;
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+	nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, NULL);
+
+}
+
+static void
+nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair)
+{
+	struct nvme_tcp_pdu *pdu;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+	uint32_t expected_hlen, hd_len = 0;
+	bool plen_error = false;
+
+	pdu = &tqpair->recv_pdu;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "pdu type = %d\n", pdu->hdr.common.pdu_type);
+	if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) {
+		if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) {
+			SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+			goto err;
+		}
+		expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp);
+		if (pdu->hdr.common.plen != expected_hlen) {
+			plen_error = true;
+		}
+	} else {
+		if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
+			SPDK_ERRLOG("The TCP/IP tqpair connection is not negotitated\n");
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+			goto err;
+		}
+
+		switch (pdu->hdr.common.pdu_type) {
+		case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_rsp);
+			if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) {
+				hd_len = SPDK_NVME_TCP_DIGEST_LEN;
+			}
+
+			if (pdu->hdr.common.plen != (expected_hlen + hd_len)) {
+				plen_error = true;
+			}
+			break;
+		case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr);
+			if (pdu->hdr.common.plen < pdu->hdr.common.pdo) {
+				plen_error = true;
+			}
+			break;
+		case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr);
+			if ((pdu->hdr.common.plen <= expected_hlen) ||
+			    (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) {
+				plen_error = true;
+			}
+			break;
+		case SPDK_NVME_TCP_PDU_TYPE_R2T:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr);
+			if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) {
+				hd_len = SPDK_NVME_TCP_DIGEST_LEN;
+			}
+
+			if (pdu->hdr.common.plen != (expected_hlen + hd_len)) {
+				plen_error = true;
+			}
+			break;
+
+		default:
+			SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+			error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type);
+			goto err;
+		}
+	}
+
+	if (pdu->hdr.common.hlen != expected_hlen) {
+		SPDK_ERRLOG("Expected PDU header length %u, got %u\n",
+			    expected_hlen, pdu->hdr.common.hlen);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen);
+		goto err;
+
+	} else if (plen_error) {
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen);
+		goto err;
+	} else {
+		nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+		nvme_tcp_pdu_calc_psh_len(&tqpair->recv_pdu, tqpair->host_hdgst_enable);
+		return;
+	}
+err:
+	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static struct nvme_tcp_req *
+get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid)
+{
+	assert(tqpair != NULL);
+	if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) {
+		return NULL;
+	}
+
+	return &tqpair->tcp_reqs[cid];
+}
+
+static void
+nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair,
+				 struct nvme_tcp_pdu *pdu, uint32_t *reaped)
+{
+	struct nvme_tcp_req *tcp_req;
+	struct spdk_nvme_tcp_c2h_data_hdr *c2h_data;
+	struct spdk_nvme_cpl cpl = {};
+	uint8_t flags;
+
+	tcp_req = pdu->req;
+	assert(tcp_req != NULL);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+	c2h_data = &pdu->hdr.c2h_data;
+	tcp_req->datao += pdu->data_len;
+	flags = c2h_data->common.flags;
+
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+	if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) {
+		if (tcp_req->datao == tcp_req->req->payload_size) {
+			cpl.status.p = 0;
+		} else {
+			cpl.status.p = 1;
+		}
+
+		cpl.cid = tcp_req->cid;
+		cpl.sqid = tqpair->qpair.id;
+		nvme_tcp_req_complete(tcp_req, &cpl);
+		if (tcp_req->ordering.send_ack) {
+			(*reaped)++;
+		}
+
+		tcp_req->ordering.data_recv = 1;
+		nvme_tcp_req_put_safe(tcp_req);
+	}
+}
+
+static const char *spdk_nvme_tcp_term_req_fes_str[] = {
+	"Invalid PDU Header Field",
+	"PDU Sequence Error",
+	"Header Digest Error",
+	"Data Transfer Out of Range",
+	"Data Transfer Limit Exceeded",
+	"Unsupported parameter",
+};
+
+static void
+nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req)
+{
+	SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req,
+		    spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]);
+	if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+	    (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "The offset from the start of the PDU header is %u\n",
+			      DGET32(c2h_term_req->fei));
+	}
+	/* we may also need to dump some other info here */
+}
+
+static void
+nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair,
+				     struct nvme_tcp_pdu *pdu)
+{
+	nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req);
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+}
+
+static void
+nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair,
+			    uint32_t *reaped)
+{
+	int rc = 0;
+	struct nvme_tcp_pdu *pdu;
+	uint32_t crc32c, error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+	pdu = &tqpair->recv_pdu;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+
+	/* check data digest if need */
+	if (pdu->ddgst_enable) {
+		crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+		rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
+		if (rc == 0) {
+			SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+			nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+			return;
+		}
+	}
+
+	switch (pdu->hdr.common.pdu_type) {
+	case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
+		nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped);
+		break;
+
+	case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+		nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu);
+		break;
+
+	default:
+		/* The code should not go to here */
+		SPDK_ERRLOG("The code should not go to here\n");
+		break;
+	}
+}
+
+static void
+nvme_tcp_send_icreq_complete(void *cb_arg)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Complete the icreq send for tqpair=%p\n",
+		      (struct nvme_tcp_qpair *)cb_arg);
+}
+
+static void
+nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair,
+		       struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+	int recv_buf_size;
+
+	/* Only PFV 0 is defined currently */
+	if (ic_resp->pfv != 0) {
+		SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv);
+		goto end;
+	}
+
+	if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) {
+		SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE,
+			    ic_resp->maxh2cdata);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata);
+		goto end;
+	}
+	tqpair->maxh2cdata = ic_resp->maxh2cdata;
+
+	if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) {
+		SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda);
+		goto end;
+	}
+	tqpair->cpda = ic_resp->cpda;
+
+	tqpair->host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false;
+	tqpair->host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false;
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable);
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable);
+
+	/* Now that we know whether digests are enabled, properly size the receive buffer to
+	 * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR
+	 * parameter. */
+	recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr);
+
+	if (tqpair->host_hdgst_enable) {
+		recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	if (tqpair->host_ddgst_enable) {
+		recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) {
+		SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n",
+			     tqpair,
+			     recv_buf_size);
+		/* Not fatal. */
+	}
+
+	tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING;
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+	return;
+end:
+	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+	return;
+}
+
+static void
+nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
+				 uint32_t *reaped)
+{
+	struct nvme_tcp_req *tcp_req;
+	struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp;
+	uint32_t cid, error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+	struct spdk_nvme_cpl cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+	cpl = capsule_resp->rccqe;
+	cid = cpl.cid;
+
+	/* Recv the pdu again */
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+
+	tcp_req = get_nvme_active_req_by_cid(tqpair, cid);
+	if (!tcp_req) {
+		SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe);
+		goto end;
+
+	}
+
+	nvme_tcp_req_complete(tcp_req, &cpl);
+	if (tcp_req->ordering.send_ack) {
+		(*reaped)++;
+	}
+
+	tcp_req->ordering.data_recv = 1;
+	nvme_tcp_req_put_safe(tcp_req);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair);
+
+	return;
+
+end:
+	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+	return;
+}
+
+static void
+nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair,
+				 struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) {
+		SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for c2h_term_req pdu=%p\n", pdu);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes);
+		goto end;
+	}
+
+	/* set the data buffer */
+	nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen,
+			      c2h_term_req->common.plen - c2h_term_req->common.hlen);
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+	return;
+end:
+	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+	return;
+}
+
+static void
+nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu)
+{
+	struct nvme_tcp_req *tcp_req;
+	struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n",
+		      tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid);
+	tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid);
+	if (!tcp_req) {
+		SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid);
+		goto end;
+
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "tcp_req(%p) on tqpair(%p): datao=%u, payload_size=%u\n",
+		      tcp_req, tqpair, tcp_req->datao, tcp_req->req->payload_size);
+
+	if (c2h_data->datal > tcp_req->req->payload_size) {
+		SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n",
+			    tcp_req, c2h_data->datal, tcp_req->req->payload_size);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+		goto end;
+	}
+
+	if (tcp_req->datao != c2h_data->datao) {
+		SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != datao(%u) in tcp_req\n",
+			    tcp_req, c2h_data->datao, tcp_req->datao);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao);
+		goto end;
+	}
+
+	if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) {
+		SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n",
+			    tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+		error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal);
+		goto end;
+
+	}
+
+	nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt,
+				  c2h_data->datao, c2h_data->datal);
+	pdu->req = tcp_req;
+
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+	return;
+
+end:
+	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+	return;
+}
+
+static void
+nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg)
+{
+	struct nvme_tcp_req *tcp_req = cb_arg;
+
+	assert(tcp_req != NULL);
+
+	tcp_req->ordering.send_ack = 1;
+	if (tcp_req->r2tl_remain) {
+		nvme_tcp_send_h2c_data(tcp_req);
+	} else {
+		assert(tcp_req->active_r2ts > 0);
+		tcp_req->active_r2ts--;
+		tcp_req->state = NVME_TCP_REQ_ACTIVE;
+		/* Need also call this function to free the resource */
+		nvme_tcp_req_put_safe(tcp_req);
+	}
+}
+
+static void
+nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req)
+{
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair);
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_h2c_data_hdr *h2c_data;
+	uint32_t plen, pdo, alignment;
+
+	/* Reinit the send_ack and r2t_recv bits */
+	tcp_req->ordering.send_ack = 0;
+	tcp_req->ordering.r2t_recv = 0;
+	rsp_pdu = tcp_req->send_pdu;
+	memset(rsp_pdu, 0, sizeof(*rsp_pdu));
+	h2c_data = &rsp_pdu->hdr.h2c_data;
+
+	h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA;
+	plen = h2c_data->common.hlen = sizeof(*h2c_data);
+	h2c_data->cccid = tcp_req->cid;
+	h2c_data->ttag = tcp_req->ttag;
+	h2c_data->datao = tcp_req->datao;
+
+	h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata);
+	nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt,
+				  h2c_data->datao, h2c_data->datal);
+	tcp_req->r2tl_remain -= h2c_data->datal;
+
+	if (tqpair->host_hdgst_enable) {
+		h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	rsp_pdu->padding_len = 0;
+	pdo = plen;
+	if (tqpair->cpda) {
+		alignment = (tqpair->cpda + 1) << 2;
+		if (alignment > plen) {
+			rsp_pdu->padding_len = alignment - plen;
+			pdo = plen = alignment;
+		}
+	}
+
+	h2c_data->common.pdo = pdo;
+	plen += h2c_data->datal;
+	if (tqpair->host_ddgst_enable) {
+		h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	h2c_data->common.plen = plen;
+	tcp_req->datao += h2c_data->datal;
+	if (!tcp_req->r2tl_remain) {
+		h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n",
+		      h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair);
+
+	nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req);
+}
+
+static void
+nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu)
+{
+	struct nvme_tcp_req *tcp_req;
+	struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t;
+	uint32_t cid, error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+	cid = r2t->cccid;
+	tcp_req = get_nvme_active_req_by_cid(tqpair, cid);
+	if (!tcp_req) {
+		SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid);
+		goto end;
+	}
+
+	tcp_req->ordering.r2t_recv = 1;
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl,
+		      tqpair);
+
+	if (tcp_req->state == NVME_TCP_REQ_ACTIVE) {
+		assert(tcp_req->active_r2ts == 0);
+		tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T;
+	}
+
+	tcp_req->active_r2ts++;
+	if (tcp_req->active_r2ts > tqpair->maxr2t) {
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED;
+		SPDK_ERRLOG("Invalid R2T: it exceeds the R2T maixmal=%u for tqpair=%p\n", tqpair->maxr2t, tqpair);
+		goto end;
+	}
+
+	if (tcp_req->datao != r2t->r2to) {
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to);
+		goto end;
+
+	}
+
+	if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) {
+		SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n",
+			    tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+		error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl);
+		goto end;
+
+	}
+
+	tcp_req->ttag = r2t->ttag;
+	tcp_req->r2tl_remain = r2t->r2tl;
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+
+	if (spdk_likely(tcp_req->ordering.send_ack)) {
+		nvme_tcp_send_h2c_data(tcp_req);
+	}
+	return;
+
+end:
+	nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+	return;
+
+}
+
+static void
+nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped)
+{
+	struct nvme_tcp_pdu *pdu;
+	int rc;
+	uint32_t crc32c, error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+	pdu = &tqpair->recv_pdu;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type);
+	/* check header digest if needed */
+	if (pdu->has_hdgst) {
+		crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+		rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c);
+		if (rc == 0) {
+			SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+			nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+			return;
+
+		}
+	}
+
+	switch (pdu->hdr.common.pdu_type) {
+	case SPDK_NVME_TCP_PDU_TYPE_IC_RESP:
+		nvme_tcp_icresp_handle(tqpair, pdu);
+		break;
+	case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP:
+		nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped);
+		break;
+	case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
+		nvme_tcp_c2h_data_hdr_handle(tqpair, pdu);
+		break;
+
+	case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+		nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu);
+		break;
+	case SPDK_NVME_TCP_PDU_TYPE_R2T:
+		nvme_tcp_r2t_hdr_handle(tqpair, pdu);
+		break;
+
+	default:
+		SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = 1;
+		nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+		break;
+	}
+
+}
+
+static int
+nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped)
+{
+	int rc = 0;
+	struct nvme_tcp_pdu *pdu;
+	uint32_t data_len;
+	enum nvme_tcp_pdu_recv_state prev_state;
+
+	/* The loop here is to allow for several back-to-back state changes. */
+	do {
+		prev_state = tqpair->recv_state;
+		switch (tqpair->recv_state) {
+		/* If in a new state */
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+			nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
+			break;
+		/* common header */
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+			pdu = &tqpair->recv_pdu;
+			if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
+				rc = nvme_tcp_read_data(tqpair->sock,
+							sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
+							(uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes);
+				if (rc < 0) {
+					nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+					break;
+				}
+				pdu->ch_valid_bytes += rc;
+				if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
+					return NVME_TCP_PDU_IN_PROGRESS;
+				}
+			}
+
+			/* The command header of this PDU has now been read from the socket. */
+			nvme_tcp_pdu_ch_handle(tqpair);
+			break;
+		/* Wait for the pdu specific header  */
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+			pdu = &tqpair->recv_pdu;
+			rc = nvme_tcp_read_data(tqpair->sock,
+						pdu->psh_len - pdu->psh_valid_bytes,
+						(uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
+			if (rc < 0) {
+				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+				break;
+			}
+
+			pdu->psh_valid_bytes += rc;
+			if (pdu->psh_valid_bytes < pdu->psh_len) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			/* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
+			nvme_tcp_pdu_psh_handle(tqpair, reaped);
+			break;
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+			pdu = &tqpair->recv_pdu;
+			/* check whether the data is valid, if not we just return */
+			if (!pdu->data_len) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			data_len = pdu->data_len;
+			/* data digest */
+			if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) &&
+					  tqpair->host_ddgst_enable)) {
+				data_len += SPDK_NVME_TCP_DIGEST_LEN;
+				pdu->ddgst_enable = true;
+			}
+
+			rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
+			if (rc < 0) {
+				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+				break;
+			}
+
+			pdu->readv_offset += rc;
+			if (pdu->readv_offset < data_len) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			assert(pdu->readv_offset == data_len);
+			/* All of this PDU has now been read from the socket. */
+			nvme_tcp_pdu_payload_handle(tqpair, reaped);
+			break;
+		case NVME_TCP_PDU_RECV_STATE_ERROR:
+			rc = NVME_TCP_PDU_FATAL;
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	} while (prev_state != tqpair->recv_state);
+
+	return rc;
+}
+
+static void
+nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+	uint64_t t02;
+	struct nvme_tcp_req *tcp_req, *tmp;
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+	struct spdk_nvme_ctrlr_process *active_proc;
+
+	/* Don't check timeouts during controller initialization. */
+	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+		return;
+	}
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		active_proc = nvme_ctrlr_get_current_process(ctrlr);
+	} else {
+		active_proc = qpair->active_proc;
+	}
+
+	/* Only check timeouts if the current process has a timeout callback. */
+	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+		return;
+	}
+
+	t02 = spdk_get_ticks();
+	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+		assert(tcp_req->req != NULL);
+
+		if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) {
+			/*
+			 * The requests are in order, so as soon as one has not timed out,
+			 * stop iterating.
+			 */
+			break;
+		}
+	}
+}
+
+static int
+nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+	uint32_t reaped;
+	int rc;
+
+	rc = spdk_sock_flush(tqpair->sock);
+	if (rc < 0) {
+		return rc;
+	}
+
+	if (max_completions == 0) {
+		max_completions = tqpair->num_entries;
+	} else {
+		max_completions = spdk_min(max_completions, tqpair->num_entries);
+	}
+
+	reaped = 0;
+	do {
+		rc = nvme_tcp_read_pdu(tqpair, &reaped);
+		if (rc < 0) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Error polling CQ! (%d): %s\n",
+				      errno, spdk_strerror(errno));
+			goto fail;
+		} else if (rc == 0) {
+			/* Partial PDU is read */
+			break;
+		}
+
+	} while (reaped < max_completions);
+
+	if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) {
+		nvme_tcp_qpair_check_timeout(qpair);
+	}
+
+	return reaped;
+fail:
+
+	/*
+	 * Since admin queues take the ctrlr_lock before entering this function,
+	 * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need
+	 * to call the generic function which will take the lock for us.
+	 */
+	qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
+
+	if (nvme_qpair_is_admin_queue(qpair)) {
+		nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair);
+	} else {
+		nvme_ctrlr_disconnect_qpair(qpair);
+	}
+	return -ENXIO;
+}
+
+static void
+nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+	struct spdk_nvme_qpair *qpair = ctx;
+	struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group);
+	int32_t num_completions;
+
+	num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair);
+
+	if (pgroup->num_completions >= 0 && num_completions >= 0) {
+		pgroup->num_completions += num_completions;
+	} else {
+		pgroup->num_completions = -ENXIO;
+	}
+}
+
+static int
+nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair)
+{
+	struct spdk_nvme_tcp_ic_req *ic_req;
+	struct nvme_tcp_pdu *pdu;
+	uint64_t icreq_timeout_tsc;
+	int rc;
+
+	pdu = &tqpair->send_pdu;
+	memset(&tqpair->send_pdu, 0, sizeof(tqpair->send_pdu));
+	ic_req = &pdu->hdr.ic_req;
+
+	ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ;
+	ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req);
+	ic_req->pfv = 0;
+	ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1;
+	ic_req->hpda = NVME_TCP_HPDA_DEFAULT;
+
+	ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest;
+	ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest;
+
+	nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair);
+
+	icreq_timeout_tsc = spdk_get_ticks() + (NVME_TCP_TIME_OUT_IN_SECONDS * spdk_get_ticks_hz());
+	do {
+		rc = nvme_tcp_qpair_process_completions(&tqpair->qpair, 0);
+	} while ((tqpair->state == NVME_TCP_QPAIR_STATE_INVALID) &&
+		 (rc == 0) && (spdk_get_ticks() <= icreq_timeout_tsc));
+
+	if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
+		SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair);
+		return -1;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "Succesfully construct the tqpair=%p via correct icresp\n", tqpair);
+
+	return 0;
+}
+
+static int
+nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	struct sockaddr_storage dst_addr;
+	struct sockaddr_storage src_addr;
+	int rc;
+	struct nvme_tcp_qpair *tqpair;
+	int family;
+	long int port;
+	struct spdk_sock_opts opts;
+
+	tqpair = nvme_tcp_qpair(qpair);
+
+	switch (ctrlr->trid.adrfam) {
+	case SPDK_NVMF_ADRFAM_IPV4:
+		family = AF_INET;
+		break;
+	case SPDK_NVMF_ADRFAM_IPV6:
+		family = AF_INET6;
+		break;
+	default:
+		SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam);
+		return -1;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family);
+
+	memset(&dst_addr, 0, sizeof(dst_addr));
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid);
+	rc = nvme_tcp_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid);
+	if (rc != 0) {
+		SPDK_ERRLOG("dst_addr nvme_tcp_parse_addr() failed\n");
+		return -1;
+	}
+
+	if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) {
+		memset(&src_addr, 0, sizeof(src_addr));
+		rc = nvme_tcp_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid);
+		if (rc != 0) {
+			SPDK_ERRLOG("src_addr nvme_tcp_parse_addr() failed\n");
+			return -1;
+		}
+	}
+
+	port = spdk_strtol(ctrlr->trid.trsvcid, 10);
+	if (port <= 0 || port >= INT_MAX) {
+		SPDK_ERRLOG("Invalid port: %s\n", ctrlr->trid.trsvcid);
+		return -1;
+	}
+
+	opts.opts_size = sizeof(opts);
+	spdk_sock_get_default_opts(&opts);
+	opts.priority = ctrlr->trid.priority;
+	tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, NULL, &opts);
+	if (!tqpair->sock) {
+		SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n",
+			    tqpair, ctrlr->trid.traddr, port);
+		return -1;
+	}
+
+	tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT;
+	/* Explicitly set the state and recv_state of tqpair */
+	tqpair->state = NVME_TCP_QPAIR_STATE_INVALID;
+	if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) {
+		nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+	}
+	rc = nvme_tcp_qpair_icreq_send(tqpair);
+	if (rc != 0) {
+		SPDK_ERRLOG("Unable to connect the tqpair\n");
+		return -1;
+	}
+
+	rc = nvme_fabric_qpair_connect(&tqpair->qpair, tqpair->num_entries);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
+			    uint16_t qid, uint32_t qsize,
+			    enum spdk_nvme_qprio qprio,
+			    uint32_t num_requests)
+{
+	struct nvme_tcp_qpair *tqpair;
+	struct spdk_nvme_qpair *qpair;
+	int rc;
+
+	tqpair = calloc(1, sizeof(struct nvme_tcp_qpair));
+	if (!tqpair) {
+		SPDK_ERRLOG("failed to get create tqpair\n");
+		return NULL;
+	}
+
+	tqpair->num_entries = qsize;
+	qpair = &tqpair->qpair;
+	rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests);
+	if (rc != 0) {
+		free(tqpair);
+		return NULL;
+	}
+
+	rc = nvme_tcp_alloc_reqs(tqpair);
+	if (rc) {
+		nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair);
+		return NULL;
+	}
+
+	return qpair;
+}
+
+static struct spdk_nvme_qpair *
+nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+			       const struct spdk_nvme_io_qpair_opts *opts)
+{
+	return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio,
+					   opts->io_queue_requests);
+}
+
+static struct spdk_nvme_ctrlr *nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+		const struct spdk_nvme_ctrlr_opts *opts,
+		void *devhandle)
+{
+	struct nvme_tcp_ctrlr *tctrlr;
+	union spdk_nvme_cap_register cap;
+	union spdk_nvme_vs_register vs;
+	int rc;
+
+	tctrlr = calloc(1, sizeof(*tctrlr));
+	if (tctrlr == NULL) {
+		SPDK_ERRLOG("could not allocate ctrlr\n");
+		return NULL;
+	}
+
+	tctrlr->ctrlr.opts = *opts;
+	tctrlr->ctrlr.trid = *trid;
+
+	rc = nvme_ctrlr_construct(&tctrlr->ctrlr);
+	if (rc != 0) {
+		free(tctrlr);
+		return NULL;
+	}
+
+	tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0,
+			       tctrlr->ctrlr.opts.admin_queue_size, 0,
+			       tctrlr->ctrlr.opts.admin_queue_size);
+	if (!tctrlr->ctrlr.adminq) {
+		SPDK_ERRLOG("failed to create admin qpair\n");
+		nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr);
+		return NULL;
+	}
+
+	rc = nvme_transport_ctrlr_connect_qpair(&tctrlr->ctrlr, tctrlr->ctrlr.adminq);
+	if (rc < 0) {
+		SPDK_ERRLOG("failed to connect admin qpair\n");
+		nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr);
+		return NULL;
+	}
+
+	if (nvme_ctrlr_get_cap(&tctrlr->ctrlr, &cap)) {
+		SPDK_ERRLOG("get_cap() failed\n");
+		nvme_ctrlr_destruct(&tctrlr->ctrlr);
+		return NULL;
+	}
+
+	if (nvme_ctrlr_get_vs(&tctrlr->ctrlr, &vs)) {
+		SPDK_ERRLOG("get_vs() failed\n");
+		nvme_ctrlr_destruct(&tctrlr->ctrlr);
+		return NULL;
+	}
+
+	if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) {
+		SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
+		nvme_ctrlr_destruct(&tctrlr->ctrlr);
+		return NULL;
+	}
+
+	nvme_ctrlr_init_cap(&tctrlr->ctrlr, &cap, &vs);
+
+	return &tctrlr->ctrlr;
+}
+
+static uint32_t
+nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+	/* TCP transport doens't limit maximum IO transfer size. */
+	return UINT32_MAX;
+}
+
+static uint16_t
+nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+	/*
+	 * We do not support >1 SGE in the initiator currently,
+	 *  so we can only return 1 here.  Once that support is
+	 *  added, this should return ctrlr->cdata.nvmf_specific.msdbd
+	 *  instead.
+	 */
+	return 1;
+}
+
+static int
+nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+				int (*iter_fn)(struct nvme_request *req, void *arg),
+				void *arg)
+{
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+	struct nvme_tcp_req *tcp_req, *tmp;
+	int rc;
+
+	assert(iter_fn != NULL);
+
+	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+		assert(tcp_req->req != NULL);
+
+		rc = iter_fn(tcp_req->req, arg);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void
+nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_tcp_req *tcp_req, *tmp;
+	struct spdk_nvme_cpl cpl;
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+	cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+	cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+		assert(tcp_req->req != NULL);
+		if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+			continue;
+		}
+
+		nvme_tcp_req_complete(tcp_req, &cpl);
+		nvme_tcp_req_put(tqpair, tcp_req);
+	}
+}
+
+static struct spdk_nvme_transport_poll_group *
+nvme_tcp_poll_group_create(void)
+{
+	struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group));
+
+	if (group == NULL) {
+		SPDK_ERRLOG("Unable to allocate poll group.\n");
+		return NULL;
+	}
+
+	group->sock_group = spdk_sock_group_create(group);
+	if (group->sock_group == NULL) {
+		free(group);
+		SPDK_ERRLOG("Unable to allocate sock group.\n");
+		return NULL;
+	}
+
+	return &group->group;
+}
+
+static int
+nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group);
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+	if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) {
+		return -EPROTO;
+	}
+	return 0;
+}
+
+static int
+nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group);
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+	if (tqpair->sock && group->sock_group) {
+		if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) {
+			return -EPROTO;
+		}
+	}
+	return 0;
+}
+
+static int
+nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+			struct spdk_nvme_qpair *qpair)
+{
+	struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
+
+	/* disconnected qpairs won't have a sock to add. */
+	if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) {
+		if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) {
+			return -EPROTO;
+		}
+	}
+
+	return 0;
+}
+
+static int
+nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+			   struct spdk_nvme_qpair *qpair)
+{
+	if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+		return nvme_poll_group_disconnect_qpair(qpair);
+	}
+
+	return 0;
+}
+
+static int64_t
+nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+					uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
+	struct spdk_nvme_qpair *qpair, *tmp_qpair;
+
+	group->completions_per_qpair = completions_per_qpair;
+	group->num_completions = 0;
+
+	spdk_sock_group_poll(group->sock_group);
+
+	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
+		disconnected_qpair_cb(qpair, tgroup->group->ctx);
+	}
+
+	return group->num_completions;
+}
+
+static int
+nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+	int rc;
+	struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
+
+	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+		return -EBUSY;
+	}
+
+	rc = spdk_sock_group_close(&group->sock_group);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n");
+		assert(false);
+	}
+
+	free(tgroup);
+
+	return 0;
+}
+
+const struct spdk_nvme_transport_ops tcp_ops = {
+	.name = "TCP",
+	.type = SPDK_NVME_TRANSPORT_TCP,
+	.ctrlr_construct = nvme_tcp_ctrlr_construct,
+	.ctrlr_scan = nvme_fabric_ctrlr_scan,
+	.ctrlr_destruct = nvme_tcp_ctrlr_destruct,
+	.ctrlr_enable = nvme_tcp_ctrlr_enable,
+
+	.ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4,
+	.ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8,
+	.ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4,
+	.ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8,
+
+	.ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size,
+	.ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges,
+
+	.ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair,
+	.ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair,
+	.ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair,
+	.ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair,
+
+	.qpair_abort_reqs = nvme_tcp_qpair_abort_reqs,
+	.qpair_reset = nvme_tcp_qpair_reset,
+	.qpair_submit_request = nvme_tcp_qpair_submit_request,
+	.qpair_process_completions = nvme_tcp_qpair_process_completions,
+	.qpair_iterate_requests = nvme_tcp_qpair_iterate_requests,
+	.admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers,
+
+	.poll_group_create = nvme_tcp_poll_group_create,
+	.poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair,
+	.poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair,
+	.poll_group_add = nvme_tcp_poll_group_add,
+	.poll_group_remove = nvme_tcp_poll_group_remove,
+	.poll_group_process_completions = nvme_tcp_poll_group_process_completions,
+	.poll_group_destroy = nvme_tcp_poll_group_destroy,
+};
+
+SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops);
diff --git a/src/spdk/lib/nvme/nvme_transport.c b/src/spdk/lib/nvme/nvme_transport.c
new file mode 100644
index 000000000..76efd5966
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_transport.c
@@ -0,0 +1,591 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe transport abstraction
+ */
+
+#include "nvme_internal.h"
+#include "spdk/queue.h"
+
+#define SPDK_MAX_NUM_OF_TRANSPORTS 16
+
+struct spdk_nvme_transport {
+	struct spdk_nvme_transport_ops	ops;
+	TAILQ_ENTRY(spdk_nvme_transport)	link;
+};
+
+TAILQ_HEAD(nvme_transport_list, spdk_nvme_transport) g_spdk_nvme_transports =
+	TAILQ_HEAD_INITIALIZER(g_spdk_nvme_transports);
+
+struct spdk_nvme_transport g_spdk_transports[SPDK_MAX_NUM_OF_TRANSPORTS] = {};
+int g_current_transport_index = 0;
+
+const struct spdk_nvme_transport *
+nvme_get_first_transport(void)
+{
+	return TAILQ_FIRST(&g_spdk_nvme_transports);
+}
+
+const struct spdk_nvme_transport *
+nvme_get_next_transport(const struct spdk_nvme_transport *transport)
+{
+	return TAILQ_NEXT(transport, link);
+}
+
+/*
+ * Unfortunately, due to NVMe PCIe multiprocess support, we cannot store the
+ * transport object in either the controller struct or the admin qpair. THis means
+ * that a lot of admin related transport calls will have to call nvme_get_transport
+ * in order to knwo which functions to call.
+ * In the I/O path, we have the ability to store the transport struct in the I/O
+ * qpairs to avoid taking a performance hit.
+ */
+const struct spdk_nvme_transport *
+nvme_get_transport(const char *transport_name)
+{
+	struct spdk_nvme_transport *registered_transport;
+
+	TAILQ_FOREACH(registered_transport, &g_spdk_nvme_transports, link) {
+		if (strcasecmp(transport_name, registered_transport->ops.name) == 0) {
+			return registered_transport;
+		}
+	}
+
+	return NULL;
+}
+
+bool
+spdk_nvme_transport_available(enum spdk_nvme_transport_type trtype)
+{
+	return nvme_get_transport(spdk_nvme_transport_id_trtype_str(trtype)) == NULL ? false : true;
+}
+
+bool
+spdk_nvme_transport_available_by_name(const char *transport_name)
+{
+	return nvme_get_transport(transport_name) == NULL ? false : true;
+}
+
+void spdk_nvme_transport_register(const struct spdk_nvme_transport_ops *ops)
+{
+	struct spdk_nvme_transport *new_transport;
+
+	if (nvme_get_transport(ops->name)) {
+		SPDK_ERRLOG("Double registering NVMe transport %s is prohibited.\n", ops->name);
+		assert(false);
+	}
+
+	if (g_current_transport_index == SPDK_MAX_NUM_OF_TRANSPORTS) {
+		SPDK_ERRLOG("Unable to register new NVMe transport.\n");
+		assert(false);
+		return;
+	}
+	new_transport = &g_spdk_transports[g_current_transport_index++];
+
+	new_transport->ops = *ops;
+	TAILQ_INSERT_TAIL(&g_spdk_nvme_transports, new_transport, link);
+}
+
+struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+		const struct spdk_nvme_ctrlr_opts *opts,
+		void *devhandle)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(trid->trstring);
+	struct spdk_nvme_ctrlr *ctrlr;
+
+	if (transport == NULL) {
+		SPDK_ERRLOG("Transport %s doesn't exist.", trid->trstring);
+		return NULL;
+	}
+
+	ctrlr = transport->ops.ctrlr_construct(trid, opts, devhandle);
+
+	return ctrlr;
+}
+
+int
+nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
+			  bool direct_connect)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(probe_ctx->trid.trstring);
+
+	if (transport == NULL) {
+		SPDK_ERRLOG("Transport %s doesn't exist.", probe_ctx->trid.trstring);
+		return -ENOENT;
+	}
+
+	return transport->ops.ctrlr_scan(probe_ctx, direct_connect);
+}
+
+int
+nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_destruct(ctrlr);
+}
+
+int
+nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_enable(ctrlr);
+}
+
+int
+nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_set_reg_4(ctrlr, offset, value);
+}
+
+int
+nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_set_reg_8(ctrlr, offset, value);
+}
+
+int
+nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_get_reg_4(ctrlr, offset, value);
+}
+
+int
+nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_get_reg_8(ctrlr, offset, value);
+}
+
+uint32_t
+nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_get_max_xfer_size(ctrlr);
+}
+
+uint16_t
+nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	return transport->ops.ctrlr_get_max_sges(ctrlr);
+}
+
+int
+nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	if (transport->ops.ctrlr_reserve_cmb != NULL) {
+		return transport->ops.ctrlr_reserve_cmb(ctrlr);
+	}
+
+	return -ENOTSUP;
+}
+
+void *
+nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	if (transport->ops.ctrlr_map_cmb != NULL) {
+		return transport->ops.ctrlr_map_cmb(ctrlr, size);
+	}
+
+	return NULL;
+}
+
+int
+nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	if (transport->ops.ctrlr_unmap_cmb != NULL) {
+		return transport->ops.ctrlr_unmap_cmb(ctrlr);
+	}
+
+	return 0;
+}
+
+struct spdk_nvme_qpair *
+nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+				     const struct spdk_nvme_io_qpair_opts *opts)
+{
+	struct spdk_nvme_qpair *qpair;
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	qpair = transport->ops.ctrlr_create_io_qpair(ctrlr, qid, opts);
+	if (qpair != NULL && !nvme_qpair_is_admin_queue(qpair)) {
+		qpair->transport = transport;
+	}
+
+	return qpair;
+}
+
+int
+nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+
+	/* Do not rely on qpair->transport.  For multi-process cases, a foreign process may delete
+	 * the IO qpair, in which case the transport object would be invalid (each process has their
+	 * own unique transport objects since they contain function pointers).  So we look up the
+	 * transport object in the delete_io_qpair case.
+	 */
+	return transport->ops.ctrlr_delete_io_qpair(ctrlr, qpair);
+}
+
+int
+nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+	uint8_t transport_failure_reason;
+	int rc;
+
+	assert(transport != NULL);
+	if (!nvme_qpair_is_admin_queue(qpair)) {
+		qpair->transport = transport;
+	}
+
+	transport_failure_reason = qpair->transport_failure_reason;
+	qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE;
+
+	nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTING);
+	rc = transport->ops.ctrlr_connect_qpair(ctrlr, qpair);
+	if (rc != 0) {
+		goto err;
+	}
+
+	nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
+	if (qpair->poll_group) {
+		rc = nvme_poll_group_connect_qpair(qpair);
+		if (rc) {
+			goto err;
+		}
+	}
+
+	return rc;
+
+err:
+	/* If the qpair was unable to reconnect, restore the original failure reason. */
+	qpair->transport_failure_reason = transport_failure_reason;
+	nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+	nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
+	return rc;
+}
+
+void
+nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+	if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING ||
+	    nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED) {
+		return;
+	}
+
+	nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTING);
+	assert(transport != NULL);
+	if (qpair->poll_group) {
+		nvme_poll_group_disconnect_qpair(qpair);
+	}
+
+	transport->ops.ctrlr_disconnect_qpair(ctrlr, qpair);
+
+	nvme_qpair_abort_reqs(qpair, 0);
+	nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
+}
+
+void
+nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+	const struct spdk_nvme_transport *transport;
+
+	assert(dnr <= 1);
+	if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+		qpair->transport->ops.qpair_abort_reqs(qpair, dnr);
+	} else {
+		transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+		assert(transport != NULL);
+		transport->ops.qpair_abort_reqs(qpair, dnr);
+	}
+}
+
+int
+nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+	const struct spdk_nvme_transport *transport;
+
+	if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+		return qpair->transport->ops.qpair_reset(qpair);
+	}
+
+	transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+	assert(transport != NULL);
+	return transport->ops.qpair_reset(qpair);
+}
+
+int
+nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+	const struct spdk_nvme_transport *transport;
+
+	if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+		return qpair->transport->ops.qpair_submit_request(qpair, req);
+	}
+
+	transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+	assert(transport != NULL);
+	return transport->ops.qpair_submit_request(qpair, req);
+}
+
+int32_t
+nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+	const struct spdk_nvme_transport *transport;
+
+	if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+		return qpair->transport->ops.qpair_process_completions(qpair, max_completions);
+	}
+
+	transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+	assert(transport != NULL);
+	return transport->ops.qpair_process_completions(qpair, max_completions);
+}
+
+int
+nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+				      int (*iter_fn)(struct nvme_request *req, void *arg),
+				      void *arg)
+{
+	const struct spdk_nvme_transport *transport;
+
+	if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+		return qpair->transport->ops.qpair_iterate_requests(qpair, iter_fn, arg);
+	}
+
+	transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+	assert(transport != NULL);
+	return transport->ops.qpair_iterate_requests(qpair, iter_fn, arg);
+}
+
+void
+nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+	const struct spdk_nvme_transport *transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+
+	assert(transport != NULL);
+	transport->ops.admin_qpair_abort_aers(qpair);
+}
+
+struct spdk_nvme_transport_poll_group *
+nvme_transport_poll_group_create(const struct spdk_nvme_transport *transport)
+{
+	struct spdk_nvme_transport_poll_group *group = NULL;
+
+	group = transport->ops.poll_group_create();
+	if (group) {
+		group->transport = transport;
+		STAILQ_INIT(&group->connected_qpairs);
+		STAILQ_INIT(&group->disconnected_qpairs);
+	}
+
+	return group;
+}
+
+int
+nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+			      struct spdk_nvme_qpair *qpair)
+{
+	int rc;
+
+	rc = tgroup->transport->ops.poll_group_add(tgroup, qpair);
+	if (rc == 0) {
+		qpair->poll_group = tgroup;
+		assert(nvme_qpair_get_state(qpair) < NVME_QPAIR_CONNECTED);
+		qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs;
+		STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq);
+	}
+
+	return rc;
+}
+
+int
+nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+				 struct spdk_nvme_qpair *qpair)
+{
+	int rc;
+
+	rc = tgroup->transport->ops.poll_group_remove(tgroup, qpair);
+	if (rc == 0) {
+		if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+			STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+		} else if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) {
+			STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+		} else {
+			return -ENOENT;
+		}
+
+		qpair->poll_group = NULL;
+		qpair->poll_group_tailq_head = NULL;
+	}
+
+	return rc;
+}
+
+int64_t
+nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+	struct spdk_nvme_qpair *qpair;
+	int64_t rc;
+
+	tgroup->in_completion_context = true;
+	rc = tgroup->transport->ops.poll_group_process_completions(tgroup, completions_per_qpair,
+			disconnected_qpair_cb);
+	tgroup->in_completion_context = false;
+
+	if (spdk_unlikely(tgroup->num_qpairs_to_delete > 0)) {
+		/* deleted qpairs are more likely to be in the disconnected qpairs list. */
+		STAILQ_FOREACH(qpair, &tgroup->disconnected_qpairs, poll_group_stailq) {
+			if (spdk_unlikely(qpair->delete_after_completion_context)) {
+				spdk_nvme_ctrlr_free_io_qpair(qpair);
+				if (--tgroup->num_qpairs_to_delete == 0) {
+					return rc;
+				}
+			}
+		}
+
+		STAILQ_FOREACH(qpair, &tgroup->connected_qpairs, poll_group_stailq) {
+			if (spdk_unlikely(qpair->delete_after_completion_context)) {
+				spdk_nvme_ctrlr_free_io_qpair(qpair);
+				if (--tgroup->num_qpairs_to_delete == 0) {
+					return rc;
+				}
+			}
+		}
+		/* Just in case. */
+		SPDK_DEBUGLOG(SPDK_LOG_NVME, "Mismatch between qpairs to delete and poll group number.\n");
+		tgroup->num_qpairs_to_delete = 0;
+	}
+
+	return rc;
+}
+
+int
+nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+	return tgroup->transport->ops.poll_group_destroy(tgroup);
+}
+
+int
+nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_transport_poll_group *tgroup;
+	int rc;
+
+	tgroup = qpair->poll_group;
+
+	if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) {
+		return 0;
+	}
+
+	if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+		rc = tgroup->transport->ops.poll_group_disconnect_qpair(qpair);
+		if (rc == 0) {
+			qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs;
+			STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+			STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq);
+			/* EINPROGRESS indicates that a call has already been made to this function.
+			 * It just keeps us from segfaulting on a double removal/insert.
+			 */
+		} else if (rc == -EINPROGRESS) {
+			rc = 0;
+		}
+		return rc;
+	}
+
+	return -EINVAL;
+}
+
+int
+nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+	struct spdk_nvme_transport_poll_group *tgroup;
+	int rc;
+
+	tgroup = qpair->poll_group;
+
+	if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+		return 0;
+	}
+
+	if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) {
+		rc = tgroup->transport->ops.poll_group_connect_qpair(qpair);
+		if (rc == 0) {
+			qpair->poll_group_tailq_head = &tgroup->connected_qpairs;
+			STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+			STAILQ_INSERT_TAIL(&tgroup->connected_qpairs, qpair, poll_group_stailq);
+		}
+
+		return rc == -EINPROGRESS ? 0 : rc;
+	}
+
+
+	return -EINVAL;
+}
diff --git a/src/spdk/lib/nvme/nvme_uevent.c b/src/spdk/lib/nvme/nvme_uevent.c
new file mode 100644
index 000000000..1bcfff1cb
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_uevent.c
@@ -0,0 +1,213 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include "spdk/log.h"
+
+#include "nvme_uevent.h"
+
+#ifdef __linux__
+
+#include <linux/netlink.h>
+
+#define SPDK_UEVENT_MSG_LEN 4096
+
+int
+nvme_uevent_connect(void)
+{
+	struct sockaddr_nl addr;
+	int netlink_fd;
+	int size = 64 * 1024;
+	int flag;
+
+	memset(&addr, 0, sizeof(addr));
+	addr.nl_family = AF_NETLINK;
+	addr.nl_pid = getpid();
+	addr.nl_groups = 0xffffffff;
+
+	netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
+	if (netlink_fd < 0) {
+		return -1;
+	}
+
+	setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size));
+
+	flag = fcntl(netlink_fd, F_GETFL);
+	if (fcntl(netlink_fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+		SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", netlink_fd,
+			    spdk_strerror(errno));
+		close(netlink_fd);
+		return -1;
+	}
+
+	if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+		close(netlink_fd);
+		return -1;
+	}
+	return netlink_fd;
+}
+
+/* Note: We only parse the event from uio subsystem and will ignore
+ *       all the event from other subsystem. the event from uio subsystem
+ *       as below:
+ *       action: "add" or "remove"
+ *       subsystem: "uio"
+ *       dev_path: "/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0"
+ */
+static int
+parse_event(const char *buf, struct spdk_uevent *event)
+{
+	char action[SPDK_UEVENT_MSG_LEN];
+	char subsystem[SPDK_UEVENT_MSG_LEN];
+	char dev_path[SPDK_UEVENT_MSG_LEN];
+	char driver[SPDK_UEVENT_MSG_LEN];
+	char vfio_pci_addr[SPDK_UEVENT_MSG_LEN];
+
+	memset(action, 0, SPDK_UEVENT_MSG_LEN);
+	memset(subsystem, 0, SPDK_UEVENT_MSG_LEN);
+	memset(dev_path, 0, SPDK_UEVENT_MSG_LEN);
+	memset(driver, 0, SPDK_UEVENT_MSG_LEN);
+	memset(vfio_pci_addr, 0, SPDK_UEVENT_MSG_LEN);
+
+	while (*buf) {
+		if (!strncmp(buf, "ACTION=", 7)) {
+			buf += 7;
+			snprintf(action, sizeof(action), "%s", buf);
+		} else if (!strncmp(buf, "DEVPATH=", 8)) {
+			buf += 8;
+			snprintf(dev_path, sizeof(dev_path), "%s", buf);
+		} else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+			buf += 10;
+			snprintf(subsystem, sizeof(subsystem), "%s", buf);
+		} else if (!strncmp(buf, "DRIVER=", 7)) {
+			buf += 7;
+			snprintf(driver, sizeof(driver), "%s", buf);
+		} else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
+			buf += 14;
+			snprintf(vfio_pci_addr, sizeof(vfio_pci_addr), "%s", buf);
+		}
+		while (*buf++)
+			;
+	}
+
+	if (!strncmp(subsystem, "uio", 3)) {
+		char *pci_address, *tmp;
+		struct spdk_pci_addr pci_addr;
+
+		event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO;
+		if (!strncmp(action, "add", 3)) {
+			event->action = SPDK_NVME_UEVENT_ADD;
+		}
+		if (!strncmp(action, "remove", 6)) {
+			event->action = SPDK_NVME_UEVENT_REMOVE;
+		}
+		tmp = strstr(dev_path, "/uio/");
+
+		memset(tmp, 0, SPDK_UEVENT_MSG_LEN - (tmp - dev_path));
+
+		pci_address = strrchr(dev_path, '/');
+		pci_address++;
+		if (spdk_pci_addr_parse(&pci_addr, pci_address) != 0) {
+			SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", pci_address);
+			return -1;
+		}
+		spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr);
+		return 1;
+	}
+	if (!strncmp(driver, "vfio-pci", 8)) {
+		struct spdk_pci_addr pci_addr;
+
+		event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO;
+		if (!strncmp(action, "bind", 4)) {
+			event->action = SPDK_NVME_UEVENT_ADD;
+		}
+		if (!strncmp(action, "remove", 6)) {
+			event->action = SPDK_NVME_UEVENT_REMOVE;
+		}
+		if (spdk_pci_addr_parse(&pci_addr, vfio_pci_addr) != 0) {
+			SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", vfio_pci_addr);
+			return -1;
+		}
+		spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr);
+		return 1;
+
+	}
+	return -1;
+}
+
+int
+nvme_get_uevent(int fd, struct spdk_uevent *uevent)
+{
+	int ret;
+	char buf[SPDK_UEVENT_MSG_LEN];
+
+	memset(uevent, 0, sizeof(struct spdk_uevent));
+	memset(buf, 0, SPDK_UEVENT_MSG_LEN);
+
+	ret = recv(fd, buf, SPDK_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
+	if (ret > 0) {
+		return parse_event(buf, uevent);
+	}
+
+	if (ret < 0) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			return 0;
+		} else {
+			SPDK_ERRLOG("Socket read error(%d): %s\n", errno, spdk_strerror(errno));
+			return -1;
+		}
+	}
+
+	/* connection closed */
+	if (ret == 0) {
+		return -1;
+	}
+	return 0;
+}
+
+#else /* Not Linux */
+
+int
+nvme_uevent_connect(void)
+{
+	return -1;
+}
+
+int
+nvme_get_uevent(int fd, struct spdk_uevent *uevent)
+{
+	return -1;
+}
+#endif
diff --git a/src/spdk/lib/nvme/nvme_uevent.h b/src/spdk/lib/nvme/nvme_uevent.h
new file mode 100644
index 000000000..778d73c2a
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_uevent.h
@@ -0,0 +1,61 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * SPDK uevent
+ */
+
+#include "spdk/env.h"
+#include "spdk/nvmf_spec.h"
+
+#ifndef SPDK_UEVENT_H_
+#define SPDK_UEVENT_H_
+
+#define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1
+#define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2
+
+enum spdk_nvme_uevent_action {
+	SPDK_NVME_UEVENT_ADD = 0,
+	SPDK_NVME_UEVENT_REMOVE = 1,
+};
+
+struct spdk_uevent {
+	enum spdk_nvme_uevent_action action;
+	int subsystem;
+	char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1];
+};
+
+int nvme_uevent_connect(void);
+int nvme_get_uevent(int fd, struct spdk_uevent *uevent);
+
+#endif /* SPDK_UEVENT_H_ */
diff --git a/src/spdk/lib/nvme/spdk_nvme.map b/src/spdk/lib/nvme/spdk_nvme.map
new file mode 100644
index 000000000..63a04eeca
--- /dev/null
+++ b/src/spdk/lib/nvme/spdk_nvme.map
@@ -0,0 +1,185 @@
+{
+	global:
+
+	# public functions from nvme.h
+	spdk_nvme_transport_register;
+	spdk_nvme_transport_available;
+	spdk_nvme_transport_available_by_name;
+	spdk_nvme_transport_id_parse;
+	spdk_nvme_transport_id_populate_trstring;
+	spdk_nvme_transport_id_parse_trtype;
+	spdk_nvme_transport_id_trtype_str;
+	spdk_nvme_transport_id_adrfam_str;
+	spdk_nvme_transport_id_parse_adrfam;
+	spdk_nvme_transport_id_compare;
+	spdk_nvme_trid_populate_transport;
+	spdk_nvme_host_id_parse;
+
+	spdk_nvme_prchk_flags_parse;
+	spdk_nvme_prchk_flags_str;
+
+	spdk_nvme_probe;
+	spdk_nvme_connect;
+	spdk_nvme_connect_async;
+	spdk_nvme_probe_async;
+	spdk_nvme_probe_poll_async;
+	spdk_nvme_detach;
+
+	spdk_nvme_ctrlr_is_discovery;
+	spdk_nvme_ctrlr_get_default_ctrlr_opts;
+	spdk_nvme_ctrlr_set_trid;
+	spdk_nvme_ctrlr_reset;
+	spdk_nvme_ctrlr_fail;
+	spdk_nvme_ctrlr_is_failed;
+	spdk_nvme_ctrlr_get_data;
+	spdk_nvme_ctrlr_get_regs_csts;
+	spdk_nvme_ctrlr_get_regs_cap;
+	spdk_nvme_ctrlr_get_regs_vs;
+	spdk_nvme_ctrlr_get_regs_cmbsz;
+	spdk_nvme_ctrlr_get_num_ns;
+	spdk_nvme_ctrlr_get_pci_device;
+	spdk_nvme_ctrlr_get_max_xfer_size;
+	spdk_nvme_ctrlr_is_active_ns;
+	spdk_nvme_ctrlr_get_first_active_ns;
+	spdk_nvme_ctrlr_get_next_active_ns;
+	spdk_nvme_ctrlr_is_log_page_supported;
+	spdk_nvme_ctrlr_is_feature_supported;
+	spdk_nvme_ctrlr_register_aer_callback;
+	spdk_nvme_ctrlr_register_timeout_callback;
+	spdk_nvme_ctrlr_get_default_io_qpair_opts;
+	spdk_nvme_ctrlr_alloc_io_qpair;
+	spdk_nvme_ctrlr_connect_io_qpair;
+	spdk_nvme_ctrlr_disconnect_io_qpair;
+	spdk_nvme_ctrlr_reconnect_io_qpair;
+	spdk_nvme_ctrlr_get_admin_qp_failure_reason;
+	spdk_nvme_ctrlr_free_io_qpair;
+	spdk_nvme_ctrlr_io_cmd_raw_no_payload_build;
+	spdk_nvme_ctrlr_cmd_io_raw;
+	spdk_nvme_ctrlr_cmd_io_raw_with_md;
+	spdk_nvme_ctrlr_cmd_admin_raw;
+	spdk_nvme_ctrlr_process_admin_completions;
+	spdk_nvme_ctrlr_get_ns;
+	spdk_nvme_ctrlr_cmd_get_log_page;
+	spdk_nvme_ctrlr_cmd_get_log_page_ext;
+	spdk_nvme_ctrlr_cmd_abort;
+	spdk_nvme_ctrlr_cmd_abort_ext;
+	spdk_nvme_ctrlr_cmd_set_feature;
+	spdk_nvme_ctrlr_cmd_get_feature;
+	spdk_nvme_ctrlr_cmd_get_feature_ns;
+	spdk_nvme_ctrlr_cmd_set_feature_ns;
+	spdk_nvme_ctrlr_cmd_security_receive;
+	spdk_nvme_ctrlr_cmd_security_send;
+	spdk_nvme_ctrlr_security_receive;
+	spdk_nvme_ctrlr_security_send;
+	spdk_nvme_ctrlr_get_flags;
+	spdk_nvme_ctrlr_attach_ns;
+	spdk_nvme_ctrlr_detach_ns;
+	spdk_nvme_ctrlr_create_ns;
+	spdk_nvme_ctrlr_delete_ns;
+	spdk_nvme_ctrlr_format;
+	spdk_nvme_ctrlr_update_firmware;
+	spdk_nvme_ctrlr_get_registers;
+	spdk_nvme_ctrlr_reserve_cmb;
+	spdk_nvme_ctrlr_map_cmb;
+	spdk_nvme_ctrlr_unmap_cmb;
+	spdk_nvme_ctrlr_get_transport_id;
+
+	spdk_nvme_poll_group_create;
+	spdk_nvme_poll_group_add;
+	spdk_nvme_poll_group_remove;
+	spdk_nvme_poll_group_destroy;
+	spdk_nvme_poll_group_process_completions;
+	spdk_nvme_poll_group_get_ctx;
+
+	spdk_nvme_ns_get_data;
+	spdk_nvme_ns_get_id;
+	spdk_nvme_ns_get_ctrlr;
+	spdk_nvme_ns_is_active;
+	spdk_nvme_ns_get_max_io_xfer_size;
+	spdk_nvme_ns_get_sector_size;
+	spdk_nvme_ns_get_extended_sector_size;
+	spdk_nvme_ns_get_num_sectors;
+	spdk_nvme_ns_get_size;
+	spdk_nvme_ns_get_pi_type;
+	spdk_nvme_ns_get_md_size;
+	spdk_nvme_ns_supports_extended_lba;
+	spdk_nvme_ns_supports_compare;
+	spdk_nvme_ns_get_dealloc_logical_block_read_value;
+	spdk_nvme_ns_get_optimal_io_boundary;
+	spdk_nvme_ns_get_uuid;
+	spdk_nvme_ns_get_flags;
+
+	spdk_nvme_ns_cmd_write;
+	spdk_nvme_ns_cmd_writev;
+	spdk_nvme_ns_cmd_writev_with_md;
+	spdk_nvme_ns_cmd_write_with_md;
+	spdk_nvme_ns_cmd_write_zeroes;
+	spdk_nvme_ns_cmd_write_uncorrectable;
+	spdk_nvme_ns_cmd_read;
+	spdk_nvme_ns_cmd_readv;
+	spdk_nvme_ns_cmd_readv_with_md;
+	spdk_nvme_ns_cmd_read_with_md;
+	spdk_nvme_ns_cmd_dataset_management;
+	spdk_nvme_ns_cmd_flush;
+	spdk_nvme_ns_cmd_reservation_register;
+	spdk_nvme_ns_cmd_reservation_release;
+	spdk_nvme_ns_cmd_reservation_acquire;
+	spdk_nvme_ns_cmd_reservation_report;
+	spdk_nvme_ns_cmd_compare;
+	spdk_nvme_ns_cmd_comparev;
+	spdk_nvme_ns_cmd_comparev_with_md;
+	spdk_nvme_ns_cmd_compare_with_md;
+
+	spdk_nvme_qpair_process_completions;
+	spdk_nvme_qpair_get_failure_reason;
+	spdk_nvme_qpair_add_cmd_error_injection;
+	spdk_nvme_qpair_remove_cmd_error_injection;
+	spdk_nvme_qpair_print_command;
+	spdk_nvme_qpair_print_completion;
+	spdk_nvme_print_command;
+	spdk_nvme_print_completion;
+
+	spdk_nvme_cpl_get_status_string;
+
+	spdk_nvme_rdma_init_hooks;
+
+	spdk_nvme_cuse_get_ctrlr_name;
+	spdk_nvme_cuse_get_ns_name;
+	spdk_nvme_cuse_register;
+	spdk_nvme_cuse_unregister;
+	spdk_nvme_cuse_update_namespaces;
+
+	spdk_nvme_map_prps;
+
+	# public functions from nvme_ocssd.h
+	spdk_nvme_ctrlr_is_ocssd_supported;
+	spdk_nvme_ocssd_ctrlr_cmd_geometry;
+	spdk_nvme_ocssd_ns_cmd_vector_reset;
+	spdk_nvme_ocssd_ns_cmd_vector_write;
+	spdk_nvme_ocssd_ns_cmd_vector_write_with_md;
+	spdk_nvme_ocssd_ns_cmd_vector_read;
+	spdk_nvme_ocssd_ns_cmd_vector_read_with_md;
+	spdk_nvme_ocssd_ns_cmd_vector_copy;
+
+	# public functions from opal.h
+	spdk_opal_dev_construct;
+	spdk_opal_dev_destruct;
+	spdk_opal_get_d0_features_info;
+	spdk_opal_supported;
+	spdk_opal_cmd_take_ownership;
+	spdk_opal_cmd_revert_tper;
+	spdk_opal_cmd_activate_locking_sp;
+	spdk_opal_cmd_lock_unlock;
+	spdk_opal_cmd_setup_locking_range;
+	spdk_opal_cmd_get_max_ranges;
+	spdk_opal_cmd_get_locking_range_info;
+	spdk_opal_cmd_enable_user;
+	spdk_opal_cmd_add_user_to_locking_range;
+	spdk_opal_cmd_set_new_passwd;
+	spdk_opal_cmd_erase_locking_range;
+	spdk_opal_cmd_secure_erase_locking_range;
+	spdk_opal_get_locking_range_info;
+	spdk_opal_free_locking_range_info;
+
+	local: *;
+};
diff --git a/src/spdk/lib/nvmf/Makefile b/src/spdk/lib/nvmf/Makefile
new file mode 100644
index 000000000..b4556564a
--- /dev/null
+++ b/src/spdk/lib/nvmf/Makefile
@@ -0,0 +1,75 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+C_SRCS = ctrlr.c ctrlr_discovery.c ctrlr_bdev.c \
+	 subsystem.c nvmf.c nvmf_rpc.c transport.c tcp.c
+
+C_SRCS-$(CONFIG_RDMA) += rdma.c
+LIBNAME = nvmf
+LOCAL_SYS_LIBS = -luuid
+ifeq ($(CONFIG_RDMA),y)
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+endif
+
+ifeq ($(CONFIG_FC),y)
+C_SRCS += fc.c fc_ls.c
+CFLAGS += -I$(CURDIR)
+ifneq ($(strip $(CONFIG_FC_PATH)),)
+CFLAGS += -I$(CONFIG_FC_PATH)
+endif
+endif
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvmf.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nvmf/ctrlr.c b/src/spdk/lib/nvmf/ctrlr.c
new file mode 100644
index 000000000..638cde9d2
--- /dev/null
+++ b/src/spdk/lib/nvmf/ctrlr.c
@@ -0,0 +1,3224 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/bit_array.h"
+#include "spdk/endian.h"
+#include "spdk/thread.h"
+#include "spdk/trace.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/version.h"
+
+#include "spdk_internal/log.h"
+
+#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS 10000
+#define NVMF_DISC_KATO_IN_MS 120000
+#define KAS_TIME_UNIT_IN_MS 100
+#define KAS_DEFAULT_VALUE (MIN_KEEP_ALIVE_TIMEOUT_IN_MS / KAS_TIME_UNIT_IN_MS)
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+/*
+ * Support for custom admin command handlers
+ */
+struct spdk_nvmf_custom_admin_cmd {
+	spdk_nvmf_custom_cmd_hdlr hdlr;
+	uint32_t nsid; /* nsid to forward */
+};
+
+static struct spdk_nvmf_custom_admin_cmd g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_MAX_OPC + 1];
+
+static void _nvmf_request_complete(void *ctx);
+
+static inline void
+nvmf_invalid_connect_response(struct spdk_nvmf_fabric_connect_rsp *rsp,
+			      uint8_t iattr, uint16_t ipo)
+{
+	rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+	rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+	rsp->status_code_specific.invalid.iattr = iattr;
+	rsp->status_code_specific.invalid.ipo = ipo;
+}
+
+#define SPDK_NVMF_INVALID_CONNECT_CMD(rsp, field)	\
+	nvmf_invalid_connect_response(rsp, 0, offsetof(struct spdk_nvmf_fabric_connect_cmd, field))
+#define SPDK_NVMF_INVALID_CONNECT_DATA(rsp, field)	\
+	nvmf_invalid_connect_response(rsp, 1, offsetof(struct spdk_nvmf_fabric_connect_data, field))
+
+static void
+nvmf_ctrlr_stop_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	if (!ctrlr) {
+		SPDK_ERRLOG("Controller is NULL\n");
+		return;
+	}
+
+	if (ctrlr->keep_alive_poller == NULL) {
+		return;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Stop keep alive poller\n");
+	spdk_poller_unregister(&ctrlr->keep_alive_poller);
+}
+
+static void
+nvmf_ctrlr_disconnect_qpairs_done(struct spdk_io_channel_iter *i, int status)
+{
+	if (status == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr disconnect qpairs complete successfully\n");
+	} else {
+		SPDK_ERRLOG("Fail to disconnect ctrlr qpairs\n");
+	}
+}
+
+static int
+_nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i, bool include_admin)
+{
+	int rc = 0;
+	struct spdk_nvmf_ctrlr *ctrlr;
+	struct spdk_nvmf_qpair *qpair, *temp_qpair;
+	struct spdk_io_channel *ch;
+	struct spdk_nvmf_poll_group *group;
+
+	ctrlr = spdk_io_channel_iter_get_ctx(i);
+	ch = spdk_io_channel_iter_get_channel(i);
+	group = spdk_io_channel_get_ctx(ch);
+
+	TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, temp_qpair) {
+		if (qpair->ctrlr == ctrlr && (include_admin || !nvmf_qpair_is_admin_queue(qpair))) {
+			rc = spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+			if (rc) {
+				SPDK_ERRLOG("Qpair disconnect failed\n");
+				return rc;
+			}
+		}
+	}
+
+	return rc;
+}
+
+static void
+nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i)
+{
+	spdk_for_each_channel_continue(i, _nvmf_ctrlr_disconnect_qpairs_on_pg(i, true));
+}
+
+static void
+nvmf_ctrlr_disconnect_io_qpairs_on_pg(struct spdk_io_channel_iter *i)
+{
+	spdk_for_each_channel_continue(i, _nvmf_ctrlr_disconnect_qpairs_on_pg(i, false));
+}
+
+static int
+nvmf_ctrlr_keep_alive_poll(void *ctx)
+{
+	uint64_t keep_alive_timeout_tick;
+	uint64_t now = spdk_get_ticks();
+	struct spdk_nvmf_ctrlr *ctrlr = ctx;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Polling ctrlr keep alive timeout\n");
+
+	/* If the Keep alive feature is in use and the timer expires */
+	keep_alive_timeout_tick = ctrlr->last_keep_alive_tick +
+				  ctrlr->feat.keep_alive_timer.bits.kato * spdk_get_ticks_hz() / UINT64_C(1000);
+	if (now > keep_alive_timeout_tick) {
+		SPDK_NOTICELOG("Disconnecting host from subsystem %s due to keep alive timeout.\n",
+			       ctrlr->subsys->subnqn);
+		/* set the Controller Fatal Status bit to '1' */
+		if (ctrlr->vcprop.csts.bits.cfs == 0) {
+			ctrlr->vcprop.csts.bits.cfs = 1;
+
+			/*
+			 * disconnect qpairs, terminate Transport connection
+			 * destroy ctrlr, break the host to controller association
+			 * disconnect qpairs with qpair->ctrlr == ctrlr
+			 */
+			spdk_for_each_channel(ctrlr->subsys->tgt,
+					      nvmf_ctrlr_disconnect_qpairs_on_pg,
+					      ctrlr,
+					      nvmf_ctrlr_disconnect_qpairs_done);
+		}
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_ctrlr_start_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	if (!ctrlr) {
+		SPDK_ERRLOG("Controller is NULL\n");
+		return;
+	}
+
+	/* if cleared to 0 then the Keep Alive Timer is disabled */
+	if (ctrlr->feat.keep_alive_timer.bits.kato != 0) {
+
+		ctrlr->last_keep_alive_tick = spdk_get_ticks();
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Ctrlr add keep alive poller\n");
+		ctrlr->keep_alive_poller = SPDK_POLLER_REGISTER(nvmf_ctrlr_keep_alive_poll, ctrlr,
+					   ctrlr->feat.keep_alive_timer.bits.kato * 1000);
+	}
+}
+
+static void
+ctrlr_add_qpair_and_update_rsp(struct spdk_nvmf_qpair *qpair,
+			       struct spdk_nvmf_ctrlr *ctrlr,
+			       struct spdk_nvmf_fabric_connect_rsp *rsp)
+{
+	assert(ctrlr->admin_qpair->group->thread == spdk_get_thread());
+
+	/* check if we would exceed ctrlr connection limit */
+	if (qpair->qid >= spdk_bit_array_capacity(ctrlr->qpair_mask)) {
+		SPDK_ERRLOG("Requested QID %u but Max QID is %u\n",
+			    qpair->qid, spdk_bit_array_capacity(ctrlr->qpair_mask) - 1);
+		rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+		return;
+	}
+
+	if (spdk_bit_array_get(ctrlr->qpair_mask, qpair->qid)) {
+		SPDK_ERRLOG("Got I/O connect with duplicate QID %u\n", qpair->qid);
+		rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+		return;
+	}
+
+	qpair->ctrlr = ctrlr;
+	spdk_bit_array_set(ctrlr->qpair_mask, qpair->qid);
+
+	rsp->status.sc = SPDK_NVME_SC_SUCCESS;
+	rsp->status_code_specific.success.cntlid = ctrlr->cntlid;
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "connect capsule response: cntlid = 0x%04x\n",
+		      rsp->status_code_specific.success.cntlid);
+}
+
+static void
+_nvmf_ctrlr_add_admin_qpair(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+	ctrlr->admin_qpair = qpair;
+	nvmf_ctrlr_start_keep_alive_timer(ctrlr);
+	ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp);
+	_nvmf_request_complete(req);
+}
+
+static void
+_nvmf_subsystem_add_ctrlr(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+	if (nvmf_subsystem_add_ctrlr(ctrlr->subsys, ctrlr)) {
+		SPDK_ERRLOG("Unable to add controller to subsystem\n");
+		spdk_bit_array_free(&ctrlr->qpair_mask);
+		free(ctrlr);
+		qpair->ctrlr = NULL;
+		rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		spdk_nvmf_request_complete(req);
+		return;
+	}
+
+	spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_add_admin_qpair, req);
+}
+
+static void
+nvmf_ctrlr_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem,
+		      struct spdk_nvmf_ctrlr_data *cdata)
+{
+	cdata->kas = KAS_DEFAULT_VALUE;
+	cdata->sgls.supported = 1;
+	cdata->sgls.keyed_sgl = 1;
+	cdata->sgls.sgl_offset = 1;
+	cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16;
+	cdata->nvmf_specific.ioccsz += transport->opts.in_capsule_data_size / 16;
+	cdata->nvmf_specific.iorcsz = sizeof(struct spdk_nvme_cpl) / 16;
+	cdata->nvmf_specific.icdoff = 0; /* offset starts directly after SQE */
+	cdata->nvmf_specific.ctrattr.ctrlr_model = SPDK_NVMF_CTRLR_MODEL_DYNAMIC;
+	cdata->nvmf_specific.msdbd = 1;
+
+	if (transport->ops->cdata_init) {
+		transport->ops->cdata_init(transport, subsystem, cdata);
+	}
+}
+
+static struct spdk_nvmf_ctrlr *
+nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem,
+		  struct spdk_nvmf_request *req,
+		  struct spdk_nvmf_fabric_connect_cmd *connect_cmd,
+		  struct spdk_nvmf_fabric_connect_data *connect_data)
+{
+	struct spdk_nvmf_ctrlr	*ctrlr;
+	struct spdk_nvmf_transport *transport;
+
+	ctrlr = calloc(1, sizeof(*ctrlr));
+	if (ctrlr == NULL) {
+		SPDK_ERRLOG("Memory allocation failed\n");
+		return NULL;
+	}
+
+	TAILQ_INIT(&ctrlr->log_head);
+	ctrlr->subsys = subsystem;
+	ctrlr->thread = req->qpair->group->thread;
+
+	transport = req->qpair->transport;
+	ctrlr->qpair_mask = spdk_bit_array_create(transport->opts.max_qpairs_per_ctrlr);
+	if (!ctrlr->qpair_mask) {
+		SPDK_ERRLOG("Failed to allocate controller qpair mask\n");
+		free(ctrlr);
+		return NULL;
+	}
+
+	nvmf_ctrlr_cdata_init(transport, subsystem, &ctrlr->cdata);
+
+	/*
+	 * KAS: This field indicates the granularity of the Keep Alive Timer in 100ms units.
+	 * If this field is cleared to 0h, then Keep Alive is not supported.
+	 */
+	if (ctrlr->cdata.kas) {
+		ctrlr->feat.keep_alive_timer.bits.kato = spdk_divide_round_up(connect_cmd->kato,
+				KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS) *
+				KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS;
+	}
+
+	ctrlr->feat.async_event_configuration.bits.ns_attr_notice = 1;
+	ctrlr->feat.volatile_write_cache.bits.wce = 1;
+
+	if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+		/*
+		 * If keep-alive timeout is not set, discovery controllers use some
+		 * arbitrary high value in order to cleanup stale discovery sessions
+		 *
+		 * From the 1.0a nvme-of spec:
+		 * "The Keep Alive command is reserved for
+		 * Discovery controllers. A transport may specify a
+		 * fixed Discovery controller activity timeout value
+		 * (e.g., 2 minutes).  If no commands are received
+		 * by a Discovery controller within that time
+		 * period, the controller may perform the
+		 * actions for Keep Alive Timer expiration".
+		 * kato is in millisecond.
+		 */
+		if (ctrlr->feat.keep_alive_timer.bits.kato == 0) {
+			ctrlr->feat.keep_alive_timer.bits.kato = NVMF_DISC_KATO_IN_MS;
+		}
+	}
+
+	/* Subtract 1 for admin queue, 1 for 0's based */
+	ctrlr->feat.number_of_queues.bits.ncqr = transport->opts.max_qpairs_per_ctrlr - 1 -
+			1;
+	ctrlr->feat.number_of_queues.bits.nsqr = transport->opts.max_qpairs_per_ctrlr - 1 -
+			1;
+
+	spdk_uuid_copy(&ctrlr->hostid, (struct spdk_uuid *)connect_data->hostid);
+	memcpy(ctrlr->hostnqn, connect_data->hostnqn, sizeof(ctrlr->hostnqn));
+
+	ctrlr->vcprop.cap.raw = 0;
+	ctrlr->vcprop.cap.bits.cqr = 1; /* NVMe-oF specification required */
+	ctrlr->vcprop.cap.bits.mqes = transport->opts.max_queue_depth -
+				      1; /* max queue depth */
+	ctrlr->vcprop.cap.bits.ams = 0; /* optional arb mechanisms */
+	ctrlr->vcprop.cap.bits.to = 1; /* ready timeout - 500 msec units */
+	ctrlr->vcprop.cap.bits.dstrd = 0; /* fixed to 0 for NVMe-oF */
+	ctrlr->vcprop.cap.bits.css = SPDK_NVME_CAP_CSS_NVM; /* NVM command set */
+	ctrlr->vcprop.cap.bits.mpsmin = 0; /* 2 ^ (12 + mpsmin) == 4k */
+	ctrlr->vcprop.cap.bits.mpsmax = 0; /* 2 ^ (12 + mpsmax) == 4k */
+
+	/* Version Supported: 1.3 */
+	ctrlr->vcprop.vs.bits.mjr = 1;
+	ctrlr->vcprop.vs.bits.mnr = 3;
+	ctrlr->vcprop.vs.bits.ter = 0;
+
+	ctrlr->vcprop.cc.raw = 0;
+	ctrlr->vcprop.cc.bits.en = 0; /* Init controller disabled */
+
+	ctrlr->vcprop.csts.raw = 0;
+	ctrlr->vcprop.csts.bits.rdy = 0; /* Init controller as not ready */
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cap 0x%" PRIx64 "\n", ctrlr->vcprop.cap.raw);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "vs 0x%x\n", ctrlr->vcprop.vs.raw);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cc 0x%x\n", ctrlr->vcprop.cc.raw);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "csts 0x%x\n", ctrlr->vcprop.csts.raw);
+
+	ctrlr->dif_insert_or_strip = transport->opts.dif_insert_or_strip;
+
+	req->qpair->ctrlr = ctrlr;
+	spdk_thread_send_msg(subsystem->thread, _nvmf_subsystem_add_ctrlr, req);
+
+	return ctrlr;
+}
+
+static void
+_nvmf_ctrlr_destruct(void *ctx)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = ctx;
+	struct spdk_nvmf_reservation_log *log, *log_tmp;
+
+	nvmf_ctrlr_stop_keep_alive_timer(ctrlr);
+
+	TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) {
+		TAILQ_REMOVE(&ctrlr->log_head, log, link);
+		free(log);
+	}
+	free(ctrlr);
+}
+
+void
+nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	nvmf_subsystem_remove_ctrlr(ctrlr->subsys, ctrlr);
+
+	spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_destruct, ctrlr);
+}
+
+static void
+nvmf_ctrlr_add_io_qpair(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+	/* Unit test will check qpair->ctrlr after calling spdk_nvmf_ctrlr_connect.
+	  * For error case, the value should be NULL. So set it to NULL at first.
+	  */
+	qpair->ctrlr = NULL;
+
+	if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+		SPDK_ERRLOG("I/O connect not allowed on discovery controller\n");
+		SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+		goto end;
+	}
+
+	if (!ctrlr->vcprop.cc.bits.en) {
+		SPDK_ERRLOG("Got I/O connect before ctrlr was enabled\n");
+		SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+		goto end;
+	}
+
+	if (1u << ctrlr->vcprop.cc.bits.iosqes != sizeof(struct spdk_nvme_cmd)) {
+		SPDK_ERRLOG("Got I/O connect with invalid IOSQES %u\n",
+			    ctrlr->vcprop.cc.bits.iosqes);
+		SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+		goto end;
+	}
+
+	if (1u << ctrlr->vcprop.cc.bits.iocqes != sizeof(struct spdk_nvme_cpl)) {
+		SPDK_ERRLOG("Got I/O connect with invalid IOCQES %u\n",
+			    ctrlr->vcprop.cc.bits.iocqes);
+		SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+		goto end;
+	}
+
+	ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp);
+end:
+	spdk_nvmf_request_complete(req);
+}
+
+static void
+_nvmf_ctrlr_add_io_qpair(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_fabric_connect_data *data = req->data;
+	struct spdk_nvmf_ctrlr *ctrlr;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_qpair *admin_qpair;
+	struct spdk_nvmf_tgt *tgt = qpair->transport->tgt;
+	struct spdk_nvmf_subsystem *subsystem;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect I/O Queue for controller id 0x%x\n", data->cntlid);
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn);
+	/* We already checked this in spdk_nvmf_ctrlr_connect */
+	assert(subsystem != NULL);
+
+	ctrlr = nvmf_subsystem_get_ctrlr(subsystem, data->cntlid);
+	if (ctrlr == NULL) {
+		SPDK_ERRLOG("Unknown controller ID 0x%x\n", data->cntlid);
+		SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid);
+		spdk_nvmf_request_complete(req);
+		return;
+	}
+
+	admin_qpair = ctrlr->admin_qpair;
+	qpair->ctrlr = ctrlr;
+	spdk_thread_send_msg(admin_qpair->group->thread, nvmf_ctrlr_add_io_qpair, req);
+}
+
+static bool
+nvmf_qpair_access_allowed(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_subsystem *subsystem,
+			  const char *hostnqn)
+{
+	struct spdk_nvme_transport_id listen_trid = {};
+
+	if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) {
+		SPDK_ERRLOG("Subsystem '%s' does not allow host '%s'\n", subsystem->subnqn, hostnqn);
+		return false;
+	}
+
+	if (spdk_nvmf_qpair_get_listen_trid(qpair, &listen_trid)) {
+		SPDK_ERRLOG("Subsystem '%s' is unable to enforce access control due to an internal error.\n",
+			    subsystem->subnqn);
+		return false;
+	}
+
+	if (!spdk_nvmf_subsystem_listener_allowed(subsystem, &listen_trid)) {
+		SPDK_ERRLOG("Subsystem '%s' does not allow host '%s' to connect at this address.\n",
+			    subsystem->subnqn, hostnqn);
+		return false;
+	}
+
+	return true;
+}
+
+static int
+_nvmf_ctrlr_connect(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_fabric_connect_data *data = req->data;
+	struct spdk_nvmf_fabric_connect_cmd *cmd = &req->cmd->connect_cmd;
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_transport *transport = qpair->transport;
+	struct spdk_nvmf_ctrlr *ctrlr;
+	struct spdk_nvmf_subsystem *subsystem;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "recfmt 0x%x qid %u sqsize %u\n",
+		      cmd->recfmt, cmd->qid, cmd->sqsize);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect data:\n");
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "  cntlid:  0x%04x\n", data->cntlid);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "  hostid: %08x-%04x-%04x-%02x%02x-%04x%08x ***\n",
+		      ntohl(*(uint32_t *)&data->hostid[0]),
+		      ntohs(*(uint16_t *)&data->hostid[4]),
+		      ntohs(*(uint16_t *)&data->hostid[6]),
+		      data->hostid[8],
+		      data->hostid[9],
+		      ntohs(*(uint16_t *)&data->hostid[10]),
+		      ntohl(*(uint32_t *)&data->hostid[12]));
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "  subnqn: \"%s\"\n", data->subnqn);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "  hostnqn: \"%s\"\n", data->hostnqn);
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(transport->tgt, data->subnqn);
+	if (!subsystem) {
+		SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (cmd->recfmt != 0) {
+		SPDK_ERRLOG("Connect command unsupported RECFMT %u\n", cmd->recfmt);
+		rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		rsp->status.sc = SPDK_NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/*
+	 * SQSIZE is a 0-based value, so it must be at least 1 (minimum queue depth is 2) and
+	 * strictly less than max_aq_depth (admin queues) or max_queue_depth (io queues).
+	 */
+	if (cmd->sqsize == 0) {
+		SPDK_ERRLOG("Invalid SQSIZE = 0\n");
+		SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (cmd->qid == 0) {
+		if (cmd->sqsize >= transport->opts.max_aq_depth) {
+			SPDK_ERRLOG("Invalid SQSIZE for admin queue %u (min 1, max %u)\n",
+				    cmd->sqsize, transport->opts.max_aq_depth - 1);
+			SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+	} else if (cmd->sqsize >= transport->opts.max_queue_depth) {
+		SPDK_ERRLOG("Invalid SQSIZE %u (min 1, max %u)\n",
+			    cmd->sqsize, transport->opts.max_queue_depth - 1);
+		SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	qpair->sq_head_max = cmd->sqsize;
+	qpair->qid = cmd->qid;
+
+	if (0 == qpair->qid) {
+		qpair->group->stat.admin_qpairs++;
+	} else {
+		qpair->group->stat.io_qpairs++;
+	}
+
+	if (cmd->qid == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect Admin Queue for controller ID 0x%x\n", data->cntlid);
+
+		if (data->cntlid != 0xFFFF) {
+			/* This NVMf target only supports dynamic mode. */
+			SPDK_ERRLOG("The NVMf target only supports dynamic mode (CNTLID = 0x%x).\n", data->cntlid);
+			SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+
+		/* Establish a new ctrlr */
+		ctrlr = nvmf_ctrlr_create(subsystem, req, cmd, data);
+		if (!ctrlr) {
+			SPDK_ERRLOG("nvmf_ctrlr_create() failed\n");
+			rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		} else {
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+	} else {
+		spdk_thread_send_msg(subsystem->thread, _nvmf_ctrlr_add_io_qpair, req);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+	}
+}
+
+static inline bool
+nvmf_request_is_fabric_connect(struct spdk_nvmf_request *req)
+{
+	return req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC &&
+	       req->cmd->nvmf_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT;
+}
+
+static struct spdk_nvmf_subsystem_poll_group *
+nvmf_subsystem_pg_from_connect_cmd(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_fabric_connect_data *data;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	assert(nvmf_request_is_fabric_connect(req));
+	assert(req->qpair->ctrlr == NULL);
+
+	data = req->data;
+	tgt = req->qpair->transport->tgt;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn);
+	if (subsystem == NULL) {
+		return NULL;
+	}
+
+	return &req->qpair->group->sgroups[subsystem->id];
+}
+
+int
+spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	enum spdk_nvmf_request_exec_status status;
+
+	sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+	if (!sgroup) {
+		SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn);
+		status = SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		goto out;
+	}
+
+	sgroup->io_outstanding++;
+	TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
+
+	status = _nvmf_ctrlr_connect(req);
+
+out:
+	if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+		_nvmf_request_complete(req);
+	}
+
+	return status;
+}
+
+static int
+nvmf_ctrlr_cmd_connect(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_fabric_connect_data *data = req->data;
+	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+	struct spdk_nvmf_transport *transport = req->qpair->transport;
+	struct spdk_nvmf_subsystem *subsystem;
+
+	if (req->length < sizeof(struct spdk_nvmf_fabric_connect_data)) {
+		SPDK_ERRLOG("Connect command data length 0x%x too small\n", req->length);
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(transport->tgt, data->subnqn);
+	if (!subsystem) {
+		SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if ((subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) ||
+	    (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSING) ||
+	    (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) ||
+	    (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) {
+		SPDK_ERRLOG("Subsystem '%s' is not ready\n", subsystem->subnqn);
+		rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		rsp->status.sc = SPDK_NVMF_FABRIC_SC_CONTROLLER_BUSY;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/* Ensure that hostnqn is null terminated */
+	if (!memchr(data->hostnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1)) {
+		SPDK_ERRLOG("Connect HOSTNQN is not null terminated\n");
+		SPDK_NVMF_INVALID_CONNECT_DATA(rsp, hostnqn);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (!nvmf_qpair_access_allowed(req->qpair, subsystem, data->hostnqn)) {
+		rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return _nvmf_ctrlr_connect(req);
+}
+
+static void
+nvmf_ctrlr_cc_reset_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = spdk_io_channel_iter_get_ctx(i);
+
+	if (status < 0) {
+		SPDK_ERRLOG("Fail to disconnect io ctrlr qpairs\n");
+		assert(false);
+	}
+
+	/* Only a subset of the registers are cleared out on a reset */
+	ctrlr->vcprop.cc.raw = 0;
+	ctrlr->vcprop.csts.raw = 0;
+
+}
+
+const struct spdk_nvmf_registers *
+spdk_nvmf_ctrlr_get_regs(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return &ctrlr->vcprop;
+}
+
+static uint64_t
+nvmf_prop_get_cap(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.cap.raw;
+}
+
+static uint64_t
+nvmf_prop_get_vs(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.vs.raw;
+}
+
+static uint64_t
+nvmf_prop_get_cc(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.cc.raw;
+}
+
+static bool
+nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+	union spdk_nvme_cc_register cc, diff;
+
+	cc.raw = value;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cur CC: 0x%08x\n", ctrlr->vcprop.cc.raw);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "new CC: 0x%08x\n", cc.raw);
+
+	/*
+	 * Calculate which bits changed between the current and new CC.
+	 * Mark each bit as 0 once it is handled to determine if any unhandled bits were changed.
+	 */
+	diff.raw = cc.raw ^ ctrlr->vcprop.cc.raw;
+
+	if (diff.bits.en) {
+		if (cc.bits.en) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Enable!\n");
+			ctrlr->vcprop.cc.bits.en = 1;
+			ctrlr->vcprop.csts.bits.rdy = 1;
+		} else {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Disable!\n");
+			ctrlr->vcprop.cc.bits.en = 0;
+			spdk_for_each_channel(ctrlr->subsys->tgt,
+					      nvmf_ctrlr_disconnect_io_qpairs_on_pg,
+					      ctrlr,
+					      nvmf_ctrlr_cc_reset_done);
+		}
+		diff.bits.en = 0;
+	}
+
+	if (diff.bits.shn) {
+		if (cc.bits.shn == SPDK_NVME_SHN_NORMAL ||
+		    cc.bits.shn == SPDK_NVME_SHN_ABRUPT) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Shutdown %u%ub!\n",
+				      cc.bits.shn >> 1, cc.bits.shn & 1);
+			ctrlr->vcprop.cc.bits.shn = cc.bits.shn;
+			ctrlr->vcprop.cc.bits.en = 0;
+			ctrlr->vcprop.csts.bits.rdy = 0;
+			ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE;
+		} else if (cc.bits.shn == 0) {
+			ctrlr->vcprop.cc.bits.shn = 0;
+		} else {
+			SPDK_ERRLOG("Prop Set CC: Invalid SHN value %u%ub\n",
+				    cc.bits.shn >> 1, cc.bits.shn & 1);
+			return false;
+		}
+		diff.bits.shn = 0;
+	}
+
+	if (diff.bits.iosqes) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOSQES = %u (%u bytes)\n",
+			      cc.bits.iosqes, 1u << cc.bits.iosqes);
+		ctrlr->vcprop.cc.bits.iosqes = cc.bits.iosqes;
+		diff.bits.iosqes = 0;
+	}
+
+	if (diff.bits.iocqes) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOCQES = %u (%u bytes)\n",
+			      cc.bits.iocqes, 1u << cc.bits.iocqes);
+		ctrlr->vcprop.cc.bits.iocqes = cc.bits.iocqes;
+		diff.bits.iocqes = 0;
+	}
+
+	if (diff.bits.ams) {
+		SPDK_ERRLOG("Arbitration Mechanism Selected (AMS) 0x%x not supported!\n", cc.bits.ams);
+		return false;
+	}
+
+	if (diff.bits.mps) {
+		SPDK_ERRLOG("Memory Page Size (MPS) %u KiB not supported!\n", (1 << (2 + cc.bits.mps)));
+		return false;
+	}
+
+	if (diff.bits.css) {
+		SPDK_ERRLOG("I/O Command Set Selected (CSS) 0x%x not supported!\n", cc.bits.css);
+		return false;
+	}
+
+	if (diff.raw != 0) {
+		SPDK_ERRLOG("Prop Set CC toggled reserved bits 0x%x!\n", diff.raw);
+		return false;
+	}
+
+	return true;
+}
+
+static uint64_t
+nvmf_prop_get_csts(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.csts.raw;
+}
+
+static uint64_t
+nvmf_prop_get_aqa(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.aqa.raw;
+}
+
+static bool
+nvmf_prop_set_aqa(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+	union spdk_nvme_aqa_register aqa;
+
+	aqa.raw = value;
+
+	if (aqa.bits.asqs > ctrlr->vcprop.cap.bits.mqes ||
+	    aqa.bits.acqs > ctrlr->vcprop.cap.bits.mqes) {
+		return false;
+	}
+
+	ctrlr->vcprop.aqa.raw = value;
+
+	return true;
+}
+
+static uint64_t
+nvmf_prop_get_asq(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.asq;
+}
+
+static bool
+nvmf_prop_set_asq_lower(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+	ctrlr->vcprop.asq = (ctrlr->vcprop.asq & (0xFFFFFFFFULL << 32ULL)) | value;
+
+	return true;
+}
+
+static bool
+nvmf_prop_set_asq_upper(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+	ctrlr->vcprop.asq = (ctrlr->vcprop.asq & 0xFFFFFFFFULL) | ((uint64_t)value << 32ULL);
+
+	return true;
+}
+
+static uint64_t
+nvmf_prop_get_acq(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->vcprop.acq;
+}
+
+static bool
+nvmf_prop_set_acq_lower(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+	ctrlr->vcprop.acq = (ctrlr->vcprop.acq & (0xFFFFFFFFULL << 32ULL)) | value;
+
+	return true;
+}
+
+static bool
+nvmf_prop_set_acq_upper(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+	ctrlr->vcprop.acq = (ctrlr->vcprop.acq & 0xFFFFFFFFULL) | ((uint64_t)value << 32ULL);
+
+	return true;
+}
+
+struct nvmf_prop {
+	uint32_t ofst;
+	uint8_t size;
+	char name[11];
+	uint64_t (*get_cb)(struct spdk_nvmf_ctrlr *ctrlr);
+	bool (*set_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value);
+	bool (*set_upper_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value);
+};
+
+#define PROP(field, size, get_cb, set_cb, set_upper_cb) \
+	{ \
+		offsetof(struct spdk_nvme_registers, field), \
+		size, \
+		#field, \
+		get_cb, set_cb, set_upper_cb \
+	}
+
+static const struct nvmf_prop nvmf_props[] = {
+	PROP(cap,  8, nvmf_prop_get_cap,  NULL,                    NULL),
+	PROP(vs,   4, nvmf_prop_get_vs,   NULL,                    NULL),
+	PROP(cc,   4, nvmf_prop_get_cc,   nvmf_prop_set_cc,        NULL),
+	PROP(csts, 4, nvmf_prop_get_csts, NULL,                    NULL),
+	PROP(aqa,  4, nvmf_prop_get_aqa,  nvmf_prop_set_aqa,       NULL),
+	PROP(asq,  8, nvmf_prop_get_asq,  nvmf_prop_set_asq_lower, nvmf_prop_set_asq_upper),
+	PROP(acq,  8, nvmf_prop_get_acq,  nvmf_prop_set_acq_lower, nvmf_prop_set_acq_upper),
+};
+
+static const struct nvmf_prop *
+find_prop(uint32_t ofst, uint8_t size)
+{
+	size_t i;
+
+	for (i = 0; i < SPDK_COUNTOF(nvmf_props); i++) {
+		const struct nvmf_prop *prop = &nvmf_props[i];
+
+		if ((ofst >= prop->ofst) && (ofst + size <= prop->ofst + prop->size)) {
+			return prop;
+		}
+	}
+
+	return NULL;
+}
+
+static int
+nvmf_property_get(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvmf_fabric_prop_get_cmd *cmd = &req->cmd->prop_get_cmd;
+	struct spdk_nvmf_fabric_prop_get_rsp *response = &req->rsp->prop_get_rsp;
+	const struct nvmf_prop *prop;
+	uint8_t size;
+
+	response->status.sc = 0;
+	response->value.u64 = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x\n",
+		      cmd->attrib.size, cmd->ofst);
+
+	switch (cmd->attrib.size) {
+	case SPDK_NVMF_PROP_SIZE_4:
+		size = 4;
+		break;
+	case SPDK_NVMF_PROP_SIZE_8:
+		size = 8;
+		break;
+	default:
+		SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size);
+		response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	prop = find_prop(cmd->ofst, size);
+	if (prop == NULL || prop->get_cb == NULL) {
+		response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name);
+
+	response->value.u64 = prop->get_cb(ctrlr);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "response value: 0x%" PRIx64 "\n", response->value.u64);
+
+	if (size != prop->size) {
+		/* The size must be 4 and the prop->size is 8. Figure out which part of the property to read. */
+		assert(size == 4);
+		assert(prop->size == 8);
+
+		if (cmd->ofst == prop->ofst) {
+			/* Keep bottom 4 bytes only */
+			response->value.u64 &= 0xFFFFFFFF;
+		} else {
+			/* Keep top 4 bytes only */
+			response->value.u64 >>= 32;
+		}
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_property_set(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvmf_fabric_prop_set_cmd *cmd = &req->cmd->prop_set_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	const struct nvmf_prop *prop;
+	uint64_t value;
+	uint8_t size;
+	bool ret;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x, value 0x%" PRIx64 "\n",
+		      cmd->attrib.size, cmd->ofst, cmd->value.u64);
+
+	switch (cmd->attrib.size) {
+	case SPDK_NVMF_PROP_SIZE_4:
+		size = 4;
+		break;
+	case SPDK_NVMF_PROP_SIZE_8:
+		size = 8;
+		break;
+	default:
+		SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size);
+		response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	prop = find_prop(cmd->ofst, size);
+	if (prop == NULL || prop->set_cb == NULL) {
+		SPDK_ERRLOG("Invalid offset 0x%x\n", cmd->ofst);
+		response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name);
+
+	value = cmd->value.u64;
+
+	if (prop->size == 4) {
+		ret = prop->set_cb(ctrlr, (uint32_t)value);
+	} else if (size != prop->size) {
+		/* The size must be 4 and the prop->size is 8. Figure out which part of the property to write. */
+		assert(size == 4);
+		assert(prop->size == 8);
+
+		if (cmd->ofst == prop->ofst) {
+			ret = prop->set_cb(ctrlr, (uint32_t)value);
+		} else {
+			ret = prop->set_upper_cb(ctrlr, (uint32_t)value);
+		}
+	} else {
+		ret = prop->set_cb(ctrlr, (uint32_t)value);
+		if (ret) {
+			ret = prop->set_upper_cb(ctrlr, (uint32_t)(value >> 32));
+		}
+	}
+
+	if (!ret) {
+		SPDK_ERRLOG("prop set_cb failed\n");
+		response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_arbitration(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Arbitration (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	ctrlr->feat.arbitration.raw = cmd->cdw11;
+	ctrlr->feat.arbitration.bits.reserved = 0;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_power_management(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Power Management (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	/* Only PS = 0 is allowed, since we report NPSS = 0 */
+	if (cmd->cdw11_bits.feat_power_management.bits.ps != 0) {
+		SPDK_ERRLOG("Invalid power state %u\n", cmd->cdw11_bits.feat_power_management.bits.ps);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	ctrlr->feat.power_management.raw = cmd->cdw11;
+	ctrlr->feat.power_management.bits.reserved = 0;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static bool
+temp_threshold_opts_valid(const union spdk_nvme_feat_temperature_threshold *opts)
+{
+	/*
+	 * Valid TMPSEL values:
+	 *  0000b - 1000b: temperature sensors
+	 *  1111b: set all implemented temperature sensors
+	 */
+	if (opts->bits.tmpsel >= 9 && opts->bits.tmpsel != 15) {
+		/* 1001b - 1110b: reserved */
+		SPDK_ERRLOG("Invalid TMPSEL %u\n", opts->bits.tmpsel);
+		return false;
+	}
+
+	/*
+	 * Valid THSEL values:
+	 *  00b: over temperature threshold
+	 *  01b: under temperature threshold
+	 */
+	if (opts->bits.thsel > 1) {
+		/* 10b - 11b: reserved */
+		SPDK_ERRLOG("Invalid THSEL %u\n", opts->bits.thsel);
+		return false;
+	}
+
+	return true;
+}
+
+static int
+nvmf_ctrlr_set_features_temperature_threshold(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	if (!temp_threshold_opts_valid(&cmd->cdw11_bits.feat_temp_threshold)) {
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/* TODO: no sensors implemented - ignore new values */
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_temperature_threshold(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	if (!temp_threshold_opts_valid(&cmd->cdw11_bits.feat_temp_threshold)) {
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/* TODO: no sensors implemented - return 0 for all thresholds */
+	rsp->cdw0 = 0;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_error_recovery(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Error Recovery (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	if (cmd->cdw11_bits.feat_error_recovery.bits.dulbe) {
+		/*
+		 * Host is not allowed to set this bit, since we don't advertise it in
+		 * Identify Namespace.
+		 */
+		SPDK_ERRLOG("Host set unsupported DULBE bit\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	ctrlr->feat.error_recovery.raw = cmd->cdw11;
+	ctrlr->feat.error_recovery.bits.reserved = 0;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_volatile_write_cache(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	ctrlr->feat.volatile_write_cache.raw = cmd->cdw11;
+	ctrlr->feat.volatile_write_cache.bits.reserved = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache %s\n",
+		      ctrlr->feat.volatile_write_cache.bits.wce ? "Enabled" : "Disabled");
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_write_atomicity(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Write Atomicity (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+	ctrlr->feat.write_atomicity.raw = cmd->cdw11;
+	ctrlr->feat.write_atomicity.bits.reserved = 0;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_host_identifier(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+	SPDK_ERRLOG("Set Features - Host Identifier not allowed\n");
+	response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_host_identifier(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Host Identifier\n");
+
+	if (!cmd->cdw11_bits.feat_host_identifier.bits.exhid) {
+		/* NVMe over Fabrics requires EXHID=1 (128-bit/16-byte host ID) */
+		SPDK_ERRLOG("Get Features - Host Identifier with EXHID=0 not allowed\n");
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (req->data == NULL || req->length < sizeof(ctrlr->hostid)) {
+		SPDK_ERRLOG("Invalid data buffer for Get Features - Host Identifier\n");
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	spdk_uuid_copy((struct spdk_uuid *)req->data, &ctrlr->hostid);
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_reservation_notification_mask(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_ns *ns;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "get Features - Reservation Notificaton Mask\n");
+
+	if (cmd->nsid == 0xffffffffu) {
+		SPDK_ERRLOG("get Features - Invalid Namespace ID\n");
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+	if (ns == NULL) {
+		SPDK_ERRLOG("Set Features - Invalid Namespace ID\n");
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+	rsp->cdw0 = ns->mask;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_reservation_notification_mask(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_ns *ns;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Notificaton Mask\n");
+
+	if (cmd->nsid == 0xffffffffu) {
+		for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+		     ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+			ns->mask = cmd->cdw11;
+		}
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+	if (ns == NULL) {
+		SPDK_ERRLOG("Set Features - Invalid Namespace ID\n");
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+	ns->mask = cmd->cdw11;
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_reservation_persistence(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_ns *ns;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Reservation Persistence\n");
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+	/* NSID with 0xffffffffu also included */
+	if (ns == NULL) {
+		SPDK_ERRLOG("Get Features - Invalid Namespace ID\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	response->cdw0 = ns->ptpl_activated;
+
+	response->status.sct = SPDK_NVME_SCT_GENERIC;
+	response->status.sc = SPDK_NVME_SC_SUCCESS;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_reservation_persistence(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_ns *ns;
+	bool ptpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Persistence\n");
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+	ptpl = cmd->cdw11_bits.feat_rsv_persistence.bits.ptpl;
+
+	if (cmd->nsid != 0xffffffffu && ns && ns->ptpl_file) {
+		ns->ptpl_activated = ptpl;
+	} else if (cmd->nsid == 0xffffffffu) {
+		for (ns = spdk_nvmf_subsystem_get_first_ns(ctrlr->subsys); ns && ns->ptpl_file;
+		     ns = spdk_nvmf_subsystem_get_next_ns(ctrlr->subsys, ns)) {
+			ns->ptpl_activated = ptpl;
+		}
+	} else {
+		SPDK_ERRLOG("Set Features - Invalid Namespace ID or Reservation Configuration\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/* TODO: Feature not changeable for now */
+	response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+	response->status.sc = SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_keep_alive_timer(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer (%u ms)\n", cmd->cdw11);
+
+	/*
+	 * if attempts to disable keep alive by setting kato to 0h
+	 * a status value of keep alive invalid shall be returned
+	 */
+	if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato == 0) {
+		rsp->status.sc = SPDK_NVME_SC_KEEP_ALIVE_INVALID;
+	} else if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato < MIN_KEEP_ALIVE_TIMEOUT_IN_MS) {
+		ctrlr->feat.keep_alive_timer.bits.kato = MIN_KEEP_ALIVE_TIMEOUT_IN_MS;
+	} else {
+		/* round up to milliseconds */
+		ctrlr->feat.keep_alive_timer.bits.kato = spdk_divide_round_up(
+					cmd->cdw11_bits.feat_keep_alive_timer.bits.kato,
+					KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS) *
+				KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS;
+	}
+
+	/*
+	 * if change the keep alive timeout value successfully
+	 * update the keep alive poller.
+	 */
+	if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato != 0) {
+		if (ctrlr->keep_alive_poller != NULL) {
+			spdk_poller_unregister(&ctrlr->keep_alive_poller);
+		}
+		ctrlr->keep_alive_poller = SPDK_POLLER_REGISTER(nvmf_ctrlr_keep_alive_poll, ctrlr,
+					   ctrlr->feat.keep_alive_timer.bits.kato * 1000);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer set to %u ms\n",
+		      ctrlr->feat.keep_alive_timer.bits.kato);
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_number_of_queues(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint32_t count;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Number of Queues, cdw11 0x%x\n",
+		      req->cmd->nvme_cmd.cdw11);
+
+	count = spdk_bit_array_count_set(ctrlr->qpair_mask);
+	/* verify that the controller is ready to process commands */
+	if (count > 1) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Queue pairs already active!\n");
+		rsp->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+	} else {
+		/*
+		 * Ignore the value requested by the host -
+		 * always return the pre-configured value based on max_qpairs_allowed.
+		 */
+		rsp->cdw0 = ctrlr->feat.number_of_queues.raw;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_async_event_configuration(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Async Event Configuration, cdw11 0x%08x\n",
+		      cmd->cdw11);
+	ctrlr->feat.async_event_configuration.raw = cmd->cdw11;
+	ctrlr->feat.async_event_configuration.bits.reserved = 0;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Async Event Request\n");
+
+	/* Four asynchronous events are supported for now */
+	if (ctrlr->nr_aer_reqs >= NVMF_MAX_ASYNC_EVENTS) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "AERL exceeded\n");
+		rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		rsp->status.sc = SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (ctrlr->notice_event.bits.async_event_type ==
+	    SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) {
+		rsp->cdw0 = ctrlr->notice_event.raw;
+		ctrlr->notice_event.raw = 0;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (ctrlr->reservation_event.bits.async_event_type ==
+	    SPDK_NVME_ASYNC_EVENT_TYPE_IO) {
+		rsp->cdw0 = ctrlr->reservation_event.raw;
+		ctrlr->reservation_event.raw = 0;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/* AER cmd is an exception */
+	sgroup = &req->qpair->group->sgroups[ctrlr->subsys->id];
+	assert(sgroup != NULL);
+	sgroup->io_outstanding--;
+
+	ctrlr->aer_req[ctrlr->nr_aer_reqs++] = req;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+static void
+nvmf_get_firmware_slot_log_page(void *buffer, uint64_t offset, uint32_t length)
+{
+	struct spdk_nvme_firmware_page fw_page;
+	size_t copy_len;
+
+	memset(&fw_page, 0, sizeof(fw_page));
+	fw_page.afi.active_slot = 1;
+	fw_page.afi.next_reset_slot = 0;
+	spdk_strcpy_pad(fw_page.revision[0], FW_VERSION, sizeof(fw_page.revision[0]), ' ');
+
+	if (offset < sizeof(fw_page)) {
+		copy_len = spdk_min(sizeof(fw_page) - offset, length);
+		if (copy_len > 0) {
+			memcpy(buffer, (const char *)&fw_page + offset, copy_len);
+		}
+	}
+}
+
+void
+nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid)
+{
+	uint16_t max_changes = SPDK_COUNTOF(ctrlr->changed_ns_list.ns_list);
+	uint16_t i;
+	bool found = false;
+
+	for (i = 0; i < ctrlr->changed_ns_list_count; i++) {
+		if (ctrlr->changed_ns_list.ns_list[i] == nsid) {
+			/* nsid is already in the list */
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (ctrlr->changed_ns_list_count == max_changes) {
+			/* Out of space - set first entry to FFFFFFFFh and zero-fill the rest. */
+			ctrlr->changed_ns_list.ns_list[0] = 0xFFFFFFFFu;
+			for (i = 1; i < max_changes; i++) {
+				ctrlr->changed_ns_list.ns_list[i] = 0;
+			}
+		} else {
+			ctrlr->changed_ns_list.ns_list[ctrlr->changed_ns_list_count++] = nsid;
+		}
+	}
+}
+
+static void
+nvmf_get_changed_ns_list_log_page(struct spdk_nvmf_ctrlr *ctrlr,
+				  void *buffer, uint64_t offset, uint32_t length)
+{
+	size_t copy_length;
+
+	if (offset < sizeof(ctrlr->changed_ns_list)) {
+		copy_length = spdk_min(length, sizeof(ctrlr->changed_ns_list) - offset);
+		if (copy_length) {
+			memcpy(buffer, (char *)&ctrlr->changed_ns_list + offset, copy_length);
+		}
+	}
+
+	/* Clear log page each time it is read */
+	ctrlr->changed_ns_list_count = 0;
+	memset(&ctrlr->changed_ns_list, 0, sizeof(ctrlr->changed_ns_list));
+}
+
+/* The structure can be modified if we provide support for other commands in future */
+static const struct spdk_nvme_cmds_and_effect_log_page g_cmds_and_effect_log_page = {
+	.admin_cmds_supported = {
+		/* CSUPP, LBCC, NCC, NIC, CCC, CSE */
+		/* Get Log Page */
+		[SPDK_NVME_OPC_GET_LOG_PAGE]		= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* Identify */
+		[SPDK_NVME_OPC_IDENTIFY]		= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* Abort */
+		[SPDK_NVME_OPC_ABORT]			= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* Set Features */
+		[SPDK_NVME_OPC_SET_FEATURES]		= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* Get Features */
+		[SPDK_NVME_OPC_GET_FEATURES]		= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* Async Event Request */
+		[SPDK_NVME_OPC_ASYNC_EVENT_REQUEST]	= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* Keep Alive */
+		[SPDK_NVME_OPC_KEEP_ALIVE]		= {1, 0, 0, 0, 0, 0, 0, 0},
+	},
+	.io_cmds_supported = {
+		/* FLUSH */
+		[SPDK_NVME_OPC_FLUSH]			= {1, 1, 0, 0, 0, 0, 0, 0},
+		/* WRITE */
+		[SPDK_NVME_OPC_WRITE]			= {1, 1, 0, 0, 0, 0, 0, 0},
+		/* READ */
+		[SPDK_NVME_OPC_READ]			= {1, 0, 0, 0, 0, 0, 0, 0},
+		/* WRITE ZEROES */
+		[SPDK_NVME_OPC_WRITE_ZEROES]		= {1, 1, 0, 0, 0, 0, 0, 0},
+		/* DATASET MANAGEMENT */
+		[SPDK_NVME_OPC_DATASET_MANAGEMENT]	= {1, 1, 0, 0, 0, 0, 0, 0},
+		/* COMPARE */
+		[SPDK_NVME_OPC_COMPARE]			= {1, 0, 0, 0, 0, 0, 0, 0},
+	},
+};
+
+static void
+nvmf_get_cmds_and_effects_log_page(void *buffer,
+				   uint64_t offset, uint32_t length)
+{
+	uint32_t page_size = sizeof(struct spdk_nvme_cmds_and_effect_log_page);
+	size_t copy_len = 0;
+	size_t zero_len = length;
+
+	if (offset < page_size) {
+		copy_len = spdk_min(page_size - offset, length);
+		zero_len -= copy_len;
+		memcpy(buffer, (char *)(&g_cmds_and_effect_log_page) + offset, copy_len);
+	}
+
+	if (zero_len) {
+		memset((char *)buffer + copy_len, 0, zero_len);
+	}
+}
+
+static void
+nvmf_get_reservation_notification_log_page(struct spdk_nvmf_ctrlr *ctrlr,
+		void *data, uint64_t offset, uint32_t length)
+{
+	uint32_t unit_log_len, avail_log_len, next_pos, copy_len;
+	struct spdk_nvmf_reservation_log *log, *log_tmp;
+	uint8_t *buf = data;
+
+	unit_log_len = sizeof(struct spdk_nvme_reservation_notification_log);
+	/* No available log, return 1 zeroed log page */
+	if (!ctrlr->num_avail_log_pages) {
+		memset(buf, 0, spdk_min(length, unit_log_len));
+		return;
+	}
+
+	avail_log_len = ctrlr->num_avail_log_pages * unit_log_len;
+	if (offset >= avail_log_len) {
+		return;
+	}
+
+	next_pos = copy_len = 0;
+	TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) {
+		TAILQ_REMOVE(&ctrlr->log_head, log, link);
+		ctrlr->num_avail_log_pages--;
+
+		next_pos += unit_log_len;
+		if (next_pos > offset) {
+			copy_len = spdk_min(next_pos - offset, length);
+			memcpy(buf, &log->log, copy_len);
+			length -= copy_len;
+			offset += copy_len;
+			buf += copy_len;
+		}
+		free(log);
+
+		if (length == 0) {
+			break;
+		}
+	}
+	return;
+}
+
+static int
+nvmf_ctrlr_get_log_page(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	uint64_t offset, len;
+	uint32_t numdl, numdu;
+	uint8_t lid;
+
+	if (req->data == NULL) {
+		SPDK_ERRLOG("get log command with no buffer\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	offset = (uint64_t)cmd->cdw12 | ((uint64_t)cmd->cdw13 << 32);
+	if (offset & 3) {
+		SPDK_ERRLOG("Invalid log page offset 0x%" PRIx64 "\n", offset);
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	numdl = cmd->cdw10_bits.get_log_page.numdl;
+	numdu = cmd->cdw11_bits.get_log_page.numdu;
+	len = ((numdu << 16) + numdl + (uint64_t)1) * 4;
+	if (len > req->length) {
+		SPDK_ERRLOG("Get log page: len (%" PRIu64 ") > buf size (%u)\n",
+			    len, req->length);
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	lid = cmd->cdw10_bits.get_log_page.lid;
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get log page: LID=0x%02X offset=0x%" PRIx64 " len=0x%" PRIx64 "\n",
+		      lid, offset, len);
+
+	if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+		switch (lid) {
+		case SPDK_NVME_LOG_DISCOVERY:
+			nvmf_get_discovery_log_page(subsystem->tgt, ctrlr->hostnqn, req->iov, req->iovcnt, offset,
+						    len);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		default:
+			goto invalid_log_page;
+		}
+	} else {
+		switch (lid) {
+		case SPDK_NVME_LOG_ERROR:
+		case SPDK_NVME_LOG_HEALTH_INFORMATION:
+			/* TODO: actually fill out log page data */
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		case SPDK_NVME_LOG_FIRMWARE_SLOT:
+			nvmf_get_firmware_slot_log_page(req->data, offset, len);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		case SPDK_NVME_LOG_COMMAND_EFFECTS_LOG:
+			nvmf_get_cmds_and_effects_log_page(req->data, offset, len);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		case SPDK_NVME_LOG_CHANGED_NS_LIST:
+			nvmf_get_changed_ns_list_log_page(ctrlr, req->data, offset, len);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		case SPDK_NVME_LOG_RESERVATION_NOTIFICATION:
+			nvmf_get_reservation_notification_log_page(ctrlr, req->data, offset, len);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		default:
+			goto invalid_log_page;
+		}
+	}
+
+invalid_log_page:
+	SPDK_ERRLOG("Unsupported Get Log Page 0x%02X\n", lid);
+	response->status.sct = SPDK_NVME_SCT_GENERIC;
+	response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+spdk_nvmf_ctrlr_identify_ns(struct spdk_nvmf_ctrlr *ctrlr,
+			    struct spdk_nvme_cmd *cmd,
+			    struct spdk_nvme_cpl *rsp,
+			    struct spdk_nvme_ns_data *nsdata)
+{
+	struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+	struct spdk_nvmf_ns *ns;
+	uint32_t max_num_blocks;
+
+	if (cmd->nsid == 0 || cmd->nsid > subsystem->max_nsid) {
+		SPDK_ERRLOG("Identify Namespace for invalid NSID %u\n", cmd->nsid);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	ns = _nvmf_subsystem_get_ns(subsystem, cmd->nsid);
+	if (ns == NULL || ns->bdev == NULL) {
+		/*
+		 * Inactive namespaces should return a zero filled data structure.
+		 * The data buffer is already zeroed by nvmf_ctrlr_process_admin_cmd(),
+		 * so we can just return early here.
+		 */
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Identify Namespace for inactive NSID %u\n", cmd->nsid);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_SUCCESS;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	nvmf_bdev_ctrlr_identify_ns(ns, nsdata, ctrlr->dif_insert_or_strip);
+
+	/* Due to bug in the Linux kernel NVMe driver we have to set noiob no larger than mdts */
+	max_num_blocks = ctrlr->admin_qpair->transport->opts.max_io_size /
+			 (1U << nsdata->lbaf[nsdata->flbas.format].lbads);
+	if (nsdata->noiob > max_num_blocks) {
+		nsdata->noiob = max_num_blocks;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static void
+nvmf_ctrlr_populate_oacs(struct spdk_nvmf_ctrlr *ctrlr,
+			 struct spdk_nvme_ctrlr_data *cdata)
+{
+	cdata->oacs.virtualization_management =
+		g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT].hdlr != NULL;
+	cdata->oacs.nvme_mi = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NVME_MI_SEND].hdlr != NULL
+			      && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NVME_MI_RECEIVE].hdlr != NULL;
+	cdata->oacs.directives = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DIRECTIVE_SEND].hdlr != NULL
+				 && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DIRECTIVE_RECEIVE].hdlr != NULL;
+	cdata->oacs.device_self_test =
+		g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DEVICE_SELF_TEST].hdlr != NULL;
+	cdata->oacs.ns_manage = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NS_MANAGEMENT].hdlr != NULL
+				&& g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NS_ATTACHMENT].hdlr != NULL;
+	cdata->oacs.firmware = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD].hdlr !=
+			       NULL
+			       && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FIRMWARE_COMMIT].hdlr != NULL;
+	cdata->oacs.format =
+		g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FORMAT_NVM].hdlr != NULL;
+	cdata->oacs.security = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_SECURITY_SEND].hdlr != NULL
+			       && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_SECURITY_RECEIVE].hdlr != NULL;
+	cdata->oacs.get_lba_status = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_GET_LBA_STATUS].hdlr !=
+				     NULL;
+}
+
+int
+spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_ctrlr_data *cdata)
+{
+	struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+	struct spdk_nvmf_transport *transport = ctrlr->admin_qpair->transport;
+
+	/*
+	 * Common fields for discovery and NVM subsystems
+	 */
+	spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+	assert((transport->opts.max_io_size % 4096) == 0);
+	cdata->mdts = spdk_u32log2(transport->opts.max_io_size / 4096);
+	cdata->cntlid = ctrlr->cntlid;
+	cdata->ver = ctrlr->vcprop.vs;
+	cdata->aerl = NVMF_MAX_ASYNC_EVENTS - 1;
+	cdata->lpa.edlp = 1;
+	cdata->elpe = 127;
+	cdata->maxcmd = transport->opts.max_queue_depth;
+	cdata->sgls = ctrlr->cdata.sgls;
+	cdata->fuses.compare_and_write = 1;
+	cdata->acwu = 1;
+	spdk_strcpy_pad(cdata->subnqn, subsystem->subnqn, sizeof(cdata->subnqn), '\0');
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr data: maxcmd 0x%x\n", cdata->maxcmd);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "sgls data: 0x%x\n", from_le32(&cdata->sgls));
+
+	/*
+	 * NVM subsystem fields (reserved for discovery subsystems)
+	 */
+	if (subsystem->subtype == SPDK_NVMF_SUBTYPE_NVME) {
+		spdk_strcpy_pad(cdata->mn, spdk_nvmf_subsystem_get_mn(subsystem), sizeof(cdata->mn), ' ');
+		spdk_strcpy_pad(cdata->sn, spdk_nvmf_subsystem_get_sn(subsystem), sizeof(cdata->sn), ' ');
+		cdata->kas = ctrlr->cdata.kas;
+
+		cdata->rab = 6;
+		cdata->cmic.multi_port = 1;
+		cdata->cmic.multi_host = 1;
+		cdata->oaes.ns_attribute_notices = 1;
+		cdata->ctratt.host_id_exhid_supported = 1;
+		/* TODO: Concurrent execution of multiple abort commands. */
+		cdata->acl = 0;
+		cdata->aerl = 0;
+		cdata->frmw.slot1_ro = 1;
+		cdata->frmw.num_slots = 1;
+
+		cdata->lpa.celp = 1; /* Command Effects log page supported */
+
+		cdata->sqes.min = 6;
+		cdata->sqes.max = 6;
+		cdata->cqes.min = 4;
+		cdata->cqes.max = 4;
+		cdata->nn = subsystem->max_nsid;
+		cdata->vwc.present = 1;
+		cdata->vwc.flush_broadcast = SPDK_NVME_FLUSH_BROADCAST_NOT_SUPPORTED;
+
+		cdata->nvmf_specific = ctrlr->cdata.nvmf_specific;
+
+		cdata->oncs.dsm = nvmf_ctrlr_dsm_supported(ctrlr);
+		cdata->oncs.write_zeroes = nvmf_ctrlr_write_zeroes_supported(ctrlr);
+		cdata->oncs.reservations = 1;
+
+		nvmf_ctrlr_populate_oacs(ctrlr, cdata);
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ioccsz 0x%x\n",
+			      cdata->nvmf_specific.ioccsz);
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: iorcsz 0x%x\n",
+			      cdata->nvmf_specific.iorcsz);
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: icdoff 0x%x\n",
+			      cdata->nvmf_specific.icdoff);
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ctrattr 0x%x\n",
+			      *(uint8_t *)&cdata->nvmf_specific.ctrattr);
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: msdbd 0x%x\n",
+			      cdata->nvmf_specific.msdbd);
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_identify_active_ns_list(struct spdk_nvmf_subsystem *subsystem,
+				   struct spdk_nvme_cmd *cmd,
+				   struct spdk_nvme_cpl *rsp,
+				   struct spdk_nvme_ns_list *ns_list)
+{
+	struct spdk_nvmf_ns *ns;
+	uint32_t count = 0;
+
+	if (cmd->nsid >= 0xfffffffeUL) {
+		SPDK_ERRLOG("Identify Active Namespace List with invalid NSID %u\n", cmd->nsid);
+		rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+	     ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+		if (ns->opts.nsid <= cmd->nsid) {
+			continue;
+		}
+
+		ns_list->ns_list[count++] = ns->opts.nsid;
+		if (count == SPDK_COUNTOF(ns_list->ns_list)) {
+			break;
+		}
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static void
+_add_ns_id_desc(void **buf_ptr, size_t *buf_remain,
+		enum spdk_nvme_nidt type,
+		const void *data, size_t data_size)
+{
+	struct spdk_nvme_ns_id_desc *desc;
+	size_t desc_size = sizeof(*desc) + data_size;
+
+	/*
+	 * These should never fail in practice, since all valid NS ID descriptors
+	 * should be defined so that they fit in the available 4096-byte buffer.
+	 */
+	assert(data_size > 0);
+	assert(data_size <= UINT8_MAX);
+	assert(desc_size < *buf_remain);
+	if (data_size == 0 || data_size > UINT8_MAX || desc_size > *buf_remain) {
+		return;
+	}
+
+	desc = *buf_ptr;
+	desc->nidt = type;
+	desc->nidl = data_size;
+	memcpy(desc->nid, data, data_size);
+
+	*buf_ptr += desc_size;
+	*buf_remain -= desc_size;
+}
+
+static int
+nvmf_ctrlr_identify_ns_id_descriptor_list(
+	struct spdk_nvmf_subsystem *subsystem,
+	struct spdk_nvme_cmd *cmd,
+	struct spdk_nvme_cpl *rsp,
+	void *id_desc_list, size_t id_desc_list_size)
+{
+	struct spdk_nvmf_ns *ns;
+	size_t buf_remain = id_desc_list_size;
+	void *buf_ptr = id_desc_list;
+
+	ns = _nvmf_subsystem_get_ns(subsystem, cmd->nsid);
+	if (ns == NULL || ns->bdev == NULL) {
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+#define ADD_ID_DESC(type, data, size) \
+	do { \
+		if (!spdk_mem_all_zero(data, size)) { \
+			_add_ns_id_desc(&buf_ptr, &buf_remain, type, data, size); \
+		} \
+	} while (0)
+
+	ADD_ID_DESC(SPDK_NVME_NIDT_EUI64, ns->opts.eui64, sizeof(ns->opts.eui64));
+	ADD_ID_DESC(SPDK_NVME_NIDT_NGUID, ns->opts.nguid, sizeof(ns->opts.nguid));
+	ADD_ID_DESC(SPDK_NVME_NIDT_UUID, &ns->opts.uuid, sizeof(ns->opts.uuid));
+
+	/*
+	 * The list is automatically 0-terminated because controller to host buffers in
+	 * admin commands always get zeroed in nvmf_ctrlr_process_admin_cmd().
+	 */
+
+#undef ADD_ID_DESC
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_identify(struct spdk_nvmf_request *req)
+{
+	uint8_t cns;
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+
+	if (req->data == NULL || req->length < 4096) {
+		SPDK_ERRLOG("identify command with invalid buffer\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	cns = cmd->cdw10_bits.identify.cns;
+
+	if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY &&
+	    cns != SPDK_NVME_IDENTIFY_CTRLR) {
+		/* Discovery controllers only support Identify Controller */
+		goto invalid_cns;
+	}
+
+	switch (cns) {
+	case SPDK_NVME_IDENTIFY_NS:
+		return spdk_nvmf_ctrlr_identify_ns(ctrlr, cmd, rsp, req->data);
+	case SPDK_NVME_IDENTIFY_CTRLR:
+		return spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, req->data);
+	case SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST:
+		return nvmf_ctrlr_identify_active_ns_list(subsystem, cmd, rsp, req->data);
+	case SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST:
+		return nvmf_ctrlr_identify_ns_id_descriptor_list(subsystem, cmd, rsp, req->data, req->length);
+	default:
+		goto invalid_cns;
+	}
+
+invalid_cns:
+	SPDK_ERRLOG("Identify command with unsupported CNS 0x%02x\n", cns);
+	rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+	rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static bool
+nvmf_qpair_abort_aer(struct spdk_nvmf_qpair *qpair, uint16_t cid)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+	struct spdk_nvmf_request *req;
+	int i;
+
+	if (!nvmf_qpair_is_admin_queue(qpair)) {
+		return false;
+	}
+
+	for (i = 0; i < ctrlr->nr_aer_reqs; i++) {
+		if (ctrlr->aer_req[i]->cmd->nvme_cmd.cid == cid) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Aborting AER request\n");
+			req = ctrlr->aer_req[i];
+			ctrlr->aer_req[i] = NULL;
+			ctrlr->nr_aer_reqs--;
+
+			/* Move the last req to the aborting position for making aer_reqs
+			 * in continuous
+			 */
+			if (i < ctrlr->nr_aer_reqs) {
+				ctrlr->aer_req[i] = ctrlr->aer_req[ctrlr->nr_aer_reqs];
+				ctrlr->aer_req[ctrlr->nr_aer_reqs] = NULL;
+			}
+
+			req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+			req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+			_nvmf_request_complete(req);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void
+nvmf_qpair_abort_request(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req)
+{
+	uint16_t cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
+
+	if (nvmf_qpair_abort_aer(qpair, cid)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p sqid=%u cid=%u successful\n",
+			      qpair->ctrlr, qpair->qid, cid);
+		req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command successfully aborted */
+
+		spdk_nvmf_request_complete(req);
+		return;
+	}
+
+	nvmf_transport_qpair_abort_request(qpair, req);
+}
+
+static void
+nvmf_ctrlr_abort_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i);
+
+	if (status == 0) {
+		/* There was no qpair whose ID matches SQID of the abort command.
+		 * Hence call _nvmf_request_complete() here.
+		 */
+		_nvmf_request_complete(req);
+	}
+}
+
+static void
+nvmf_ctrlr_abort_on_pg(struct spdk_io_channel_iter *i)
+{
+	struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch);
+	uint16_t sqid = req->cmd->nvme_cmd.cdw10_bits.abort.sqid;
+	struct spdk_nvmf_qpair *qpair;
+
+	TAILQ_FOREACH(qpair, &group->qpairs, link) {
+		if (qpair->ctrlr == req->qpair->ctrlr && qpair->qid == sqid) {
+			/* Found the qpair */
+
+			nvmf_qpair_abort_request(qpair, req);
+
+			/* Return -1 for the status so the iteration across threads stops. */
+			spdk_for_each_channel_continue(i, -1);
+			return;
+		}
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+nvmf_ctrlr_abort(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	rsp->cdw0 = 1U; /* Command not aborted */
+	rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+	rsp->status.sc = SPDK_NVME_SC_SUCCESS;
+
+	/* Send a message to each poll group, searching for this ctrlr, sqid, and command. */
+	spdk_for_each_channel(req->qpair->ctrlr->subsys->tgt,
+			      nvmf_ctrlr_abort_on_pg,
+			      req,
+			      nvmf_ctrlr_abort_done
+			     );
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_request *req_to_abort = req->req_to_abort;
+	struct spdk_bdev *bdev;
+	struct spdk_bdev_desc *desc;
+	struct spdk_io_channel *ch;
+	int rc;
+
+	assert(req_to_abort != NULL);
+
+	if (g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr &&
+	    nvmf_qpair_is_admin_queue(req_to_abort->qpair)) {
+		return g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr(req);
+	}
+
+	rc = spdk_nvmf_request_get_bdev(req_to_abort->cmd->nvme_cmd.nsid, req_to_abort,
+					&bdev, &desc, &ch);
+	if (rc != 0) {
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return spdk_nvmf_bdev_ctrlr_abort_cmd(bdev, desc, ch, req, req_to_abort);
+}
+
+static int
+get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0)
+{
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	rsp->cdw0 = cdw0;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features(struct spdk_nvmf_request *req)
+{
+	uint8_t feature;
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+	feature = cmd->cdw10_bits.get_features.fid;
+	switch (feature) {
+	case SPDK_NVME_FEAT_ARBITRATION:
+		return get_features_generic(req, ctrlr->feat.arbitration.raw);
+	case SPDK_NVME_FEAT_POWER_MANAGEMENT:
+		return get_features_generic(req, ctrlr->feat.power_management.raw);
+	case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD:
+		return nvmf_ctrlr_get_features_temperature_threshold(req);
+	case SPDK_NVME_FEAT_ERROR_RECOVERY:
+		return get_features_generic(req, ctrlr->feat.error_recovery.raw);
+	case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE:
+		return get_features_generic(req, ctrlr->feat.volatile_write_cache.raw);
+	case SPDK_NVME_FEAT_NUMBER_OF_QUEUES:
+		return get_features_generic(req, ctrlr->feat.number_of_queues.raw);
+	case SPDK_NVME_FEAT_WRITE_ATOMICITY:
+		return get_features_generic(req, ctrlr->feat.write_atomicity.raw);
+	case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+		return get_features_generic(req, ctrlr->feat.async_event_configuration.raw);
+	case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER:
+		return get_features_generic(req, ctrlr->feat.keep_alive_timer.raw);
+	case SPDK_NVME_FEAT_HOST_IDENTIFIER:
+		return nvmf_ctrlr_get_features_host_identifier(req);
+	case SPDK_NVME_FEAT_HOST_RESERVE_MASK:
+		return nvmf_ctrlr_get_features_reservation_notification_mask(req);
+	case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST:
+		return nvmf_ctrlr_get_features_reservation_persistence(req);
+	default:
+		SPDK_ERRLOG("Get Features command with unsupported feature ID 0x%02x\n", feature);
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+}
+
+static int
+nvmf_ctrlr_set_features(struct spdk_nvmf_request *req)
+{
+	uint8_t feature, save;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+	/*
+	 * Features are not saveable by the controller as indicated by
+	 * ONCS field of the Identify Controller data.
+	 * */
+	save = cmd->cdw10_bits.set_features.sv;
+	if (save) {
+		response->status.sc = SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE;
+		response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	feature = cmd->cdw10_bits.set_features.fid;
+	switch (feature) {
+	case SPDK_NVME_FEAT_ARBITRATION:
+		return nvmf_ctrlr_set_features_arbitration(req);
+	case SPDK_NVME_FEAT_POWER_MANAGEMENT:
+		return nvmf_ctrlr_set_features_power_management(req);
+	case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD:
+		return nvmf_ctrlr_set_features_temperature_threshold(req);
+	case SPDK_NVME_FEAT_ERROR_RECOVERY:
+		return nvmf_ctrlr_set_features_error_recovery(req);
+	case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE:
+		return nvmf_ctrlr_set_features_volatile_write_cache(req);
+	case SPDK_NVME_FEAT_NUMBER_OF_QUEUES:
+		return nvmf_ctrlr_set_features_number_of_queues(req);
+	case SPDK_NVME_FEAT_WRITE_ATOMICITY:
+		return nvmf_ctrlr_set_features_write_atomicity(req);
+	case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+		return nvmf_ctrlr_set_features_async_event_configuration(req);
+	case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER:
+		return nvmf_ctrlr_set_features_keep_alive_timer(req);
+	case SPDK_NVME_FEAT_HOST_IDENTIFIER:
+		return nvmf_ctrlr_set_features_host_identifier(req);
+	case SPDK_NVME_FEAT_HOST_RESERVE_MASK:
+		return nvmf_ctrlr_set_features_reservation_notification_mask(req);
+	case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST:
+		return nvmf_ctrlr_set_features_reservation_persistence(req);
+	default:
+		SPDK_ERRLOG("Set Features command with unsupported feature ID 0x%02x\n", feature);
+		response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+}
+
+static int
+nvmf_ctrlr_keep_alive(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Keep Alive\n");
+	/*
+	 * To handle keep alive just clear or reset the
+	 * ctrlr based keep alive duration counter.
+	 * When added, a separate timer based process
+	 * will monitor if the time since last recorded
+	 * keep alive has exceeded the max duration and
+	 * take appropriate action.
+	 */
+	ctrlr->last_keep_alive_tick = spdk_get_ticks();
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	int rc;
+
+	if (ctrlr == NULL) {
+		SPDK_ERRLOG("Admin command sent before CONNECT\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (ctrlr->vcprop.cc.bits.en != 1) {
+		SPDK_ERRLOG("Admin command sent to disabled controller\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (req->data && spdk_nvme_opc_get_data_transfer(cmd->opc) == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+		memset(req->data, 0, req->length);
+	}
+
+	if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+		/* Discovery controllers only support Get Log Page, Identify and Keep Alive. */
+		switch (cmd->opc) {
+		case SPDK_NVME_OPC_IDENTIFY:
+		case SPDK_NVME_OPC_GET_LOG_PAGE:
+		case SPDK_NVME_OPC_KEEP_ALIVE:
+			break;
+		default:
+			goto invalid_opcode;
+		}
+	}
+
+	/* Call a custom adm cmd handler if set. Aborts are handled in a different path (see nvmf_passthru_admin_cmd) */
+	if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr && cmd->opc != SPDK_NVME_OPC_ABORT) {
+		rc = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr(req);
+		if (rc >= SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+			/* The handler took care of this commmand */
+			return rc;
+		}
+	}
+
+	switch (cmd->opc) {
+	case SPDK_NVME_OPC_GET_LOG_PAGE:
+		return nvmf_ctrlr_get_log_page(req);
+	case SPDK_NVME_OPC_IDENTIFY:
+		return nvmf_ctrlr_identify(req);
+	case SPDK_NVME_OPC_ABORT:
+		return nvmf_ctrlr_abort(req);
+	case SPDK_NVME_OPC_GET_FEATURES:
+		return nvmf_ctrlr_get_features(req);
+	case SPDK_NVME_OPC_SET_FEATURES:
+		return nvmf_ctrlr_set_features(req);
+	case SPDK_NVME_OPC_ASYNC_EVENT_REQUEST:
+		return nvmf_ctrlr_async_event_request(req);
+	case SPDK_NVME_OPC_KEEP_ALIVE:
+		return nvmf_ctrlr_keep_alive(req);
+
+	case SPDK_NVME_OPC_CREATE_IO_SQ:
+	case SPDK_NVME_OPC_CREATE_IO_CQ:
+	case SPDK_NVME_OPC_DELETE_IO_SQ:
+	case SPDK_NVME_OPC_DELETE_IO_CQ:
+		/* Create and Delete I/O CQ/SQ not allowed in NVMe-oF */
+		goto invalid_opcode;
+
+	default:
+		goto invalid_opcode;
+	}
+
+invalid_opcode:
+	SPDK_ERRLOG("Unsupported admin opcode 0x%x\n", cmd->opc);
+	response->status.sct = SPDK_NVME_SCT_GENERIC;
+	response->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_capsule_cmd *cap_hdr;
+
+	cap_hdr = &req->cmd->nvmf_cmd;
+
+	if (qpair->ctrlr == NULL) {
+		/* No ctrlr established yet; the only valid command is Connect */
+		if (cap_hdr->fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT) {
+			return nvmf_ctrlr_cmd_connect(req);
+		} else {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Got fctype 0x%x, expected Connect\n",
+				      cap_hdr->fctype);
+			req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+			req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+	} else if (nvmf_qpair_is_admin_queue(qpair)) {
+		/*
+		 * Controller session is established, and this is an admin queue.
+		 * Disallow Connect and allow other fabrics commands.
+		 */
+		switch (cap_hdr->fctype) {
+		case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET:
+			return nvmf_property_set(req);
+		case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET:
+			return nvmf_property_get(req);
+		default:
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "unknown fctype 0x%02x\n",
+				      cap_hdr->fctype);
+			req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+			req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+	} else {
+		/* Controller session is established, and this is an I/O queue */
+		/* For now, no I/O-specific Fabrics commands are implemented (other than Connect) */
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Unexpected I/O fctype 0x%x\n", cap_hdr->fctype);
+		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+}
+
+static inline int
+nvmf_ctrlr_async_event_notification(struct spdk_nvmf_ctrlr *ctrlr,
+				    union spdk_nvme_async_event_completion *event)
+{
+	struct spdk_nvmf_request *req;
+	struct spdk_nvme_cpl *rsp;
+
+	assert(ctrlr->nr_aer_reqs > 0);
+
+	req = ctrlr->aer_req[--ctrlr->nr_aer_reqs];
+	rsp = &req->rsp->nvme_cpl;
+
+	rsp->cdw0 = event->raw;
+
+	_nvmf_request_complete(req);
+	ctrlr->aer_req[ctrlr->nr_aer_reqs] = NULL;
+
+	return 0;
+}
+
+int
+nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	union spdk_nvme_async_event_completion event = {0};
+
+	/* Users may disable the event notification */
+	if (!ctrlr->feat.async_event_configuration.bits.ns_attr_notice) {
+		return 0;
+	}
+
+	event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE;
+	event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED;
+	event.bits.log_page_identifier = SPDK_NVME_LOG_CHANGED_NS_LIST;
+
+	/* If there is no outstanding AER request, queue the event.  Then
+	 * if an AER is later submitted, this event can be sent as a
+	 * response.
+	 */
+	if (ctrlr->nr_aer_reqs == 0) {
+		if (ctrlr->notice_event.bits.async_event_type ==
+		    SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) {
+			return 0;
+		}
+
+		ctrlr->notice_event.raw = event.raw;
+		return 0;
+	}
+
+	return nvmf_ctrlr_async_event_notification(ctrlr, &event);
+}
+
+void
+nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	union spdk_nvme_async_event_completion event = {0};
+
+	if (!ctrlr->num_avail_log_pages) {
+		return;
+	}
+	event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_IO;
+	event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_RESERVATION_LOG_AVAIL;
+	event.bits.log_page_identifier = SPDK_NVME_LOG_RESERVATION_NOTIFICATION;
+
+	/* If there is no outstanding AER request, queue the event.  Then
+	 * if an AER is later submitted, this event can be sent as a
+	 * response.
+	 */
+	if (ctrlr->nr_aer_reqs == 0) {
+		if (ctrlr->reservation_event.bits.async_event_type ==
+		    SPDK_NVME_ASYNC_EVENT_TYPE_IO) {
+			return;
+		}
+
+		ctrlr->reservation_event.raw = event.raw;
+		return;
+	}
+
+	nvmf_ctrlr_async_event_notification(ctrlr, &event);
+}
+
+void
+nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+	int i;
+
+	if (!nvmf_qpair_is_admin_queue(qpair)) {
+		return;
+	}
+
+	for (i = 0; i < ctrlr->nr_aer_reqs; i++) {
+		spdk_nvmf_request_free(ctrlr->aer_req[i]);
+		ctrlr->aer_req[i] = NULL;
+	}
+
+	ctrlr->nr_aer_reqs = 0;
+}
+
+void
+nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	struct spdk_nvmf_request *req;
+	int i;
+
+	for (i = 0; i < ctrlr->nr_aer_reqs; i++) {
+		req = ctrlr->aer_req[i];
+
+		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+		_nvmf_request_complete(req);
+
+		ctrlr->aer_req[i] = NULL;
+	}
+
+	ctrlr->nr_aer_reqs = 0;
+}
+
+static void
+_nvmf_ctrlr_add_reservation_log(void *ctx)
+{
+	struct spdk_nvmf_reservation_log *log = (struct spdk_nvmf_reservation_log *)ctx;
+	struct spdk_nvmf_ctrlr *ctrlr = log->ctrlr;
+
+	ctrlr->log_page_count++;
+
+	/* Maximum number of queued log pages is 255 */
+	if (ctrlr->num_avail_log_pages == 0xff) {
+		struct spdk_nvmf_reservation_log *entry;
+		entry = TAILQ_LAST(&ctrlr->log_head, log_page_head);
+		entry->log.log_page_count = ctrlr->log_page_count;
+		free(log);
+		return;
+	}
+
+	log->log.log_page_count = ctrlr->log_page_count;
+	log->log.num_avail_log_pages = ctrlr->num_avail_log_pages++;
+	TAILQ_INSERT_TAIL(&ctrlr->log_head, log, link);
+
+	nvmf_ctrlr_async_event_reservation_notification(ctrlr);
+}
+
+void
+nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr,
+				  struct spdk_nvmf_ns *ns,
+				  enum spdk_nvme_reservation_notification_log_page_type type)
+{
+	struct spdk_nvmf_reservation_log *log;
+
+	switch (type) {
+	case SPDK_NVME_RESERVATION_LOG_PAGE_EMPTY:
+		return;
+	case SPDK_NVME_REGISTRATION_PREEMPTED:
+		if (ns->mask & SPDK_NVME_REGISTRATION_PREEMPTED_MASK) {
+			return;
+		}
+		break;
+	case SPDK_NVME_RESERVATION_RELEASED:
+		if (ns->mask & SPDK_NVME_RESERVATION_RELEASED_MASK) {
+			return;
+		}
+		break;
+	case SPDK_NVME_RESERVATION_PREEMPTED:
+		if (ns->mask & SPDK_NVME_RESERVATION_PREEMPTED_MASK) {
+			return;
+		}
+		break;
+	default:
+		return;
+	}
+
+	log = calloc(1, sizeof(*log));
+	if (!log) {
+		SPDK_ERRLOG("Alloc log page failed, ignore the log\n");
+		return;
+	}
+	log->ctrlr = ctrlr;
+	log->log.type = type;
+	log->log.nsid = ns->nsid;
+
+	spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_add_reservation_log, log);
+}
+
+/* Check from subsystem poll group's namespace information data structure */
+static bool
+nvmf_ns_info_ctrlr_is_registrant(struct spdk_nvmf_subsystem_pg_ns_info *ns_info,
+				 struct spdk_nvmf_ctrlr *ctrlr)
+{
+	uint32_t i;
+
+	for (i = 0; i < SPDK_NVMF_MAX_NUM_REGISTRANTS; i++) {
+		if (!spdk_uuid_compare(&ns_info->reg_hostid[i], &ctrlr->hostid)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Check the NVMe command is permitted or not for current controller(Host).
+ */
+static int
+nvmf_ns_reservation_request_check(struct spdk_nvmf_subsystem_pg_ns_info *ns_info,
+				  struct spdk_nvmf_ctrlr *ctrlr,
+				  struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	enum spdk_nvme_reservation_type rtype = ns_info->rtype;
+	uint8_t status = SPDK_NVME_SC_SUCCESS;
+	uint8_t racqa;
+	bool is_registrant;
+
+	/* No valid reservation */
+	if (!rtype) {
+		return 0;
+	}
+
+	is_registrant = nvmf_ns_info_ctrlr_is_registrant(ns_info, ctrlr);
+	/* All registrants type and current ctrlr is a valid registrant */
+	if ((rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_ALL_REGS ||
+	     rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) && is_registrant) {
+		return 0;
+	} else if (!spdk_uuid_compare(&ns_info->holder_id, &ctrlr->hostid)) {
+		return 0;
+	}
+
+	/* Non-holder for current controller */
+	switch (cmd->opc) {
+	case SPDK_NVME_OPC_READ:
+	case SPDK_NVME_OPC_COMPARE:
+		if (rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+			goto exit;
+		}
+		if ((rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_REG_ONLY ||
+		     rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) && !is_registrant) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+		}
+		break;
+	case SPDK_NVME_OPC_FLUSH:
+	case SPDK_NVME_OPC_WRITE:
+	case SPDK_NVME_OPC_WRITE_UNCORRECTABLE:
+	case SPDK_NVME_OPC_WRITE_ZEROES:
+	case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+		if (rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE ||
+		    rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+			goto exit;
+		}
+		if (!is_registrant) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+		}
+		break;
+	case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
+		racqa = cmd->cdw10_bits.resv_acquire.racqa;
+		if (racqa == SPDK_NVME_RESERVE_ACQUIRE) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+			goto exit;
+		}
+		if (!is_registrant) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+		}
+		break;
+	case SPDK_NVME_OPC_RESERVATION_RELEASE:
+		if (!is_registrant) {
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+		}
+		break;
+	default:
+		break;
+	}
+
+exit:
+	req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	req->rsp->nvme_cpl.status.sc = status;
+	if (status == SPDK_NVME_SC_RESERVATION_CONFLICT) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int
+nvmf_ctrlr_process_io_fused_cmd(struct spdk_nvmf_request *req, struct spdk_bdev *bdev,
+				struct spdk_bdev_desc *desc, struct spdk_io_channel *ch)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_request *first_fused_req = req->qpair->first_fused_req;
+	int rc;
+
+	if (cmd->fuse == SPDK_NVME_CMD_FUSE_FIRST) {
+		/* first fused operation (should be compare) */
+		if (first_fused_req != NULL) {
+			struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl;
+
+			SPDK_ERRLOG("Wrong sequence of fused operations\n");
+
+			/* abort req->qpair->first_fused_request and continue with new fused command */
+			fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+			fused_response->status.sct = SPDK_NVME_SCT_GENERIC;
+			_nvmf_request_complete(first_fused_req);
+		} else if (cmd->opc != SPDK_NVME_OPC_COMPARE) {
+			SPDK_ERRLOG("Wrong op code of fused operations\n");
+			rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+			rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+
+		req->qpair->first_fused_req = req;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+	} else if (cmd->fuse == SPDK_NVME_CMD_FUSE_SECOND) {
+		/* second fused operation (should be write) */
+		if (first_fused_req == NULL) {
+			SPDK_ERRLOG("Wrong sequence of fused operations\n");
+			rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+			rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		} else if (cmd->opc != SPDK_NVME_OPC_WRITE) {
+			struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl;
+
+			SPDK_ERRLOG("Wrong op code of fused operations\n");
+
+			/* abort req->qpair->first_fused_request and fail current command */
+			fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+			fused_response->status.sct = SPDK_NVME_SCT_GENERIC;
+			_nvmf_request_complete(first_fused_req);
+
+			rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+			rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+			req->qpair->first_fused_req = NULL;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+
+		/* save request of first command to generate response later */
+		req->first_fused_req = first_fused_req;
+		req->qpair->first_fused_req = NULL;
+	} else {
+		SPDK_ERRLOG("Invalid fused command fuse field.\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = nvmf_bdev_ctrlr_compare_and_write_cmd(bdev, desc, ch, req->first_fused_req, req);
+
+	if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+		if (spdk_nvme_cpl_is_error(rsp)) {
+			struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl;
+
+			fused_response->status = rsp->status;
+			rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+			rsp->status.sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+			/* Complete first of fused commands. Second will be completed by upper layer */
+			_nvmf_request_complete(first_fused_req);
+			req->first_fused_req = NULL;
+		}
+	}
+
+	return rc;
+}
+
+int
+nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req)
+{
+	uint32_t nsid;
+	struct spdk_nvmf_ns *ns;
+	struct spdk_bdev *bdev;
+	struct spdk_bdev_desc *desc;
+	struct spdk_io_channel *ch;
+	struct spdk_nvmf_poll_group *group = req->qpair->group;
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+
+	/* pre-set response details for this command */
+	response->status.sc = SPDK_NVME_SC_SUCCESS;
+	nsid = cmd->nsid;
+
+	if (spdk_unlikely(ctrlr == NULL)) {
+		SPDK_ERRLOG("I/O command sent before CONNECT\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (spdk_unlikely(ctrlr->vcprop.cc.bits.en != 1)) {
+		SPDK_ERRLOG("I/O command sent to disabled controller\n");
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
+	if (ns == NULL || ns->bdev == NULL) {
+		SPDK_ERRLOG("Unsuccessful query for nsid %u\n", cmd->nsid);
+		response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		response->status.dnr = 1;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	/* scan-build falsely reporting dereference of null pointer */
+	assert(group != NULL && group->sgroups != NULL);
+	ns_info = &group->sgroups[ctrlr->subsys->id].ns_info[nsid - 1];
+	if (nvmf_ns_reservation_request_check(ns_info, ctrlr, req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Reservation Conflict for nsid %u, opcode %u\n",
+			      cmd->nsid, cmd->opc);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	bdev = ns->bdev;
+	desc = ns->desc;
+	ch = ns_info->channel;
+
+	if (spdk_unlikely(cmd->fuse & SPDK_NVME_CMD_FUSE_MASK)) {
+		return nvmf_ctrlr_process_io_fused_cmd(req, bdev, desc, ch);
+	} else if (spdk_unlikely(req->qpair->first_fused_req != NULL)) {
+		struct spdk_nvme_cpl *fused_response = &req->qpair->first_fused_req->rsp->nvme_cpl;
+
+		SPDK_ERRLOG("Expected second of fused commands - failing first of fused commands\n");
+
+		/* abort req->qpair->first_fused_request and continue with new command */
+		fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+		fused_response->status.sct = SPDK_NVME_SCT_GENERIC;
+		_nvmf_request_complete(req->qpair->first_fused_req);
+		req->qpair->first_fused_req = NULL;
+	}
+
+	switch (cmd->opc) {
+	case SPDK_NVME_OPC_READ:
+		return nvmf_bdev_ctrlr_read_cmd(bdev, desc, ch, req);
+	case SPDK_NVME_OPC_WRITE:
+		return nvmf_bdev_ctrlr_write_cmd(bdev, desc, ch, req);
+	case SPDK_NVME_OPC_COMPARE:
+		return nvmf_bdev_ctrlr_compare_cmd(bdev, desc, ch, req);
+	case SPDK_NVME_OPC_WRITE_ZEROES:
+		return nvmf_bdev_ctrlr_write_zeroes_cmd(bdev, desc, ch, req);
+	case SPDK_NVME_OPC_FLUSH:
+		return nvmf_bdev_ctrlr_flush_cmd(bdev, desc, ch, req);
+	case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+		return nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req);
+	case SPDK_NVME_OPC_RESERVATION_REGISTER:
+	case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
+	case SPDK_NVME_OPC_RESERVATION_RELEASE:
+	case SPDK_NVME_OPC_RESERVATION_REPORT:
+		spdk_thread_send_msg(ctrlr->subsys->thread, nvmf_ns_reservation_request, req);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+	default:
+		return nvmf_bdev_ctrlr_nvme_passthru_io(bdev, desc, ch, req);
+	}
+}
+
+static void
+nvmf_qpair_request_cleanup(struct spdk_nvmf_qpair *qpair)
+{
+	if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING) {
+		assert(qpair->state_cb != NULL);
+
+		if (TAILQ_EMPTY(&qpair->outstanding)) {
+			qpair->state_cb(qpair->state_cb_arg, 0);
+		}
+	}
+}
+
+int
+spdk_nvmf_request_free(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+
+	TAILQ_REMOVE(&qpair->outstanding, req, link);
+	if (nvmf_transport_req_free(req)) {
+		SPDK_ERRLOG("Unable to free transport level request resources.\n");
+	}
+
+	nvmf_qpair_request_cleanup(qpair);
+
+	return 0;
+}
+
+static void
+_nvmf_request_complete(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	struct spdk_nvmf_qpair *qpair;
+	struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
+	bool is_aer = false;
+
+	rsp->sqid = 0;
+	rsp->status.p = 0;
+	rsp->cid = req->cmd->nvme_cmd.cid;
+
+	qpair = req->qpair;
+	if (qpair->ctrlr) {
+		sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
+		assert(sgroup != NULL);
+		is_aer = req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
+	} else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) {
+		sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+	}
+
+	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) {
+		spdk_nvme_print_completion(qpair->qid, rsp);
+	}
+
+	TAILQ_REMOVE(&qpair->outstanding, req, link);
+	if (nvmf_transport_req_complete(req)) {
+		SPDK_ERRLOG("Transport request completion error!\n");
+	}
+
+	/* AER cmd is an exception */
+	if (sgroup && !is_aer) {
+		assert(sgroup->io_outstanding > 0);
+		sgroup->io_outstanding--;
+		if (sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSING &&
+		    sgroup->io_outstanding == 0) {
+			sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED;
+			sgroup->cb_fn(sgroup->cb_arg, 0);
+		}
+	}
+
+	nvmf_qpair_request_cleanup(qpair);
+}
+
+int
+spdk_nvmf_request_complete(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+
+	if (spdk_likely(qpair->group->thread == spdk_get_thread())) {
+		_nvmf_request_complete(req);
+	} else {
+		spdk_thread_send_msg(qpair->group->thread,
+				     _nvmf_request_complete, req);
+	}
+
+	return 0;
+}
+
+static void
+_nvmf_request_exec(struct spdk_nvmf_request *req,
+		   struct spdk_nvmf_subsystem_poll_group *sgroup)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	enum spdk_nvmf_request_exec_status status;
+
+	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) {
+		spdk_nvme_print_command(qpair->qid, &req->cmd->nvme_cmd);
+	}
+
+	if (sgroup) {
+		sgroup->io_outstanding++;
+	}
+
+	/* Place the request on the outstanding list so we can keep track of it */
+	TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
+
+	if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) {
+		status = nvmf_ctrlr_process_fabrics_cmd(req);
+	} else if (spdk_unlikely(nvmf_qpair_is_admin_queue(qpair))) {
+		status = nvmf_ctrlr_process_admin_cmd(req);
+	} else {
+		status = nvmf_ctrlr_process_io_cmd(req);
+	}
+
+	if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+		_nvmf_request_complete(req);
+	}
+}
+
+void
+spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
+
+	assert(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC);
+
+	if (qpair->ctrlr) {
+		sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
+		assert(sgroup != NULL);
+	} else {
+		sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+	}
+
+	_nvmf_request_exec(req, sgroup);
+}
+
+void
+spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
+
+	if (qpair->ctrlr) {
+		sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
+		assert(sgroup != NULL);
+	} else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) {
+		sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+	}
+
+	if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) {
+		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+		/* Place the request on the outstanding list so we can keep track of it */
+		TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
+		/* Still increment io_outstanding because request_complete decrements it */
+		if (sgroup != NULL) {
+			sgroup->io_outstanding++;
+		}
+		_nvmf_request_complete(req);
+		return;
+	}
+
+	/* Check if the subsystem is paused (if there is a subsystem) */
+	if (sgroup != NULL) {
+		if (sgroup->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) {
+			/* The subsystem is not currently active. Queue this request. */
+			TAILQ_INSERT_TAIL(&sgroup->queued, req, link);
+			return;
+		}
+	}
+
+	_nvmf_request_exec(req, sgroup);
+}
+
+static bool
+nvmf_ctrlr_get_dif_ctx(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
+		       struct spdk_dif_ctx *dif_ctx)
+{
+	struct spdk_nvmf_ns *ns;
+	struct spdk_bdev *bdev;
+
+	if (ctrlr == NULL || cmd == NULL) {
+		return false;
+	}
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+	if (ns == NULL || ns->bdev == NULL) {
+		return false;
+	}
+
+	bdev = ns->bdev;
+
+	switch (cmd->opc) {
+	case SPDK_NVME_OPC_READ:
+	case SPDK_NVME_OPC_WRITE:
+	case SPDK_NVME_OPC_COMPARE:
+		return nvmf_bdev_ctrlr_get_dif_ctx(bdev, cmd, dif_ctx);
+	default:
+		break;
+	}
+
+	return false;
+}
+
+bool
+spdk_nvmf_request_get_dif_ctx(struct spdk_nvmf_request *req, struct spdk_dif_ctx *dif_ctx)
+{
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+	if (spdk_likely(ctrlr == NULL || !ctrlr->dif_insert_or_strip)) {
+		return false;
+	}
+
+	if (spdk_unlikely(qpair->state != SPDK_NVMF_QPAIR_ACTIVE)) {
+		return false;
+	}
+
+	if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) {
+		return false;
+	}
+
+	if (spdk_unlikely(nvmf_qpair_is_admin_queue(qpair))) {
+		return false;
+	}
+
+	return nvmf_ctrlr_get_dif_ctx(ctrlr, &req->cmd->nvme_cmd, dif_ctx);
+}
+
+void
+spdk_nvmf_set_custom_admin_cmd_hdlr(uint8_t opc, spdk_nvmf_custom_cmd_hdlr hdlr)
+{
+	g_nvmf_custom_admin_cmd_hdlrs[opc].hdlr = hdlr;
+}
+
+static int
+nvmf_passthru_admin_cmd(struct spdk_nvmf_request *req)
+{
+	struct spdk_bdev *bdev;
+	struct spdk_bdev_desc *desc;
+	struct spdk_io_channel *ch;
+	struct spdk_nvme_cmd *cmd = spdk_nvmf_request_get_cmd(req);
+	struct spdk_nvme_cpl *response = spdk_nvmf_request_get_response(req);
+	uint32_t bdev_nsid;
+	int rc;
+
+	if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].nsid == 0) {
+		bdev_nsid = cmd->nsid;
+	} else {
+		bdev_nsid = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].nsid;
+	}
+
+	rc = spdk_nvmf_request_get_bdev(bdev_nsid, req, &bdev, &desc, &ch);
+	if (rc) {
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+	return spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(bdev, desc, ch, req, NULL);
+}
+
+void
+spdk_nvmf_set_passthru_admin_cmd(uint8_t opc, uint32_t forward_nsid)
+{
+	g_nvmf_custom_admin_cmd_hdlrs[opc].hdlr = nvmf_passthru_admin_cmd;
+	g_nvmf_custom_admin_cmd_hdlrs[opc].nsid = forward_nsid;
+}
+
+int
+spdk_nvmf_request_get_bdev(uint32_t nsid, struct spdk_nvmf_request *req,
+			   struct spdk_bdev **bdev, struct spdk_bdev_desc **desc, struct spdk_io_channel **ch)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct spdk_nvmf_ns *ns;
+	struct spdk_nvmf_poll_group *group = req->qpair->group;
+	struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+
+	*bdev = NULL;
+	*desc = NULL;
+	*ch = NULL;
+
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
+	if (ns == NULL || ns->bdev == NULL) {
+		return -EINVAL;
+	}
+
+	assert(group != NULL && group->sgroups != NULL);
+	ns_info = &group->sgroups[ctrlr->subsys->id].ns_info[nsid - 1];
+	*bdev = ns->bdev;
+	*desc = ns->desc;
+	*ch = ns_info->channel;
+
+	return 0;
+}
+
+struct spdk_nvmf_ctrlr *spdk_nvmf_request_get_ctrlr(struct spdk_nvmf_request *req)
+{
+	return req->qpair->ctrlr;
+}
+
+struct spdk_nvme_cmd *spdk_nvmf_request_get_cmd(struct spdk_nvmf_request *req)
+{
+	return &req->cmd->nvme_cmd;
+}
+
+struct spdk_nvme_cpl *spdk_nvmf_request_get_response(struct spdk_nvmf_request *req)
+{
+	return &req->rsp->nvme_cpl;
+}
+
+struct spdk_nvmf_subsystem *spdk_nvmf_request_get_subsystem(struct spdk_nvmf_request *req)
+{
+	return req->qpair->ctrlr->subsys;
+}
+
+void spdk_nvmf_request_get_data(struct spdk_nvmf_request *req, void **data, uint32_t *length)
+{
+	*data = req->data;
+	*length = req->length;
+}
+
+struct spdk_nvmf_subsystem *spdk_nvmf_ctrlr_get_subsystem(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->subsys;
+}
+
+uint16_t spdk_nvmf_ctrlr_get_id(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return ctrlr->cntlid;
+}
+
+struct spdk_nvmf_request *spdk_nvmf_request_get_req_to_abort(struct spdk_nvmf_request *req)
+{
+	return req->req_to_abort;
+}
diff --git a/src/spdk/lib/nvmf/ctrlr_bdev.c b/src/spdk/lib/nvmf/ctrlr_bdev.c
new file mode 100644
index 000000000..13e0a4309
--- /dev/null
+++ b/src/spdk/lib/nvmf/ctrlr_bdev.c
@@ -0,0 +1,761 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+
+#include "spdk/bdev.h"
+#include "spdk/endian.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/trace.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+static bool
+nvmf_subsystem_bdev_io_type_supported(struct spdk_nvmf_subsystem *subsystem,
+				      enum spdk_bdev_io_type io_type)
+{
+	struct spdk_nvmf_ns *ns;
+
+	for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+	     ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+		if (ns->bdev == NULL) {
+			continue;
+		}
+
+		if (!spdk_bdev_io_type_supported(ns->bdev, io_type)) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF,
+				      "Subsystem %s namespace %u (%s) does not support io_type %d\n",
+				      spdk_nvmf_subsystem_get_nqn(subsystem),
+				      ns->opts.nsid, spdk_bdev_get_name(ns->bdev), (int)io_type);
+			return false;
+		}
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "All devices in Subsystem %s support io_type %d\n",
+		      spdk_nvmf_subsystem_get_nqn(subsystem), (int)io_type);
+	return true;
+}
+
+bool
+nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_UNMAP);
+}
+
+bool
+nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	return nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_WRITE_ZEROES);
+}
+
+static void
+nvmf_bdev_ctrlr_complete_cmd(struct spdk_bdev_io *bdev_io, bool success,
+			     void *cb_arg)
+{
+	struct spdk_nvmf_request	*req = cb_arg;
+	struct spdk_nvme_cpl		*response = &req->rsp->nvme_cpl;
+	int				first_sc = 0, first_sct = 0, second_sc = 0, second_sct = 0;
+	uint32_t			cdw0 = 0;
+	struct spdk_nvmf_request	*first_req = req->first_fused_req;
+
+	if (spdk_unlikely(first_req != NULL)) {
+		/* fused commands - get status for both operations */
+		struct spdk_nvme_cpl *fused_response = &first_req->rsp->nvme_cpl;
+
+		spdk_bdev_io_get_nvme_fused_status(bdev_io, &cdw0, &second_sct, &second_sc, &first_sct, &first_sc);
+		fused_response->cdw0 = cdw0;
+		fused_response->status.sc = second_sc;
+		fused_response->status.sct = second_sct;
+
+		/* first request should be completed */
+		spdk_nvmf_request_complete(first_req);
+		req->first_fused_req = NULL;
+	} else {
+		spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &first_sct, &first_sc);
+	}
+
+	response->cdw0 = cdw0;
+	response->status.sc = first_sc;
+	response->status.sct = first_sct;
+
+	spdk_nvmf_request_complete(req);
+	spdk_bdev_free_io(bdev_io);
+}
+
+static void
+nvmf_bdev_ctrlr_complete_admin_cmd(struct spdk_bdev_io *bdev_io, bool success,
+				   void *cb_arg)
+{
+	struct spdk_nvmf_request *req = cb_arg;
+
+	if (req->cmd_cb_fn) {
+		req->cmd_cb_fn(req);
+	}
+
+	nvmf_bdev_ctrlr_complete_cmd(bdev_io, success, req);
+}
+
+void
+nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata,
+			    bool dif_insert_or_strip)
+{
+	struct spdk_bdev *bdev = ns->bdev;
+	uint64_t num_blocks;
+
+	num_blocks = spdk_bdev_get_num_blocks(bdev);
+
+	nsdata->nsze = num_blocks;
+	nsdata->ncap = num_blocks;
+	nsdata->nuse = num_blocks;
+	nsdata->nlbaf = 0;
+	nsdata->flbas.format = 0;
+	nsdata->nacwu = spdk_bdev_get_acwu(bdev);
+	if (!dif_insert_or_strip) {
+		nsdata->lbaf[0].ms = spdk_bdev_get_md_size(bdev);
+		nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(bdev));
+		if (nsdata->lbaf[0].ms != 0) {
+			nsdata->flbas.extended = 1;
+			nsdata->mc.extended = 1;
+			nsdata->mc.pointer = 0;
+			nsdata->dps.md_start = spdk_bdev_is_dif_head_of_md(bdev);
+
+			switch (spdk_bdev_get_dif_type(bdev)) {
+			case SPDK_DIF_TYPE1:
+				nsdata->dpc.pit1 = 1;
+				nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE1;
+				break;
+			case SPDK_DIF_TYPE2:
+				nsdata->dpc.pit2 = 1;
+				nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE2;
+				break;
+			case SPDK_DIF_TYPE3:
+				nsdata->dpc.pit3 = 1;
+				nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE3;
+				break;
+			default:
+				SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Protection Disabled\n");
+				nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE;
+				break;
+			}
+		}
+	} else {
+		nsdata->lbaf[0].ms = 0;
+		nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_data_block_size(bdev));
+	}
+	nsdata->noiob = spdk_bdev_get_optimal_io_boundary(bdev);
+	nsdata->nmic.can_share = 1;
+	if (ns->ptpl_file != NULL) {
+		nsdata->nsrescap.rescap.persist = 1;
+	}
+	nsdata->nsrescap.rescap.write_exclusive = 1;
+	nsdata->nsrescap.rescap.exclusive_access = 1;
+	nsdata->nsrescap.rescap.write_exclusive_reg_only = 1;
+	nsdata->nsrescap.rescap.exclusive_access_reg_only = 1;
+	nsdata->nsrescap.rescap.write_exclusive_all_reg = 1;
+	nsdata->nsrescap.rescap.exclusive_access_all_reg = 1;
+	nsdata->nsrescap.rescap.ignore_existing_key = 1;
+
+	SPDK_STATIC_ASSERT(sizeof(nsdata->nguid) == sizeof(ns->opts.nguid), "size mismatch");
+	memcpy(nsdata->nguid, ns->opts.nguid, sizeof(nsdata->nguid));
+
+	SPDK_STATIC_ASSERT(sizeof(nsdata->eui64) == sizeof(ns->opts.eui64), "size mismatch");
+	memcpy(&nsdata->eui64, ns->opts.eui64, sizeof(nsdata->eui64));
+}
+
+static void
+nvmf_bdev_ctrlr_get_rw_params(const struct spdk_nvme_cmd *cmd, uint64_t *start_lba,
+			      uint64_t *num_blocks)
+{
+	/* SLBA: CDW10 and CDW11 */
+	*start_lba = from_le64(&cmd->cdw10);
+
+	/* NLB: CDW12 bits 15:00, 0's based */
+	*num_blocks = (from_le32(&cmd->cdw12) & 0xFFFFu) + 1;
+}
+
+static bool
+nvmf_bdev_ctrlr_lba_in_range(uint64_t bdev_num_blocks, uint64_t io_start_lba,
+			     uint64_t io_num_blocks)
+{
+	if (io_start_lba + io_num_blocks > bdev_num_blocks ||
+	    io_start_lba + io_num_blocks < io_start_lba) {
+		return false;
+	}
+
+	return true;
+}
+
+static void
+nvmf_ctrlr_process_io_cmd_resubmit(void *arg)
+{
+	struct spdk_nvmf_request *req = arg;
+
+	nvmf_ctrlr_process_io_cmd(req);
+}
+
+static void
+nvmf_ctrlr_process_admin_cmd_resubmit(void *arg)
+{
+	struct spdk_nvmf_request *req = arg;
+
+	nvmf_ctrlr_process_admin_cmd(req);
+}
+
+static void
+nvmf_bdev_ctrl_queue_io(struct spdk_nvmf_request *req, struct spdk_bdev *bdev,
+			struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn, void *cb_arg)
+{
+	int rc;
+
+	req->bdev_io_wait.bdev = bdev;
+	req->bdev_io_wait.cb_fn = cb_fn;
+	req->bdev_io_wait.cb_arg = cb_arg;
+
+	rc = spdk_bdev_queue_io_wait(bdev, ch, &req->bdev_io_wait);
+	if (rc != 0) {
+		assert(false);
+	}
+	req->qpair->group->stat.pending_bdev_io++;
+}
+
+int
+nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			 struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+	uint32_t block_size = spdk_bdev_get_block_size(bdev);
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint64_t start_lba;
+	uint64_t num_blocks;
+	int rc;
+
+	nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+	if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+		SPDK_ERRLOG("end of media\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (spdk_unlikely(num_blocks * block_size > req->length)) {
+		SPDK_ERRLOG("Read NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+			    num_blocks, block_size, req->length);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
+				    nvmf_bdev_ctrlr_complete_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			  struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+	uint32_t block_size = spdk_bdev_get_block_size(bdev);
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint64_t start_lba;
+	uint64_t num_blocks;
+	int rc;
+
+	nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+	if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+		SPDK_ERRLOG("end of media\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (spdk_unlikely(num_blocks * block_size > req->length)) {
+		SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+			    num_blocks, block_size, req->length);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
+				     nvmf_bdev_ctrlr_complete_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_compare_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			    struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+	uint32_t block_size = spdk_bdev_get_block_size(bdev);
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint64_t start_lba;
+	uint64_t num_blocks;
+	int rc;
+
+	nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+	if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+		SPDK_ERRLOG("end of media\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (spdk_unlikely(num_blocks * block_size > req->length)) {
+		SPDK_ERRLOG("Compare NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+			    num_blocks, block_size, req->length);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = spdk_bdev_comparev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
+				       nvmf_bdev_ctrlr_complete_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_compare_and_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+				      struct spdk_io_channel *ch, struct spdk_nvmf_request *cmp_req, struct spdk_nvmf_request *write_req)
+{
+	uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+	uint32_t block_size = spdk_bdev_get_block_size(bdev);
+	struct spdk_nvme_cmd *cmp_cmd = &cmp_req->cmd->nvme_cmd;
+	struct spdk_nvme_cmd *write_cmd = &write_req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &write_req->rsp->nvme_cpl;
+	uint64_t write_start_lba, cmp_start_lba;
+	uint64_t write_num_blocks, cmp_num_blocks;
+	int rc;
+
+	nvmf_bdev_ctrlr_get_rw_params(cmp_cmd, &cmp_start_lba, &cmp_num_blocks);
+	nvmf_bdev_ctrlr_get_rw_params(write_cmd, &write_start_lba, &write_num_blocks);
+
+	if (spdk_unlikely(write_start_lba != cmp_start_lba || write_num_blocks != cmp_num_blocks)) {
+		SPDK_ERRLOG("Fused command start lba / num blocks mismatch\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, write_start_lba,
+			  write_num_blocks))) {
+		SPDK_ERRLOG("end of media\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (spdk_unlikely(write_num_blocks * block_size > write_req->length)) {
+		SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+			    write_num_blocks, block_size, write_req->length);
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = spdk_bdev_comparev_and_writev_blocks(desc, ch, cmp_req->iov, cmp_req->iovcnt, write_req->iov,
+			write_req->iovcnt, write_start_lba, write_num_blocks, nvmf_bdev_ctrlr_complete_cmd, write_req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(cmp_req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, cmp_req);
+			nvmf_bdev_ctrl_queue_io(write_req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, write_req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+				 struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint64_t start_lba;
+	uint64_t num_blocks;
+	int rc;
+
+	nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+	if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+		SPDK_ERRLOG("end of media\n");
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = spdk_bdev_write_zeroes_blocks(desc, ch, start_lba, num_blocks,
+					   nvmf_bdev_ctrlr_complete_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+		rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			  struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	int rc;
+
+	/* As for NVMeoF controller, SPDK always set volatile write
+	 * cache bit to 1, return success for those block devices
+	 * which can't support FLUSH command.
+	 */
+	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_SUCCESS;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	rc = spdk_bdev_flush_blocks(desc, ch, 0, spdk_bdev_get_num_blocks(bdev),
+				    nvmf_bdev_ctrlr_complete_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+struct nvmf_bdev_ctrlr_unmap {
+	struct spdk_nvmf_request	*req;
+	uint32_t			count;
+	struct spdk_bdev_desc		*desc;
+	struct spdk_bdev		*bdev;
+	struct spdk_io_channel		*ch;
+	uint32_t			range_index;
+};
+
+static void
+nvmf_bdev_ctrlr_unmap_cpl(struct spdk_bdev_io *bdev_io, bool success,
+			  void *cb_arg)
+{
+	struct nvmf_bdev_ctrlr_unmap *unmap_ctx = cb_arg;
+	struct spdk_nvmf_request	*req = unmap_ctx->req;
+	struct spdk_nvme_cpl		*response = &req->rsp->nvme_cpl;
+	int				sc, sct;
+	uint32_t			cdw0;
+
+	unmap_ctx->count--;
+
+	if (response->status.sct == SPDK_NVME_SCT_GENERIC &&
+	    response->status.sc == SPDK_NVME_SC_SUCCESS) {
+		spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+		response->cdw0 = cdw0;
+		response->status.sc = sc;
+		response->status.sct = sct;
+	}
+
+	if (unmap_ctx->count == 0) {
+		spdk_nvmf_request_complete(req);
+		free(unmap_ctx);
+	}
+	spdk_bdev_free_io(bdev_io);
+}
+
+static int
+nvmf_bdev_ctrlr_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+		      struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+		      struct nvmf_bdev_ctrlr_unmap *unmap_ctx);
+static void
+nvmf_bdev_ctrlr_unmap_resubmit(void *arg)
+{
+	struct nvmf_bdev_ctrlr_unmap *unmap_ctx = arg;
+	struct spdk_nvmf_request *req = unmap_ctx->req;
+	struct spdk_bdev_desc *desc = unmap_ctx->desc;
+	struct spdk_bdev *bdev = unmap_ctx->bdev;
+	struct spdk_io_channel *ch = unmap_ctx->ch;
+
+	nvmf_bdev_ctrlr_unmap(bdev, desc, ch, req, unmap_ctx);
+}
+
+static int
+nvmf_bdev_ctrlr_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+		      struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+		      struct nvmf_bdev_ctrlr_unmap *unmap_ctx)
+{
+	uint16_t nr, i;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+	struct spdk_nvme_dsm_range *dsm_range;
+	uint64_t lba;
+	uint32_t lba_count;
+	int rc;
+
+	nr = cmd->cdw10_bits.dsm.nr + 1;
+	if (nr * sizeof(struct spdk_nvme_dsm_range) > req->length) {
+		SPDK_ERRLOG("Dataset Management number of ranges > SGL length\n");
+		response->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	if (unmap_ctx == NULL) {
+		unmap_ctx = calloc(1, sizeof(*unmap_ctx));
+		if (!unmap_ctx) {
+			response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+		}
+
+		unmap_ctx->req = req;
+		unmap_ctx->desc = desc;
+		unmap_ctx->ch = ch;
+		unmap_ctx->bdev = bdev;
+
+		response->status.sct = SPDK_NVME_SCT_GENERIC;
+		response->status.sc = SPDK_NVME_SC_SUCCESS;
+	} else {
+		unmap_ctx->count--;	/* dequeued */
+	}
+
+	dsm_range = (struct spdk_nvme_dsm_range *)req->data;
+	for (i = unmap_ctx->range_index; i < nr; i++) {
+		lba = dsm_range[i].starting_lba;
+		lba_count = dsm_range[i].length;
+
+		unmap_ctx->count++;
+
+		rc = spdk_bdev_unmap_blocks(desc, ch, lba, lba_count,
+					    nvmf_bdev_ctrlr_unmap_cpl, unmap_ctx);
+		if (rc) {
+			if (rc == -ENOMEM) {
+				nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_bdev_ctrlr_unmap_resubmit, unmap_ctx);
+				/* Unmap was not yet submitted to bdev */
+				/* unmap_ctx->count will be decremented when the request is dequeued */
+				return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+			}
+			response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+			unmap_ctx->count--;
+			/* We can't return here - we may have to wait for any other
+				* unmaps already sent to complete */
+			break;
+		}
+		unmap_ctx->range_index++;
+	}
+
+	if (unmap_ctx->count == 0) {
+		free(unmap_ctx);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+	if (cmd->cdw11_bits.dsm.ad) {
+		return nvmf_bdev_ctrlr_unmap(bdev, desc, ch, req, NULL);
+	}
+
+	response->status.sct = SPDK_NVME_SCT_GENERIC;
+	response->status.sc = SPDK_NVME_SC_SUCCESS;
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+				 struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+	int rc;
+
+	rc = spdk_bdev_nvme_io_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length,
+					nvmf_bdev_ctrlr_complete_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+		struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+		spdk_nvmf_nvme_passthru_cmd_cb cb_fn)
+{
+	int rc;
+
+	req->cmd_cb_fn = cb_fn;
+
+	rc = spdk_bdev_nvme_admin_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length,
+					   nvmf_bdev_ctrlr_complete_admin_cmd, req);
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
+			nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req);
+			return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+		}
+		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+
+	return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+static void
+nvmf_bdev_ctrlr_complete_abort_cmd(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_nvmf_request *req = cb_arg;
+
+	if (success) {
+		req->rsp->nvme_cpl.cdw0 &= ~1U;
+	}
+
+	spdk_nvmf_request_complete(req);
+	spdk_bdev_free_io(bdev_io);
+}
+
+int
+spdk_nvmf_bdev_ctrlr_abort_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			       struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+			       struct spdk_nvmf_request *req_to_abort)
+{
+	int rc;
+
+	assert((req->rsp->nvme_cpl.cdw0 & 1U) != 0);
+
+	rc = spdk_bdev_abort(desc, ch, req_to_abort, nvmf_bdev_ctrlr_complete_abort_cmd, req);
+	if (spdk_likely(rc == 0)) {
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+	} else if (rc == -ENOMEM) {
+		nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req);
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+	} else {
+		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+	}
+}
+
+bool
+nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd,
+			    struct spdk_dif_ctx *dif_ctx)
+{
+	uint32_t init_ref_tag, dif_check_flags = 0;
+	int rc;
+
+	if (spdk_bdev_get_md_size(bdev) == 0) {
+		return false;
+	}
+
+	/* Initial Reference Tag is the lower 32 bits of the start LBA. */
+	init_ref_tag = (uint32_t)from_le64(&cmd->cdw10);
+
+	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
+		dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
+	}
+
+	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
+		dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
+	}
+
+	rc = spdk_dif_ctx_init(dif_ctx,
+			       spdk_bdev_get_block_size(bdev),
+			       spdk_bdev_get_md_size(bdev),
+			       spdk_bdev_is_md_interleaved(bdev),
+			       spdk_bdev_is_dif_head_of_md(bdev),
+			       spdk_bdev_get_dif_type(bdev),
+			       dif_check_flags,
+			       init_ref_tag, 0, 0, 0, 0);
+
+	return (rc == 0) ? true : false;
+}
diff --git a/src/spdk/lib/nvmf/ctrlr_discovery.c b/src/spdk/lib/nvmf/ctrlr_discovery.c
new file mode 100644
index 000000000..ab1c46ba1
--- /dev/null
+++ b/src/spdk/lib/nvmf/ctrlr_discovery.c
@@ -0,0 +1,159 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over Fabrics discovery service
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/nvmf_spec.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+static struct spdk_nvmf_discovery_log_page *
+nvmf_generate_discovery_log(struct spdk_nvmf_tgt *tgt, const char *hostnqn, size_t *log_page_size)
+{
+	uint64_t numrec = 0;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_subsystem_listener *listener;
+	struct spdk_nvmf_discovery_log_page_entry *entry;
+	struct spdk_nvmf_discovery_log_page *disc_log;
+	size_t cur_size;
+	uint32_t sid;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Generating log page for genctr %" PRIu64 "\n",
+		      tgt->discovery_genctr);
+
+	cur_size = sizeof(struct spdk_nvmf_discovery_log_page);
+	disc_log = calloc(1, cur_size);
+	if (disc_log == NULL) {
+		SPDK_ERRLOG("Discovery log page memory allocation error\n");
+		return NULL;
+	}
+
+	for (sid = 0; sid < tgt->max_subsystems; sid++) {
+		subsystem = tgt->subsystems[sid];
+		if ((subsystem == NULL) ||
+		    (subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) ||
+		    (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) {
+			continue;
+		}
+
+		if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+			continue;
+		}
+
+		if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) {
+			continue;
+		}
+
+		for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL;
+		     listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) {
+			size_t new_size = cur_size + sizeof(*entry);
+			void *new_log_page = realloc(disc_log, new_size);
+
+			if (new_log_page == NULL) {
+				SPDK_ERRLOG("Discovery log page memory allocation error\n");
+				break;
+			}
+
+			disc_log = new_log_page;
+			cur_size = new_size;
+
+			entry = &disc_log->entries[numrec];
+			memset(entry, 0, sizeof(*entry));
+			entry->portid = numrec;
+			entry->cntlid = 0xffff;
+			entry->asqsz = listener->transport->opts.max_aq_depth;
+			entry->subtype = subsystem->subtype;
+			snprintf(entry->subnqn, sizeof(entry->subnqn), "%s", subsystem->subnqn);
+
+			nvmf_transport_listener_discover(listener->transport, listener->trid, entry);
+
+			numrec++;
+		}
+	}
+
+	disc_log->numrec = numrec;
+	disc_log->genctr = tgt->discovery_genctr;
+	*log_page_size = cur_size;
+
+	return disc_log;
+}
+
+void
+nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, const char *hostnqn, struct iovec *iov,
+			    uint32_t iovcnt, uint64_t offset, uint32_t length)
+{
+	size_t copy_len = 0;
+	size_t zero_len = 0;
+	struct iovec *tmp;
+	size_t log_page_size = 0;
+	struct spdk_nvmf_discovery_log_page *discovery_log_page;
+
+	discovery_log_page = nvmf_generate_discovery_log(tgt, hostnqn, &log_page_size);
+
+	/* Copy the valid part of the discovery log page, if any */
+	if (discovery_log_page) {
+		for (tmp = iov; tmp < iov + iovcnt; tmp++) {
+			copy_len = spdk_min(tmp->iov_len, length);
+			copy_len = spdk_min(log_page_size - offset, copy_len);
+
+			memcpy(tmp->iov_base, (char *)discovery_log_page + offset, copy_len);
+
+			offset += copy_len;
+			length -= copy_len;
+			zero_len = tmp->iov_len - copy_len;
+			if (log_page_size <= offset || length == 0) {
+				break;
+			}
+		}
+		/* Zero out the rest of the payload */
+		if (zero_len) {
+			memset((char *)tmp->iov_base + copy_len, 0, zero_len);
+		}
+
+		for (++tmp; tmp < iov + iovcnt; tmp++) {
+			memset((char *)tmp->iov_base, 0, tmp->iov_len);
+		}
+
+		free(discovery_log_page);
+	}
+}
diff --git a/src/spdk/lib/nvmf/fc.c b/src/spdk/lib/nvmf/fc.c
new file mode 100644
index 000000000..678cfc681
--- /dev/null
+++ b/src/spdk/lib/nvmf/fc.c
@@ -0,0 +1,3957 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2018-2019 Broadcom.  All Rights Reserved.
+ *   The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe_FC transport functions.
+ */
+
+#include "spdk/env.h"
+#include "spdk/assert.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+#include "spdk/endian.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/log.h"
+
+#include "nvmf_fc.h"
+#include "fc_lld.h"
+
+#ifndef DEV_VERIFY
+#define DEV_VERIFY assert
+#endif
+
+#ifndef ASSERT_SPDK_FC_MASTER_THREAD
+#define ASSERT_SPDK_FC_MASTER_THREAD() \
+        DEV_VERIFY(spdk_get_thread() == nvmf_fc_get_master_thread());
+#endif
+
+/*
+ * PRLI service parameters
+ */
+enum spdk_nvmf_fc_service_parameters {
+	SPDK_NVMF_FC_FIRST_BURST_SUPPORTED = 0x0001,
+	SPDK_NVMF_FC_DISCOVERY_SERVICE = 0x0008,
+	SPDK_NVMF_FC_TARGET_FUNCTION = 0x0010,
+	SPDK_NVMF_FC_INITIATOR_FUNCTION = 0x0020,
+	SPDK_NVMF_FC_CONFIRMED_COMPLETION_SUPPORTED = 0x0080,
+};
+
+static char *fc_req_state_strs[] = {
+	"SPDK_NVMF_FC_REQ_INIT",
+	"SPDK_NVMF_FC_REQ_READ_BDEV",
+	"SPDK_NVMF_FC_REQ_READ_XFER",
+	"SPDK_NVMF_FC_REQ_READ_RSP",
+	"SPDK_NVMF_FC_REQ_WRITE_BUFFS",
+	"SPDK_NVMF_FC_REQ_WRITE_XFER",
+	"SPDK_NVMF_FC_REQ_WRITE_BDEV",
+	"SPDK_NVMF_FC_REQ_WRITE_RSP",
+	"SPDK_NVMF_FC_REQ_NONE_BDEV",
+	"SPDK_NVMF_FC_REQ_NONE_RSP",
+	"SPDK_NVMF_FC_REQ_SUCCESS",
+	"SPDK_NVMF_FC_REQ_FAILED",
+	"SPDK_NVMF_FC_REQ_ABORTED",
+	"SPDK_NVMF_FC_REQ_BDEV_ABORTED",
+	"SPDK_NVMF_FC_REQ_PENDING"
+};
+
+#define OBJECT_NVMF_FC_IO				0xA0
+
+#define TRACE_GROUP_NVMF_FC				0x8
+#define TRACE_FC_REQ_INIT                       SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x01)
+#define TRACE_FC_REQ_READ_BDEV                  SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x02)
+#define TRACE_FC_REQ_READ_XFER                  SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x03)
+#define TRACE_FC_REQ_READ_RSP                   SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x04)
+#define TRACE_FC_REQ_WRITE_BUFFS                SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x05)
+#define TRACE_FC_REQ_WRITE_XFER                 SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x06)
+#define TRACE_FC_REQ_WRITE_BDEV                 SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x07)
+#define TRACE_FC_REQ_WRITE_RSP                  SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x08)
+#define TRACE_FC_REQ_NONE_BDEV                  SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x09)
+#define TRACE_FC_REQ_NONE_RSP                   SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0A)
+#define TRACE_FC_REQ_SUCCESS                    SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0B)
+#define TRACE_FC_REQ_FAILED                     SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0C)
+#define TRACE_FC_REQ_ABORTED                    SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0D)
+#define TRACE_FC_REQ_BDEV_ABORTED               SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0E)
+#define TRACE_FC_REQ_PENDING                    SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0F)
+
+SPDK_TRACE_REGISTER_FN(nvmf_fc_trace, "nvmf_fc", TRACE_GROUP_NVMF_FC)
+{
+	spdk_trace_register_object(OBJECT_NVMF_FC_IO, 'r');
+	spdk_trace_register_description("FC_REQ_NEW",
+					TRACE_FC_REQ_INIT,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 1, 1, "");
+	spdk_trace_register_description("FC_REQ_READ_SUBMIT_TO_BDEV",
+					TRACE_FC_REQ_READ_BDEV,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_READ_XFER_DATA",
+					TRACE_FC_REQ_READ_XFER,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_READ_RSP",
+					TRACE_FC_REQ_READ_RSP,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_WRITE_NEED_BUFFER",
+					TRACE_FC_REQ_WRITE_BUFFS,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_WRITE_XFER_DATA",
+					TRACE_FC_REQ_WRITE_XFER,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_WRITE_SUBMIT_TO_BDEV",
+					TRACE_FC_REQ_WRITE_BDEV,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_WRITE_RSP",
+					TRACE_FC_REQ_WRITE_RSP,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_NONE_SUBMIT_TO_BDEV",
+					TRACE_FC_REQ_NONE_BDEV,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_NONE_RSP",
+					TRACE_FC_REQ_NONE_RSP,
+					OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_SUCCESS",
+					TRACE_FC_REQ_SUCCESS,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("FC_REQ_FAILED",
+					TRACE_FC_REQ_FAILED,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("FC_REQ_ABORTED",
+					TRACE_FC_REQ_ABORTED,
+					OWNER_NONE, OBJECT_NONE, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_ABORTED_SUBMIT_TO_BDEV",
+					TRACE_FC_REQ_BDEV_ABORTED,
+					OWNER_NONE, OBJECT_NONE, 0, 1, "");
+	spdk_trace_register_description("FC_REQ_PENDING",
+					TRACE_FC_REQ_PENDING,
+					OWNER_NONE, OBJECT_NONE, 0, 1, "");
+}
+
+/**
+ * The structure used by all fc adm functions
+ */
+struct spdk_nvmf_fc_adm_api_data {
+	void *api_args;
+	spdk_nvmf_fc_callback cb_func;
+};
+
+/**
+ * The callback structure for nport-delete
+ */
+struct spdk_nvmf_fc_adm_nport_del_cb_data {
+	struct spdk_nvmf_fc_nport *nport;
+	uint8_t port_handle;
+	spdk_nvmf_fc_callback fc_cb_func;
+	void *fc_cb_ctx;
+};
+
+/**
+ * The callback structure for it-delete
+ */
+struct spdk_nvmf_fc_adm_i_t_del_cb_data {
+	struct spdk_nvmf_fc_nport *nport;
+	struct spdk_nvmf_fc_remote_port_info *rport;
+	uint8_t port_handle;
+	spdk_nvmf_fc_callback fc_cb_func;
+	void *fc_cb_ctx;
+};
+
+
+typedef void (*spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn)(void *arg, uint32_t err);
+
+/**
+ * The callback structure for the it-delete-assoc callback
+ */
+struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data {
+	struct spdk_nvmf_fc_nport *nport;
+	struct spdk_nvmf_fc_remote_port_info *rport;
+	uint8_t port_handle;
+	spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func;
+	void *cb_ctx;
+};
+
+/*
+ * Call back function pointer for HW port quiesce.
+ */
+typedef void (*spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn)(void *ctx, int err);
+
+/**
+ * Context structure for quiescing a hardware port
+ */
+struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx {
+	int quiesce_count;
+	void *ctx;
+	spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func;
+};
+
+/**
+ * Context structure used to reset a hardware port
+ */
+struct spdk_nvmf_fc_adm_hw_port_reset_ctx {
+	void *reset_args;
+	spdk_nvmf_fc_callback reset_cb_func;
+};
+
+/**
+ * The callback structure for HW port link break event
+ */
+struct spdk_nvmf_fc_adm_port_link_break_cb_data {
+	struct spdk_nvmf_hw_port_link_break_args *args;
+	struct spdk_nvmf_fc_nport_delete_args nport_del_args;
+	spdk_nvmf_fc_callback cb_func;
+};
+
+struct spdk_nvmf_fc_transport {
+	struct spdk_nvmf_transport transport;
+	pthread_mutex_t lock;
+};
+
+static struct spdk_nvmf_fc_transport *g_nvmf_ftransport;
+
+static TAILQ_HEAD(, spdk_nvmf_fc_port) g_spdk_nvmf_fc_port_list =
+	TAILQ_HEAD_INITIALIZER(g_spdk_nvmf_fc_port_list);
+
+static struct spdk_thread *g_nvmf_fc_master_thread = NULL;
+
+static uint32_t g_nvmf_fgroup_count = 0;
+static TAILQ_HEAD(, spdk_nvmf_fc_poll_group) g_nvmf_fgroups =
+	TAILQ_HEAD_INITIALIZER(g_nvmf_fgroups);
+
+struct spdk_thread *
+nvmf_fc_get_master_thread(void)
+{
+	return g_nvmf_fc_master_thread;
+}
+
+static inline void
+nvmf_fc_record_req_trace_point(struct spdk_nvmf_fc_request *fc_req,
+			       enum spdk_nvmf_fc_request_state state)
+{
+	uint16_t tpoint_id = SPDK_TRACE_MAX_TPOINT_ID;
+
+	switch (state) {
+	case SPDK_NVMF_FC_REQ_INIT:
+		/* Start IO tracing */
+		tpoint_id = TRACE_FC_REQ_INIT;
+		break;
+	case SPDK_NVMF_FC_REQ_READ_BDEV:
+		tpoint_id = TRACE_FC_REQ_READ_BDEV;
+		break;
+	case SPDK_NVMF_FC_REQ_READ_XFER:
+		tpoint_id = TRACE_FC_REQ_READ_XFER;
+		break;
+	case SPDK_NVMF_FC_REQ_READ_RSP:
+		tpoint_id = TRACE_FC_REQ_READ_RSP;
+		break;
+	case SPDK_NVMF_FC_REQ_WRITE_BUFFS:
+		tpoint_id = TRACE_FC_REQ_WRITE_BUFFS;
+		break;
+	case SPDK_NVMF_FC_REQ_WRITE_XFER:
+		tpoint_id = TRACE_FC_REQ_WRITE_XFER;
+		break;
+	case SPDK_NVMF_FC_REQ_WRITE_BDEV:
+		tpoint_id = TRACE_FC_REQ_WRITE_BDEV;
+		break;
+	case SPDK_NVMF_FC_REQ_WRITE_RSP:
+		tpoint_id = TRACE_FC_REQ_WRITE_RSP;
+		break;
+	case SPDK_NVMF_FC_REQ_NONE_BDEV:
+		tpoint_id = TRACE_FC_REQ_NONE_BDEV;
+		break;
+	case SPDK_NVMF_FC_REQ_NONE_RSP:
+		tpoint_id = TRACE_FC_REQ_NONE_RSP;
+		break;
+	case SPDK_NVMF_FC_REQ_SUCCESS:
+		tpoint_id = TRACE_FC_REQ_SUCCESS;
+		break;
+	case SPDK_NVMF_FC_REQ_FAILED:
+		tpoint_id = TRACE_FC_REQ_FAILED;
+		break;
+	case SPDK_NVMF_FC_REQ_ABORTED:
+		tpoint_id = TRACE_FC_REQ_ABORTED;
+		break;
+	case SPDK_NVMF_FC_REQ_BDEV_ABORTED:
+		tpoint_id = TRACE_FC_REQ_ABORTED;
+		break;
+	case SPDK_NVMF_FC_REQ_PENDING:
+		tpoint_id = TRACE_FC_REQ_PENDING;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+	if (tpoint_id != SPDK_TRACE_MAX_TPOINT_ID) {
+		spdk_trace_record(tpoint_id, fc_req->poller_lcore, 0,
+				  (uint64_t)(&fc_req->req), 0);
+	}
+}
+
+static void
+nvmf_fc_handle_connection_failure(void *arg)
+{
+	struct spdk_nvmf_fc_conn *fc_conn = arg;
+	struct spdk_nvmf_fc_ls_add_conn_api_data *api_data = NULL;
+
+	if (!fc_conn->create_opd) {
+		return;
+	}
+	api_data = &fc_conn->create_opd->u.add_conn;
+
+	nvmf_fc_ls_add_conn_failure(api_data->assoc, api_data->ls_rqst,
+				    api_data->args.fc_conn, api_data->aq_conn);
+}
+
+static void
+nvmf_fc_handle_assoc_deletion(void *arg)
+{
+	struct spdk_nvmf_fc_conn *fc_conn = arg;
+
+	nvmf_fc_delete_association(fc_conn->fc_assoc->tgtport,
+				   fc_conn->fc_assoc->assoc_id, false, true, NULL, NULL);
+}
+
+static int
+nvmf_fc_create_req_mempool(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	uint32_t i;
+	struct spdk_nvmf_fc_request *fc_req;
+
+	TAILQ_INIT(&hwqp->free_reqs);
+	TAILQ_INIT(&hwqp->in_use_reqs);
+
+	hwqp->fc_reqs_buf = calloc(hwqp->rq_size, sizeof(struct spdk_nvmf_fc_request));
+	if (hwqp->fc_reqs_buf == NULL) {
+		SPDK_ERRLOG("create fc request pool failed\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < hwqp->rq_size; i++) {
+		fc_req = hwqp->fc_reqs_buf + i;
+
+		nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_INIT);
+		TAILQ_INSERT_TAIL(&hwqp->free_reqs, fc_req, link);
+	}
+
+	return 0;
+}
+
+static inline struct spdk_nvmf_fc_request *
+nvmf_fc_hwqp_alloc_fc_request(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	struct spdk_nvmf_fc_request *fc_req;
+
+	if (TAILQ_EMPTY(&hwqp->free_reqs)) {
+		SPDK_ERRLOG("Alloc request buffer failed\n");
+		return NULL;
+	}
+
+	fc_req = TAILQ_FIRST(&hwqp->free_reqs);
+	TAILQ_REMOVE(&hwqp->free_reqs, fc_req, link);
+
+	memset(fc_req, 0, sizeof(struct spdk_nvmf_fc_request));
+	TAILQ_INSERT_TAIL(&hwqp->in_use_reqs, fc_req, link);
+	TAILQ_INIT(&fc_req->abort_cbs);
+	return fc_req;
+}
+
+static inline void
+nvmf_fc_hwqp_free_fc_request(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_request *fc_req)
+{
+	if (fc_req->state != SPDK_NVMF_FC_REQ_SUCCESS) {
+		/* Log an error for debug purpose. */
+		nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_FAILED);
+	}
+
+	/* set the magic to mark req as no longer valid. */
+	fc_req->magic = 0xDEADBEEF;
+
+	TAILQ_REMOVE(&hwqp->in_use_reqs, fc_req, link);
+	TAILQ_INSERT_HEAD(&hwqp->free_reqs, fc_req, link);
+}
+
+static inline bool
+nvmf_fc_req_in_get_buff(struct spdk_nvmf_fc_request *fc_req)
+{
+	switch (fc_req->state) {
+	case SPDK_NVMF_FC_REQ_WRITE_BUFFS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+void
+nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	nvmf_fc_init_rqpair_buffers(hwqp);
+}
+
+struct spdk_nvmf_fc_conn *
+nvmf_fc_hwqp_find_fc_conn(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t conn_id)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	TAILQ_FOREACH(fc_conn, &hwqp->connection_list, link) {
+		if (fc_conn->conn_id == conn_id) {
+			return fc_conn;
+		}
+	}
+
+	return NULL;
+}
+
+void
+nvmf_fc_hwqp_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, void *queues_curr)
+{
+	struct spdk_nvmf_fc_abts_ctx *ctx;
+	struct spdk_nvmf_fc_poller_api_queue_sync_args *args = NULL, *tmp = NULL;
+
+	/* Clean up any pending sync callbacks */
+	TAILQ_FOREACH_SAFE(args, &hwqp->sync_cbs, link, tmp) {
+		TAILQ_REMOVE(&hwqp->sync_cbs, args, link);
+		ctx = args->cb_info.cb_data;
+		if (ctx) {
+			if (++ctx->hwqps_responded == ctx->num_hwqps) {
+				free(ctx->sync_poller_args);
+				free(ctx->abts_poller_args);
+				free(ctx);
+			}
+		}
+	}
+
+	nvmf_fc_reinit_q(hwqp->queues, queues_curr);
+}
+
+void
+nvmf_fc_init_hwqp(struct spdk_nvmf_fc_port *fc_port, struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	hwqp->fc_port = fc_port;
+
+	/* clear counters */
+	memset(&hwqp->counters, 0, sizeof(struct spdk_nvmf_fc_errors));
+
+	nvmf_fc_init_poller_queues(hwqp);
+	if (&fc_port->ls_queue != hwqp) {
+		nvmf_fc_create_req_mempool(hwqp);
+	}
+
+	nvmf_fc_init_q(hwqp);
+	TAILQ_INIT(&hwqp->connection_list);
+	TAILQ_INIT(&hwqp->sync_cbs);
+	TAILQ_INIT(&hwqp->ls_pending_queue);
+}
+
+static struct spdk_nvmf_fc_poll_group *
+nvmf_fc_get_idlest_poll_group(void)
+{
+	uint32_t max_count = UINT32_MAX;
+	struct spdk_nvmf_fc_poll_group *fgroup;
+	struct spdk_nvmf_fc_poll_group *ret_fgroup = NULL;
+
+	/* find poll group with least number of hwqp's assigned to it */
+	TAILQ_FOREACH(fgroup, &g_nvmf_fgroups, link) {
+		if (fgroup->hwqp_count < max_count) {
+			ret_fgroup = fgroup;
+			max_count = fgroup->hwqp_count;
+		}
+	}
+
+	return ret_fgroup;
+}
+
+void
+nvmf_fc_poll_group_add_hwqp(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	struct spdk_nvmf_fc_poll_group *fgroup = NULL;
+
+	assert(hwqp);
+	if (hwqp == NULL) {
+		SPDK_ERRLOG("Error: hwqp is NULL\n");
+		return;
+	}
+
+	assert(g_nvmf_fgroup_count);
+
+	fgroup = nvmf_fc_get_idlest_poll_group();
+	if (!fgroup) {
+		SPDK_ERRLOG("Could not assign poll group for hwqp (%d)\n", hwqp->hwqp_id);
+		return;
+	}
+
+	hwqp->thread = fgroup->group.group->thread;
+	hwqp->fgroup = fgroup;
+	fgroup->hwqp_count++;
+	nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_ADD_HWQP, NULL);
+}
+
+void
+nvmf_fc_poll_group_remove_hwqp(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	assert(hwqp);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+		      "Remove hwqp from poller: for port: %d, hwqp: %d\n",
+		      hwqp->fc_port->port_hdl, hwqp->hwqp_id);
+
+	if (!hwqp->fgroup) {
+		SPDK_ERRLOG("HWQP (%d) not assigned to poll group\n", hwqp->hwqp_id);
+	} else {
+		hwqp->fgroup->hwqp_count--;
+		nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP, NULL);
+	}
+}
+
+/*
+ * Note: This needs to be used only on master poller.
+ */
+static uint64_t
+nvmf_fc_get_abts_unique_id(void)
+{
+	static uint32_t u_id = 0;
+
+	return (uint64_t)(++u_id);
+}
+
+static void
+nvmf_fc_queue_synced_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	struct spdk_nvmf_fc_abts_ctx *ctx = cb_data;
+	struct spdk_nvmf_fc_poller_api_abts_recvd_args *args, *poller_arg;
+
+	ctx->hwqps_responded++;
+
+	if (ctx->hwqps_responded < ctx->num_hwqps) {
+		/* Wait for all pollers to complete. */
+		return;
+	}
+
+	/* Free the queue sync poller args. */
+	free(ctx->sync_poller_args);
+
+	/* Mark as queue synced */
+	ctx->queue_synced = true;
+
+	/* Reset the ctx values */
+	ctx->hwqps_responded = 0;
+	ctx->handled = false;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+		      "QueueSync(0x%lx) completed for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+		      ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+
+	/* Resend ABTS to pollers */
+	args = ctx->abts_poller_args;
+	for (int i = 0; i < ctx->num_hwqps; i++) {
+		poller_arg = args + i;
+		nvmf_fc_poller_api_func(poller_arg->hwqp,
+					SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED,
+					poller_arg);
+	}
+}
+
+static int
+nvmf_fc_handle_abts_notfound(struct spdk_nvmf_fc_abts_ctx *ctx)
+{
+	struct spdk_nvmf_fc_poller_api_queue_sync_args *args, *poller_arg;
+	struct spdk_nvmf_fc_poller_api_abts_recvd_args *abts_args, *abts_poller_arg;
+
+	/* check if FC driver supports queue sync */
+	if (!nvmf_fc_q_sync_available()) {
+		return -EPERM;
+	}
+
+	assert(ctx);
+	if (!ctx) {
+		SPDK_ERRLOG("NULL ctx pointer");
+		return -EINVAL;
+	}
+
+	/* Reset the ctx values */
+	ctx->hwqps_responded = 0;
+
+	args = calloc(ctx->num_hwqps,
+		      sizeof(struct spdk_nvmf_fc_poller_api_queue_sync_args));
+	if (!args) {
+		SPDK_ERRLOG("QueueSync(0x%lx) failed for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+			    ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+		return -ENOMEM;
+	}
+	ctx->sync_poller_args = args;
+
+	abts_args = ctx->abts_poller_args;
+	for (int i = 0; i < ctx->num_hwqps; i++) {
+		abts_poller_arg = abts_args + i;
+		poller_arg = args + i;
+		poller_arg->u_id = ctx->u_id;
+		poller_arg->hwqp = abts_poller_arg->hwqp;
+		poller_arg->cb_info.cb_func = nvmf_fc_queue_synced_cb;
+		poller_arg->cb_info.cb_data = ctx;
+		poller_arg->cb_info.cb_thread = spdk_get_thread();
+
+		/* Send a Queue sync message to interested pollers */
+		nvmf_fc_poller_api_func(poller_arg->hwqp,
+					SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC,
+					poller_arg);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+		      "QueueSync(0x%lx) Sent for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+		      ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+
+	/* Post Marker to queue to track aborted request */
+	nvmf_fc_issue_q_sync(ctx->ls_hwqp, ctx->u_id, ctx->fcp_rq_id);
+
+	return 0;
+}
+
+static void
+nvmf_fc_abts_handled_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	struct spdk_nvmf_fc_abts_ctx *ctx = cb_data;
+	struct spdk_nvmf_fc_nport *nport  = NULL;
+
+	if (ret != SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND) {
+		ctx->handled = true;
+	}
+
+	ctx->hwqps_responded++;
+
+	if (ctx->hwqps_responded < ctx->num_hwqps) {
+		/* Wait for all pollers to complete. */
+		return;
+	}
+
+	nport = nvmf_fc_nport_find(ctx->port_hdl, ctx->nport_hdl);
+
+	if (ctx->nport != nport) {
+		/* Nport can be deleted while this abort is being
+		 * processed by the pollers.
+		 */
+		SPDK_NOTICELOG("nport_%d deleted while processing ABTS frame, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+			       ctx->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+	} else {
+		if (!ctx->handled) {
+			/* Try syncing the queues and try one more time */
+			if (!ctx->queue_synced && (nvmf_fc_handle_abts_notfound(ctx) == 0)) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+					      "QueueSync(0x%lx) for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+					      ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+				return;
+			} else {
+				/* Send Reject */
+				nvmf_fc_xmt_bls_rsp(&ctx->nport->fc_port->ls_queue,
+						    ctx->oxid, ctx->rxid, ctx->rpi, true,
+						    FCNVME_BLS_REJECT_EXP_INVALID_OXID, NULL, NULL);
+			}
+		} else {
+			/* Send Accept */
+			nvmf_fc_xmt_bls_rsp(&ctx->nport->fc_port->ls_queue,
+					    ctx->oxid, ctx->rxid, ctx->rpi, false,
+					    0, NULL, NULL);
+		}
+	}
+	SPDK_NOTICELOG("BLS_%s sent for ABTS frame nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+		       (ctx->handled) ? "ACC" : "REJ", ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+
+	free(ctx->abts_poller_args);
+	free(ctx);
+}
+
+void
+nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, uint16_t rpi,
+			  uint16_t oxid, uint16_t rxid)
+{
+	struct spdk_nvmf_fc_abts_ctx *ctx = NULL;
+	struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = NULL, *poller_arg;
+	struct spdk_nvmf_fc_association *assoc = NULL;
+	struct spdk_nvmf_fc_conn *conn = NULL;
+	uint32_t hwqp_cnt = 0;
+	bool skip_hwqp_cnt;
+	struct spdk_nvmf_fc_hwqp **hwqps = NULL;
+	uint32_t i;
+
+	SPDK_NOTICELOG("Handle ABTS frame for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+		       nport->nport_hdl, rpi, oxid, rxid);
+
+	/* Allocate memory to track hwqp's with at least 1 active connection. */
+	hwqps = calloc(nport->fc_port->num_io_queues, sizeof(struct spdk_nvmf_fc_hwqp *));
+	if (hwqps == NULL) {
+		SPDK_ERRLOG("Unable to allocate temp. hwqp array for abts processing!\n");
+		goto bls_rej;
+	}
+
+	TAILQ_FOREACH(assoc, &nport->fc_associations, link) {
+		TAILQ_FOREACH(conn, &assoc->fc_conns, assoc_link) {
+			if (conn->rpi != rpi) {
+				continue;
+			}
+
+			skip_hwqp_cnt = false;
+			for (i = 0; i < hwqp_cnt; i++) {
+				if (hwqps[i] == conn->hwqp) {
+					/* Skip. This is already present */
+					skip_hwqp_cnt = true;
+					break;
+				}
+			}
+			if (!skip_hwqp_cnt) {
+				assert(hwqp_cnt < nport->fc_port->num_io_queues);
+				hwqps[hwqp_cnt] = conn->hwqp;
+				hwqp_cnt++;
+			}
+		}
+	}
+
+	if (!hwqp_cnt) {
+		goto bls_rej;
+	}
+
+	args = calloc(hwqp_cnt,
+		      sizeof(struct spdk_nvmf_fc_poller_api_abts_recvd_args));
+	if (!args) {
+		goto bls_rej;
+	}
+
+	ctx = calloc(1, sizeof(struct spdk_nvmf_fc_abts_ctx));
+	if (!ctx) {
+		goto bls_rej;
+	}
+	ctx->rpi = rpi;
+	ctx->oxid = oxid;
+	ctx->rxid = rxid;
+	ctx->nport = nport;
+	ctx->nport_hdl = nport->nport_hdl;
+	ctx->port_hdl = nport->fc_port->port_hdl;
+	ctx->num_hwqps = hwqp_cnt;
+	ctx->ls_hwqp = &nport->fc_port->ls_queue;
+	ctx->fcp_rq_id = nport->fc_port->fcp_rq_id;
+	ctx->abts_poller_args = args;
+
+	/* Get a unique context for this ABTS */
+	ctx->u_id = nvmf_fc_get_abts_unique_id();
+
+	for (i = 0; i < hwqp_cnt; i++) {
+		poller_arg = args + i;
+		poller_arg->hwqp = hwqps[i];
+		poller_arg->cb_info.cb_func = nvmf_fc_abts_handled_cb;
+		poller_arg->cb_info.cb_data = ctx;
+		poller_arg->cb_info.cb_thread = spdk_get_thread();
+		poller_arg->ctx = ctx;
+
+		nvmf_fc_poller_api_func(poller_arg->hwqp,
+					SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED,
+					poller_arg);
+	}
+
+	free(hwqps);
+
+	return;
+bls_rej:
+	free(args);
+	free(hwqps);
+
+	/* Send Reject */
+	nvmf_fc_xmt_bls_rsp(&nport->fc_port->ls_queue, oxid, rxid, rpi,
+			    true, FCNVME_BLS_REJECT_EXP_NOINFO, NULL, NULL);
+	SPDK_NOTICELOG("BLS_RJT for ABTS frame for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+		       nport->nport_hdl, rpi, oxid, rxid);
+	return;
+}
+
+/*** Accessor functions for the FC structures - BEGIN */
+/*
+ * Returns true if the port is in offline state.
+ */
+bool
+nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port)
+{
+	if (fc_port && (fc_port->hw_port_status == SPDK_FC_PORT_OFFLINE)) {
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Returns true if the port is in online state.
+ */
+bool
+nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port)
+{
+	if (fc_port && (fc_port->hw_port_status == SPDK_FC_PORT_ONLINE)) {
+		return true;
+	}
+
+	return false;
+}
+
+int
+nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port)
+{
+	if (fc_port && (fc_port->hw_port_status != SPDK_FC_PORT_ONLINE)) {
+		fc_port->hw_port_status = SPDK_FC_PORT_ONLINE;
+		return 0;
+	}
+
+	return -EPERM;
+}
+
+int
+nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port)
+{
+	if (fc_port && (fc_port->hw_port_status != SPDK_FC_PORT_OFFLINE)) {
+		fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE;
+		return 0;
+	}
+
+	return -EPERM;
+}
+
+int
+nvmf_fc_hwqp_set_online(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	if (hwqp && (hwqp->state != SPDK_FC_HWQP_ONLINE)) {
+		hwqp->state = SPDK_FC_HWQP_ONLINE;
+		/* reset some queue counters */
+		hwqp->num_conns = 0;
+		return nvmf_fc_set_q_online_state(hwqp, true);
+	}
+
+	return -EPERM;
+}
+
+int
+nvmf_fc_hwqp_set_offline(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	if (hwqp && (hwqp->state != SPDK_FC_HWQP_OFFLINE)) {
+		hwqp->state = SPDK_FC_HWQP_OFFLINE;
+		return nvmf_fc_set_q_online_state(hwqp, false);
+	}
+
+	return -EPERM;
+}
+
+void
+nvmf_fc_port_add(struct spdk_nvmf_fc_port *fc_port)
+{
+	TAILQ_INSERT_TAIL(&g_spdk_nvmf_fc_port_list, fc_port, link);
+}
+
+struct spdk_nvmf_fc_port *
+nvmf_fc_port_lookup(uint8_t port_hdl)
+{
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+
+	TAILQ_FOREACH(fc_port, &g_spdk_nvmf_fc_port_list, link) {
+		if (fc_port->port_hdl == port_hdl) {
+			return fc_port;
+		}
+	}
+	return NULL;
+}
+
+static void
+nvmf_fc_port_cleanup(void)
+{
+	struct spdk_nvmf_fc_port *fc_port, *tmp;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	uint32_t i;
+
+	TAILQ_FOREACH_SAFE(fc_port, &g_spdk_nvmf_fc_port_list, link, tmp) {
+		TAILQ_REMOVE(&g_spdk_nvmf_fc_port_list,  fc_port, link);
+		for (i = 0; i < fc_port->num_io_queues; i++) {
+			hwqp = &fc_port->io_queues[i];
+			if (hwqp->fc_reqs_buf) {
+				free(hwqp->fc_reqs_buf);
+			}
+		}
+		free(fc_port);
+	}
+}
+
+uint32_t
+nvmf_fc_get_prli_service_params(void)
+{
+	return (SPDK_NVMF_FC_DISCOVERY_SERVICE | SPDK_NVMF_FC_TARGET_FUNCTION);
+}
+
+int
+nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port,
+		       struct spdk_nvmf_fc_nport *nport)
+{
+	if (fc_port) {
+		TAILQ_INSERT_TAIL(&fc_port->nport_list, nport, link);
+		fc_port->num_nports++;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int
+nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port,
+			  struct spdk_nvmf_fc_nport *nport)
+{
+	if (fc_port && nport) {
+		TAILQ_REMOVE(&fc_port->nport_list, nport, link);
+		fc_port->num_nports--;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static struct spdk_nvmf_fc_nport *
+nvmf_fc_nport_hdl_lookup(struct spdk_nvmf_fc_port *fc_port, uint16_t nport_hdl)
+{
+	struct spdk_nvmf_fc_nport *fc_nport = NULL;
+
+	TAILQ_FOREACH(fc_nport, &fc_port->nport_list, link) {
+		if (fc_nport->nport_hdl == nport_hdl) {
+			return fc_nport;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_nvmf_fc_nport *
+nvmf_fc_nport_find(uint8_t port_hdl, uint16_t nport_hdl)
+{
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+
+	fc_port = nvmf_fc_port_lookup(port_hdl);
+	if (fc_port) {
+		return nvmf_fc_nport_hdl_lookup(fc_port, nport_hdl);
+	}
+
+	return NULL;
+}
+
+static inline int
+nvmf_fc_hwqp_find_nport_and_rport(struct spdk_nvmf_fc_hwqp *hwqp,
+				  uint32_t d_id, struct spdk_nvmf_fc_nport **nport,
+				  uint32_t s_id, struct spdk_nvmf_fc_remote_port_info **rport)
+{
+	struct spdk_nvmf_fc_nport *n_port;
+	struct spdk_nvmf_fc_remote_port_info *r_port;
+
+	assert(hwqp);
+	if (hwqp == NULL) {
+		SPDK_ERRLOG("Error: hwqp is NULL\n");
+		return -EINVAL;
+	}
+	assert(nport);
+	if (nport == NULL) {
+		SPDK_ERRLOG("Error: nport is NULL\n");
+		return -EINVAL;
+	}
+	assert(rport);
+	if (rport == NULL) {
+		SPDK_ERRLOG("Error: rport is NULL\n");
+		return -EINVAL;
+	}
+
+	TAILQ_FOREACH(n_port, &hwqp->fc_port->nport_list, link) {
+		if (n_port->d_id == d_id) {
+			TAILQ_FOREACH(r_port, &n_port->rem_port_list, link) {
+				if (r_port->s_id == s_id) {
+					*nport = n_port;
+					*rport = r_port;
+					return 0;
+				}
+			}
+			break;
+		}
+	}
+
+	return -ENOENT;
+}
+
+/* Returns true if the Nport is empty of all rem_ports */
+bool
+nvmf_fc_nport_has_no_rport(struct spdk_nvmf_fc_nport *nport)
+{
+	if (nport && TAILQ_EMPTY(&nport->rem_port_list)) {
+		assert(nport->rport_count == 0);
+		return true;
+	} else {
+		return false;
+	}
+}
+
+int
+nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport,
+			enum spdk_nvmf_fc_object_state state)
+{
+	if (nport) {
+		nport->nport_state = state;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
+bool
+nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport,
+			   struct spdk_nvmf_fc_remote_port_info *rem_port)
+{
+	if (nport && rem_port) {
+		TAILQ_INSERT_TAIL(&nport->rem_port_list, rem_port, link);
+		nport->rport_count++;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
+bool
+nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport,
+			      struct spdk_nvmf_fc_remote_port_info *rem_port)
+{
+	if (nport && rem_port) {
+		TAILQ_REMOVE(&nport->rem_port_list, rem_port, link);
+		nport->rport_count--;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
+int
+nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport,
+			enum spdk_nvmf_fc_object_state state)
+{
+	if (rport) {
+		rport->rport_state = state;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+int
+nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc,
+			enum spdk_nvmf_fc_object_state state)
+{
+	if (assoc) {
+		assoc->assoc_state = state;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
+static struct spdk_nvmf_fc_association *
+nvmf_ctrlr_get_fc_assoc(struct spdk_nvmf_ctrlr *ctrlr)
+{
+	struct spdk_nvmf_qpair *qpair = ctrlr->admin_qpair;
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	if (!qpair) {
+		SPDK_ERRLOG("Controller %d has no associations\n", ctrlr->cntlid);
+		return NULL;
+	}
+
+	fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+
+	return fc_conn->fc_assoc;
+}
+
+bool
+nvmf_ctrlr_is_on_nport(uint8_t port_hdl, uint16_t nport_hdl,
+		       struct spdk_nvmf_ctrlr *ctrlr)
+{
+	struct spdk_nvmf_fc_nport *fc_nport = NULL;
+	struct spdk_nvmf_fc_association *assoc = NULL;
+
+	if (!ctrlr) {
+		return false;
+	}
+
+	fc_nport = nvmf_fc_nport_find(port_hdl, nport_hdl);
+	if (!fc_nport) {
+		return false;
+	}
+
+	assoc = nvmf_ctrlr_get_fc_assoc(ctrlr);
+	if (assoc && assoc->tgtport == fc_nport) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+			      "Controller: %d corresponding to association: %p(%lu:%d) is on port: %d nport: %d\n",
+			      ctrlr->cntlid, assoc, assoc->assoc_id, assoc->assoc_state, port_hdl,
+			      nport_hdl);
+		return true;
+	}
+	return false;
+}
+
+static inline bool
+nvmf_fc_req_in_bdev(struct spdk_nvmf_fc_request *fc_req)
+{
+	switch (fc_req->state) {
+	case SPDK_NVMF_FC_REQ_READ_BDEV:
+	case SPDK_NVMF_FC_REQ_WRITE_BDEV:
+	case SPDK_NVMF_FC_REQ_NONE_BDEV:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool
+nvmf_fc_req_in_pending(struct spdk_nvmf_fc_request *fc_req)
+{
+	struct spdk_nvmf_request *tmp = NULL;
+
+	STAILQ_FOREACH(tmp, &fc_req->hwqp->fgroup->group.pending_buf_queue, buf_link) {
+		if (tmp == &fc_req->req) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static void
+nvmf_fc_req_bdev_abort(void *arg1)
+{
+	struct spdk_nvmf_fc_request *fc_req = arg1;
+	struct spdk_nvmf_ctrlr *ctrlr = fc_req->req.qpair->ctrlr;
+	int i;
+
+	/* Initial release - we don't have to abort Admin Queue or
+	 * Fabric commands. The AQ commands supported at this time are
+	 * Get-Log-Page,
+	 * Identify
+	 * Set Features
+	 * Get Features
+	 * AER -> Special case and handled differently.
+	 * Every one of the above Admin commands (except AER) run
+	 * to completion and so an Abort of such commands doesn't
+	 * make sense.
+	 */
+	/* The Fabric commands supported are
+	 * Property Set
+	 * Property Get
+	 * Connect -> Special case (async. handling). Not sure how to
+	 * handle at this point. Let it run to completion.
+	 */
+	for (i = 0; i < NVMF_MAX_ASYNC_EVENTS; i++) {
+		if (ctrlr->aer_req[i] == &fc_req->req) {
+			SPDK_NOTICELOG("Abort AER request\n");
+			nvmf_qpair_free_aer(fc_req->req.qpair);
+		}
+	}
+}
+
+void
+nvmf_fc_request_abort_complete(void *arg1)
+{
+	struct spdk_nvmf_fc_request *fc_req =
+		(struct spdk_nvmf_fc_request *)arg1;
+	struct spdk_nvmf_fc_caller_ctx *ctx = NULL, *tmp = NULL;
+
+	/* Request abort completed. Notify all the callbacks */
+	TAILQ_FOREACH_SAFE(ctx, &fc_req->abort_cbs, link, tmp) {
+		/* Notify */
+		ctx->cb(fc_req->hwqp, 0, ctx->cb_args);
+		/* Remove */
+		TAILQ_REMOVE(&fc_req->abort_cbs, ctx, link);
+		/* free */
+		free(ctx);
+	}
+
+	SPDK_NOTICELOG("FC Request(%p) in state :%s aborted\n", fc_req,
+		       fc_req_state_strs[fc_req->state]);
+
+	_nvmf_fc_request_free(fc_req);
+}
+
+void
+nvmf_fc_request_abort(struct spdk_nvmf_fc_request *fc_req, bool send_abts,
+		      spdk_nvmf_fc_caller_cb cb, void *cb_args)
+{
+	struct spdk_nvmf_fc_caller_ctx *ctx = NULL;
+	bool kill_req = false;
+
+	/* Add the cb to list */
+	if (cb) {
+		ctx = calloc(1, sizeof(struct spdk_nvmf_fc_caller_ctx));
+		if (!ctx) {
+			SPDK_ERRLOG("ctx alloc failed.\n");
+			return;
+		}
+		ctx->cb = cb;
+		ctx->cb_args = cb_args;
+
+		TAILQ_INSERT_TAIL(&fc_req->abort_cbs, ctx, link);
+	}
+
+	if (!fc_req->is_aborted) {
+		/* Increment aborted command counter */
+		fc_req->hwqp->counters.num_aborted++;
+	}
+
+	/* If port is dead, skip abort wqe */
+	kill_req = nvmf_fc_is_port_dead(fc_req->hwqp);
+	if (kill_req && nvmf_fc_req_in_xfer(fc_req)) {
+		fc_req->is_aborted = true;
+		goto complete;
+	}
+
+	/* Check if the request is already marked for deletion */
+	if (fc_req->is_aborted) {
+		return;
+	}
+
+	/* Mark request as aborted */
+	fc_req->is_aborted = true;
+
+	/* If xchg is allocated, then save if we need to send abts or not. */
+	if (fc_req->xchg) {
+		fc_req->xchg->send_abts = send_abts;
+		fc_req->xchg->aborted	= true;
+	}
+
+	if (fc_req->state == SPDK_NVMF_FC_REQ_BDEV_ABORTED) {
+		/* Aborted by backend */
+		goto complete;
+	} else if (nvmf_fc_req_in_bdev(fc_req)) {
+		/* Notify bdev */
+		spdk_thread_send_msg(fc_req->hwqp->thread,
+				     nvmf_fc_req_bdev_abort, (void *)fc_req);
+	} else if (nvmf_fc_req_in_xfer(fc_req)) {
+		/* Notify HBA to abort this exchange  */
+		nvmf_fc_issue_abort(fc_req->hwqp, fc_req->xchg, NULL, NULL);
+	} else if (nvmf_fc_req_in_get_buff(fc_req)) {
+		/* Will be completed by request_complete callback. */
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Abort req when getting buffers.\n");
+	} else if (nvmf_fc_req_in_pending(fc_req)) {
+		/* Remove from pending */
+		STAILQ_REMOVE(&fc_req->hwqp->fgroup->group.pending_buf_queue, &fc_req->req,
+			      spdk_nvmf_request, buf_link);
+		goto complete;
+	} else {
+		/* Should never happen */
+		SPDK_ERRLOG("Request in invalid state\n");
+		goto complete;
+	}
+
+	return;
+complete:
+	nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_ABORTED);
+	nvmf_fc_poller_api_func(fc_req->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+				(void *)fc_req);
+}
+
+static int
+nvmf_fc_request_alloc_buffers(struct spdk_nvmf_fc_request *fc_req)
+{
+	uint32_t length = fc_req->req.length;
+	struct spdk_nvmf_fc_poll_group *fgroup = fc_req->hwqp->fgroup;
+	struct spdk_nvmf_transport_poll_group *group = &fgroup->group;
+	struct spdk_nvmf_transport *transport = group->transport;
+
+	if (spdk_nvmf_request_get_buffers(&fc_req->req, group, transport, length)) {
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int
+nvmf_fc_request_execute(struct spdk_nvmf_fc_request *fc_req)
+{
+	/* Allocate an XCHG if we dont use send frame for this command. */
+	if (!nvmf_fc_use_send_frame(&fc_req->req)) {
+		fc_req->xchg = nvmf_fc_get_xri(fc_req->hwqp);
+		if (!fc_req->xchg) {
+			fc_req->hwqp->counters.no_xchg++;
+			printf("NO XCHGs!\n");
+			goto pending;
+		}
+	}
+
+	if (fc_req->req.length) {
+		if (nvmf_fc_request_alloc_buffers(fc_req) < 0) {
+			fc_req->hwqp->counters.buf_alloc_err++;
+			goto pending;
+		}
+		fc_req->req.data = fc_req->req.iov[0].iov_base;
+	}
+
+	if (fc_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "WRITE CMD.\n");
+
+		nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_WRITE_XFER);
+
+		if (nvmf_fc_recv_data(fc_req)) {
+			/* Dropped return success to caller */
+			fc_req->hwqp->counters.unexpected_err++;
+			_nvmf_fc_request_free(fc_req);
+		}
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "READ/NONE CMD\n");
+
+		if (fc_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+			nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_BDEV);
+		} else {
+			nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_NONE_BDEV);
+		}
+		spdk_nvmf_request_exec(&fc_req->req);
+	}
+
+	return 0;
+
+pending:
+	if (fc_req->xchg) {
+		nvmf_fc_put_xchg(fc_req->hwqp, fc_req->xchg);
+		fc_req->xchg = NULL;
+	}
+
+	nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_PENDING);
+
+	return -EAGAIN;
+}
+
+static int
+nvmf_fc_hwqp_handle_request(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_frame_hdr *frame,
+			    uint32_t buf_idx, struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen)
+{
+	uint16_t cmnd_len;
+	uint64_t rqst_conn_id;
+	struct spdk_nvmf_fc_request *fc_req = NULL;
+	struct spdk_nvmf_fc_cmnd_iu *cmd_iu = NULL;
+	struct spdk_nvmf_fc_conn *fc_conn = NULL;
+	enum spdk_nvme_data_transfer xfer;
+
+	cmd_iu = buffer->virt;
+	cmnd_len = cmd_iu->cmnd_iu_len;
+	cmnd_len = from_be16(&cmnd_len);
+
+	/* check for a valid cmnd_iu format */
+	if ((cmd_iu->fc_id != FCNVME_CMND_IU_FC_ID) ||
+	    (cmd_iu->scsi_id != FCNVME_CMND_IU_SCSI_ID) ||
+	    (cmnd_len != sizeof(struct spdk_nvmf_fc_cmnd_iu) / 4)) {
+		SPDK_ERRLOG("IU CMD error\n");
+		hwqp->counters.nvme_cmd_iu_err++;
+		return -ENXIO;
+	}
+
+	xfer = spdk_nvme_opc_get_data_transfer(cmd_iu->flags);
+	if (xfer == SPDK_NVME_DATA_BIDIRECTIONAL) {
+		SPDK_ERRLOG("IU CMD xfer error\n");
+		hwqp->counters.nvme_cmd_xfer_err++;
+		return -EPERM;
+	}
+
+	rqst_conn_id = from_be64(&cmd_iu->conn_id);
+
+	/* Check if conn id is valid */
+	fc_conn = nvmf_fc_hwqp_find_fc_conn(hwqp, rqst_conn_id);
+	if (!fc_conn) {
+		SPDK_ERRLOG("IU CMD conn(%ld) invalid\n", rqst_conn_id);
+		hwqp->counters.invalid_conn_err++;
+		return -ENODEV;
+	}
+
+	/* If association/connection is being deleted - return */
+	if (fc_conn->fc_assoc->assoc_state !=  SPDK_NVMF_FC_OBJECT_CREATED) {
+		SPDK_ERRLOG("Association state not valid\n");
+		return -EACCES;
+	}
+
+	if (fc_conn->qpair.state == SPDK_NVMF_QPAIR_ERROR) {
+		return -EACCES;
+	}
+
+	/* Make sure xfer len is according to mdts */
+	if (from_be32(&cmd_iu->data_len) >
+	    hwqp->fgroup->group.transport->opts.max_io_size) {
+		SPDK_ERRLOG("IO length requested is greater than MDTS\n");
+		return -EINVAL;
+	}
+
+	/* allocate a request buffer */
+	fc_req = nvmf_fc_hwqp_alloc_fc_request(hwqp);
+	if (fc_req == NULL) {
+		/* Should not happen. Since fc_reqs == RQ buffers */
+		return -ENOMEM;
+	}
+
+	fc_req->req.length = from_be32(&cmd_iu->data_len);
+	fc_req->req.qpair = &fc_conn->qpair;
+	fc_req->req.cmd = (union nvmf_h2c_msg *)&cmd_iu->cmd;
+	fc_req->req.rsp = (union nvmf_c2h_msg *)&fc_req->ersp.rsp;
+	fc_req->oxid = frame->ox_id;
+	fc_req->oxid = from_be16(&fc_req->oxid);
+	fc_req->rpi = fc_conn->rpi;
+	fc_req->buf_index = buf_idx;
+	fc_req->poller_lcore = hwqp->lcore_id;
+	fc_req->poller_thread = hwqp->thread;
+	fc_req->hwqp = hwqp;
+	fc_req->fc_conn = fc_conn;
+	fc_req->req.xfer = xfer;
+	fc_req->s_id = (uint32_t)frame->s_id;
+	fc_req->d_id = (uint32_t)frame->d_id;
+	fc_req->s_id = from_be32(&fc_req->s_id) >> 8;
+	fc_req->d_id = from_be32(&fc_req->d_id) >> 8;
+
+	nvmf_fc_record_req_trace_point(fc_req, SPDK_NVMF_FC_REQ_INIT);
+	if (nvmf_fc_request_execute(fc_req)) {
+		STAILQ_INSERT_TAIL(&hwqp->fgroup->group.pending_buf_queue, &fc_req->req, buf_link);
+	}
+
+	return 0;
+}
+
+/*
+ * These functions are called from the FC LLD
+ */
+
+void
+_nvmf_fc_request_free(struct spdk_nvmf_fc_request *fc_req)
+{
+	struct spdk_nvmf_fc_hwqp *hwqp = fc_req->hwqp;
+	struct spdk_nvmf_fc_poll_group *fgroup = hwqp->fgroup;
+	struct spdk_nvmf_transport_poll_group *group = &fgroup->group;
+	struct spdk_nvmf_transport *transport = group->transport;
+
+	if (!fc_req) {
+		return;
+	}
+
+	if (fc_req->xchg) {
+		nvmf_fc_put_xchg(hwqp, fc_req->xchg);
+		fc_req->xchg = NULL;
+	}
+
+	/* Release IO buffers */
+	if (fc_req->req.data_from_pool) {
+		spdk_nvmf_request_free_buffers(&fc_req->req, group, transport);
+	}
+	fc_req->req.data = NULL;
+	fc_req->req.iovcnt  = 0;
+
+	/* Release Q buffer */
+	nvmf_fc_rqpair_buffer_release(hwqp, fc_req->buf_index);
+
+	/* Free Fc request */
+	nvmf_fc_hwqp_free_fc_request(hwqp, fc_req);
+}
+
+void
+nvmf_fc_request_set_state(struct spdk_nvmf_fc_request *fc_req,
+			  enum spdk_nvmf_fc_request_state state)
+{
+	assert(fc_req->magic != 0xDEADBEEF);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+		      "FC Request(%p):\n\tState Old:%s New:%s\n", fc_req,
+		      nvmf_fc_request_get_state_str(fc_req->state),
+		      nvmf_fc_request_get_state_str(state));
+	nvmf_fc_record_req_trace_point(fc_req, state);
+	fc_req->state = state;
+}
+
+char *
+nvmf_fc_request_get_state_str(int state)
+{
+	static char *unk_str = "unknown";
+
+	return (state >= 0 && state < (int)(sizeof(fc_req_state_strs) / sizeof(char *)) ?
+		fc_req_state_strs[state] : unk_str);
+}
+
+int
+nvmf_fc_hwqp_process_frame(struct spdk_nvmf_fc_hwqp *hwqp,
+			   uint32_t buff_idx,
+			   struct spdk_nvmf_fc_frame_hdr *frame,
+			   struct spdk_nvmf_fc_buffer_desc *buffer,
+			   uint32_t plen)
+{
+	int rc = 0;
+	uint32_t s_id, d_id;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+
+	s_id = (uint32_t)frame->s_id;
+	d_id = (uint32_t)frame->d_id;
+	s_id = from_be32(&s_id) >> 8;
+	d_id = from_be32(&d_id) >> 8;
+
+	/* Note: In tracelog below, we directly do endian conversion on rx_id and.
+	 * ox_id Since these are fields, we can't pass address to from_be16().
+	 * Since ox_id and rx_id are only needed for tracelog, assigning to local
+	 * vars. and doing conversion is a waste of time in non-debug builds. */
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+		      "Process NVME frame s_id:0x%x d_id:0x%x oxid:0x%x rxid:0x%x.\n",
+		      s_id, d_id,
+		      ((frame->ox_id << 8) & 0xff00) | ((frame->ox_id >> 8) & 0xff),
+		      ((frame->rx_id << 8) & 0xff00) | ((frame->rx_id >> 8) & 0xff));
+
+	rc = nvmf_fc_hwqp_find_nport_and_rport(hwqp, d_id, &nport, s_id, &rport);
+	if (rc) {
+		if (nport == NULL) {
+			SPDK_ERRLOG("Nport not found. Dropping\n");
+			/* increment invalid nport counter */
+			hwqp->counters.nport_invalid++;
+		} else if (rport == NULL) {
+			SPDK_ERRLOG("Rport not found. Dropping\n");
+			/* increment invalid rport counter */
+			hwqp->counters.rport_invalid++;
+		}
+		return rc;
+	}
+
+	if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ||
+	    rport->rport_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+		SPDK_ERRLOG("%s state not created. Dropping\n",
+			    nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ?
+			    "Nport" : "Rport");
+		return -EACCES;
+	}
+
+	if ((frame->r_ctl == FCNVME_R_CTL_LS_REQUEST) &&
+	    (frame->type == FCNVME_TYPE_NVMF_DATA)) {
+		struct spdk_nvmf_fc_rq_buf_ls_request *req_buf = buffer->virt;
+		struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Process LS NVME frame\n");
+
+		/* Use the RQ buffer for holding LS request. */
+		ls_rqst = (struct spdk_nvmf_fc_ls_rqst *)&req_buf->ls_rqst;
+
+		/* Fill in the LS request structure */
+		ls_rqst->rqstbuf.virt = (void *)&req_buf->rqst;
+		ls_rqst->rqstbuf.phys = buffer->phys +
+					offsetof(struct spdk_nvmf_fc_rq_buf_ls_request, rqst);
+		ls_rqst->rqstbuf.buf_index = buff_idx;
+		ls_rqst->rqst_len = plen;
+
+		ls_rqst->rspbuf.virt = (void *)&req_buf->resp;
+		ls_rqst->rspbuf.phys = buffer->phys +
+				       offsetof(struct spdk_nvmf_fc_rq_buf_ls_request, resp);
+		ls_rqst->rsp_len = FCNVME_MAX_LS_RSP_SIZE;
+
+		ls_rqst->private_data = (void *)hwqp;
+		ls_rqst->rpi = rport->rpi;
+		ls_rqst->oxid = (uint16_t)frame->ox_id;
+		ls_rqst->oxid = from_be16(&ls_rqst->oxid);
+		ls_rqst->s_id = s_id;
+		ls_rqst->d_id = d_id;
+		ls_rqst->nport = nport;
+		ls_rqst->rport = rport;
+		ls_rqst->nvmf_tgt = g_nvmf_ftransport->transport.tgt;
+
+		ls_rqst->xchg = nvmf_fc_get_xri(hwqp);
+		if (ls_rqst->xchg) {
+			/* Handover the request to LS module */
+			nvmf_fc_handle_ls_rqst(ls_rqst);
+		} else {
+			/* No XCHG available. Add to pending list. */
+			hwqp->counters.no_xchg++;
+			TAILQ_INSERT_TAIL(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+		}
+	} else if ((frame->r_ctl == FCNVME_R_CTL_CMD_REQ) &&
+		   (frame->type == FCNVME_TYPE_FC_EXCHANGE)) {
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Process IO NVME frame\n");
+		rc = nvmf_fc_hwqp_handle_request(hwqp, frame, buff_idx, buffer, plen);
+	} else {
+
+		SPDK_ERRLOG("Unknown frame received. Dropping\n");
+		hwqp->counters.unknown_frame++;
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+void
+nvmf_fc_hwqp_process_pending_reqs(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	struct spdk_nvmf_request *req = NULL, *tmp;
+	struct spdk_nvmf_fc_request *fc_req;
+	int budget = 64;
+
+	if (!hwqp->fgroup) {
+		/* LS queue is tied to acceptor_poll group and LS pending requests
+		 * are stagged and processed using hwqp->ls_pending_queue.
+		 */
+		return;
+	}
+
+	STAILQ_FOREACH_SAFE(req, &hwqp->fgroup->group.pending_buf_queue, buf_link, tmp) {
+		fc_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_fc_request, req);
+		if (!nvmf_fc_request_execute(fc_req)) {
+			/* Succesfuly posted, Delete from pending. */
+			STAILQ_REMOVE_HEAD(&hwqp->fgroup->group.pending_buf_queue, buf_link);
+		}
+
+		if (budget) {
+			budget--;
+		} else {
+			return;
+		}
+	}
+}
+
+void
+nvmf_fc_hwqp_process_pending_ls_rqsts(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst = NULL, *tmp;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+
+	TAILQ_FOREACH_SAFE(ls_rqst, &hwqp->ls_pending_queue, ls_pending_link, tmp) {
+		/* lookup nport and rport again - make sure they are still valid */
+		int rc = nvmf_fc_hwqp_find_nport_and_rport(hwqp, ls_rqst->d_id, &nport, ls_rqst->s_id, &rport);
+		if (rc) {
+			if (nport == NULL) {
+				SPDK_ERRLOG("Nport not found. Dropping\n");
+				/* increment invalid nport counter */
+				hwqp->counters.nport_invalid++;
+			} else if (rport == NULL) {
+				SPDK_ERRLOG("Rport not found. Dropping\n");
+				/* increment invalid rport counter */
+				hwqp->counters.rport_invalid++;
+			}
+			TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+			/* Return buffer to chip */
+			nvmf_fc_rqpair_buffer_release(hwqp, ls_rqst->rqstbuf.buf_index);
+			continue;
+		}
+		if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ||
+		    rport->rport_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+			SPDK_ERRLOG("%s state not created. Dropping\n",
+				    nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ?
+				    "Nport" : "Rport");
+			TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+			/* Return buffer to chip */
+			nvmf_fc_rqpair_buffer_release(hwqp, ls_rqst->rqstbuf.buf_index);
+			continue;
+		}
+
+		ls_rqst->xchg = nvmf_fc_get_xri(hwqp);
+		if (ls_rqst->xchg) {
+			/* Got an XCHG */
+			TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+			/* Handover the request to LS module */
+			nvmf_fc_handle_ls_rqst(ls_rqst);
+		} else {
+			/* No more XCHGs. Stop processing. */
+			hwqp->counters.no_xchg++;
+			return;
+		}
+	}
+}
+
+int
+nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *fc_req)
+{
+	int rc = 0;
+	struct spdk_nvmf_request *req = &fc_req->req;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_fc_conn *fc_conn = nvmf_fc_get_conn(qpair);
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint16_t ersp_len = 0;
+
+	/* set sq head value in resp */
+	rsp->sqhd = nvmf_fc_advance_conn_sqhead(qpair);
+
+	/* Increment connection responses */
+	fc_conn->rsp_count++;
+
+	if (nvmf_fc_send_ersp_required(fc_req, fc_conn->rsp_count,
+				       fc_req->transfered_len)) {
+		/* Fill ERSP Len */
+		to_be16(&ersp_len, (sizeof(struct spdk_nvmf_fc_ersp_iu) /
+				    sizeof(uint32_t)));
+		fc_req->ersp.ersp_len = ersp_len;
+
+		/* Fill RSN */
+		to_be32(&fc_req->ersp.response_seq_no, fc_conn->rsn);
+		fc_conn->rsn++;
+
+		/* Fill transfer length */
+		to_be32(&fc_req->ersp.transferred_data_len, fc_req->transfered_len);
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Posting ERSP.\n");
+		rc = nvmf_fc_xmt_rsp(fc_req, (uint8_t *)&fc_req->ersp,
+				     sizeof(struct spdk_nvmf_fc_ersp_iu));
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Posting RSP.\n");
+		rc = nvmf_fc_xmt_rsp(fc_req, NULL, 0);
+	}
+
+	return rc;
+}
+
+bool
+nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req,
+			   uint32_t rsp_cnt, uint32_t xfer_len)
+{
+	struct spdk_nvmf_request *req = &fc_req->req;
+	struct spdk_nvmf_qpair *qpair = req->qpair;
+	struct spdk_nvmf_fc_conn *fc_conn = nvmf_fc_get_conn(qpair);
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+	uint16_t status = *((uint16_t *)&rsp->status);
+
+	/*
+	 * Check if we need to send ERSP
+	 * 1) For every N responses where N == ersp_ratio
+	 * 2) Fabric commands.
+	 * 3) Completion status failed or Completion dw0 or dw1 valid.
+	 * 4) SQ == 90% full.
+	 * 5) Transfer length not equal to CMD IU length
+	 */
+
+	if (!(rsp_cnt % fc_conn->esrp_ratio) ||
+	    (cmd->opc == SPDK_NVME_OPC_FABRIC) ||
+	    (status & 0xFFFE) || rsp->cdw0 || rsp->rsvd1 ||
+	    (req->length != xfer_len)) {
+		return true;
+	}
+	return false;
+}
+
+static int
+nvmf_fc_request_complete(struct spdk_nvmf_request *req)
+{
+	int rc = 0;
+	struct spdk_nvmf_fc_request *fc_req = nvmf_fc_get_fc_req(req);
+	struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+	if (fc_req->is_aborted) {
+		/* Defer this to make sure we dont call io cleanup in same context. */
+		nvmf_fc_poller_api_func(fc_req->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+					(void *)fc_req);
+	} else if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
+		   req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+
+		nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_XFER);
+
+		rc = nvmf_fc_send_data(fc_req);
+	} else {
+		if (req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+			nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_WRITE_RSP);
+		} else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+			nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_RSP);
+		} else {
+			nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_NONE_RSP);
+		}
+
+		rc = nvmf_fc_handle_rsp(fc_req);
+	}
+
+	if (rc) {
+		SPDK_ERRLOG("Error in request complete.\n");
+		_nvmf_fc_request_free(fc_req);
+	}
+	return 0;
+}
+
+struct spdk_nvmf_tgt *
+nvmf_fc_get_tgt(void)
+{
+	if (g_nvmf_ftransport) {
+		return g_nvmf_ftransport->transport.tgt;
+	}
+	return NULL;
+}
+
+/*
+ * FC Transport Public API begins here
+ */
+
+#define SPDK_NVMF_FC_DEFAULT_MAX_QUEUE_DEPTH 128
+#define SPDK_NVMF_FC_DEFAULT_AQ_DEPTH 32
+#define SPDK_NVMF_FC_DEFAULT_MAX_QPAIRS_PER_CTRLR 5
+#define SPDK_NVMF_FC_DEFAULT_IN_CAPSULE_DATA_SIZE 0
+#define SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE 65536
+#define SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE 4096
+#define SPDK_NVMF_FC_DEFAULT_NUM_SHARED_BUFFERS 8192
+#define SPDK_NVMF_FC_DEFAULT_MAX_SGE (SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE /	\
+				      SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE)
+
+static void
+nvmf_fc_opts_init(struct spdk_nvmf_transport_opts *opts)
+{
+	opts->max_queue_depth =      SPDK_NVMF_FC_DEFAULT_MAX_QUEUE_DEPTH;
+	opts->max_qpairs_per_ctrlr = SPDK_NVMF_FC_DEFAULT_MAX_QPAIRS_PER_CTRLR;
+	opts->in_capsule_data_size = SPDK_NVMF_FC_DEFAULT_IN_CAPSULE_DATA_SIZE;
+	opts->max_io_size =          SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE;
+	opts->io_unit_size =         SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE;
+	opts->max_aq_depth =         SPDK_NVMF_FC_DEFAULT_AQ_DEPTH;
+	opts->num_shared_buffers =   SPDK_NVMF_FC_DEFAULT_NUM_SHARED_BUFFERS;
+}
+
+static struct spdk_nvmf_transport *
+nvmf_fc_create(struct spdk_nvmf_transport_opts *opts)
+{
+	uint32_t sge_count;
+
+	SPDK_INFOLOG(SPDK_LOG_NVMF_FC, "*** FC Transport Init ***\n"
+		     "  Transport opts:  max_ioq_depth=%d, max_io_size=%d,\n"
+		     "  max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
+		     "  max_aq_depth=%d\n",
+		     opts->max_queue_depth,
+		     opts->max_io_size,
+		     opts->max_qpairs_per_ctrlr - 1,
+		     opts->io_unit_size,
+		     opts->max_aq_depth);
+
+	if (g_nvmf_ftransport) {
+		SPDK_ERRLOG("Duplicate NVMF-FC transport create request!\n");
+		return NULL;
+	}
+
+	if (spdk_env_get_last_core() < 1) {
+		SPDK_ERRLOG("Not enough cores/threads (%d) to run NVMF-FC transport!\n",
+			    spdk_env_get_last_core() + 1);
+		return NULL;
+	}
+
+	sge_count = opts->max_io_size / opts->io_unit_size;
+	if (sge_count > SPDK_NVMF_FC_DEFAULT_MAX_SGE) {
+		SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
+		return NULL;
+	}
+
+	g_nvmf_fc_master_thread = spdk_get_thread();
+	g_nvmf_fgroup_count = 0;
+	g_nvmf_ftransport = calloc(1, sizeof(*g_nvmf_ftransport));
+
+	if (!g_nvmf_ftransport) {
+		SPDK_ERRLOG("Failed to allocate NVMF-FC transport\n");
+		return NULL;
+	}
+
+	if (pthread_mutex_init(&g_nvmf_ftransport->lock, NULL)) {
+		SPDK_ERRLOG("pthread_mutex_init() failed\n");
+		free(g_nvmf_ftransport);
+		g_nvmf_ftransport = NULL;
+		return NULL;
+	}
+
+	/* initialize the low level FC driver */
+	nvmf_fc_lld_init();
+
+	return &g_nvmf_ftransport->transport;
+}
+
+static int
+nvmf_fc_destroy(struct spdk_nvmf_transport *transport)
+{
+	if (transport) {
+		struct spdk_nvmf_fc_transport *ftransport;
+		struct spdk_nvmf_fc_poll_group *fgroup, *pg_tmp;
+
+		ftransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_fc_transport, transport);
+
+		free(ftransport);
+
+		/* clean up any FC poll groups still around */
+		TAILQ_FOREACH_SAFE(fgroup, &g_nvmf_fgroups, link, pg_tmp) {
+			TAILQ_REMOVE(&g_nvmf_fgroups, fgroup, link);
+			free(fgroup);
+		}
+		g_nvmf_fgroup_count = 0;
+
+		/* low level FC driver clean up */
+		nvmf_fc_lld_fini();
+
+		nvmf_fc_port_cleanup();
+	}
+
+	return 0;
+}
+
+static int
+nvmf_fc_listen(struct spdk_nvmf_transport *transport,
+	       const struct spdk_nvme_transport_id *trid)
+{
+	return 0;
+}
+
+static void
+nvmf_fc_stop_listen(struct spdk_nvmf_transport *transport,
+		    const struct spdk_nvme_transport_id *_trid)
+{
+}
+
+static uint32_t
+nvmf_fc_accept(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	uint32_t count = 0;
+	static bool start_lld = false;
+
+	if (spdk_unlikely(!start_lld)) {
+		start_lld  = true;
+		nvmf_fc_lld_start();
+	}
+
+	/* poll the LS queue on each port */
+	TAILQ_FOREACH(fc_port, &g_spdk_nvmf_fc_port_list, link) {
+		if (fc_port->hw_port_status == SPDK_FC_PORT_ONLINE) {
+			count += nvmf_fc_process_queue(&fc_port->ls_queue);
+		}
+	}
+
+	return count;
+}
+
+static void
+nvmf_fc_discover(struct spdk_nvmf_transport *transport,
+		 struct spdk_nvme_transport_id *trid,
+		 struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+	entry->trtype = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC;
+	entry->adrfam = trid->adrfam;
+	entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED;
+
+	spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
+	spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_fc_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_fc_poll_group *fgroup;
+	struct spdk_nvmf_fc_transport *ftransport =
+		SPDK_CONTAINEROF(transport, struct spdk_nvmf_fc_transport, transport);
+
+	fgroup = calloc(1, sizeof(struct spdk_nvmf_fc_poll_group));
+	if (!fgroup) {
+		SPDK_ERRLOG("Unable to alloc FC poll group\n");
+		return NULL;
+	}
+
+	TAILQ_INIT(&fgroup->hwqp_list);
+
+	pthread_mutex_lock(&ftransport->lock);
+	TAILQ_INSERT_TAIL(&g_nvmf_fgroups, fgroup, link);
+	g_nvmf_fgroup_count++;
+	pthread_mutex_unlock(&ftransport->lock);
+
+	return &fgroup->group;
+}
+
+static void
+nvmf_fc_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_fc_poll_group *fgroup;
+	struct spdk_nvmf_fc_transport *ftransport =
+		SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_fc_transport, transport);
+
+	fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group);
+	pthread_mutex_lock(&ftransport->lock);
+	TAILQ_REMOVE(&g_nvmf_fgroups, fgroup, link);
+	g_nvmf_fgroup_count--;
+	pthread_mutex_unlock(&ftransport->lock);
+
+	free(fgroup);
+}
+
+static int
+nvmf_fc_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+		       struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_fc_poll_group *fgroup;
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+	struct spdk_nvmf_fc_ls_add_conn_api_data *api_data = NULL;
+	bool hwqp_found = false;
+
+	fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group);
+	fc_conn  = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+
+	TAILQ_FOREACH(hwqp, &fgroup->hwqp_list, link) {
+		if (fc_conn->fc_assoc->tgtport->fc_port == hwqp->fc_port) {
+			hwqp_found = true;
+			break;
+		}
+	}
+
+	if (!hwqp_found) {
+		SPDK_ERRLOG("No valid hwqp found for new QP.\n");
+		goto err;
+	}
+
+	if (!nvmf_fc_assign_conn_to_hwqp(hwqp,
+					 &fc_conn->conn_id,
+					 fc_conn->max_queue_depth)) {
+		SPDK_ERRLOG("Failed to get a connection id for new QP.\n");
+		goto err;
+	}
+
+	fc_conn->hwqp = hwqp;
+
+	/* If this is for ADMIN connection, then update assoc ID. */
+	if (fc_conn->qpair.qid == 0) {
+		fc_conn->fc_assoc->assoc_id = fc_conn->conn_id;
+	}
+
+	api_data = &fc_conn->create_opd->u.add_conn;
+	nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION, &api_data->args);
+	return 0;
+err:
+	return -1;
+}
+
+static int
+nvmf_fc_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+	uint32_t count = 0;
+	struct spdk_nvmf_fc_poll_group *fgroup;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+
+	fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group);
+
+	TAILQ_FOREACH(hwqp, &fgroup->hwqp_list, link) {
+		if (hwqp->state == SPDK_FC_HWQP_ONLINE) {
+			count += nvmf_fc_process_queue(hwqp);
+		}
+	}
+
+	return (int) count;
+}
+
+static int
+nvmf_fc_request_free(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_fc_request *fc_req = nvmf_fc_get_fc_req(req);
+
+	if (!fc_req->is_aborted) {
+		nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_BDEV_ABORTED);
+		nvmf_fc_request_abort(fc_req, true, NULL, NULL);
+	} else {
+		nvmf_fc_request_abort_complete(fc_req);
+	}
+	return 0;
+}
+
+
+static void
+nvmf_fc_close_qpair(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+
+	if (fc_conn->conn_id == NVMF_FC_INVALID_CONN_ID) {
+		/* QP creation failure in FC tranport. Cleanup. */
+		spdk_thread_send_msg(nvmf_fc_get_master_thread(),
+				     nvmf_fc_handle_connection_failure, fc_conn);
+	} else if (fc_conn->fc_assoc->assoc_id == fc_conn->conn_id &&
+		   fc_conn->fc_assoc->assoc_state != SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+		/* Admin connection */
+		spdk_thread_send_msg(nvmf_fc_get_master_thread(),
+				     nvmf_fc_handle_assoc_deletion, fc_conn);
+	}
+}
+
+static int
+nvmf_fc_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+			    struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+	memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id));
+	return 0;
+}
+
+static int
+nvmf_fc_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+			     struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+	memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id));
+	return 0;
+}
+
+static int
+nvmf_fc_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+			      struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+	memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id));
+	return 0;
+}
+
+static void
+nvmf_fc_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+			    struct spdk_nvmf_request *req)
+{
+	spdk_nvmf_request_complete(req);
+}
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_fc = {
+	.name = "FC",
+	.type = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC,
+	.opts_init = nvmf_fc_opts_init,
+	.create = nvmf_fc_create,
+	.destroy = nvmf_fc_destroy,
+
+	.listen = nvmf_fc_listen,
+	.stop_listen = nvmf_fc_stop_listen,
+	.accept = nvmf_fc_accept,
+
+	.listener_discover = nvmf_fc_discover,
+
+	.poll_group_create = nvmf_fc_poll_group_create,
+	.poll_group_destroy = nvmf_fc_poll_group_destroy,
+	.poll_group_add = nvmf_fc_poll_group_add,
+	.poll_group_poll = nvmf_fc_poll_group_poll,
+
+	.req_complete = nvmf_fc_request_complete,
+	.req_free = nvmf_fc_request_free,
+	.qpair_fini = nvmf_fc_close_qpair,
+	.qpair_get_peer_trid = nvmf_fc_qpair_get_peer_trid,
+	.qpair_get_local_trid = nvmf_fc_qpair_get_local_trid,
+	.qpair_get_listen_trid = nvmf_fc_qpair_get_listen_trid,
+	.qpair_abort_request = nvmf_fc_qpair_abort_request,
+};
+
+/*
+ * Re-initialize the FC-Port after an offline event.
+ * Only the queue information needs to be populated. XCHG, lcore and other hwqp information remains
+ * unchanged after the first initialization.
+ *
+ */
+static int
+nvmf_fc_adm_hw_port_reinit_validate(struct spdk_nvmf_fc_port *fc_port,
+				    struct spdk_nvmf_fc_hw_port_init_args *args)
+{
+	uint32_t i;
+
+	/* Verify that the port was previously in offline or quiesced state */
+	if (nvmf_fc_port_is_online(fc_port)) {
+		SPDK_ERRLOG("SPDK FC port %d already initialized and online.\n", args->port_handle);
+		return -EINVAL;
+	}
+
+	/* Reinit information in new LS queue from previous queue */
+	nvmf_fc_hwqp_reinit_poller_queues(&fc_port->ls_queue, args->ls_queue);
+
+	fc_port->fcp_rq_id = args->fcp_rq_id;
+
+	/* Initialize the LS queue */
+	fc_port->ls_queue.queues = args->ls_queue;
+	nvmf_fc_init_poller_queues(fc_port->ls_queue.queues);
+
+	for (i = 0; i < fc_port->num_io_queues; i++) {
+		/* Reinit information in new IO queue from previous queue */
+		nvmf_fc_hwqp_reinit_poller_queues(&fc_port->io_queues[i],
+						  args->io_queues[i]);
+		fc_port->io_queues[i].queues = args->io_queues[i];
+		/* Initialize the IO queues */
+		nvmf_fc_init_poller_queues(fc_port->io_queues[i].queues);
+	}
+
+	fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE;
+
+	/* Validate the port information */
+	DEV_VERIFY(TAILQ_EMPTY(&fc_port->nport_list));
+	DEV_VERIFY(fc_port->num_nports == 0);
+	if (!TAILQ_EMPTY(&fc_port->nport_list) || (fc_port->num_nports != 0)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Initializes the data for the creation of a FC-Port object in the SPDK
+ * library. The spdk_nvmf_fc_port is a well defined structure that is part of
+ * the API to the library. The contents added to this well defined structure
+ * is private to each vendors implementation.
+ */
+static int
+nvmf_fc_adm_hw_port_data_init(struct spdk_nvmf_fc_port *fc_port,
+			      struct spdk_nvmf_fc_hw_port_init_args *args)
+{
+	/* Used a high number for the LS HWQP so that it does not clash with the
+	 * IO HWQP's and immediately shows a LS queue during tracing.
+	 */
+	uint32_t i;
+
+	fc_port->port_hdl       = args->port_handle;
+	fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE;
+	fc_port->fcp_rq_id      = args->fcp_rq_id;
+	fc_port->num_io_queues  = args->io_queue_cnt;
+
+	/*
+	 * Set port context from init args. Used for FCP port stats.
+	 */
+	fc_port->port_ctx = args->port_ctx;
+
+	/*
+	 * Initialize the LS queue wherever needed.
+	 */
+	fc_port->ls_queue.queues = args->ls_queue;
+	fc_port->ls_queue.thread = nvmf_fc_get_master_thread();
+	fc_port->ls_queue.hwqp_id = SPDK_MAX_NUM_OF_FC_PORTS * fc_port->num_io_queues;
+
+	/*
+	 * Initialize the LS queue.
+	 */
+	nvmf_fc_init_hwqp(fc_port, &fc_port->ls_queue);
+
+	/*
+	 * Initialize the IO queues.
+	 */
+	for (i = 0; i < args->io_queue_cnt; i++) {
+		struct spdk_nvmf_fc_hwqp *hwqp = &fc_port->io_queues[i];
+		hwqp->hwqp_id = i;
+		hwqp->queues = args->io_queues[i];
+		hwqp->rq_size = args->io_queue_size;
+		nvmf_fc_init_hwqp(fc_port, hwqp);
+	}
+
+	/*
+	 * Initialize the LS processing for port
+	 */
+	nvmf_fc_ls_init(fc_port);
+
+	/*
+	 * Initialize the list of nport on this HW port.
+	 */
+	TAILQ_INIT(&fc_port->nport_list);
+	fc_port->num_nports = 0;
+
+	return 0;
+}
+
+static void
+nvmf_fc_adm_port_hwqp_offline_del_poller(struct spdk_nvmf_fc_port *fc_port)
+{
+	struct spdk_nvmf_fc_hwqp *hwqp    = NULL;
+	int i = 0;
+
+	hwqp = &fc_port->ls_queue;
+	(void)nvmf_fc_hwqp_set_offline(hwqp);
+
+	/*  Remove poller for all the io queues. */
+	for (i = 0; i < (int)fc_port->num_io_queues; i++) {
+		hwqp = &fc_port->io_queues[i];
+		(void)nvmf_fc_hwqp_set_offline(hwqp);
+		nvmf_fc_poll_group_remove_hwqp(hwqp);
+	}
+}
+
+/*
+ * Callback function for HW port link break operation.
+ *
+ * Notice that this callback is being triggered when spdk_fc_nport_delete()
+ * completes, if that spdk_fc_nport_delete() called is issued by
+ * nvmf_fc_adm_evnt_hw_port_link_break().
+ *
+ * Since nvmf_fc_adm_evnt_hw_port_link_break() can invoke spdk_fc_nport_delete() multiple
+ * times (one per nport in the HW port's nport_list), a single call to
+ * nvmf_fc_adm_evnt_hw_port_link_break() can result in multiple calls to this callback function.
+ *
+ * As a result, this function only invokes a callback to the caller of
+ * nvmf_fc_adm_evnt_hw_port_link_break() only when the HW port's nport_list is empty.
+ */
+static void
+nvmf_fc_adm_hw_port_link_break_cb(uint8_t port_handle,
+				  enum spdk_fc_event event_type, void *cb_args, int spdk_err)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_port_link_break_cb_data *offline_cb_args = cb_args;
+	struct spdk_nvmf_hw_port_link_break_args *offline_args = NULL;
+	spdk_nvmf_fc_callback cb_func = NULL;
+	int err = 0;
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	int num_nports = 0;
+	char log_str[256];
+
+	if (0 != spdk_err) {
+		DEV_VERIFY(!"port link break cb: spdk_err not success.");
+		SPDK_ERRLOG("port link break cb: spdk_err:%d.\n", spdk_err);
+		goto out;
+	}
+
+	if (!offline_cb_args) {
+		DEV_VERIFY(!"port link break cb: port_offline_args is NULL.");
+		err = -EINVAL;
+		goto out;
+	}
+
+	offline_args = offline_cb_args->args;
+	if (!offline_args) {
+		DEV_VERIFY(!"port link break cb: offline_args is NULL.");
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (port_handle != offline_args->port_handle) {
+		DEV_VERIFY(!"port link break cb: port_handle mismatch.");
+		err = -EINVAL;
+		goto out;
+	}
+
+	cb_func = offline_cb_args->cb_func;
+	if (!cb_func) {
+		DEV_VERIFY(!"port link break cb: cb_func is NULL.");
+		err = -EINVAL;
+		goto out;
+	}
+
+	fc_port = nvmf_fc_port_lookup(port_handle);
+	if (!fc_port) {
+		DEV_VERIFY(!"port link break cb: fc_port is NULL.");
+		SPDK_ERRLOG("port link break cb: Unable to find port:%d\n",
+			    offline_args->port_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	num_nports = fc_port->num_nports;
+	if (!TAILQ_EMPTY(&fc_port->nport_list)) {
+		/*
+		 * Don't call the callback unless all nports have been deleted.
+		 */
+		goto out;
+	}
+
+	if (num_nports != 0) {
+		DEV_VERIFY(!"port link break cb: num_nports in non-zero.");
+		SPDK_ERRLOG("port link break cb: # of ports should be 0. Instead, num_nports:%d\n",
+			    num_nports);
+		err = -EINVAL;
+	}
+
+	/*
+	 * Mark the hwqps as offline and unregister the pollers.
+	 */
+	(void)nvmf_fc_adm_port_hwqp_offline_del_poller(fc_port);
+
+	/*
+	 * Since there are no more nports, execute the callback(s).
+	 */
+	(void)cb_func(port_handle, SPDK_FC_LINK_BREAK,
+		      (void *)offline_args->cb_ctx, spdk_err);
+
+out:
+	free(offline_cb_args);
+
+	snprintf(log_str, sizeof(log_str),
+		 "port link break cb: port:%d evt_type:%d num_nports:%d err:%d spdk_err:%d.\n",
+		 port_handle, event_type, num_nports, err, spdk_err);
+
+	if (err != 0) {
+		SPDK_ERRLOG("%s", log_str);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	}
+	return;
+}
+
+/*
+ * FC port must have all its nports deleted before transitioning to offline state.
+ */
+static void
+nvmf_fc_adm_hw_port_offline_nport_delete(struct spdk_nvmf_fc_port *fc_port)
+{
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	/* All nports must have been deleted at this point for this fc port */
+	DEV_VERIFY(fc_port && TAILQ_EMPTY(&fc_port->nport_list));
+	DEV_VERIFY(fc_port->num_nports == 0);
+	/* Mark the nport states to be zombie, if they exist */
+	if (fc_port && !TAILQ_EMPTY(&fc_port->nport_list)) {
+		TAILQ_FOREACH(nport, &fc_port->nport_list, link) {
+			(void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_ZOMBIE);
+		}
+	}
+}
+
+static void
+nvmf_fc_adm_i_t_delete_cb(void *args, uint32_t err)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_i_t_del_cb_data *cb_data = args;
+	struct spdk_nvmf_fc_nport *nport = cb_data->nport;
+	struct spdk_nvmf_fc_remote_port_info *rport = cb_data->rport;
+	spdk_nvmf_fc_callback cb_func = cb_data->fc_cb_func;
+	int spdk_err = 0;
+	uint8_t port_handle = cb_data->port_handle;
+	uint32_t s_id = rport->s_id;
+	uint32_t rpi = rport->rpi;
+	uint32_t assoc_count = rport->assoc_count;
+	uint32_t nport_hdl = nport->nport_hdl;
+	uint32_t d_id = nport->d_id;
+	char log_str[256];
+
+	/*
+	 * Assert on any delete failure.
+	 */
+	if (0 != err) {
+		DEV_VERIFY(!"Error in IT Delete callback.");
+		goto out;
+	}
+
+	if (cb_func != NULL) {
+		(void)cb_func(port_handle, SPDK_FC_IT_DELETE, cb_data->fc_cb_ctx, spdk_err);
+	}
+
+out:
+	free(cb_data);
+
+	snprintf(log_str, sizeof(log_str),
+		 "IT delete assoc_cb on nport %d done, port_handle:%d s_id:%d d_id:%d rpi:%d rport_assoc_count:%d rc = %d.\n",
+		 nport_hdl, port_handle, s_id, d_id, rpi, assoc_count, err);
+
+	if (err != 0) {
+		SPDK_ERRLOG("%s", log_str);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	}
+}
+
+static void
+nvmf_fc_adm_i_t_delete_assoc_cb(void *args, uint32_t err)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data *cb_data = args;
+	struct spdk_nvmf_fc_nport *nport = cb_data->nport;
+	struct spdk_nvmf_fc_remote_port_info *rport = cb_data->rport;
+	spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func = cb_data->cb_func;
+	uint32_t s_id = rport->s_id;
+	uint32_t rpi = rport->rpi;
+	uint32_t assoc_count = rport->assoc_count;
+	uint32_t nport_hdl = nport->nport_hdl;
+	uint32_t d_id = nport->d_id;
+	char log_str[256];
+
+	/*
+	 * Assert on any association delete failure. We continue to delete other
+	 * associations in promoted builds.
+	 */
+	if (0 != err) {
+		DEV_VERIFY(!"Nport's association delete callback returned error");
+		if (nport->assoc_count > 0) {
+			nport->assoc_count--;
+		}
+		if (rport->assoc_count > 0) {
+			rport->assoc_count--;
+		}
+	}
+
+	/*
+	 * If this is the last association being deleted for the ITN,
+	 * execute the callback(s).
+	 */
+	if (0 == rport->assoc_count) {
+		/* Remove the rport from the remote port list. */
+		if (nvmf_fc_nport_remove_rem_port(nport, rport) != 0) {
+			SPDK_ERRLOG("Error while removing rport from list.\n");
+			DEV_VERIFY(!"Error while removing rport from list.");
+		}
+
+		if (cb_func != NULL) {
+			/*
+			 * Callback function is provided by the caller
+			 * of nvmf_fc_adm_i_t_delete_assoc().
+			 */
+			(void)cb_func(cb_data->cb_ctx, 0);
+		}
+		free(rport);
+		free(args);
+	}
+
+	snprintf(log_str, sizeof(log_str),
+		 "IT delete assoc_cb on nport %d done, s_id:%d d_id:%d rpi:%d rport_assoc_count:%d err = %d.\n",
+		 nport_hdl, s_id, d_id, rpi, assoc_count, err);
+
+	if (err != 0) {
+		SPDK_ERRLOG("%s", log_str);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	}
+}
+
+/**
+ * Process a IT delete.
+ */
+static void
+nvmf_fc_adm_i_t_delete_assoc(struct spdk_nvmf_fc_nport *nport,
+			     struct spdk_nvmf_fc_remote_port_info *rport,
+			     spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func,
+			     void *cb_ctx)
+{
+	int err = 0;
+	struct spdk_nvmf_fc_association *assoc = NULL;
+	int assoc_err = 0;
+	uint32_t num_assoc = 0;
+	uint32_t num_assoc_del_scheduled = 0;
+	struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data *cb_data = NULL;
+	uint8_t port_hdl = nport->port_hdl;
+	uint32_t s_id = rport->s_id;
+	uint32_t rpi = rport->rpi;
+	uint32_t assoc_count = rport->assoc_count;
+	char log_str[256];
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "IT delete associations on nport:%d begin.\n",
+		      nport->nport_hdl);
+
+	/*
+	 * Allocate memory for callback data.
+	 * This memory will be freed by the callback function.
+	 */
+	cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data));
+	if (NULL == cb_data) {
+		SPDK_ERRLOG("Failed to allocate memory for cb_data on nport:%d.\n", nport->nport_hdl);
+		err = -ENOMEM;
+		goto out;
+	}
+	cb_data->nport       = nport;
+	cb_data->rport       = rport;
+	cb_data->port_handle = port_hdl;
+	cb_data->cb_func     = cb_func;
+	cb_data->cb_ctx      = cb_ctx;
+
+	/*
+	 * Delete all associations, if any, related with this ITN/remote_port.
+	 */
+	TAILQ_FOREACH(assoc, &nport->fc_associations, link) {
+		num_assoc++;
+		if (assoc->s_id == s_id) {
+			assoc_err = nvmf_fc_delete_association(nport,
+							       assoc->assoc_id,
+							       false /* send abts */, false,
+							       nvmf_fc_adm_i_t_delete_assoc_cb, cb_data);
+			if (0 != assoc_err) {
+				/*
+				 * Mark this association as zombie.
+				 */
+				err = -EINVAL;
+				DEV_VERIFY(!"Error while deleting association");
+				(void)nvmf_fc_assoc_set_state(assoc, SPDK_NVMF_FC_OBJECT_ZOMBIE);
+			} else {
+				num_assoc_del_scheduled++;
+			}
+		}
+	}
+
+out:
+	if ((cb_data) && (num_assoc_del_scheduled == 0)) {
+		/*
+		 * Since there are no association_delete calls
+		 * successfully scheduled, the association_delete
+		 * callback function will never be called.
+		 * In this case, call the callback function now.
+		 */
+		nvmf_fc_adm_i_t_delete_assoc_cb(cb_data, 0);
+	}
+
+	snprintf(log_str, sizeof(log_str),
+		 "IT delete associations on nport:%d end. "
+		 "s_id:%d rpi:%d assoc_count:%d assoc:%d assoc_del_scheduled:%d rc:%d.\n",
+		 nport->nport_hdl, s_id, rpi, assoc_count, num_assoc, num_assoc_del_scheduled, err);
+
+	if (err == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	} else {
+		SPDK_ERRLOG("%s", log_str);
+	}
+}
+
+static void
+nvmf_fc_adm_queue_quiesce_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_poller_api_quiesce_queue_args *quiesce_api_data = NULL;
+	struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *port_quiesce_ctx = NULL;
+	struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	int err = 0;
+
+	quiesce_api_data = (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *)cb_data;
+	hwqp = quiesce_api_data->hwqp;
+	fc_port = hwqp->fc_port;
+	port_quiesce_ctx = (struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *)quiesce_api_data->ctx;
+	spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func = port_quiesce_ctx->cb_func;
+
+	/*
+	 * Decrement the callback/quiesced queue count.
+	 */
+	port_quiesce_ctx->quiesce_count--;
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Queue%d Quiesced\n", quiesce_api_data->hwqp->hwqp_id);
+
+	free(quiesce_api_data);
+	/*
+	 * Wait for call backs i.e. max_ioq_queues + LS QUEUE.
+	 */
+	if (port_quiesce_ctx->quiesce_count > 0) {
+		return;
+	}
+
+	if (fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) {
+		SPDK_ERRLOG("Port %d already in quiesced state.\n", fc_port->port_hdl);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d quiesced.\n", fc_port->port_hdl);
+		fc_port->hw_port_status = SPDK_FC_PORT_QUIESCED;
+	}
+
+	if (cb_func) {
+		/*
+		 * Callback function for the called of quiesce.
+		 */
+		cb_func(port_quiesce_ctx->ctx, err);
+	}
+
+	/*
+	 * Free the context structure.
+	 */
+	free(port_quiesce_ctx);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d quiesce done, rc = %d.\n", fc_port->port_hdl,
+		      err);
+}
+
+static int
+nvmf_fc_adm_hw_queue_quiesce(struct spdk_nvmf_fc_hwqp *fc_hwqp, void *ctx,
+			     spdk_nvmf_fc_poller_api_cb cb_func)
+{
+	struct spdk_nvmf_fc_poller_api_quiesce_queue_args *args;
+	enum spdk_nvmf_fc_poller_api_ret rc = SPDK_NVMF_FC_POLLER_API_SUCCESS;
+	int err = 0;
+
+	args = calloc(1, sizeof(struct spdk_nvmf_fc_poller_api_quiesce_queue_args));
+
+	if (args == NULL) {
+		err = -ENOMEM;
+		SPDK_ERRLOG("Failed to allocate memory for poller quiesce args, hwqp:%d\n", fc_hwqp->hwqp_id);
+		goto done;
+	}
+	args->hwqp = fc_hwqp;
+	args->ctx = ctx;
+	args->cb_info.cb_func = cb_func;
+	args->cb_info.cb_data = args;
+	args->cb_info.cb_thread = spdk_get_thread();
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Quiesce queue %d\n", fc_hwqp->hwqp_id);
+	rc = nvmf_fc_poller_api_func(fc_hwqp, SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE, args);
+	if (rc) {
+		free(args);
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+/*
+ * Hw port Quiesce
+ */
+static int
+nvmf_fc_adm_hw_port_quiesce(struct spdk_nvmf_fc_port *fc_port, void *ctx,
+			    spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func)
+{
+	struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *port_quiesce_ctx = NULL;
+	uint32_t i = 0;
+	int err = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port:%d is being quiesced.\n", fc_port->port_hdl);
+
+	/*
+	 * If the port is in an OFFLINE state, set the state to QUIESCED
+	 * and execute the callback.
+	 */
+	if (fc_port->hw_port_status == SPDK_FC_PORT_OFFLINE) {
+		fc_port->hw_port_status = SPDK_FC_PORT_QUIESCED;
+	}
+
+	if (fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Port %d already in quiesced state.\n",
+			      fc_port->port_hdl);
+		/*
+		 * Execute the callback function directly.
+		 */
+		cb_func(ctx, err);
+		goto out;
+	}
+
+	port_quiesce_ctx = calloc(1, sizeof(struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx));
+
+	if (port_quiesce_ctx == NULL) {
+		err = -ENOMEM;
+		SPDK_ERRLOG("Failed to allocate memory for LS queue quiesce ctx, port:%d\n",
+			    fc_port->port_hdl);
+		goto out;
+	}
+
+	port_quiesce_ctx->quiesce_count = 0;
+	port_quiesce_ctx->ctx = ctx;
+	port_quiesce_ctx->cb_func = cb_func;
+
+	/*
+	 * Quiesce the LS queue.
+	 */
+	err = nvmf_fc_adm_hw_queue_quiesce(&fc_port->ls_queue, port_quiesce_ctx,
+					   nvmf_fc_adm_queue_quiesce_cb);
+	if (err != 0) {
+		SPDK_ERRLOG("Failed to quiesce the LS queue.\n");
+		goto out;
+	}
+	port_quiesce_ctx->quiesce_count++;
+
+	/*
+	 * Quiesce the IO queues.
+	 */
+	for (i = 0; i < fc_port->num_io_queues; i++) {
+		err = nvmf_fc_adm_hw_queue_quiesce(&fc_port->io_queues[i],
+						   port_quiesce_ctx,
+						   nvmf_fc_adm_queue_quiesce_cb);
+		if (err != 0) {
+			DEV_VERIFY(0);
+			SPDK_ERRLOG("Failed to quiesce the IO queue:%d.\n", fc_port->io_queues[i].hwqp_id);
+		}
+		port_quiesce_ctx->quiesce_count++;
+	}
+
+out:
+	if (port_quiesce_ctx && err != 0) {
+		free(port_quiesce_ctx);
+	}
+	return err;
+}
+
+/*
+ * Initialize and add a HW port entry to the global
+ * HW port list.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_init(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_hw_port_init_args *args = (struct spdk_nvmf_fc_hw_port_init_args *)
+			api_data->api_args;
+	int err = 0;
+
+	if (args->io_queue_cnt > spdk_env_get_core_count()) {
+		SPDK_ERRLOG("IO queues count greater than cores for %d.\n", args->port_handle);
+		err = EINVAL;
+		goto abort_port_init;
+	}
+
+	/*
+	 * 1. Check for duplicate initialization.
+	 */
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (fc_port != NULL) {
+		/* Port already exists, check if it has to be re-initialized */
+		err = nvmf_fc_adm_hw_port_reinit_validate(fc_port, args);
+		if (err) {
+			/*
+			 * In case of an error we do not want to free the fc_port
+			 * so we set that pointer to NULL.
+			 */
+			fc_port = NULL;
+		}
+		goto abort_port_init;
+	}
+
+	/*
+	 * 2. Get the memory to instantiate a fc port.
+	 */
+	fc_port = calloc(1, sizeof(struct spdk_nvmf_fc_port) +
+			 (args->io_queue_cnt * sizeof(struct spdk_nvmf_fc_hwqp)));
+	if (fc_port == NULL) {
+		SPDK_ERRLOG("Failed to allocate memory for fc_port %d.\n", args->port_handle);
+		err = -ENOMEM;
+		goto abort_port_init;
+	}
+
+	/* assign the io_queues array */
+	fc_port->io_queues = (struct spdk_nvmf_fc_hwqp *)((uint8_t *)fc_port + sizeof(
+				     struct spdk_nvmf_fc_port));
+
+	/*
+	 * 3. Initialize the contents for the FC-port
+	 */
+	err = nvmf_fc_adm_hw_port_data_init(fc_port, args);
+
+	if (err != 0) {
+		SPDK_ERRLOG("Data initialization failed for fc_port %d.\n", args->port_handle);
+		DEV_VERIFY(!"Data initialization failed for fc_port");
+		goto abort_port_init;
+	}
+
+	/*
+	 * 4. Add this port to the global fc port list in the library.
+	 */
+	nvmf_fc_port_add(fc_port);
+
+abort_port_init:
+	if (err && fc_port) {
+		free(fc_port);
+	}
+	if (api_data->cb_func != NULL) {
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_INIT, args->cb_ctx, err);
+	}
+
+	free(arg);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d initialize done, rc = %d.\n",
+		      args->port_handle, err);
+}
+
+/*
+ * Online a HW port.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_online(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_hw_port_online_args *args = (struct spdk_nvmf_fc_hw_port_online_args *)
+			api_data->api_args;
+	int i = 0;
+	int err = 0;
+
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (fc_port) {
+		/* Set the port state to online */
+		err = nvmf_fc_port_set_online(fc_port);
+		if (err != 0) {
+			SPDK_ERRLOG("Hw port %d online failed. err = %d\n", fc_port->port_hdl, err);
+			DEV_VERIFY(!"Hw port online failed");
+			goto out;
+		}
+
+		hwqp = &fc_port->ls_queue;
+		hwqp->context = NULL;
+		(void)nvmf_fc_hwqp_set_online(hwqp);
+
+		/* Cycle through all the io queues and setup a hwqp poller for each. */
+		for (i = 0; i < (int)fc_port->num_io_queues; i++) {
+			hwqp = &fc_port->io_queues[i];
+			hwqp->context = NULL;
+			(void)nvmf_fc_hwqp_set_online(hwqp);
+			nvmf_fc_poll_group_add_hwqp(hwqp);
+		}
+	} else {
+		SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+		err = -EINVAL;
+	}
+
+out:
+	if (api_data->cb_func != NULL) {
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_ONLINE, args->cb_ctx, err);
+	}
+
+	free(arg);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d online done, rc = %d.\n", args->port_handle,
+		      err);
+}
+
+/*
+ * Offline a HW port.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_offline(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_hw_port_offline_args *args = (struct spdk_nvmf_fc_hw_port_offline_args *)
+			api_data->api_args;
+	int i = 0;
+	int err = 0;
+
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (fc_port) {
+		/* Set the port state to offline, if it is not already. */
+		err = nvmf_fc_port_set_offline(fc_port);
+		if (err != 0) {
+			SPDK_ERRLOG("Hw port %d already offline. err = %d\n", fc_port->port_hdl, err);
+			err = 0;
+			goto out;
+		}
+
+		hwqp = &fc_port->ls_queue;
+		(void)nvmf_fc_hwqp_set_offline(hwqp);
+
+		/* Remove poller for all the io queues. */
+		for (i = 0; i < (int)fc_port->num_io_queues; i++) {
+			hwqp = &fc_port->io_queues[i];
+			(void)nvmf_fc_hwqp_set_offline(hwqp);
+			nvmf_fc_poll_group_remove_hwqp(hwqp);
+		}
+
+		/*
+		 * Delete all the nports. Ideally, the nports should have been purged
+		 * before the offline event, in which case, only a validation is required.
+		 */
+		nvmf_fc_adm_hw_port_offline_nport_delete(fc_port);
+	} else {
+		SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+		err = -EINVAL;
+	}
+out:
+	if (api_data->cb_func != NULL) {
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_OFFLINE, args->cb_ctx, err);
+	}
+
+	free(arg);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d offline done, rc = %d.\n", args->port_handle,
+		      err);
+}
+
+struct nvmf_fc_add_rem_listener_ctx {
+	struct spdk_nvmf_subsystem *subsystem;
+	bool add_listener;
+	struct spdk_nvme_transport_id trid;
+};
+
+static void
+nvmf_fc_adm_subsystem_resume_cb(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct nvmf_fc_add_rem_listener_ctx *ctx = (struct nvmf_fc_add_rem_listener_ctx *)cb_arg;
+	free(ctx);
+}
+
+static void
+nvmf_fc_adm_listen_done(void *cb_arg, int status)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct nvmf_fc_add_rem_listener_ctx *ctx = cb_arg;
+
+	if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_fc_adm_subsystem_resume_cb, ctx)) {
+		SPDK_ERRLOG("Failed to resume subsystem: %s\n", ctx->subsystem->subnqn);
+		free(ctx);
+	}
+}
+
+static void
+nvmf_fc_adm_subsystem_paused_cb(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct nvmf_fc_add_rem_listener_ctx *ctx = (struct nvmf_fc_add_rem_listener_ctx *)cb_arg;
+
+	if (ctx->add_listener) {
+		spdk_nvmf_subsystem_add_listener(subsystem, &ctx->trid, nvmf_fc_adm_listen_done, ctx);
+	} else {
+		spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid);
+		nvmf_fc_adm_listen_done(ctx, 0);
+	}
+}
+
+static int
+nvmf_fc_adm_add_rem_nport_listener(struct spdk_nvmf_fc_nport *nport, bool add)
+{
+	struct spdk_nvmf_tgt *tgt = nvmf_fc_get_tgt();
+	struct spdk_nvmf_subsystem *subsystem;
+
+	if (!tgt) {
+		SPDK_ERRLOG("No nvmf target defined\n");
+		return -EINVAL;
+	}
+
+	subsystem = spdk_nvmf_subsystem_get_first(tgt);
+	while (subsystem) {
+		struct nvmf_fc_add_rem_listener_ctx *ctx;
+
+		if (spdk_nvmf_subsytem_any_listener_allowed(subsystem) == true) {
+			ctx = calloc(1, sizeof(struct nvmf_fc_add_rem_listener_ctx));
+			if (ctx) {
+				ctx->add_listener = add;
+				ctx->subsystem = subsystem;
+				nvmf_fc_create_trid(&ctx->trid,
+						    nport->fc_nodename.u.wwn,
+						    nport->fc_portname.u.wwn);
+
+				if (spdk_nvmf_tgt_listen(subsystem->tgt, &ctx->trid)) {
+					SPDK_ERRLOG("Failed to add transport address %s to tgt listeners\n",
+						    ctx->trid.traddr);
+					free(ctx);
+				} else if (spdk_nvmf_subsystem_pause(subsystem,
+								     nvmf_fc_adm_subsystem_paused_cb,
+								     ctx)) {
+					SPDK_ERRLOG("Failed to pause subsystem: %s\n",
+						    subsystem->subnqn);
+					free(ctx);
+				}
+			}
+		}
+
+		subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+	}
+
+	return 0;
+}
+
+/*
+ * Create a Nport.
+ */
+static void
+nvmf_fc_adm_evnt_nport_create(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_nport_create_args *args = (struct spdk_nvmf_fc_nport_create_args *)
+			api_data->api_args;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	int err = 0;
+
+	/*
+	 * Get the physical port.
+	 */
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (fc_port == NULL) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Check for duplicate initialization.
+	 */
+	nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+	if (nport != NULL) {
+		SPDK_ERRLOG("Duplicate SPDK FC nport %d exists for FC port:%d.\n", args->nport_handle,
+			    args->port_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Get the memory to instantiate a fc nport.
+	 */
+	nport = calloc(1, sizeof(struct spdk_nvmf_fc_nport));
+	if (nport == NULL) {
+		SPDK_ERRLOG("Failed to allocate memory for nport %d.\n",
+			    args->nport_handle);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Initialize the contents for the nport
+	 */
+	nport->nport_hdl    = args->nport_handle;
+	nport->port_hdl     = args->port_handle;
+	nport->nport_state  = SPDK_NVMF_FC_OBJECT_CREATED;
+	nport->fc_nodename  = args->fc_nodename;
+	nport->fc_portname  = args->fc_portname;
+	nport->d_id         = args->d_id;
+	nport->fc_port      = nvmf_fc_port_lookup(args->port_handle);
+
+	(void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_CREATED);
+	TAILQ_INIT(&nport->rem_port_list);
+	nport->rport_count = 0;
+	TAILQ_INIT(&nport->fc_associations);
+	nport->assoc_count = 0;
+
+	/*
+	 * Populate the nport address (as listening address) to the nvmf subsystems.
+	 */
+	err = nvmf_fc_adm_add_rem_nport_listener(nport, true);
+
+	(void)nvmf_fc_port_add_nport(fc_port, nport);
+out:
+	if (err && nport) {
+		free(nport);
+	}
+
+	if (api_data->cb_func != NULL) {
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_NPORT_CREATE, args->cb_ctx, err);
+	}
+
+	free(arg);
+}
+
+static void
+nvmf_fc_adm_delete_nport_cb(uint8_t port_handle, enum spdk_fc_event event_type,
+			    void *cb_args, int spdk_err)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_nport_del_cb_data *cb_data = cb_args;
+	struct spdk_nvmf_fc_nport *nport = cb_data->nport;
+	spdk_nvmf_fc_callback cb_func = cb_data->fc_cb_func;
+	int err = 0;
+	uint16_t nport_hdl = 0;
+	char log_str[256];
+
+	/*
+	 * Assert on any delete failure.
+	 */
+	if (nport == NULL) {
+		SPDK_ERRLOG("Nport delete callback returned null nport");
+		DEV_VERIFY(!"nport is null.");
+		goto out;
+	}
+
+	nport_hdl = nport->nport_hdl;
+	if (0 != spdk_err) {
+		SPDK_ERRLOG("Nport delete callback returned error. FC Port: "
+			    "%d, Nport: %d\n",
+			    nport->port_hdl, nport->nport_hdl);
+		DEV_VERIFY(!"nport delete callback error.");
+	}
+
+	/*
+	 * Free the nport if this is the last rport being deleted and
+	 * execute the callback(s).
+	 */
+	if (nvmf_fc_nport_has_no_rport(nport)) {
+		if (0 != nport->assoc_count) {
+			SPDK_ERRLOG("association count != 0\n");
+			DEV_VERIFY(!"association count != 0");
+		}
+
+		err = nvmf_fc_port_remove_nport(nport->fc_port, nport);
+		if (0 != err) {
+			SPDK_ERRLOG("Nport delete callback: Failed to remove "
+				    "nport from nport list. FC Port:%d Nport:%d\n",
+				    nport->port_hdl, nport->nport_hdl);
+		}
+		/* Free the nport */
+		free(nport);
+
+		if (cb_func != NULL) {
+			(void)cb_func(cb_data->port_handle, SPDK_FC_NPORT_DELETE, cb_data->fc_cb_ctx, spdk_err);
+		}
+		free(cb_data);
+	}
+out:
+	snprintf(log_str, sizeof(log_str),
+		 "port:%d nport:%d delete cb exit, evt_type:%d rc:%d.\n",
+		 port_handle, nport_hdl, event_type, spdk_err);
+
+	if (err != 0) {
+		SPDK_ERRLOG("%s", log_str);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	}
+}
+
+/*
+ * Delete Nport.
+ */
+static void
+nvmf_fc_adm_evnt_nport_delete(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_nport_delete_args *args = (struct spdk_nvmf_fc_nport_delete_args *)
+			api_data->api_args;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	struct spdk_nvmf_fc_adm_nport_del_cb_data *cb_data = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL;
+	int err = 0;
+	uint32_t rport_cnt = 0;
+	int rc = 0;
+
+	/*
+	 * Make sure that the nport exists.
+	 */
+	nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+	if (nport == NULL) {
+		SPDK_ERRLOG("Unable to find the SPDK FC nport %d for FC Port: %d.\n", args->nport_handle,
+			    args->port_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Allocate memory for callback data.
+	 */
+	cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_nport_del_cb_data));
+	if (NULL == cb_data) {
+		SPDK_ERRLOG("Failed to allocate memory for cb_data %d.\n", args->nport_handle);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	cb_data->nport = nport;
+	cb_data->port_handle = args->port_handle;
+	cb_data->fc_cb_func = api_data->cb_func;
+	cb_data->fc_cb_ctx = args->cb_ctx;
+
+	/*
+	 * Begin nport tear down
+	 */
+	if (nport->nport_state == SPDK_NVMF_FC_OBJECT_CREATED) {
+		(void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_TO_BE_DELETED);
+	} else if (nport->nport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+		/*
+		 * Deletion of this nport already in progress. Register callback
+		 * and return.
+		 */
+		/* TODO: Register callback in callback vector. For now, set the error and return. */
+		err = -ENODEV;
+		goto out;
+	} else {
+		/* nport partially created/deleted */
+		DEV_VERIFY(nport->nport_state == SPDK_NVMF_FC_OBJECT_ZOMBIE);
+		DEV_VERIFY(0 != "Nport in zombie state");
+		err = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * Remove this nport from listening addresses across subsystems
+	 */
+	rc = nvmf_fc_adm_add_rem_nport_listener(nport, false);
+
+	if (0 != rc) {
+		err = nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_ZOMBIE);
+		SPDK_ERRLOG("Unable to remove the listen addr in the subsystems for nport %d.\n",
+			    nport->nport_hdl);
+		goto out;
+	}
+
+	/*
+	 * Delete all the remote ports (if any) for the nport
+	 */
+	/* TODO - Need to do this with a "first" and a "next" accessor function
+	 * for completeness. Look at app-subsystem as examples.
+	 */
+	if (nvmf_fc_nport_has_no_rport(nport)) {
+		/* No rports to delete. Complete the nport deletion. */
+		nvmf_fc_adm_delete_nport_cb(nport->port_hdl, SPDK_FC_NPORT_DELETE, cb_data, 0);
+		goto out;
+	}
+
+	TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) {
+		struct spdk_nvmf_fc_hw_i_t_delete_args *it_del_args = calloc(
+					1, sizeof(struct spdk_nvmf_fc_hw_i_t_delete_args));
+
+		if (it_del_args == NULL) {
+			err = -ENOMEM;
+			SPDK_ERRLOG("SPDK_FC_IT_DELETE no mem to delete rport with rpi:%d s_id:%d.\n",
+				    rport_iter->rpi, rport_iter->s_id);
+			DEV_VERIFY(!"SPDK_FC_IT_DELETE failed, cannot allocate memory");
+			goto out;
+		}
+
+		rport_cnt++;
+		it_del_args->port_handle = nport->port_hdl;
+		it_del_args->nport_handle = nport->nport_hdl;
+		it_del_args->cb_ctx = (void *)cb_data;
+		it_del_args->rpi = rport_iter->rpi;
+		it_del_args->s_id = rport_iter->s_id;
+
+		nvmf_fc_master_enqueue_event(SPDK_FC_IT_DELETE, (void *)it_del_args,
+					     nvmf_fc_adm_delete_nport_cb);
+	}
+
+out:
+	/* On failure, execute the callback function now */
+	if ((err != 0) || (rc != 0)) {
+		SPDK_ERRLOG("NPort %d delete failed, error:%d, fc port:%d, "
+			    "rport_cnt:%d rc:%d.\n",
+			    args->nport_handle, err, args->port_handle,
+			    rport_cnt, rc);
+		if (cb_data) {
+			free(cb_data);
+		}
+		if (api_data->cb_func != NULL) {
+			(void)api_data->cb_func(args->port_handle, SPDK_FC_NPORT_DELETE, args->cb_ctx, err);
+		}
+
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API,
+			      "NPort %d delete done succesfully, fc port:%d. "
+			      "rport_cnt:%d\n",
+			      args->nport_handle, args->port_handle, rport_cnt);
+	}
+
+	free(arg);
+}
+
+/*
+ * Process an PRLI/IT add.
+ */
+static void
+nvmf_fc_adm_evnt_i_t_add(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_hw_i_t_add_args *args = (struct spdk_nvmf_fc_hw_i_t_add_args *)
+			api_data->api_args;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+	int err = 0;
+
+	/*
+	 * Make sure the nport port exists.
+	 */
+	nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+	if (nport == NULL) {
+		SPDK_ERRLOG("Unable to find the SPDK FC nport %d\n", args->nport_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Check for duplicate i_t_add.
+	 */
+	TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) {
+		if ((rport_iter->s_id == args->s_id) && (rport_iter->rpi == args->rpi)) {
+			SPDK_ERRLOG("Duplicate rport found for FC nport %d: sid:%d rpi:%d\n",
+				    args->nport_handle, rport_iter->s_id, rport_iter->rpi);
+			err = -EEXIST;
+			goto out;
+		}
+	}
+
+	/*
+	 * Get the memory to instantiate the remote port
+	 */
+	rport = calloc(1, sizeof(struct spdk_nvmf_fc_remote_port_info));
+	if (rport == NULL) {
+		SPDK_ERRLOG("Memory allocation for rem port failed.\n");
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Initialize the contents for the rport
+	 */
+	(void)nvmf_fc_rport_set_state(rport, SPDK_NVMF_FC_OBJECT_CREATED);
+	rport->s_id = args->s_id;
+	rport->rpi = args->rpi;
+	rport->fc_nodename = args->fc_nodename;
+	rport->fc_portname = args->fc_portname;
+
+	/*
+	 * Add remote port to nport
+	 */
+	if (nvmf_fc_nport_add_rem_port(nport, rport) != 0) {
+		DEV_VERIFY(!"Error while adding rport to list");
+	};
+
+	/*
+	 * TODO: Do we validate the initiators service parameters?
+	 */
+
+	/*
+	 * Get the targets service parameters from the library
+	 * to return back to the driver.
+	 */
+	args->target_prli_info = nvmf_fc_get_prli_service_params();
+
+out:
+	if (api_data->cb_func != NULL) {
+		/*
+		 * Passing pointer to the args struct as the first argument.
+		 * The cb_func should handle this appropriately.
+		 */
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_IT_ADD, args->cb_ctx, err);
+	}
+
+	free(arg);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API,
+		      "IT add on nport %d done, rc = %d.\n",
+		      args->nport_handle, err);
+}
+
+/**
+ * Process a IT delete.
+ */
+static void
+nvmf_fc_adm_evnt_i_t_delete(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_hw_i_t_delete_args *args = (struct spdk_nvmf_fc_hw_i_t_delete_args *)
+			api_data->api_args;
+	int rc = 0;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	struct spdk_nvmf_fc_adm_i_t_del_cb_data *cb_data = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL;
+	struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+	uint32_t num_rport = 0;
+	char log_str[256];
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "IT delete on nport:%d begin.\n", args->nport_handle);
+
+	/*
+	 * Make sure the nport port exists. If it does not, error out.
+	 */
+	nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+	if (nport == NULL) {
+		SPDK_ERRLOG("Unable to find the SPDK FC nport:%d\n", args->nport_handle);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Find this ITN / rport (remote port).
+	 */
+	TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) {
+		num_rport++;
+		if ((rport_iter->s_id == args->s_id) &&
+		    (rport_iter->rpi == args->rpi) &&
+		    (rport_iter->rport_state == SPDK_NVMF_FC_OBJECT_CREATED)) {
+			rport = rport_iter;
+			break;
+		}
+	}
+
+	/*
+	 * We should find either zero or exactly one rport.
+	 *
+	 * If we find zero rports, that means that a previous request has
+	 * removed the rport by the time we reached here. In this case,
+	 * simply return out.
+	 */
+	if (rport == NULL) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * We have found exactly one rport. Allocate memory for callback data.
+	 */
+	cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_i_t_del_cb_data));
+	if (NULL == cb_data) {
+		SPDK_ERRLOG("Failed to allocate memory for cb_data for nport:%d.\n", args->nport_handle);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cb_data->nport = nport;
+	cb_data->rport = rport;
+	cb_data->port_handle = args->port_handle;
+	cb_data->fc_cb_func = api_data->cb_func;
+	cb_data->fc_cb_ctx = args->cb_ctx;
+
+	/*
+	 * Validate rport object state.
+	 */
+	if (rport->rport_state == SPDK_NVMF_FC_OBJECT_CREATED) {
+		(void)nvmf_fc_rport_set_state(rport, SPDK_NVMF_FC_OBJECT_TO_BE_DELETED);
+	} else if (rport->rport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+		/*
+		 * Deletion of this rport already in progress. Register callback
+		 * and return.
+		 */
+		/* TODO: Register callback in callback vector. For now, set the error and return. */
+		rc = -ENODEV;
+		goto out;
+	} else {
+		/* rport partially created/deleted */
+		DEV_VERIFY(rport->rport_state == SPDK_NVMF_FC_OBJECT_ZOMBIE);
+		DEV_VERIFY(!"Invalid rport_state");
+		rc = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * We have successfully found a rport to delete. Call
+	 * nvmf_fc_i_t_delete_assoc(), which will perform further
+	 * IT-delete processing as well as free the cb_data.
+	 */
+	nvmf_fc_adm_i_t_delete_assoc(nport, rport, nvmf_fc_adm_i_t_delete_cb,
+				     (void *)cb_data);
+
+out:
+	if (rc != 0) {
+		/*
+		 * We have entered here because either we encountered an
+		 * error, or we did not find a rport to delete.
+		 * As a result, we will not call the function
+		 * nvmf_fc_i_t_delete_assoc() for further IT-delete
+		 * processing. Therefore, execute the callback function now.
+		 */
+		if (cb_data) {
+			free(cb_data);
+		}
+		if (api_data->cb_func != NULL) {
+			(void)api_data->cb_func(args->port_handle, SPDK_FC_IT_DELETE, args->cb_ctx, rc);
+		}
+	}
+
+	snprintf(log_str, sizeof(log_str),
+		 "IT delete on nport:%d end. num_rport:%d rc = %d.\n",
+		 args->nport_handle, num_rport, rc);
+
+	if (rc != 0) {
+		SPDK_ERRLOG("%s", log_str);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	}
+
+	free(arg);
+}
+
+/*
+ * Process ABTS received
+ */
+static void
+nvmf_fc_adm_evnt_abts_recv(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_abts_args *args = (struct spdk_nvmf_fc_abts_args *)api_data->api_args;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	int err = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "FC ABTS received. RPI:%d, oxid:%d, rxid:%d\n", args->rpi,
+		      args->oxid, args->rxid);
+
+	/*
+	 * 1. Make sure the nport port exists.
+	 */
+	nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+	if (nport == NULL) {
+		SPDK_ERRLOG("Unable to find the SPDK FC nport %d\n", args->nport_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * 2. If the nport is in the process of being deleted, drop the ABTS.
+	 */
+	if (nport->nport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API,
+			      "FC ABTS dropped because the nport is being deleted; RPI:%d, oxid:%d, rxid:%d\n",
+			      args->rpi, args->oxid, args->rxid);
+		err = 0;
+		goto out;
+
+	}
+
+	/*
+	 * 3. Pass the received ABTS-LS to the library for handling.
+	 */
+	nvmf_fc_handle_abts_frame(nport, args->rpi, args->oxid, args->rxid);
+
+out:
+	if (api_data->cb_func != NULL) {
+		/*
+		 * Passing pointer to the args struct as the first argument.
+		 * The cb_func should handle this appropriately.
+		 */
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_ABTS_RECV, args, err);
+	} else {
+		/* No callback set, free the args */
+		free(args);
+	}
+
+	free(arg);
+}
+
+/*
+ * Callback function for hw port quiesce.
+ */
+static void
+nvmf_fc_adm_hw_port_quiesce_reset_cb(void *ctx, int err)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_hw_port_reset_ctx *reset_ctx =
+		(struct spdk_nvmf_fc_adm_hw_port_reset_ctx *)ctx;
+	struct spdk_nvmf_fc_hw_port_reset_args *args = reset_ctx->reset_args;
+	spdk_nvmf_fc_callback cb_func = reset_ctx->reset_cb_func;
+	struct spdk_nvmf_fc_queue_dump_info dump_info;
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	char *dump_buf = NULL;
+	uint32_t dump_buf_size = SPDK_FC_HW_DUMP_BUF_SIZE;
+
+	/*
+	 * Free the callback context struct.
+	 */
+	free(ctx);
+
+	if (err != 0) {
+		SPDK_ERRLOG("Port %d  quiesce operation failed.\n", args->port_handle);
+		goto out;
+	}
+
+	if (args->dump_queues == false) {
+		/*
+		 * Queues need not be dumped.
+		 */
+		goto out;
+	}
+
+	SPDK_ERRLOG("Dumping queues for HW port %d\n", args->port_handle);
+
+	/*
+	 * Get the fc port.
+	 */
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (fc_port == NULL) {
+		SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Allocate memory for the dump buffer.
+	 * This memory will be freed by FCT.
+	 */
+	dump_buf = (char *)calloc(1, dump_buf_size);
+	if (dump_buf == NULL) {
+		err = -ENOMEM;
+		SPDK_ERRLOG("Memory allocation for dump buffer failed, SPDK FC port %d\n", args->port_handle);
+		goto out;
+	}
+	*args->dump_buf  = (uint32_t *)dump_buf;
+	dump_info.buffer = dump_buf;
+	dump_info.offset = 0;
+
+	/*
+	 * Add the dump reason to the top of the buffer.
+	 */
+	nvmf_fc_dump_buf_print(&dump_info, "%s\n", args->reason);
+
+	/*
+	 * Dump the hwqp.
+	 */
+	nvmf_fc_dump_all_queues(&fc_port->ls_queue, fc_port->io_queues,
+				fc_port->num_io_queues, &dump_info);
+
+out:
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d reset done, queues_dumped = %d, rc = %d.\n",
+		      args->port_handle, args->dump_queues, err);
+
+	if (cb_func != NULL) {
+		(void)cb_func(args->port_handle, SPDK_FC_HW_PORT_RESET, args->cb_ctx, err);
+	}
+}
+
+/*
+ * HW port reset
+
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_reset(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_fc_hw_port_reset_args *args = (struct spdk_nvmf_fc_hw_port_reset_args *)
+			api_data->api_args;
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	struct spdk_nvmf_fc_adm_hw_port_reset_ctx *ctx = NULL;
+	int err = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d dump\n", args->port_handle);
+
+	/*
+	 * Make sure the physical port exists.
+	 */
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (fc_port == NULL) {
+		SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Save the reset event args and the callback in a context struct.
+	 */
+	ctx = calloc(1, sizeof(struct spdk_nvmf_fc_adm_hw_port_reset_ctx));
+
+	if (ctx == NULL) {
+		err = -ENOMEM;
+		SPDK_ERRLOG("Memory allocation for reset ctx failed, SPDK FC port %d\n", args->port_handle);
+		goto fail;
+	}
+
+	ctx->reset_args = arg;
+	ctx->reset_cb_func = api_data->cb_func;
+
+	/*
+	 * Quiesce the hw port.
+	 */
+	err = nvmf_fc_adm_hw_port_quiesce(fc_port, ctx, nvmf_fc_adm_hw_port_quiesce_reset_cb);
+	if (err != 0) {
+		goto fail;
+	}
+
+	/*
+	 * Once the ports are successfully quiesced the reset processing
+	 * will continue in the callback function: spdk_fc_port_quiesce_reset_cb
+	 */
+	return;
+fail:
+	free(ctx);
+
+out:
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d dump done, rc = %d.\n", args->port_handle,
+		      err);
+
+	if (api_data->cb_func != NULL) {
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_RESET, args->cb_ctx, err);
+	}
+
+	free(arg);
+}
+
+/*
+ * Process a link break event on a HW port.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_link_break(void *arg)
+{
+	ASSERT_SPDK_FC_MASTER_THREAD();
+	struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+	struct spdk_nvmf_hw_port_link_break_args *args = (struct spdk_nvmf_hw_port_link_break_args *)
+			api_data->api_args;
+	struct spdk_nvmf_fc_port *fc_port = NULL;
+	int err = 0;
+	struct spdk_nvmf_fc_adm_port_link_break_cb_data *cb_data = NULL;
+	struct spdk_nvmf_fc_nport *nport = NULL;
+	uint32_t nport_deletes_sent = 0;
+	uint32_t nport_deletes_skipped = 0;
+	struct spdk_nvmf_fc_nport_delete_args *nport_del_args = NULL;
+	char log_str[256];
+
+	/*
+	 * Get the fc port using the port handle.
+	 */
+	fc_port = nvmf_fc_port_lookup(args->port_handle);
+	if (!fc_port) {
+		SPDK_ERRLOG("port link break: Unable to find the SPDK FC port %d\n",
+			    args->port_handle);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Set the port state to offline, if it is not already.
+	 */
+	err = nvmf_fc_port_set_offline(fc_port);
+	if (err != 0) {
+		SPDK_ERRLOG("port link break: HW port %d already offline. rc = %d\n",
+			    fc_port->port_hdl, err);
+		err = 0;
+		goto out;
+	}
+
+	/*
+	 * Delete all the nports, if any.
+	 */
+	if (!TAILQ_EMPTY(&fc_port->nport_list)) {
+		TAILQ_FOREACH(nport, &fc_port->nport_list, link) {
+			/* Skipped the nports that are not in CREATED state */
+			if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+				nport_deletes_skipped++;
+				continue;
+			}
+
+			/* Allocate memory for callback data. */
+			cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_port_link_break_cb_data));
+			if (NULL == cb_data) {
+				SPDK_ERRLOG("port link break: Failed to allocate memory for cb_data %d.\n",
+					    args->port_handle);
+				err = -ENOMEM;
+				goto out;
+			}
+			cb_data->args = args;
+			cb_data->cb_func = api_data->cb_func;
+			nport_del_args = &cb_data->nport_del_args;
+			nport_del_args->port_handle = args->port_handle;
+			nport_del_args->nport_handle = nport->nport_hdl;
+			nport_del_args->cb_ctx = cb_data;
+
+			nvmf_fc_master_enqueue_event(SPDK_FC_NPORT_DELETE,
+						     (void *)nport_del_args,
+						     nvmf_fc_adm_hw_port_link_break_cb);
+
+			nport_deletes_sent++;
+		}
+	}
+
+	if (nport_deletes_sent == 0 && err == 0) {
+		/*
+		 * Mark the hwqps as offline and unregister the pollers.
+		 */
+		(void)nvmf_fc_adm_port_hwqp_offline_del_poller(fc_port);
+	}
+
+out:
+	snprintf(log_str, sizeof(log_str),
+		 "port link break done: port:%d nport_deletes_sent:%d nport_deletes_skipped:%d rc:%d.\n",
+		 args->port_handle, nport_deletes_sent, nport_deletes_skipped, err);
+
+	if (err != 0) {
+		SPDK_ERRLOG("%s", log_str);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+	}
+
+	if ((api_data->cb_func != NULL) && (nport_deletes_sent == 0)) {
+		/*
+		 * No nport_deletes are sent, which would have eventually
+		 * called the port_link_break callback. Therefore, call the
+		 * port_link_break callback here.
+		 */
+		(void)api_data->cb_func(args->port_handle, SPDK_FC_LINK_BREAK, args->cb_ctx, err);
+	}
+
+	free(arg);
+}
+
+static inline void
+nvmf_fc_adm_run_on_master_thread(spdk_msg_fn fn, void *args)
+{
+	if (nvmf_fc_get_master_thread()) {
+		spdk_thread_send_msg(nvmf_fc_get_master_thread(), fn, args);
+	}
+}
+
+/*
+ * Queue up an event in the SPDK masters event queue.
+ * Used by the FC driver to notify the SPDK master of FC related events.
+ */
+int
+nvmf_fc_master_enqueue_event(enum spdk_fc_event event_type, void *args,
+			     spdk_nvmf_fc_callback cb_func)
+{
+	int err = 0;
+	struct spdk_nvmf_fc_adm_api_data *api_data = NULL;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Enqueue event %d.\n", event_type);
+
+	if (event_type >= SPDK_FC_EVENT_MAX) {
+		SPDK_ERRLOG("Invalid spdk_fc_event_t %d.\n", event_type);
+		err = -EINVAL;
+		goto done;
+	}
+
+	if (args == NULL) {
+		SPDK_ERRLOG("Null args for event %d.\n", event_type);
+		err = -EINVAL;
+		goto done;
+	}
+
+	api_data = calloc(1, sizeof(*api_data));
+
+	if (api_data == NULL) {
+		SPDK_ERRLOG("Failed to alloc api data for event %d.\n", event_type);
+		err = -ENOMEM;
+		goto done;
+	}
+
+	api_data->api_args = args;
+	api_data->cb_func = cb_func;
+
+	switch (event_type) {
+	case SPDK_FC_HW_PORT_INIT:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_init,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_HW_PORT_ONLINE:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_online,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_HW_PORT_OFFLINE:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_offline,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_NPORT_CREATE:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_nport_create,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_NPORT_DELETE:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_nport_delete,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_IT_ADD:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_i_t_add,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_IT_DELETE:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_i_t_delete,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_ABTS_RECV:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_abts_recv,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_LINK_BREAK:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_link_break,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_HW_PORT_RESET:
+		nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_reset,
+						 (void *)api_data);
+		break;
+
+	case SPDK_FC_UNRECOVERABLE_ERR:
+	default:
+		SPDK_ERRLOG("Invalid spdk_fc_event_t: %d\n", event_type);
+		err = -EINVAL;
+		break;
+	}
+
+done:
+
+	if (err == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Enqueue event %d done successfully\n", event_type);
+	} else {
+		SPDK_ERRLOG("Enqueue event %d failed, err = %d\n", event_type, err);
+		if (api_data) {
+			free(api_data);
+		}
+	}
+
+	return err;
+}
+
+SPDK_NVMF_TRANSPORT_REGISTER(fc, &spdk_nvmf_transport_fc);
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_adm_api", SPDK_LOG_NVMF_FC_ADM_API);
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc", SPDK_LOG_NVMF_FC)
diff --git a/src/spdk/lib/nvmf/fc_ls.c b/src/spdk/lib/nvmf/fc_ls.c
new file mode 100644
index 000000000..1aa06bd45
--- /dev/null
+++ b/src/spdk/lib/nvmf/fc_ls.c
@@ -0,0 +1,1678 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2018-2019 Broadcom.  All Rights Reserved.
+ *   The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+#include "spdk/assert.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+#include "spdk/endian.h"
+#include "spdk_internal/log.h"
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "nvmf_fc.h"
+#include "fc_lld.h"
+
+/* set to 1 to send ls disconnect in response to ls disconnect from host (per standard) */
+#define NVMF_FC_LS_SEND_LS_DISCONNECT 0
+
+/* Validation Error indexes into the string table below */
+enum {
+	VERR_NO_ERROR = 0,
+	VERR_CR_ASSOC_LEN = 1,
+	VERR_CR_ASSOC_RQST_LEN = 2,
+	VERR_CR_ASSOC_CMD = 3,
+	VERR_CR_ASSOC_CMD_LEN = 4,
+	VERR_ERSP_RATIO = 5,
+	VERR_ASSOC_ALLOC_FAIL = 6,
+	VERR_CONN_ALLOC_FAIL = 7,
+	VERR_CR_CONN_LEN = 8,
+	VERR_CR_CONN_RQST_LEN = 9,
+	VERR_ASSOC_ID = 10,
+	VERR_ASSOC_ID_LEN = 11,
+	VERR_NO_ASSOC = 12,
+	VERR_CONN_ID = 13,
+	VERR_CONN_ID_LEN = 14,
+	VERR_NO_CONN = 15,
+	VERR_CR_CONN_CMD = 16,
+	VERR_CR_CONN_CMD_LEN = 17,
+	VERR_DISCONN_LEN = 18,
+	VERR_DISCONN_RQST_LEN = 19,
+	VERR_DISCONN_CMD = 20,
+	VERR_DISCONN_CMD_LEN = 21,
+	VERR_DISCONN_SCOPE = 22,
+	VERR_RS_LEN = 23,
+	VERR_RS_RQST_LEN = 24,
+	VERR_RS_CMD = 25,
+	VERR_RS_CMD_LEN = 26,
+	VERR_RS_RCTL = 27,
+	VERR_RS_RO = 28,
+	VERR_CONN_TOO_MANY = 29,
+	VERR_SUBNQN = 30,
+	VERR_HOSTNQN = 31,
+	VERR_SQSIZE = 32,
+	VERR_NO_RPORT = 33,
+	VERR_SUBLISTENER = 34,
+};
+
+static char *validation_errors[] = {
+	"OK",
+	"Bad CR_ASSOC Length",
+	"Bad CR_ASSOC Rqst Length",
+	"Not CR_ASSOC Cmd",
+	"Bad CR_ASSOC Cmd Length",
+	"Bad Ersp Ratio",
+	"Association Allocation Failed",
+	"Queue Allocation Failed",
+	"Bad CR_CONN Length",
+	"Bad CR_CONN Rqst Length",
+	"Not Association ID",
+	"Bad Association ID Length",
+	"No Association",
+	"Not Connection ID",
+	"Bad Connection ID Length",
+	"No Connection",
+	"Not CR_CONN Cmd",
+	"Bad CR_CONN Cmd Length",
+	"Bad DISCONN Length",
+	"Bad DISCONN Rqst Length",
+	"Not DISCONN Cmd",
+	"Bad DISCONN Cmd Length",
+	"Bad Disconnect Scope",
+	"Bad RS Length",
+	"Bad RS Rqst Length",
+	"Not RS Cmd",
+	"Bad RS Cmd Length",
+	"Bad RS R_CTL",
+	"Bad RS Relative Offset",
+	"Too many connections for association",
+	"Invalid subnqn or subsystem not found",
+	"Invalid hostnqn or subsystem doesn't allow host",
+	"SQ size = 0 or too big",
+	"No Remote Port",
+	"Bad Subsystem Port",
+};
+
+static inline void
+nvmf_fc_add_assoc_to_tgt_port(struct spdk_nvmf_fc_nport *tgtport,
+			      struct spdk_nvmf_fc_association *assoc,
+			      struct spdk_nvmf_fc_remote_port_info *rport);
+
+static inline FCNVME_BE32 cpu_to_be32(uint32_t in)
+{
+	uint32_t t;
+
+	to_be32(&t, in);
+	return (FCNVME_BE32)t;
+}
+
+static inline FCNVME_BE32 nvmf_fc_lsdesc_len(size_t sz)
+{
+	uint32_t t;
+
+	to_be32(&t, sz - (2 * sizeof(uint32_t)));
+	return (FCNVME_BE32)t;
+}
+
+static void
+nvmf_fc_ls_format_rsp_hdr(void *buf, uint8_t ls_cmd, uint32_t desc_len,
+			  uint8_t rqst_ls_cmd)
+{
+	struct spdk_nvmf_fc_ls_acc_hdr *acc_hdr = buf;
+
+	acc_hdr->w0.ls_cmd = ls_cmd;
+	acc_hdr->desc_list_len = desc_len;
+	to_be32(&acc_hdr->rqst.desc_tag, FCNVME_LSDESC_RQST);
+	acc_hdr->rqst.desc_len =
+		nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_rqst));
+	acc_hdr->rqst.w0.ls_cmd = rqst_ls_cmd;
+}
+
+static int
+nvmf_fc_ls_format_rjt(void *buf, uint16_t buflen, uint8_t ls_cmd,
+		      uint8_t reason, uint8_t explanation, uint8_t vendor)
+{
+	struct spdk_nvmf_fc_ls_rjt *rjt = buf;
+
+	bzero(buf, sizeof(struct spdk_nvmf_fc_ls_rjt));
+	nvmf_fc_ls_format_rsp_hdr(buf, FCNVME_LSDESC_RQST,
+				  nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_rjt)),
+				  ls_cmd);
+	to_be32(&rjt->rjt.desc_tag, FCNVME_LSDESC_RJT);
+	rjt->rjt.desc_len = nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_rjt));
+	rjt->rjt.reason_code = reason;
+	rjt->rjt.reason_explanation = explanation;
+	rjt->rjt.vendor = vendor;
+
+	return sizeof(struct spdk_nvmf_fc_ls_rjt);
+}
+
+/* ************************************************** */
+/* Allocators/Deallocators (assocations, connections, */
+/* poller API data)                                   */
+
+static inline void
+nvmf_fc_ls_free_association(struct spdk_nvmf_fc_association *assoc)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	/* return the q slots of the conns for the association */
+	TAILQ_FOREACH(fc_conn, &assoc->avail_fc_conns, assoc_avail_link) {
+		if (fc_conn->conn_id != NVMF_FC_INVALID_CONN_ID) {
+			nvmf_fc_release_conn(fc_conn->hwqp, fc_conn->conn_id,
+					     fc_conn->max_queue_depth);
+		}
+	}
+
+	/* free assocation's send disconnect buffer */
+	if (assoc->snd_disconn_bufs) {
+		nvmf_fc_free_srsr_bufs(assoc->snd_disconn_bufs);
+	}
+
+	/* free assocation's connections */
+	free(assoc->conns_buf);
+
+	/* free the association */
+	free(assoc);
+}
+
+static int
+nvmf_fc_ls_alloc_connections(struct spdk_nvmf_fc_association *assoc,
+			     struct spdk_nvmf_transport *nvmf_transport)
+{
+	uint32_t i;
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Pre-alloc %d qpairs for host NQN %s\n",
+		      nvmf_transport->opts.max_qpairs_per_ctrlr, assoc->host_nqn);
+
+	/* allocate memory for all connections at once */
+	assoc->conns_buf = calloc(nvmf_transport->opts.max_qpairs_per_ctrlr + 1,
+				  sizeof(struct spdk_nvmf_fc_conn));
+	if (assoc->conns_buf == NULL) {
+		SPDK_ERRLOG("Out of memory for connections for new association\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nvmf_transport->opts.max_qpairs_per_ctrlr; i++) {
+		fc_conn = assoc->conns_buf + (i * sizeof(struct spdk_nvmf_fc_conn));
+		fc_conn->conn_id	 = NVMF_FC_INVALID_CONN_ID;
+		fc_conn->qpair.state	 = SPDK_NVMF_QPAIR_UNINITIALIZED;
+		fc_conn->qpair.transport = nvmf_transport;
+
+		TAILQ_INSERT_TAIL(&assoc->avail_fc_conns, fc_conn, assoc_avail_link);
+	}
+
+	return 0;
+}
+
+static struct spdk_nvmf_fc_association *
+nvmf_fc_ls_new_association(uint32_t s_id,
+			   struct spdk_nvmf_fc_nport *tgtport,
+			   struct spdk_nvmf_fc_remote_port_info *rport,
+			   struct spdk_nvmf_fc_lsdesc_cr_assoc_cmd *a_cmd,
+			   struct spdk_nvmf_subsystem *subsys,
+			   uint16_t rpi,
+			   struct spdk_nvmf_transport *nvmf_transport)
+{
+	struct spdk_nvmf_fc_association *assoc;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "New Association request for port %d nport %d rpi 0x%x\n",
+		      tgtport->fc_port->port_hdl, tgtport->nport_hdl, rpi);
+
+	assert(rport);
+	if (!rport) {
+		SPDK_ERRLOG("rport is null.\n");
+		return NULL;
+	}
+
+	assoc = calloc(1, sizeof(struct spdk_nvmf_fc_association));
+	if (!assoc) {
+		SPDK_ERRLOG("unable to allocate memory for new association\n");
+		return NULL;
+	}
+
+	/* initialize association */
+#if (NVMF_FC_LS_SEND_LS_DISCONNECT == 1)
+	/* allocate buffers to send LS disconnect command to host */
+	assoc->snd_disconn_bufs =
+		nvmf_fc_alloc_srsr_bufs(sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst),
+					sizeof(struct spdk_nvmf_fc_ls_rjt));
+	if (!assoc->snd_disconn_bufs) {
+		SPDK_ERRLOG("no dma memory for association's ls disconnect bufs\n");
+		free(assoc);
+		return NULL;
+	}
+
+	assoc->snd_disconn_bufs->rpi = rpi;
+#endif
+	assoc->s_id = s_id;
+	assoc->tgtport = tgtport;
+	assoc->rport = rport;
+	assoc->subsystem = subsys;
+	assoc->assoc_state = SPDK_NVMF_FC_OBJECT_CREATED;
+	memcpy(assoc->host_id, a_cmd->hostid, FCNVME_ASSOC_HOSTID_LEN);
+	memcpy(assoc->host_nqn, a_cmd->hostnqn, SPDK_NVME_NQN_FIELD_SIZE);
+	memcpy(assoc->sub_nqn, a_cmd->subnqn, SPDK_NVME_NQN_FIELD_SIZE);
+	TAILQ_INIT(&assoc->fc_conns);
+	TAILQ_INIT(&assoc->avail_fc_conns);
+	assoc->ls_del_op_ctx = NULL;
+
+	/* allocate and assign connections for association */
+	rc =  nvmf_fc_ls_alloc_connections(assoc, nvmf_transport);
+	if (rc != 0) {
+		nvmf_fc_ls_free_association(assoc);
+		return NULL;
+	}
+
+	/* add association to target port's association list */
+	nvmf_fc_add_assoc_to_tgt_port(tgtport, assoc, rport);
+	return assoc;
+}
+
+static inline void
+nvmf_fc_ls_append_del_cb_ctx(struct spdk_nvmf_fc_association *assoc,
+			     struct nvmf_fc_ls_op_ctx *opd)
+{
+	/* append to delete assoc callback list */
+	if (!assoc->ls_del_op_ctx) {
+		assoc->ls_del_op_ctx = (void *)opd;
+	} else {
+		struct nvmf_fc_ls_op_ctx *nxt =
+			(struct nvmf_fc_ls_op_ctx *) assoc->ls_del_op_ctx;
+		while (nxt->next_op_ctx) {
+			nxt = nxt->next_op_ctx;
+		}
+		nxt->next_op_ctx = opd;
+	}
+}
+
+static struct spdk_nvmf_fc_conn *
+nvmf_fc_ls_new_connection(struct spdk_nvmf_fc_association *assoc, uint16_t qid,
+			  uint16_t esrp_ratio, uint16_t rpi, uint16_t sq_size,
+			  struct spdk_nvmf_fc_nport *tgtport)
+{
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	fc_conn = TAILQ_FIRST(&assoc->avail_fc_conns);
+	if (!fc_conn) {
+		SPDK_ERRLOG("out of connections for association %p\n", assoc);
+		return NULL;
+	}
+
+	/* Remove from avail list and add to in use. */
+	TAILQ_REMOVE(&assoc->avail_fc_conns, fc_conn, assoc_avail_link);
+	TAILQ_INSERT_TAIL(&assoc->fc_conns, fc_conn, assoc_link);
+
+	if (qid == 0) {
+		/* AdminQ connection. */
+		assoc->aq_conn = fc_conn;
+	}
+
+	fc_conn->qpair.qid = qid;
+	fc_conn->qpair.sq_head_max = sq_size;
+	TAILQ_INIT(&fc_conn->qpair.outstanding);
+	fc_conn->esrp_ratio = esrp_ratio;
+	fc_conn->fc_assoc = assoc;
+	fc_conn->rpi = rpi;
+	fc_conn->max_queue_depth = sq_size + 1;
+
+	/* save target port trid in connection (for subsystem
+	 * listener validation in fabric connect command)
+	 */
+	nvmf_fc_create_trid(&fc_conn->trid, tgtport->fc_nodename.u.wwn,
+			    tgtport->fc_portname.u.wwn);
+
+	return fc_conn;
+}
+
+static inline void
+nvmf_fc_ls_free_connection(struct spdk_nvmf_fc_conn *fc_conn)
+{
+	TAILQ_INSERT_TAIL(&fc_conn->fc_assoc->avail_fc_conns, fc_conn, assoc_avail_link);
+}
+
+/* End - Allocators/Deallocators (assocations, connections, */
+/*       poller API data)                                   */
+/* ******************************************************** */
+
+static inline struct spdk_nvmf_fc_association *
+nvmf_fc_ls_find_assoc(struct spdk_nvmf_fc_nport *tgtport, uint64_t assoc_id)
+{
+	struct spdk_nvmf_fc_association *assoc = NULL;
+
+	TAILQ_FOREACH(assoc, &tgtport->fc_associations, link) {
+		if (assoc->assoc_id == assoc_id) {
+			if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_ZOMBIE) {
+				assoc = NULL;
+			}
+			break;
+		}
+	}
+	return assoc;
+}
+
+static inline void
+nvmf_fc_add_assoc_to_tgt_port(struct spdk_nvmf_fc_nport *tgtport,
+			      struct spdk_nvmf_fc_association *assoc,
+			      struct spdk_nvmf_fc_remote_port_info *rport)
+{
+	TAILQ_INSERT_TAIL(&tgtport->fc_associations, assoc, link);
+	tgtport->assoc_count++;
+	rport->assoc_count++;
+}
+
+static inline void
+nvmf_fc_del_assoc_from_tgt_port(struct spdk_nvmf_fc_association *assoc)
+{
+	struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport;
+
+	TAILQ_REMOVE(&tgtport->fc_associations, assoc, link);
+	tgtport->assoc_count--;
+	assoc->rport->assoc_count--;
+}
+
+static void
+nvmf_fc_ls_rsp_fail_del_conn_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	struct nvmf_fc_ls_op_ctx *opd =
+		(struct nvmf_fc_ls_op_ctx *)cb_data;
+	struct spdk_nvmf_fc_ls_del_conn_api_data *dp = &opd->u.del_conn;
+	struct spdk_nvmf_fc_association *assoc = dp->assoc;
+	struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Delete Connection callback "
+		      "for assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+		      fc_conn->conn_id);
+
+	if (dp->aq_conn) {
+		/* delete association */
+		nvmf_fc_del_assoc_from_tgt_port(assoc);
+		nvmf_fc_ls_free_association(assoc);
+	} else {
+		/* remove connection from association's connection list */
+		TAILQ_REMOVE(&assoc->fc_conns, fc_conn, assoc_link);
+		nvmf_fc_ls_free_connection(fc_conn);
+	}
+
+	free(opd);
+}
+
+static void
+nvmf_fc_handle_xmt_ls_rsp_failure(struct spdk_nvmf_fc_association *assoc,
+				  struct spdk_nvmf_fc_conn *fc_conn,
+				  bool aq_conn)
+{
+	struct spdk_nvmf_fc_ls_del_conn_api_data *api_data;
+	struct nvmf_fc_ls_op_ctx *opd = NULL;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Transmit LS response failure "
+		      "for assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+		      fc_conn->conn_id);
+
+
+	/* create context for delete connection API */
+	opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+	if (!opd) { /* hopefully this doesn't happen - if so, we leak the connection */
+		SPDK_ERRLOG("Mem alloc failed for del conn op data");
+		return;
+	}
+
+	api_data = &opd->u.del_conn;
+	api_data->assoc = assoc;
+	api_data->ls_rqst = NULL;
+	api_data->aq_conn = aq_conn;
+	api_data->args.fc_conn = fc_conn;
+	api_data->args.send_abts = false;
+	api_data->args.hwqp = fc_conn->hwqp;
+	api_data->args.cb_info.cb_thread = spdk_get_thread();
+	api_data->args.cb_info.cb_func = nvmf_fc_ls_rsp_fail_del_conn_cb;
+	api_data->args.cb_info.cb_data = opd;
+
+	nvmf_fc_poller_api_func(api_data->args.hwqp,
+				SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION,
+				&api_data->args);
+}
+
+/* callback from poller's ADD_Connection event */
+static void
+nvmf_fc_ls_add_conn_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	struct nvmf_fc_ls_op_ctx *opd =
+		(struct nvmf_fc_ls_op_ctx *)cb_data;
+	struct spdk_nvmf_fc_ls_add_conn_api_data *dp = &opd->u.add_conn;
+	struct spdk_nvmf_fc_association *assoc = dp->assoc;
+	struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport;
+	struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn;
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst = dp->ls_rqst;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "add_conn_cb: assoc_id = 0x%lx, conn_id = 0x%lx\n",
+		      assoc->assoc_id, fc_conn->conn_id);
+
+	fc_conn->create_opd = NULL;
+
+	if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+		/* association is already being deleted - don't continue */
+		free(opd);
+		return;
+	}
+
+	if (dp->aq_conn) {
+		struct spdk_nvmf_fc_ls_cr_assoc_acc *assoc_acc =
+			(struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+		/* put connection and association ID in response */
+		to_be64(&assoc_acc->conn_id.connection_id, fc_conn->conn_id);
+		assoc_acc->assoc_id.association_id = assoc_acc->conn_id.connection_id;
+	} else {
+		struct spdk_nvmf_fc_ls_cr_conn_acc *conn_acc =
+			(struct spdk_nvmf_fc_ls_cr_conn_acc *)ls_rqst->rspbuf.virt;
+		/* put connection ID in response */
+		to_be64(&conn_acc->conn_id.connection_id, fc_conn->conn_id);
+	}
+
+	/* send LS response */
+	if (nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst) != 0) {
+		SPDK_ERRLOG("Send LS response for %s failed - cleaning up\n",
+			    dp->aq_conn ? "association" : "connection");
+		nvmf_fc_handle_xmt_ls_rsp_failure(assoc, fc_conn,
+						  dp->aq_conn);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+			      "LS response (conn_id 0x%lx) sent\n", fc_conn->conn_id);
+	}
+
+	free(opd);
+}
+
+void
+nvmf_fc_ls_add_conn_failure(
+	struct spdk_nvmf_fc_association *assoc,
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst,
+	struct spdk_nvmf_fc_conn *fc_conn,
+	bool aq_conn)
+{
+	struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst;
+	struct spdk_nvmf_fc_ls_cr_assoc_acc *acc;
+	struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport;
+
+	if (fc_conn->create_opd) {
+		free(fc_conn->create_opd);
+		fc_conn->create_opd = NULL;
+	}
+
+	rqst	 = (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+	acc	 = (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+
+	/* send failure response */
+	ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc,
+			   FCNVME_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+			   FCNVME_RJT_RC_INSUFF_RES,
+			   FCNVME_RJT_EXP_NONE, 0);
+
+	nvmf_fc_ls_free_connection(fc_conn);
+	if (aq_conn) {
+		nvmf_fc_del_assoc_from_tgt_port(assoc);
+		nvmf_fc_ls_free_association(assoc);
+	}
+
+	nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+
+static void
+nvmf_fc_ls_add_conn_to_poller(
+	struct spdk_nvmf_fc_association *assoc,
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst,
+	struct spdk_nvmf_fc_conn *fc_conn,
+	bool aq_conn)
+{
+	struct nvmf_fc_ls_op_ctx *opd;
+	struct spdk_nvmf_fc_ls_add_conn_api_data *api_data;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Add Connection to poller for "
+		      "assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+		      fc_conn->conn_id);
+
+	opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+	if (!opd) {
+		SPDK_ERRLOG("allocate api data for add conn op failed\n");
+		nvmf_fc_ls_add_conn_failure(assoc, ls_rqst, fc_conn, aq_conn);
+		return;
+	}
+
+	/* insert conn in association's connection list */
+	api_data = &opd->u.add_conn;
+	assoc->conn_count++;
+
+	api_data->args.fc_conn = fc_conn;
+	api_data->args.cb_info.cb_thread = spdk_get_thread();
+	api_data->args.cb_info.cb_func = nvmf_fc_ls_add_conn_cb;
+	api_data->args.cb_info.cb_data = (void *)opd;
+	api_data->assoc = assoc;
+	api_data->ls_rqst = ls_rqst;
+	api_data->aq_conn = aq_conn;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "New QP callback called.\n");
+
+	/* Let the nvmf_tgt decide which pollgroup to use. */
+	fc_conn->create_opd = opd;
+	spdk_nvmf_tgt_new_qpair(ls_rqst->nvmf_tgt, &fc_conn->qpair);
+}
+
+/* Delete association functions */
+
+static void
+nvmf_fc_do_del_assoc_cbs(struct nvmf_fc_ls_op_ctx *opd, int ret)
+{
+	struct nvmf_fc_ls_op_ctx *nxt;
+	struct spdk_nvmf_fc_delete_assoc_api_data *dp;
+
+	while (opd) {
+		dp = &opd->u.del_assoc;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "performing delete assoc. callback\n");
+		dp->del_assoc_cb(dp->del_assoc_cb_data, ret);
+
+		nxt = opd->next_op_ctx;
+		free(opd);
+		opd = nxt;
+	}
+}
+
+static void
+nvmf_fs_send_ls_disconnect_cb(void *hwqp, int32_t status, void *args)
+{
+	if (args) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "free disconnect buffers\n");
+		nvmf_fc_free_srsr_bufs((struct spdk_nvmf_fc_srsr_bufs *)args);
+	}
+}
+
+static void
+nvmf_fc_del_all_conns_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data;
+	struct spdk_nvmf_fc_delete_assoc_api_data *dp = &opd->u.del_assoc;
+	struct spdk_nvmf_fc_association *assoc = dp->assoc;
+	struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn;
+
+	/* Assumption here is that there will be no error (i.e. ret=success).
+	 * Since connections are deleted in parallel, nothing can be
+	 * done anyway if there is an error because we need to complete
+	 * all connection deletes and callback to caller */
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "Delete all connections for assoc_id 0x%lx, conn_id = %lx\n",
+		      assoc->assoc_id, fc_conn->conn_id);
+
+	/* remove connection from association's connection list */
+	TAILQ_REMOVE(&assoc->fc_conns, fc_conn, assoc_link);
+	nvmf_fc_ls_free_connection(fc_conn);
+
+	if (--assoc->conn_count == 0) {
+		/* last connection - remove association from target port's association list */
+		struct nvmf_fc_ls_op_ctx *cb_opd = (struct nvmf_fc_ls_op_ctx *)assoc->ls_del_op_ctx;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+			      "remove assoc. %lx\n", assoc->assoc_id);
+		nvmf_fc_del_assoc_from_tgt_port(assoc);
+
+		if (assoc->snd_disconn_bufs &&
+		    assoc->tgtport->fc_port->hw_port_status == SPDK_FC_PORT_ONLINE) {
+
+			struct spdk_nvmf_fc_ls_disconnect_rqst *dc_rqst;
+			struct spdk_nvmf_fc_srsr_bufs *srsr_bufs;
+
+			dc_rqst = (struct spdk_nvmf_fc_ls_disconnect_rqst *)
+				  assoc->snd_disconn_bufs->rqst;
+
+			bzero(dc_rqst, sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst));
+
+			/* fill in request descriptor */
+			dc_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT;
+			to_be32(&dc_rqst->desc_list_len,
+				sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst) -
+				(2 * sizeof(uint32_t)));
+
+			/* fill in disconnect command descriptor */
+			to_be32(&dc_rqst->disconn_cmd.desc_tag, FCNVME_LSDESC_DISCONN_CMD);
+			to_be32(&dc_rqst->disconn_cmd.desc_len,
+				sizeof(struct spdk_nvmf_fc_lsdesc_disconn_cmd) -
+				(2 * sizeof(uint32_t)));
+
+			/* fill in association id descriptor */
+			to_be32(&dc_rqst->assoc_id.desc_tag, FCNVME_LSDESC_ASSOC_ID),
+				to_be32(&dc_rqst->assoc_id.desc_len,
+					sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id) -
+					(2 * sizeof(uint32_t)));
+			to_be64(&dc_rqst->assoc_id.association_id, assoc->assoc_id);
+
+			srsr_bufs = assoc->snd_disconn_bufs;
+			assoc->snd_disconn_bufs = NULL;
+
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Send LS disconnect\n");
+			if (nvmf_fc_xmt_srsr_req(&assoc->tgtport->fc_port->ls_queue,
+						 srsr_bufs, nvmf_fs_send_ls_disconnect_cb,
+						 (void *)srsr_bufs)) {
+				SPDK_ERRLOG("Error sending LS disconnect\n");
+				assoc->snd_disconn_bufs = srsr_bufs;
+			}
+		}
+
+		nvmf_fc_ls_free_association(assoc);
+
+		/* perform callbacks to all callers to delete association */
+		nvmf_fc_do_del_assoc_cbs(cb_opd, 0);
+
+	}
+
+	free(opd);
+}
+
+static void
+nvmf_fc_kill_io_del_all_conns_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Callback after killing outstanding ABTS.");
+	/*
+	 * NOTE: We should not access any connection or association related data
+	 * structures here.
+	 */
+	free(opd);
+}
+
+
+/* Disconnect/delete (association) request functions */
+
+static int
+_nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport,
+			    uint64_t assoc_id, bool send_abts, bool backend_initiated,
+			    spdk_nvmf_fc_del_assoc_cb del_assoc_cb,
+			    void *cb_data, bool from_ls_rqst)
+{
+
+	struct nvmf_fc_ls_op_ctx *opd, *opd_tail, *opd_head = NULL;
+	struct spdk_nvmf_fc_delete_assoc_api_data *api_data;
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_fc_association *assoc =
+		nvmf_fc_ls_find_assoc(tgtport, assoc_id);
+	struct spdk_nvmf_fc_port *fc_port = tgtport->fc_port;
+	enum spdk_nvmf_fc_object_state assoc_state;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Delete association, "
+		      "assoc_id 0x%lx\n", assoc_id);
+
+	if (!assoc) {
+		SPDK_ERRLOG("Delete association failed: %s\n",
+			    validation_errors[VERR_NO_ASSOC]);
+		return VERR_NO_ASSOC;
+	}
+
+	/* create cb context to put in association's list of
+	 * callbacks to call when delete association is done */
+	opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+	if (!opd) {
+		SPDK_ERRLOG("Mem alloc failed for del assoc cb data");
+		return -ENOMEM;
+	}
+
+	api_data = &opd->u.del_assoc;
+	api_data->assoc = assoc;
+	api_data->from_ls_rqst = from_ls_rqst;
+	api_data->del_assoc_cb = del_assoc_cb;
+	api_data->del_assoc_cb_data = cb_data;
+	api_data->args.cb_info.cb_data = opd;
+	nvmf_fc_ls_append_del_cb_ctx(assoc, opd);
+
+	assoc_state = assoc->assoc_state;
+	if ((assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) &&
+	    (fc_port->hw_port_status != SPDK_FC_PORT_QUIESCED)) {
+		/* association already being deleted */
+		return 0;
+	}
+
+	/* mark assoc. to be deleted */
+	assoc->assoc_state = SPDK_NVMF_FC_OBJECT_TO_BE_DELETED;
+
+	/* create a list of all connection to delete */
+	TAILQ_FOREACH(fc_conn, &assoc->fc_conns, assoc_link) {
+		opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+		if (!opd) { /* hopefully this doesn't happen */
+			SPDK_ERRLOG("Mem alloc failed for del conn op data");
+			while (opd_head) { /* free any contexts already allocated */
+				opd = opd_head;
+				opd_head = opd->next_op_ctx;
+				free(opd);
+			}
+			return -ENOMEM;
+		}
+
+		api_data = &opd->u.del_assoc;
+		api_data->args.fc_conn = fc_conn;
+		api_data->assoc = assoc;
+		api_data->args.send_abts = send_abts;
+		api_data->args.backend_initiated = backend_initiated;
+		api_data->args.hwqp = nvmf_fc_get_hwqp_from_conn_id(
+					      assoc->tgtport->fc_port->io_queues,
+					      assoc->tgtport->fc_port->num_io_queues,
+					      fc_conn->conn_id);
+		api_data->args.cb_info.cb_thread = spdk_get_thread();
+		if ((fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) &&
+		    (assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED)) {
+			/*
+			 * If there are any connections deletes or IO abts that are
+			 * stuck because of firmware reset, a second invocation of
+			 * SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION will result in
+			 * outstanding connections & requests being killed and
+			 * their corresponding callbacks being executed.
+			 */
+			api_data->args.cb_info.cb_func = nvmf_fc_kill_io_del_all_conns_cb;
+		} else {
+			api_data->args.cb_info.cb_func = nvmf_fc_del_all_conns_cb;
+		}
+		api_data->args.cb_info.cb_data = opd;
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+			      "conn_id = %lx\n", fc_conn->conn_id);
+
+		if (!opd_head) {
+			opd_head = opd;
+		} else {
+			opd_tail->next_op_ctx = opd;
+		}
+		opd_tail = opd;
+	}
+
+	/* make poller api calls to delete connetions */
+	while (opd_head) {
+		opd = opd_head;
+		opd_head = opd->next_op_ctx;
+		api_data = &opd->u.del_assoc;
+		nvmf_fc_poller_api_func(api_data->args.hwqp,
+					SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION,
+					&api_data->args);
+	}
+
+	return 0;
+}
+
+static void
+nvmf_fc_ls_disconnect_assoc_cb(void *cb_data, uint32_t err)
+{
+	struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data;
+	struct spdk_nvmf_fc_ls_disconn_assoc_api_data *dp = &opd->u.disconn_assoc;
+	struct spdk_nvmf_fc_nport *tgtport = dp->tgtport;
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst = dp->ls_rqst;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Disconnect association callback begin "
+		      "nport %d\n", tgtport->nport_hdl);
+	if (err != 0) {
+		/* send failure response */
+		struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst =
+			(struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+		struct spdk_nvmf_fc_ls_cr_assoc_acc *acc =
+			(struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+		ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc,
+				   FCNVME_MAX_LS_BUFFER_SIZE,
+				   rqst->w0.ls_cmd,
+				   FCNVME_RJT_RC_UNAB,
+				   FCNVME_RJT_EXP_NONE,
+				   0);
+	}
+
+	nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+
+	free(opd);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Disconnect association callback complete "
+		      "nport %d err %d\n", tgtport->nport_hdl, err);
+}
+
+static void
+nvmf_fc_ls_disconnect_assoc(struct spdk_nvmf_fc_nport *tgtport,
+			    struct spdk_nvmf_fc_ls_rqst *ls_rqst, uint64_t assoc_id)
+{
+	struct nvmf_fc_ls_op_ctx *opd;
+	struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst =
+		(struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+	struct spdk_nvmf_fc_ls_cr_assoc_acc *acc =
+		(struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+	struct spdk_nvmf_fc_ls_disconn_assoc_api_data *api_data;
+	int ret;
+	uint8_t reason = 0;
+
+	opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+	if (!opd) {
+		/* send failure response */
+		SPDK_ERRLOG("Allocate disconn assoc op data failed\n");
+		reason = FCNVME_RJT_RC_INSUFF_RES;
+		goto send_rjt;
+	}
+
+	api_data = &opd->u.disconn_assoc;
+	api_data->tgtport = tgtport;
+	api_data->ls_rqst = ls_rqst;
+	ret = _nvmf_fc_delete_association(tgtport, assoc_id,
+					  false, false,
+					  nvmf_fc_ls_disconnect_assoc_cb,
+					  api_data, true);
+	if (!ret) {
+		return;
+	}
+
+	/* delete association failed */
+	switch (ret) {
+	case VERR_NO_ASSOC:
+		reason = FCNVME_RJT_RC_INV_ASSOC;
+		break;
+	case -ENOMEM:
+		reason = FCNVME_RJT_RC_INSUFF_RES;
+		break;
+	default:
+		reason = FCNVME_RJT_RC_LOGIC;
+	}
+
+	free(opd);
+
+send_rjt:
+	ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc,
+			   FCNVME_MAX_LS_BUFFER_SIZE,
+			   rqst->w0.ls_cmd, reason,
+			   FCNVME_RJT_EXP_NONE, 0);
+	nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+static int
+nvmf_fc_ls_validate_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+
+	if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+/* **************************** */
+/* LS Reqeust Handler Functions */
+
+static void
+nvmf_fc_ls_process_cass(uint32_t s_id,
+			struct spdk_nvmf_fc_nport *tgtport,
+			struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+	struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst =
+		(struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+	struct spdk_nvmf_fc_ls_cr_assoc_acc *acc =
+		(struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+	struct spdk_nvmf_fc_association *assoc;
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_subsystem *subsystem = NULL;
+	const char *hostnqn = (const char *)rqst->assoc_cmd.hostnqn;
+	int errmsg_ind = 0;
+	uint8_t rc = FCNVME_RJT_RC_NONE;
+	uint8_t ec = FCNVME_RJT_EXP_NONE;
+	struct spdk_nvmf_transport *transport = spdk_nvmf_tgt_get_transport(ls_rqst->nvmf_tgt,
+						SPDK_NVME_TRANSPORT_NAME_FC);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "LS_CASS: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d, sq_size=%d, "
+		      "Subnqn: %s, Hostnqn: %s, Tgtport nn:%lx, pn:%lx\n",
+		      ls_rqst->rqst_len, from_be32(&rqst->desc_list_len),
+		      from_be32(&rqst->assoc_cmd.desc_len),
+		      from_be32(&rqst->assoc_cmd.sqsize),
+		      rqst->assoc_cmd.subnqn, hostnqn,
+		      tgtport->fc_nodename.u.wwn, tgtport->fc_portname.u.wwn);
+
+	if (ls_rqst->rqst_len < FCNVME_LS_CA_CMD_MIN_LEN) {
+		SPDK_ERRLOG("assoc_cmd req len = %d, should be at least %d\n",
+			    ls_rqst->rqst_len, FCNVME_LS_CA_CMD_MIN_LEN);
+		errmsg_ind = VERR_CR_ASSOC_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (from_be32(&rqst->desc_list_len) <
+		   FCNVME_LS_CA_DESC_LIST_MIN_LEN) {
+		SPDK_ERRLOG("assoc_cmd desc list len = %d, should be at least %d\n",
+			    from_be32(&rqst->desc_list_len),
+			    FCNVME_LS_CA_DESC_LIST_MIN_LEN);
+		errmsg_ind = VERR_CR_ASSOC_RQST_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->assoc_cmd.desc_tag !=
+		   cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) {
+		errmsg_ind = VERR_CR_ASSOC_CMD;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+	} else if (from_be32(&rqst->assoc_cmd.desc_len) <
+		   FCNVME_LS_CA_DESC_MIN_LEN) {
+		SPDK_ERRLOG("assoc_cmd desc len = %d, should be at least %d\n",
+			    from_be32(&rqst->assoc_cmd.desc_len),
+			    FCNVME_LS_CA_DESC_MIN_LEN);
+		errmsg_ind = VERR_CR_ASSOC_CMD_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (!rqst->assoc_cmd.ersp_ratio ||
+		   (from_be16(&rqst->assoc_cmd.ersp_ratio) >=
+		    from_be16(&rqst->assoc_cmd.sqsize))) {
+		errmsg_ind = VERR_ERSP_RATIO;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_ESRP;
+	} else if (from_be16(&rqst->assoc_cmd.sqsize) == 0 ||
+		   from_be16(&rqst->assoc_cmd.sqsize) > transport->opts.max_aq_depth) {
+		errmsg_ind = VERR_SQSIZE;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_SQ_SIZE;
+	}
+
+	if (rc != FCNVME_RJT_RC_NONE) {
+		goto rjt_cass;
+	}
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(ls_rqst->nvmf_tgt, rqst->assoc_cmd.subnqn);
+	if (subsystem == NULL) {
+		errmsg_ind = VERR_SUBNQN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_SUBNQN;
+		goto rjt_cass;
+	}
+
+	if (nvmf_fc_ls_validate_host(subsystem, hostnqn)) {
+		errmsg_ind = VERR_HOSTNQN;
+		rc = FCNVME_RJT_RC_INV_HOST;
+		ec = FCNVME_RJT_EXP_INV_HOSTNQN;
+		goto rjt_cass;
+	}
+
+	/* get new association */
+	assoc = nvmf_fc_ls_new_association(s_id, tgtport, ls_rqst->rport,
+					   &rqst->assoc_cmd, subsystem,
+					   ls_rqst->rpi, transport);
+	if (!assoc) {
+		errmsg_ind = VERR_ASSOC_ALLOC_FAIL;
+		rc = FCNVME_RJT_RC_INSUFF_RES;
+		ec = FCNVME_RJT_EXP_NONE;
+		goto rjt_cass;
+	}
+
+	/* alloc admin q (i.e. connection) */
+	fc_conn = nvmf_fc_ls_new_connection(assoc, 0,
+					    from_be16(&rqst->assoc_cmd.ersp_ratio),
+					    ls_rqst->rpi,
+					    from_be16(&rqst->assoc_cmd.sqsize),
+					    tgtport);
+	if (!fc_conn) {
+		nvmf_fc_ls_free_association(assoc);
+		errmsg_ind = VERR_CONN_ALLOC_FAIL;
+		rc = FCNVME_RJT_RC_INSUFF_RES;
+		ec = FCNVME_RJT_EXP_NONE;
+		goto rjt_cass;
+	}
+
+	/* format accept response */
+	bzero(acc, sizeof(*acc));
+	ls_rqst->rsp_len = sizeof(*acc);
+
+	nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC,
+				  nvmf_fc_lsdesc_len(
+					  sizeof(struct spdk_nvmf_fc_ls_cr_assoc_acc)),
+				  FCNVME_LS_CREATE_ASSOCIATION);
+	to_be32(&acc->assoc_id.desc_tag, FCNVME_LSDESC_ASSOC_ID);
+	acc->assoc_id.desc_len =
+		nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id));
+	to_be32(&acc->conn_id.desc_tag, FCNVME_LSDESC_CONN_ID);
+	acc->conn_id.desc_len =
+		nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_conn_id));
+
+	/* assign connection to HWQP poller - also sends response */
+	nvmf_fc_ls_add_conn_to_poller(assoc, ls_rqst, fc_conn, true);
+
+	return;
+
+rjt_cass:
+	SPDK_ERRLOG("Create Association LS failed: %s\n", validation_errors[errmsg_ind]);
+	ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE,
+			   rqst->w0.ls_cmd, rc, ec, 0);
+	nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+static void
+nvmf_fc_ls_process_cioc(struct spdk_nvmf_fc_nport *tgtport,
+			struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+	struct spdk_nvmf_fc_ls_cr_conn_rqst *rqst =
+		(struct spdk_nvmf_fc_ls_cr_conn_rqst *)ls_rqst->rqstbuf.virt;
+	struct spdk_nvmf_fc_ls_cr_conn_acc *acc =
+		(struct spdk_nvmf_fc_ls_cr_conn_acc *)ls_rqst->rspbuf.virt;
+	struct spdk_nvmf_fc_association *assoc;
+	struct spdk_nvmf_fc_conn *fc_conn = NULL;
+	int errmsg_ind = 0;
+	uint8_t rc = FCNVME_RJT_RC_NONE;
+	uint8_t ec = FCNVME_RJT_EXP_NONE;
+	struct spdk_nvmf_transport *transport = spdk_nvmf_tgt_get_transport(ls_rqst->nvmf_tgt,
+						SPDK_NVME_TRANSPORT_NAME_FC);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "LS_CIOC: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d, "
+		      "assoc_id=0x%lx, sq_size=%d, esrp=%d, Tgtport nn:%lx, pn:%lx\n",
+		      ls_rqst->rqst_len, from_be32(&rqst->desc_list_len),
+		      from_be32(&rqst->connect_cmd.desc_len),
+		      from_be64(&rqst->assoc_id.association_id),
+		      from_be32(&rqst->connect_cmd.sqsize),
+		      from_be32(&rqst->connect_cmd.ersp_ratio),
+		      tgtport->fc_nodename.u.wwn, tgtport->fc_portname.u.wwn);
+
+	if (ls_rqst->rqst_len < sizeof(struct spdk_nvmf_fc_ls_cr_conn_rqst)) {
+		errmsg_ind = VERR_CR_CONN_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->desc_list_len !=
+		   nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_cr_conn_rqst))) {
+		errmsg_ind = VERR_CR_CONN_RQST_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->assoc_id.desc_tag !=
+		   cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) {
+		errmsg_ind = VERR_ASSOC_ID;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+	} else if (rqst->assoc_id.desc_len !=
+		   nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id))) {
+		errmsg_ind = VERR_ASSOC_ID_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->connect_cmd.desc_tag !=
+		   cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD)) {
+		errmsg_ind = VERR_CR_CONN_CMD;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+	} else if (rqst->connect_cmd.desc_len !=
+		   nvmf_fc_lsdesc_len(
+			   sizeof(struct spdk_nvmf_fc_lsdesc_cr_conn_cmd))) {
+		errmsg_ind = VERR_CR_CONN_CMD_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (!rqst->connect_cmd.ersp_ratio ||
+		   (from_be16(&rqst->connect_cmd.ersp_ratio) >=
+		    from_be16(&rqst->connect_cmd.sqsize))) {
+		errmsg_ind = VERR_ERSP_RATIO;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_ESRP;
+	} else if (from_be16(&rqst->connect_cmd.sqsize) == 0 ||
+		   from_be16(&rqst->connect_cmd.sqsize) > transport->opts.max_queue_depth) {
+		errmsg_ind = VERR_SQSIZE;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_SQ_SIZE;
+	}
+
+	if (rc != FCNVME_RJT_RC_NONE) {
+		goto rjt_cioc;
+	}
+
+	/* find association */
+	assoc = nvmf_fc_ls_find_assoc(tgtport,
+				      from_be64(&rqst->assoc_id.association_id));
+	if (!assoc) {
+		errmsg_ind = VERR_NO_ASSOC;
+		rc = FCNVME_RJT_RC_INV_ASSOC;
+	} else if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+		/* association is being deleted - don't allow more connections */
+		errmsg_ind = VERR_NO_ASSOC;
+		rc = FCNVME_RJT_RC_INV_ASSOC;
+	} else  if (assoc->conn_count >= transport->opts.max_qpairs_per_ctrlr) {
+		errmsg_ind = VERR_CONN_TOO_MANY;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec =  FCNVME_RJT_EXP_INV_Q_ID;
+	}
+
+	if (rc != FCNVME_RJT_RC_NONE) {
+		goto rjt_cioc;
+	}
+
+	fc_conn = nvmf_fc_ls_new_connection(assoc, from_be16(&rqst->connect_cmd.qid),
+					    from_be16(&rqst->connect_cmd.ersp_ratio),
+					    ls_rqst->rpi,
+					    from_be16(&rqst->connect_cmd.sqsize),
+					    tgtport);
+	if (!fc_conn) {
+		errmsg_ind = VERR_CONN_ALLOC_FAIL;
+		rc = FCNVME_RJT_RC_INSUFF_RES;
+		ec = FCNVME_RJT_EXP_NONE;
+		goto rjt_cioc;
+	}
+
+	/* format accept response */
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Formatting LS accept response for "
+		      "assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+		      fc_conn->conn_id);
+	bzero(acc, sizeof(*acc));
+	ls_rqst->rsp_len = sizeof(*acc);
+	nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC,
+				  nvmf_fc_lsdesc_len(
+					  sizeof(struct spdk_nvmf_fc_ls_cr_conn_acc)),
+				  FCNVME_LS_CREATE_CONNECTION);
+	to_be32(&acc->conn_id.desc_tag, FCNVME_LSDESC_CONN_ID);
+	acc->conn_id.desc_len =
+		nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_conn_id));
+
+	/* assign connection to HWQP poller - also sends response */
+	nvmf_fc_ls_add_conn_to_poller(assoc, ls_rqst, fc_conn, false);
+
+	return;
+
+rjt_cioc:
+	SPDK_ERRLOG("Create Connection LS failed: %s\n", validation_errors[errmsg_ind]);
+
+	ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE,
+			   rqst->w0.ls_cmd, rc, ec, 0);
+	nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+static void
+nvmf_fc_ls_process_disc(struct spdk_nvmf_fc_nport *tgtport,
+			struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+	struct spdk_nvmf_fc_ls_disconnect_rqst *rqst =
+		(struct spdk_nvmf_fc_ls_disconnect_rqst *)ls_rqst->rqstbuf.virt;
+	struct spdk_nvmf_fc_ls_disconnect_acc *acc =
+		(struct spdk_nvmf_fc_ls_disconnect_acc *)ls_rqst->rspbuf.virt;
+	struct spdk_nvmf_fc_association *assoc;
+	int errmsg_ind = 0;
+	uint8_t rc = FCNVME_RJT_RC_NONE;
+	uint8_t ec = FCNVME_RJT_EXP_NONE;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+		      "LS_DISC: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d,"
+		      "assoc_id=0x%lx\n",
+		      ls_rqst->rqst_len, from_be32(&rqst->desc_list_len),
+		      from_be32(&rqst->disconn_cmd.desc_len),
+		      from_be64(&rqst->assoc_id.association_id));
+
+	if (ls_rqst->rqst_len < sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst)) {
+		errmsg_ind = VERR_DISCONN_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->desc_list_len !=
+		   nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst))) {
+		errmsg_ind = VERR_DISCONN_RQST_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->assoc_id.desc_tag !=
+		   cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) {
+		errmsg_ind = VERR_ASSOC_ID;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+	} else if (rqst->assoc_id.desc_len !=
+		   nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id))) {
+		errmsg_ind = VERR_ASSOC_ID_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	} else if (rqst->disconn_cmd.desc_tag !=
+		   cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD)) {
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		errmsg_ind = VERR_DISCONN_CMD;
+	} else if (rqst->disconn_cmd.desc_len !=
+		   nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_disconn_cmd))) {
+		errmsg_ind = VERR_DISCONN_CMD_LEN;
+		rc = FCNVME_RJT_RC_INV_PARAM;
+		ec = FCNVME_RJT_EXP_INV_LEN;
+	}
+
+	if (rc != FCNVME_RJT_RC_NONE) {
+		goto rjt_disc;
+	}
+
+	/* match an active association */
+	assoc = nvmf_fc_ls_find_assoc(tgtport,
+				      from_be64(&rqst->assoc_id.association_id));
+	if (!assoc) {
+		errmsg_ind = VERR_NO_ASSOC;
+		rc = FCNVME_RJT_RC_INV_ASSOC;
+		goto rjt_disc;
+	}
+
+	/* format response */
+	bzero(acc, sizeof(*acc));
+	ls_rqst->rsp_len = sizeof(*acc);
+
+	nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC,
+				  nvmf_fc_lsdesc_len(
+					  sizeof(struct spdk_nvmf_fc_ls_disconnect_acc)),
+				  FCNVME_LS_DISCONNECT);
+
+	nvmf_fc_ls_disconnect_assoc(tgtport, ls_rqst, assoc->assoc_id);
+	return;
+
+rjt_disc:
+	SPDK_ERRLOG("Disconnect LS failed: %s\n", validation_errors[errmsg_ind]);
+	ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE,
+			   rqst->w0.ls_cmd, rc, ec, 0);
+	nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+/* ************************ */
+/* external functions       */
+
+void
+nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port)
+{
+}
+
+void
+nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port)
+{
+}
+
+void
+nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+	struct spdk_nvmf_fc_ls_rqst_w0 *w0 =
+		(struct spdk_nvmf_fc_ls_rqst_w0 *)ls_rqst->rqstbuf.virt;
+	uint32_t s_id = ls_rqst->s_id;
+	struct spdk_nvmf_fc_nport *tgtport = ls_rqst->nport;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "LS cmd=%d\n", w0->ls_cmd);
+
+	switch (w0->ls_cmd) {
+	case FCNVME_LS_CREATE_ASSOCIATION:
+		nvmf_fc_ls_process_cass(s_id, tgtport, ls_rqst);
+		break;
+	case FCNVME_LS_CREATE_CONNECTION:
+		nvmf_fc_ls_process_cioc(tgtport, ls_rqst);
+		break;
+	case FCNVME_LS_DISCONNECT:
+		nvmf_fc_ls_process_disc(tgtport, ls_rqst);
+		break;
+	default:
+		SPDK_ERRLOG("Invalid LS cmd=%d\n", w0->ls_cmd);
+		ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(ls_rqst->rspbuf.virt,
+				   FCNVME_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
+				   FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
+		nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+	}
+}
+
+int
+nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport,
+			   uint64_t assoc_id, bool send_abts, bool backend_initiated,
+			   spdk_nvmf_fc_del_assoc_cb del_assoc_cb,
+			   void *cb_data)
+{
+	return _nvmf_fc_delete_association(tgtport, assoc_id, send_abts, backend_initiated,
+					   del_assoc_cb, cb_data, false);
+}
+
+static void
+nvmf_fc_poller_api_cb_event(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_cb_info *cb_info =
+		(struct spdk_nvmf_fc_poller_api_cb_info *) arg;
+
+	assert(cb_info != NULL);
+	cb_info->cb_func(cb_info->cb_data, cb_info->ret);
+}
+
+static void
+nvmf_fc_poller_api_perform_cb(struct spdk_nvmf_fc_poller_api_cb_info *cb_info,
+			      enum spdk_nvmf_fc_poller_api_ret ret)
+{
+	if (cb_info->cb_func && cb_info->cb_thread) {
+		cb_info->ret = ret;
+		/* callback to master thread */
+		spdk_thread_send_msg(cb_info->cb_thread, nvmf_fc_poller_api_cb_event,
+				     (void *) cb_info);
+	}
+}
+
+static void
+nvmf_fc_poller_api_add_connection(void *arg)
+{
+	enum spdk_nvmf_fc_poller_api_ret ret = SPDK_NVMF_FC_POLLER_API_SUCCESS;
+	struct spdk_nvmf_fc_poller_api_add_connection_args *conn_args =
+		(struct spdk_nvmf_fc_poller_api_add_connection_args *)arg;
+	struct spdk_nvmf_fc_conn *fc_conn;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Poller add connection, conn_id 0x%lx\n",
+		      conn_args->fc_conn->conn_id);
+
+	/* make sure connection is not already in poller's list */
+	fc_conn = nvmf_fc_hwqp_find_fc_conn(conn_args->fc_conn->hwqp,
+					    conn_args->fc_conn->conn_id);
+	if (fc_conn) {
+		SPDK_ERRLOG("duplicate connection found");
+		ret = SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+			      "conn_id=%lx", fc_conn->conn_id);
+		TAILQ_INSERT_TAIL(&conn_args->fc_conn->hwqp->connection_list,
+				  conn_args->fc_conn, link);
+	}
+
+	/* perform callback */
+	nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, ret);
+}
+
+static void
+nvmf_fc_poller_api_quiesce_queue(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_quiesce_queue_args *q_args =
+		(struct spdk_nvmf_fc_poller_api_quiesce_queue_args *) arg;
+	struct spdk_nvmf_fc_request *fc_req = NULL, *tmp;
+
+	/* should be already, but make sure queue is quiesced */
+	q_args->hwqp->state = SPDK_FC_HWQP_OFFLINE;
+
+	/*
+	 * Kill all the outstanding commands that are in the transfer state and
+	 * in the process of being aborted.
+	 * We can run into this situation if an adapter reset happens when an I_T Nexus delete
+	 * is in progress.
+	 */
+	TAILQ_FOREACH_SAFE(fc_req, &q_args->hwqp->in_use_reqs, link, tmp) {
+		if (nvmf_fc_req_in_xfer(fc_req) && fc_req->is_aborted == true) {
+			nvmf_fc_poller_api_func(q_args->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+						(void *)fc_req);
+		}
+	}
+
+	/* perform callback */
+	nvmf_fc_poller_api_perform_cb(&q_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
+}
+
+static void
+nvmf_fc_poller_api_activate_queue(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_quiesce_queue_args *q_args =
+		(struct spdk_nvmf_fc_poller_api_quiesce_queue_args *) arg;
+
+	q_args->hwqp->state = SPDK_FC_HWQP_ONLINE;
+
+	/* perform callback */
+	nvmf_fc_poller_api_perform_cb(&q_args->cb_info, 0);
+}
+
+static void
+nvmf_fc_disconnect_qpair_cb(void *ctx)
+{
+	struct spdk_nvmf_fc_poller_api_cb_info *cb_info = ctx;
+	/* perform callback */
+	nvmf_fc_poller_api_perform_cb(cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
+}
+
+static void
+nvmf_fc_poller_conn_abort_done(void *hwqp, int32_t status, void *cb_args)
+{
+	struct spdk_nvmf_fc_poller_api_del_connection_args *conn_args = cb_args;
+
+	if (conn_args->fc_request_cnt) {
+		conn_args->fc_request_cnt -= 1;
+	}
+
+	if (!conn_args->fc_request_cnt) {
+		if (!TAILQ_EMPTY(&conn_args->hwqp->connection_list)) {
+			/* All the requests for this connection are aborted. */
+			TAILQ_REMOVE(&conn_args->hwqp->connection_list,	conn_args->fc_conn, link);
+
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Connection deleted, conn_id 0x%lx\n",
+				      conn_args->fc_conn->conn_id);
+
+			if (!conn_args->backend_initiated) {
+				/* disconnect qpair from nvmf controller */
+				spdk_nvmf_qpair_disconnect(&conn_args->fc_conn->qpair,
+							   nvmf_fc_disconnect_qpair_cb, &conn_args->cb_info);
+			}
+		} else {
+			/*
+			 * Duplicate connection delete can happen if one is
+			 * coming in via an association disconnect and the other
+			 * is initiated by a port reset.
+			 */
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Duplicate conn delete.");
+			/* perform callback */
+			nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
+		}
+	}
+}
+
+static void
+nvmf_fc_poller_api_del_connection(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_del_connection_args *conn_args =
+		(struct spdk_nvmf_fc_poller_api_del_connection_args *)arg;
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_fc_request *fc_req = NULL, *tmp;
+	struct spdk_nvmf_fc_hwqp *hwqp = conn_args->hwqp;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Poller delete connection, conn_id 0x%lx\n",
+		      conn_args->fc_conn->conn_id);
+
+	/* find the connection in poller's list */
+	fc_conn = nvmf_fc_hwqp_find_fc_conn(hwqp, conn_args->fc_conn->conn_id);
+	if (!fc_conn) {
+		/* perform callback */
+		nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_NO_CONN_ID);
+		return;
+	}
+
+	conn_args->fc_request_cnt = 0;
+
+	TAILQ_FOREACH_SAFE(fc_req, &hwqp->in_use_reqs, link, tmp) {
+		if (fc_req->fc_conn->conn_id == fc_conn->conn_id) {
+			if (nvmf_qpair_is_admin_queue(&fc_conn->qpair) &&
+			    (fc_req->req.cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST)) {
+				/* AER will be cleaned by spdk_nvmf_qpair_disconnect. */
+				continue;
+			}
+
+			conn_args->fc_request_cnt += 1;
+			nvmf_fc_request_abort(fc_req, conn_args->send_abts,
+					      nvmf_fc_poller_conn_abort_done,
+					      conn_args);
+		}
+	}
+
+	if (!conn_args->fc_request_cnt) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Connection deleted.\n");
+		TAILQ_REMOVE(&hwqp->connection_list, fc_conn, link);
+
+		if (!conn_args->backend_initiated) {
+			/* disconnect qpair from nvmf controller */
+			spdk_nvmf_qpair_disconnect(&fc_conn->qpair, nvmf_fc_disconnect_qpair_cb,
+						   &conn_args->cb_info);
+		}
+	}
+}
+
+static void
+nvmf_fc_poller_abts_done(void *hwqp, int32_t status, void *cb_args)
+{
+	struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = cb_args;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+		      "ABTS poller done, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+		      args->ctx->rpi, args->ctx->oxid, args->ctx->rxid);
+
+	nvmf_fc_poller_api_perform_cb(&args->cb_info,
+				      SPDK_NVMF_FC_POLLER_API_SUCCESS);
+}
+
+static void
+nvmf_fc_poller_api_abts_received(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = arg;
+	struct spdk_nvmf_fc_request *fc_req = NULL;
+	struct spdk_nvmf_fc_hwqp *hwqp = args->hwqp;
+
+	TAILQ_FOREACH(fc_req, &hwqp->in_use_reqs, link) {
+		if ((fc_req->rpi == args->ctx->rpi) &&
+		    (fc_req->oxid == args->ctx->oxid)) {
+			nvmf_fc_request_abort(fc_req, false,
+					      nvmf_fc_poller_abts_done, args);
+			return;
+		}
+	}
+
+	nvmf_fc_poller_api_perform_cb(&args->cb_info,
+				      SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND);
+}
+
+static void
+nvmf_fc_poller_api_queue_sync(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_queue_sync_args *args = arg;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+		      "HWQP sync requested for u_id = 0x%lx\n", args->u_id);
+
+	/* Add this args to hwqp sync_cb list */
+	TAILQ_INSERT_TAIL(&args->hwqp->sync_cbs, args, link);
+}
+
+static void
+nvmf_fc_poller_api_queue_sync_done(void *arg)
+{
+	struct spdk_nvmf_fc_poller_api_queue_sync_done_args *args = arg;
+	struct spdk_nvmf_fc_hwqp *hwqp = args->hwqp;
+	uint64_t tag = args->tag;
+	struct spdk_nvmf_fc_poller_api_queue_sync_args *sync_args = NULL, *tmp = NULL;
+
+	assert(args != NULL);
+
+	TAILQ_FOREACH_SAFE(sync_args, &hwqp->sync_cbs, link, tmp) {
+		if (sync_args->u_id == tag) {
+			/* Queue successfully synced. Remove from cb list */
+			TAILQ_REMOVE(&hwqp->sync_cbs, sync_args, link);
+
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+				      "HWQP sync done for u_id = 0x%lx\n", sync_args->u_id);
+
+			/* Return the status to poller */
+			nvmf_fc_poller_api_perform_cb(&sync_args->cb_info,
+						      SPDK_NVMF_FC_POLLER_API_SUCCESS);
+			return;
+		}
+	}
+
+	free(arg);
+	/* note: no callback from this api */
+}
+
+static void
+nvmf_fc_poller_api_add_hwqp(void *arg)
+{
+	struct spdk_nvmf_fc_hwqp *hwqp = (struct spdk_nvmf_fc_hwqp *)arg;
+
+	hwqp->lcore_id = spdk_env_get_current_core(); /* for tracing purposes only */
+	TAILQ_INSERT_TAIL(&hwqp->fgroup->hwqp_list, hwqp, link);
+	/* note: no callback from this api */
+}
+
+static void
+nvmf_fc_poller_api_remove_hwqp(void *arg)
+{
+	struct spdk_nvmf_fc_hwqp *hwqp = (struct spdk_nvmf_fc_hwqp *)arg;
+	struct spdk_nvmf_fc_poll_group *fgroup = hwqp->fgroup;
+
+	TAILQ_REMOVE(&fgroup->hwqp_list, hwqp, link);
+	hwqp->fgroup = NULL;
+	/* note: no callback from this api */
+}
+
+enum spdk_nvmf_fc_poller_api_ret
+nvmf_fc_poller_api_func(struct spdk_nvmf_fc_hwqp *hwqp, enum spdk_nvmf_fc_poller_api api,
+			void *api_args) {
+	switch (api)
+	{
+	case SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION:
+				spdk_thread_send_msg(hwqp->thread,
+						     nvmf_fc_poller_api_add_connection, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION:
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_poller_api_del_connection, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE:
+		/* quiesce q polling now, don't wait for poller to do it */
+		hwqp->state = SPDK_FC_HWQP_OFFLINE;
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_poller_api_quiesce_queue, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE:
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_poller_api_activate_queue, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED:
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_poller_api_abts_received, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE:
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_request_abort_complete, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC:
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_poller_api_queue_sync, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE:
+		spdk_thread_send_msg(hwqp->thread,
+				     nvmf_fc_poller_api_queue_sync_done, api_args);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_ADD_HWQP:
+		spdk_thread_send_msg(hwqp->thread, nvmf_fc_poller_api_add_hwqp, (void *) hwqp);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP:
+		spdk_thread_send_msg(hwqp->thread, nvmf_fc_poller_api_remove_hwqp, (void *) hwqp);
+		break;
+
+	case SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT:
+	case SPDK_NVMF_FC_POLLER_API_AEN:
+	default:
+		SPDK_ERRLOG("BAD ARG!");
+		return SPDK_NVMF_FC_POLLER_API_INVALID_ARG;
+	}
+
+	return SPDK_NVMF_FC_POLLER_API_SUCCESS;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_poller_api", SPDK_LOG_NVMF_FC_POLLER_API)
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_ls", SPDK_LOG_NVMF_FC_LS)
diff --git a/src/spdk/lib/nvmf/nvmf.c b/src/spdk/lib/nvmf/nvmf.c
new file mode 100644
index 000000000..73fa0742e
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf.c
@@ -0,0 +1,1457 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/bit_array.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/nvmf.h"
+#include "spdk/trace.h"
+#include "spdk/endian.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF)
+
+#define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024
+
+static TAILQ_HEAD(, spdk_nvmf_tgt) g_nvmf_tgts = TAILQ_HEAD_INITIALIZER(g_nvmf_tgts);
+
+typedef void (*nvmf_qpair_disconnect_cpl)(void *ctx, int status);
+static void nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf);
+
+/* supplied to a single call to nvmf_qpair_disconnect */
+struct nvmf_qpair_disconnect_ctx {
+	struct spdk_nvmf_qpair *qpair;
+	struct spdk_nvmf_ctrlr *ctrlr;
+	nvmf_qpair_disconnect_cb cb_fn;
+	struct spdk_thread *thread;
+	void *ctx;
+	uint16_t qid;
+};
+
+/*
+ * There are several times when we need to iterate through the list of all qpairs and selectively delete them.
+ * In order to do this sequentially without overlap, we must provide a context to recover the next qpair from
+ * to enable calling nvmf_qpair_disconnect on the next desired qpair.
+ */
+struct nvmf_qpair_disconnect_many_ctx {
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_poll_group *group;
+	spdk_nvmf_poll_group_mod_done cpl_fn;
+	void *cpl_ctx;
+};
+
+static void
+nvmf_qpair_set_state(struct spdk_nvmf_qpair *qpair,
+		     enum spdk_nvmf_qpair_state state)
+{
+	assert(qpair != NULL);
+	assert(qpair->group->thread == spdk_get_thread());
+
+	qpair->state = state;
+}
+
+static int
+nvmf_poll_group_poll(void *ctx)
+{
+	struct spdk_nvmf_poll_group *group = ctx;
+	int rc;
+	int count = 0;
+	struct spdk_nvmf_transport_poll_group *tgroup;
+
+	TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		rc = nvmf_transport_poll_group_poll(tgroup);
+		if (rc < 0) {
+			return SPDK_POLLER_BUSY;
+		}
+		count += rc;
+	}
+
+	return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static int
+nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf)
+{
+	struct spdk_nvmf_tgt *tgt = io_device;
+	struct spdk_nvmf_poll_group *group = ctx_buf;
+	struct spdk_nvmf_transport *transport;
+	uint32_t sid;
+
+	TAILQ_INIT(&group->tgroups);
+	TAILQ_INIT(&group->qpairs);
+
+	TAILQ_FOREACH(transport, &tgt->transports, link) {
+		nvmf_poll_group_add_transport(group, transport);
+	}
+
+	group->num_sgroups = tgt->max_subsystems;
+	group->sgroups = calloc(tgt->max_subsystems, sizeof(struct spdk_nvmf_subsystem_poll_group));
+	if (!group->sgroups) {
+		return -ENOMEM;
+	}
+
+	for (sid = 0; sid < tgt->max_subsystems; sid++) {
+		struct spdk_nvmf_subsystem *subsystem;
+
+		subsystem = tgt->subsystems[sid];
+		if (!subsystem) {
+			continue;
+		}
+
+		if (nvmf_poll_group_add_subsystem(group, subsystem, NULL, NULL) != 0) {
+			nvmf_tgt_destroy_poll_group(io_device, ctx_buf);
+			return -1;
+		}
+	}
+
+	pthread_mutex_lock(&tgt->mutex);
+	TAILQ_INSERT_TAIL(&tgt->poll_groups, group, link);
+	pthread_mutex_unlock(&tgt->mutex);
+
+	group->poller = SPDK_POLLER_REGISTER(nvmf_poll_group_poll, group, 0);
+	group->thread = spdk_get_thread();
+
+	return 0;
+}
+
+static void
+nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf)
+{
+	struct spdk_nvmf_tgt *tgt = io_device;
+	struct spdk_nvmf_poll_group *group = ctx_buf;
+	struct spdk_nvmf_transport_poll_group *tgroup, *tmp;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	uint32_t sid, nsid;
+
+	pthread_mutex_lock(&tgt->mutex);
+	TAILQ_REMOVE(&tgt->poll_groups, group, link);
+	pthread_mutex_unlock(&tgt->mutex);
+
+	TAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp) {
+		TAILQ_REMOVE(&group->tgroups, tgroup, link);
+		nvmf_transport_poll_group_destroy(tgroup);
+	}
+
+	for (sid = 0; sid < group->num_sgroups; sid++) {
+		sgroup = &group->sgroups[sid];
+
+		for (nsid = 0; nsid < sgroup->num_ns; nsid++) {
+			if (sgroup->ns_info[nsid].channel) {
+				spdk_put_io_channel(sgroup->ns_info[nsid].channel);
+				sgroup->ns_info[nsid].channel = NULL;
+			}
+		}
+
+		free(sgroup->ns_info);
+	}
+
+	free(group->sgroups);
+
+	if (group->destroy_cb_fn) {
+		group->destroy_cb_fn(group->destroy_cb_arg, 0);
+	}
+}
+
+static void
+_nvmf_tgt_disconnect_next_qpair(void *ctx)
+{
+	struct spdk_nvmf_qpair *qpair;
+	struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
+	struct spdk_nvmf_poll_group *group = qpair_ctx->group;
+	struct spdk_io_channel *ch;
+	int rc = 0;
+
+	qpair = TAILQ_FIRST(&group->qpairs);
+
+	if (qpair) {
+		rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_tgt_disconnect_next_qpair, ctx);
+	}
+
+	if (!qpair || rc != 0) {
+		/* When the refcount from the channels reaches 0, nvmf_tgt_destroy_poll_group will be called. */
+		ch = spdk_io_channel_from_ctx(group);
+		spdk_put_io_channel(ch);
+		free(qpair_ctx);
+	}
+}
+
+static void
+nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group)
+{
+	struct nvmf_qpair_disconnect_many_ctx *ctx;
+
+	ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx));
+
+	if (!ctx) {
+		SPDK_ERRLOG("Failed to allocate memory for destroy poll group ctx\n");
+		return;
+	}
+
+	spdk_poller_unregister(&group->poller);
+
+	ctx->group = group;
+	_nvmf_tgt_disconnect_next_qpair(ctx);
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_tgt_create(struct spdk_nvmf_target_opts *opts)
+{
+	struct spdk_nvmf_tgt *tgt, *tmp_tgt;
+
+	if (strnlen(opts->name, NVMF_TGT_NAME_MAX_LENGTH) == NVMF_TGT_NAME_MAX_LENGTH) {
+		SPDK_ERRLOG("Provided target name exceeds the max length of %u.\n", NVMF_TGT_NAME_MAX_LENGTH);
+		return NULL;
+	}
+
+	TAILQ_FOREACH(tmp_tgt, &g_nvmf_tgts, link) {
+		if (!strncmp(opts->name, tmp_tgt->name, NVMF_TGT_NAME_MAX_LENGTH)) {
+			SPDK_ERRLOG("Provided target name must be unique.\n");
+			return NULL;
+		}
+	}
+
+	tgt = calloc(1, sizeof(*tgt));
+	if (!tgt) {
+		return NULL;
+	}
+
+	snprintf(tgt->name, NVMF_TGT_NAME_MAX_LENGTH, "%s", opts->name);
+
+	if (!opts || !opts->max_subsystems) {
+		tgt->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS;
+	} else {
+		tgt->max_subsystems = opts->max_subsystems;
+	}
+
+	tgt->discovery_genctr = 0;
+	TAILQ_INIT(&tgt->transports);
+	TAILQ_INIT(&tgt->poll_groups);
+
+	tgt->subsystems = calloc(tgt->max_subsystems, sizeof(struct spdk_nvmf_subsystem *));
+	if (!tgt->subsystems) {
+		free(tgt);
+		return NULL;
+	}
+
+	pthread_mutex_init(&tgt->mutex, NULL);
+
+	TAILQ_INSERT_HEAD(&g_nvmf_tgts, tgt, link);
+
+	spdk_io_device_register(tgt,
+				nvmf_tgt_create_poll_group,
+				nvmf_tgt_destroy_poll_group,
+				sizeof(struct spdk_nvmf_poll_group),
+				tgt->name);
+
+	return tgt;
+}
+
+static void
+nvmf_tgt_destroy_cb(void *io_device)
+{
+	struct spdk_nvmf_tgt *tgt = io_device;
+	struct spdk_nvmf_transport *transport, *transport_tmp;
+	spdk_nvmf_tgt_destroy_done_fn		*destroy_cb_fn;
+	void					*destroy_cb_arg;
+	uint32_t i;
+
+	if (tgt->subsystems) {
+		for (i = 0; i < tgt->max_subsystems; i++) {
+			if (tgt->subsystems[i]) {
+				nvmf_subsystem_remove_all_listeners(tgt->subsystems[i], true);
+				spdk_nvmf_subsystem_destroy(tgt->subsystems[i]);
+			}
+		}
+		free(tgt->subsystems);
+	}
+
+	TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, transport_tmp) {
+		TAILQ_REMOVE(&tgt->transports, transport, link);
+		spdk_nvmf_transport_destroy(transport);
+	}
+
+	destroy_cb_fn = tgt->destroy_cb_fn;
+	destroy_cb_arg = tgt->destroy_cb_arg;
+
+	free(tgt);
+
+	if (destroy_cb_fn) {
+		destroy_cb_fn(destroy_cb_arg, 0);
+	}
+}
+
+void
+spdk_nvmf_tgt_destroy(struct spdk_nvmf_tgt *tgt,
+		      spdk_nvmf_tgt_destroy_done_fn cb_fn,
+		      void *cb_arg)
+{
+	tgt->destroy_cb_fn = cb_fn;
+	tgt->destroy_cb_arg = cb_arg;
+
+	TAILQ_REMOVE(&g_nvmf_tgts, tgt, link);
+
+	spdk_io_device_unregister(tgt, nvmf_tgt_destroy_cb);
+}
+
+const char *
+spdk_nvmf_tgt_get_name(struct spdk_nvmf_tgt *tgt)
+{
+	return tgt->name;
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_get_tgt(const char *name)
+{
+	struct spdk_nvmf_tgt *tgt;
+	uint32_t num_targets = 0;
+
+	TAILQ_FOREACH(tgt, &g_nvmf_tgts, link) {
+		if (name) {
+			if (!strncmp(tgt->name, name, NVMF_TGT_NAME_MAX_LENGTH)) {
+				return tgt;
+			}
+		}
+		num_targets++;
+	}
+
+	/*
+	 * special case. If there is only one target and
+	 * no name was specified, return the only available
+	 * target. If there is more than one target, name must
+	 * be specified.
+	 */
+	if (!name && num_targets == 1) {
+		return TAILQ_FIRST(&g_nvmf_tgts);
+	}
+
+	return NULL;
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_get_first_tgt(void)
+{
+	return TAILQ_FIRST(&g_nvmf_tgts);
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_get_next_tgt(struct spdk_nvmf_tgt *prev)
+{
+	return TAILQ_NEXT(prev, link);
+}
+
+static void
+nvmf_write_subsystem_config_json(struct spdk_json_write_ctx *w,
+				 struct spdk_nvmf_subsystem *subsystem)
+{
+	struct spdk_nvmf_host *host;
+	struct spdk_nvmf_subsystem_listener *listener;
+	const struct spdk_nvme_transport_id *trid;
+	struct spdk_nvmf_ns *ns;
+	struct spdk_nvmf_ns_opts ns_opts;
+	uint32_t max_namespaces;
+	char uuid_str[SPDK_UUID_STRING_LEN];
+	const char *adrfam;
+
+	if (spdk_nvmf_subsystem_get_type(subsystem) != SPDK_NVMF_SUBTYPE_NVME) {
+		return;
+	}
+
+	/* { */
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "nvmf_create_subsystem");
+
+	/*     "params" : { */
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+	spdk_json_write_named_bool(w, "allow_any_host", spdk_nvmf_subsystem_get_allow_any_host(subsystem));
+	spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem));
+	spdk_json_write_named_string(w, "model_number", spdk_nvmf_subsystem_get_mn(subsystem));
+
+	max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem);
+	if (max_namespaces != 0) {
+		spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces);
+	}
+
+	/*     } "params" */
+	spdk_json_write_object_end(w);
+
+	/* } */
+	spdk_json_write_object_end(w);
+
+	for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL;
+	     listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) {
+		trid = spdk_nvmf_subsystem_listener_get_trid(listener);
+
+		adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
+
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_listener");
+
+		/*     "params" : { */
+		spdk_json_write_named_object_begin(w, "params");
+
+		spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+
+		/*     "listen_address" : { */
+		spdk_json_write_named_object_begin(w, "listen_address");
+
+		spdk_json_write_named_string(w, "trtype", trid->trstring);
+		if (adrfam) {
+			spdk_json_write_named_string(w, "adrfam", adrfam);
+		}
+
+		spdk_json_write_named_string(w, "traddr", trid->traddr);
+		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
+		/*     } "listen_address" */
+		spdk_json_write_object_end(w);
+
+		/*     } "params" */
+		spdk_json_write_object_end(w);
+
+		/* } */
+		spdk_json_write_object_end(w);
+	}
+
+	for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL;
+	     host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) {
+
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_host");
+
+		/*     "params" : { */
+		spdk_json_write_named_object_begin(w, "params");
+
+		spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+		spdk_json_write_named_string(w, "host", spdk_nvmf_host_get_nqn(host));
+
+		/*     } "params" */
+		spdk_json_write_object_end(w);
+
+		/* } */
+		spdk_json_write_object_end(w);
+	}
+
+	for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+	     ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+		spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts));
+
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_ns");
+
+		/*     "params" : { */
+		spdk_json_write_named_object_begin(w, "params");
+
+		spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+
+		/*     "namespace" : { */
+		spdk_json_write_named_object_begin(w, "namespace");
+
+		spdk_json_write_named_uint32(w, "nsid", spdk_nvmf_ns_get_id(ns));
+		spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns)));
+
+		if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) {
+			SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(uint64_t) * 2, "size mismatch");
+			spdk_json_write_named_string_fmt(w, "nguid", "%016"PRIX64"%016"PRIX64, from_be64(&ns_opts.nguid[0]),
+							 from_be64(&ns_opts.nguid[8]));
+		}
+
+		if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) {
+			SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(uint64_t), "size mismatch");
+			spdk_json_write_named_string_fmt(w, "eui64", "%016"PRIX64, from_be64(&ns_opts.eui64));
+		}
+
+		if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) {
+			spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid);
+			spdk_json_write_named_string(w, "uuid",  uuid_str);
+		}
+
+		/*     "namespace" */
+		spdk_json_write_object_end(w);
+
+		/*     } "params" */
+		spdk_json_write_object_end(w);
+
+		/* } */
+		spdk_json_write_object_end(w);
+	}
+}
+
+void
+spdk_nvmf_tgt_write_config_json(struct spdk_json_write_ctx *w, struct spdk_nvmf_tgt *tgt)
+{
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_transport *transport;
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "nvmf_set_max_subsystems");
+
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_uint32(w, "max_subsystems", tgt->max_subsystems);
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+
+	/* write transports */
+	TAILQ_FOREACH(transport, &tgt->transports, link) {
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "method", "nvmf_create_transport");
+
+		spdk_json_write_named_object_begin(w, "params");
+		spdk_json_write_named_string(w, "trtype", spdk_nvme_transport_id_trtype_str(transport->ops->type));
+		spdk_json_write_named_uint32(w, "max_queue_depth", transport->opts.max_queue_depth);
+		spdk_json_write_named_uint32(w, "max_io_qpairs_per_ctrlr",
+					     transport->opts.max_qpairs_per_ctrlr - 1);
+		spdk_json_write_named_uint32(w, "in_capsule_data_size", transport->opts.in_capsule_data_size);
+		spdk_json_write_named_uint32(w, "max_io_size", transport->opts.max_io_size);
+		spdk_json_write_named_uint32(w, "io_unit_size", transport->opts.io_unit_size);
+		spdk_json_write_named_uint32(w, "max_aq_depth", transport->opts.max_aq_depth);
+		if (transport->ops->type == SPDK_NVME_TRANSPORT_RDMA) {
+			spdk_json_write_named_uint32(w, "max_srq_depth", transport->opts.max_srq_depth);
+		}
+		spdk_json_write_named_uint32(w, "abort_timeout_sec", transport->opts.abort_timeout_sec);
+		spdk_json_write_object_end(w);
+
+		spdk_json_write_object_end(w);
+	}
+
+	subsystem = spdk_nvmf_subsystem_get_first(tgt);
+	while (subsystem) {
+		nvmf_write_subsystem_config_json(w, subsystem);
+		subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+	}
+}
+
+int
+spdk_nvmf_tgt_listen(struct spdk_nvmf_tgt *tgt,
+		     struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_transport *transport;
+	const char *trtype;
+	int rc;
+
+	transport = spdk_nvmf_tgt_get_transport(tgt, trid->trstring);
+	if (!transport) {
+		trtype = spdk_nvme_transport_id_trtype_str(trid->trtype);
+		if (trtype != NULL) {
+			SPDK_ERRLOG("Unable to listen on transport %s. The transport must be created first.\n", trtype);
+		} else {
+			SPDK_ERRLOG("The specified trtype %d is unknown. Please make sure that it is properly registered.\n",
+				    trid->trtype);
+		}
+
+		return -EINVAL;
+	}
+
+	rc = spdk_nvmf_transport_listen(transport, trid);
+	if (rc < 0) {
+		SPDK_ERRLOG("Unable to listen on address '%s'\n", trid->traddr);
+	}
+
+	return rc;
+}
+
+int
+spdk_nvmf_tgt_stop_listen(struct spdk_nvmf_tgt *tgt,
+			  struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_transport *transport;
+	const char *trtype;
+	int rc;
+
+	transport = spdk_nvmf_tgt_get_transport(tgt, trid->trstring);
+	if (!transport) {
+		trtype = spdk_nvme_transport_id_trtype_str(trid->trtype);
+		if (trtype != NULL) {
+			SPDK_ERRLOG("Unable to stop listen on transport %s. The transport must be created first.\n",
+				    trtype);
+		} else {
+			SPDK_ERRLOG("The specified trtype %d is unknown. Please make sure that it is properly registered.\n",
+				    trid->trtype);
+		}
+		return -EINVAL;
+	}
+
+	rc = spdk_nvmf_transport_stop_listen(transport, trid);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to stop listening on address '%s'\n", trid->traddr);
+		return rc;
+	}
+	return 0;
+}
+
+struct spdk_nvmf_tgt_add_transport_ctx {
+	struct spdk_nvmf_tgt *tgt;
+	struct spdk_nvmf_transport *transport;
+	spdk_nvmf_tgt_add_transport_done_fn cb_fn;
+	void *cb_arg;
+};
+
+static void
+_nvmf_tgt_add_transport_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	ctx->cb_fn(ctx->cb_arg, status);
+
+	free(ctx);
+}
+
+static void
+_nvmf_tgt_add_transport(struct spdk_io_channel_iter *i)
+{
+	struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+	struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch);
+	int rc;
+
+	rc = nvmf_poll_group_add_transport(group, ctx->transport);
+	spdk_for_each_channel_continue(i, rc);
+}
+
+void spdk_nvmf_tgt_add_transport(struct spdk_nvmf_tgt *tgt,
+				 struct spdk_nvmf_transport *transport,
+				 spdk_nvmf_tgt_add_transport_done_fn cb_fn,
+				 void *cb_arg)
+{
+	struct spdk_nvmf_tgt_add_transport_ctx *ctx;
+
+	if (spdk_nvmf_tgt_get_transport(tgt, transport->ops->name)) {
+		cb_fn(cb_arg, -EEXIST);
+		return; /* transport already created */
+	}
+
+	transport->tgt = tgt;
+	TAILQ_INSERT_TAIL(&tgt->transports, transport, link);
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	ctx->tgt = tgt;
+	ctx->transport = transport;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	spdk_for_each_channel(tgt,
+			      _nvmf_tgt_add_transport,
+			      ctx,
+			      _nvmf_tgt_add_transport_done);
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_tgt_find_subsystem(struct spdk_nvmf_tgt *tgt, const char *subnqn)
+{
+	struct spdk_nvmf_subsystem	*subsystem;
+	uint32_t sid;
+
+	if (!subnqn) {
+		return NULL;
+	}
+
+	/* Ensure that subnqn is null terminated */
+	if (!memchr(subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1)) {
+		SPDK_ERRLOG("Connect SUBNQN is not null terminated\n");
+		return NULL;
+	}
+
+	for (sid = 0; sid < tgt->max_subsystems; sid++) {
+		subsystem = tgt->subsystems[sid];
+		if (subsystem == NULL) {
+			continue;
+		}
+
+		if (strcmp(subnqn, subsystem->subnqn) == 0) {
+			return subsystem;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, const char *transport_name)
+{
+	struct spdk_nvmf_transport *transport;
+
+	TAILQ_FOREACH(transport, &tgt->transports, link) {
+		if (!strncasecmp(transport->ops->name, transport_name, SPDK_NVMF_TRSTRING_MAX_LEN)) {
+			return transport;
+		}
+	}
+	return NULL;
+}
+
+struct nvmf_new_qpair_ctx {
+	struct spdk_nvmf_qpair *qpair;
+	struct spdk_nvmf_poll_group *group;
+};
+
+static void
+_nvmf_poll_group_add(void *_ctx)
+{
+	struct nvmf_new_qpair_ctx *ctx = _ctx;
+	struct spdk_nvmf_qpair *qpair = ctx->qpair;
+	struct spdk_nvmf_poll_group *group = ctx->group;
+
+	free(_ctx);
+
+	if (spdk_nvmf_poll_group_add(group, qpair) != 0) {
+		SPDK_ERRLOG("Unable to add the qpair to a poll group.\n");
+		spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+	}
+}
+
+void
+spdk_nvmf_tgt_new_qpair(struct spdk_nvmf_tgt *tgt, struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_poll_group *group;
+	struct nvmf_new_qpair_ctx *ctx;
+
+	group = spdk_nvmf_get_optimal_poll_group(qpair);
+	if (group == NULL) {
+		if (tgt->next_poll_group == NULL) {
+			tgt->next_poll_group = TAILQ_FIRST(&tgt->poll_groups);
+			if (tgt->next_poll_group == NULL) {
+				SPDK_ERRLOG("No poll groups exist.\n");
+				spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+				return;
+			}
+		}
+		group = tgt->next_poll_group;
+		tgt->next_poll_group = TAILQ_NEXT(group, link);
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		SPDK_ERRLOG("Unable to send message to poll group.\n");
+		spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+		return;
+	}
+
+	ctx->qpair = qpair;
+	ctx->group = group;
+
+	spdk_thread_send_msg(group->thread, _nvmf_poll_group_add, ctx);
+}
+
+uint32_t
+spdk_nvmf_tgt_accept(struct spdk_nvmf_tgt *tgt)
+{
+	struct spdk_nvmf_transport *transport, *tmp;
+	uint32_t count = 0;
+
+	TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, tmp) {
+		count += nvmf_transport_accept(transport);
+	}
+
+	return count;
+}
+
+struct spdk_nvmf_poll_group *
+spdk_nvmf_poll_group_create(struct spdk_nvmf_tgt *tgt)
+{
+	struct spdk_io_channel *ch;
+
+	ch = spdk_get_io_channel(tgt);
+	if (!ch) {
+		SPDK_ERRLOG("Unable to get I/O channel for target\n");
+		return NULL;
+	}
+
+	return spdk_io_channel_get_ctx(ch);
+}
+
+void
+spdk_nvmf_poll_group_destroy(struct spdk_nvmf_poll_group *group,
+			     spdk_nvmf_poll_group_destroy_done_fn cb_fn,
+			     void *cb_arg)
+{
+	assert(group->destroy_cb_fn == NULL);
+	group->destroy_cb_fn = cb_fn;
+	group->destroy_cb_arg = cb_arg;
+
+	/* This function will put the io_channel associated with this poll group */
+	nvmf_tgt_destroy_poll_group_qpairs(group);
+}
+
+int
+spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group,
+			 struct spdk_nvmf_qpair *qpair)
+{
+	int rc = -1;
+	struct spdk_nvmf_transport_poll_group *tgroup;
+
+	TAILQ_INIT(&qpair->outstanding);
+	qpair->group = group;
+
+	TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		if (tgroup->transport == qpair->transport) {
+			rc = nvmf_transport_poll_group_add(tgroup, qpair);
+			break;
+		}
+	}
+
+	/* We add the qpair to the group only it is succesfully added into the tgroup */
+	if (rc == 0) {
+		TAILQ_INSERT_TAIL(&group->qpairs, qpair, link);
+		nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVE);
+	}
+
+	return rc;
+}
+
+static
+void _nvmf_ctrlr_destruct(void *ctx)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = ctx;
+
+	nvmf_ctrlr_destruct(ctrlr);
+}
+
+static void
+_nvmf_transport_qpair_fini(void *ctx)
+{
+	struct spdk_nvmf_qpair *qpair = ctx;
+
+	nvmf_transport_qpair_fini(qpair);
+}
+
+static void
+_nvmf_ctrlr_free_from_qpair(void *ctx)
+{
+	struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx;
+	struct spdk_nvmf_ctrlr *ctrlr = qpair_ctx->ctrlr;
+	uint32_t count;
+
+	spdk_bit_array_clear(ctrlr->qpair_mask, qpair_ctx->qid);
+	count = spdk_bit_array_count_set(ctrlr->qpair_mask);
+	if (count == 0) {
+		spdk_bit_array_free(&ctrlr->qpair_mask);
+
+		spdk_thread_send_msg(ctrlr->subsys->thread, _nvmf_ctrlr_destruct, ctrlr);
+	}
+
+	spdk_thread_send_msg(qpair_ctx->thread, _nvmf_transport_qpair_fini, qpair_ctx->qpair);
+	if (qpair_ctx->cb_fn) {
+		spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx);
+	}
+	free(qpair_ctx);
+}
+
+void
+spdk_nvmf_poll_group_remove(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+	struct spdk_nvmf_transport_poll_group *tgroup;
+	struct spdk_nvmf_request *req, *tmp;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	int rc;
+
+	nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ERROR);
+
+	/* Find the tgroup and remove the qpair from the tgroup */
+	TAILQ_FOREACH(tgroup, &qpair->group->tgroups, link) {
+		if (tgroup->transport == qpair->transport) {
+			rc = nvmf_transport_poll_group_remove(tgroup, qpair);
+			if (rc && (rc != ENOTSUP)) {
+				SPDK_ERRLOG("Cannot remove qpair=%p from transport group=%p\n",
+					    qpair, tgroup);
+			}
+			break;
+		}
+	}
+
+	if (ctrlr) {
+		sgroup = &qpair->group->sgroups[ctrlr->subsys->id];
+		TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) {
+			if (req->qpair == qpair) {
+				TAILQ_REMOVE(&sgroup->queued, req, link);
+				if (nvmf_transport_req_free(req)) {
+					SPDK_ERRLOG("Transport request free error!\n");
+				}
+			}
+		}
+	}
+
+	TAILQ_REMOVE(&qpair->group->qpairs, qpair, link);
+	qpair->group = NULL;
+}
+
+static void
+_nvmf_qpair_destroy(void *ctx, int status)
+{
+	struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx;
+	struct spdk_nvmf_qpair *qpair = qpair_ctx->qpair;
+	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+	assert(qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING);
+	qpair_ctx->qid = qpair->qid;
+
+	spdk_nvmf_poll_group_remove(qpair);
+
+	if (!ctrlr || !ctrlr->thread) {
+		nvmf_transport_qpair_fini(qpair);
+		if (qpair_ctx->cb_fn) {
+			spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx);
+		}
+		free(qpair_ctx);
+		return;
+	}
+
+	qpair_ctx->ctrlr = ctrlr;
+	spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_free_from_qpair, qpair_ctx);
+}
+
+int
+spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx)
+{
+	struct nvmf_qpair_disconnect_ctx *qpair_ctx;
+
+	/* If we get a qpair in the uninitialized state, we can just destroy it immediately */
+	if (qpair->state == SPDK_NVMF_QPAIR_UNINITIALIZED) {
+		nvmf_transport_qpair_fini(qpair);
+		if (cb_fn) {
+			cb_fn(ctx);
+		}
+		return 0;
+	}
+
+	/* The queue pair must be disconnected from the thread that owns it */
+	assert(qpair->group->thread == spdk_get_thread());
+
+	if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) {
+		/* This can occur if the connection is killed by the target,
+		 * which results in a notification that the connection
+		 * died. Send a message to defer the processing of this
+		 * callback. This allows the stack to unwind in the case
+		 * where a bunch of connections are disconnected in
+		 * a loop. */
+		if (cb_fn) {
+			spdk_thread_send_msg(qpair->group->thread, cb_fn, ctx);
+		}
+		return 0;
+	}
+
+	assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE);
+	nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_DEACTIVATING);
+
+	qpair_ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_ctx));
+	if (!qpair_ctx) {
+		SPDK_ERRLOG("Unable to allocate context for nvmf_qpair_disconnect\n");
+		return -ENOMEM;
+	}
+
+	qpair_ctx->qpair = qpair;
+	qpair_ctx->cb_fn = cb_fn;
+	qpair_ctx->thread = qpair->group->thread;
+	qpair_ctx->ctx = ctx;
+
+	/* Check for outstanding I/O */
+	if (!TAILQ_EMPTY(&qpair->outstanding)) {
+		qpair->state_cb = _nvmf_qpair_destroy;
+		qpair->state_cb_arg = qpair_ctx;
+		nvmf_qpair_free_aer(qpair);
+		return 0;
+	}
+
+	_nvmf_qpair_destroy(qpair_ctx, 0);
+
+	return 0;
+}
+
+int
+spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+			      struct spdk_nvme_transport_id *trid)
+{
+	return nvmf_transport_qpair_get_peer_trid(qpair, trid);
+}
+
+int
+spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+			       struct spdk_nvme_transport_id *trid)
+{
+	return nvmf_transport_qpair_get_local_trid(qpair, trid);
+}
+
+int
+spdk_nvmf_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+				struct spdk_nvme_transport_id *trid)
+{
+	return nvmf_transport_qpair_get_listen_trid(qpair, trid);
+}
+
+int
+nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group,
+			      struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_transport_poll_group *tgroup;
+
+	TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		if (tgroup->transport == transport) {
+			/* Transport already in the poll group */
+			return 0;
+		}
+	}
+
+	tgroup = nvmf_transport_poll_group_create(transport);
+	if (!tgroup) {
+		SPDK_ERRLOG("Unable to create poll group for transport\n");
+		return -1;
+	}
+
+	tgroup->group = group;
+	TAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
+
+	return 0;
+}
+
+static int
+poll_group_update_subsystem(struct spdk_nvmf_poll_group *group,
+			    struct spdk_nvmf_subsystem *subsystem)
+{
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	uint32_t new_num_ns, old_num_ns;
+	uint32_t i, j;
+	struct spdk_nvmf_ns *ns;
+	struct spdk_nvmf_registrant *reg, *tmp;
+	struct spdk_io_channel *ch;
+	struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+	struct spdk_nvmf_ctrlr *ctrlr;
+	bool ns_changed;
+
+	/* Make sure our poll group has memory for this subsystem allocated */
+	if (subsystem->id >= group->num_sgroups) {
+		return -ENOMEM;
+	}
+
+	sgroup = &group->sgroups[subsystem->id];
+
+	/* Make sure the array of namespace information is the correct size */
+	new_num_ns = subsystem->max_nsid;
+	old_num_ns = sgroup->num_ns;
+
+	ns_changed = false;
+
+	if (old_num_ns == 0) {
+		if (new_num_ns > 0) {
+			/* First allocation */
+			sgroup->ns_info = calloc(new_num_ns, sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+			if (!sgroup->ns_info) {
+				return -ENOMEM;
+			}
+		}
+	} else if (new_num_ns > old_num_ns) {
+		void *buf;
+
+		/* Make the array larger */
+		buf = realloc(sgroup->ns_info, new_num_ns * sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+		if (!buf) {
+			return -ENOMEM;
+		}
+
+		sgroup->ns_info = buf;
+
+		/* Null out the new namespace information slots */
+		for (i = old_num_ns; i < new_num_ns; i++) {
+			memset(&sgroup->ns_info[i], 0, sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+		}
+	} else if (new_num_ns < old_num_ns) {
+		void *buf;
+
+		/* Free the extra I/O channels */
+		for (i = new_num_ns; i < old_num_ns; i++) {
+			ns_info = &sgroup->ns_info[i];
+
+			if (ns_info->channel) {
+				spdk_put_io_channel(ns_info->channel);
+				ns_info->channel = NULL;
+			}
+		}
+
+		/* Make the array smaller */
+		if (new_num_ns > 0) {
+			buf = realloc(sgroup->ns_info, new_num_ns * sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+			if (!buf) {
+				return -ENOMEM;
+			}
+			sgroup->ns_info = buf;
+		} else {
+			free(sgroup->ns_info);
+			sgroup->ns_info = NULL;
+		}
+	}
+
+	sgroup->num_ns = new_num_ns;
+
+	/* Detect bdevs that were added or removed */
+	for (i = 0; i < sgroup->num_ns; i++) {
+		ns = subsystem->ns[i];
+		ns_info = &sgroup->ns_info[i];
+		ch = ns_info->channel;
+
+		if (ns == NULL && ch == NULL) {
+			/* Both NULL. Leave empty */
+		} else if (ns == NULL && ch != NULL) {
+			/* There was a channel here, but the namespace is gone. */
+			ns_changed = true;
+			spdk_put_io_channel(ch);
+			ns_info->channel = NULL;
+		} else if (ns != NULL && ch == NULL) {
+			/* A namespace appeared but there is no channel yet */
+			ns_changed = true;
+			ch = spdk_bdev_get_io_channel(ns->desc);
+			if (ch == NULL) {
+				SPDK_ERRLOG("Could not allocate I/O channel.\n");
+				return -ENOMEM;
+			}
+			ns_info->channel = ch;
+		} else if (spdk_uuid_compare(&ns_info->uuid, spdk_bdev_get_uuid(ns->bdev)) != 0) {
+			/* A namespace was here before, but was replaced by a new one. */
+			ns_changed = true;
+			spdk_put_io_channel(ns_info->channel);
+			memset(ns_info, 0, sizeof(*ns_info));
+
+			ch = spdk_bdev_get_io_channel(ns->desc);
+			if (ch == NULL) {
+				SPDK_ERRLOG("Could not allocate I/O channel.\n");
+				return -ENOMEM;
+			}
+			ns_info->channel = ch;
+		} else if (ns_info->num_blocks != spdk_bdev_get_num_blocks(ns->bdev)) {
+			/* Namespace is still there but size has changed */
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Namespace resized: subsystem_id %d,"
+				      " nsid %u, pg %p, old %lu, new %lu\n",
+				      subsystem->id,
+				      ns->nsid,
+				      group,
+				      ns_info->num_blocks,
+				      spdk_bdev_get_num_blocks(ns->bdev));
+			ns_changed = true;
+		}
+
+		if (ns == NULL) {
+			memset(ns_info, 0, sizeof(*ns_info));
+		} else {
+			ns_info->uuid = *spdk_bdev_get_uuid(ns->bdev);
+			ns_info->num_blocks = spdk_bdev_get_num_blocks(ns->bdev);
+			ns_info->crkey = ns->crkey;
+			ns_info->rtype = ns->rtype;
+			if (ns->holder) {
+				ns_info->holder_id = ns->holder->hostid;
+			}
+
+			memset(&ns_info->reg_hostid, 0, SPDK_NVMF_MAX_NUM_REGISTRANTS * sizeof(struct spdk_uuid));
+			j = 0;
+			TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+				if (j >= SPDK_NVMF_MAX_NUM_REGISTRANTS) {
+					SPDK_ERRLOG("Maximum %u registrants can support.\n", SPDK_NVMF_MAX_NUM_REGISTRANTS);
+					return -EINVAL;
+				}
+				ns_info->reg_hostid[j++] = reg->hostid;
+			}
+		}
+	}
+
+	if (ns_changed) {
+		TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+			if (ctrlr->admin_qpair->group == group) {
+				nvmf_ctrlr_async_event_ns_notice(ctrlr);
+			}
+		}
+	}
+
+	return 0;
+}
+
+int
+nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group,
+				 struct spdk_nvmf_subsystem *subsystem)
+{
+	return poll_group_update_subsystem(group, subsystem);
+}
+
+int
+nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group,
+			      struct spdk_nvmf_subsystem *subsystem,
+			      spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+	int rc = 0;
+	struct spdk_nvmf_subsystem_poll_group *sgroup = &group->sgroups[subsystem->id];
+
+	TAILQ_INIT(&sgroup->queued);
+
+	rc = poll_group_update_subsystem(group, subsystem);
+	if (rc) {
+		nvmf_poll_group_remove_subsystem(group, subsystem, NULL, NULL);
+		goto fini;
+	}
+
+	sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+fini:
+	if (cb_fn) {
+		cb_fn(cb_arg, rc);
+	}
+
+	return rc;
+}
+
+static void
+_nvmf_poll_group_remove_subsystem_cb(void *ctx, int status)
+{
+	struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_poll_group *group;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	spdk_nvmf_poll_group_mod_done cpl_fn = NULL;
+	void *cpl_ctx = NULL;
+	uint32_t nsid;
+
+	group = qpair_ctx->group;
+	subsystem = qpair_ctx->subsystem;
+	cpl_fn = qpair_ctx->cpl_fn;
+	cpl_ctx = qpair_ctx->cpl_ctx;
+	sgroup = &group->sgroups[subsystem->id];
+
+	if (status) {
+		goto fini;
+	}
+
+	for (nsid = 0; nsid < sgroup->num_ns; nsid++) {
+		if (sgroup->ns_info[nsid].channel) {
+			spdk_put_io_channel(sgroup->ns_info[nsid].channel);
+			sgroup->ns_info[nsid].channel = NULL;
+		}
+	}
+
+	sgroup->num_ns = 0;
+	free(sgroup->ns_info);
+	sgroup->ns_info = NULL;
+fini:
+	free(qpair_ctx);
+	if (cpl_fn) {
+		cpl_fn(cpl_ctx, status);
+	}
+}
+
+static void
+_nvmf_subsystem_disconnect_next_qpair(void *ctx)
+{
+	struct spdk_nvmf_qpair *qpair;
+	struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_poll_group *group;
+	int rc = 0;
+
+	group = qpair_ctx->group;
+	subsystem = qpair_ctx->subsystem;
+
+	TAILQ_FOREACH(qpair, &group->qpairs, link) {
+		if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) {
+			break;
+		}
+	}
+
+	if (qpair) {
+		rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, qpair_ctx);
+	}
+
+	if (!qpair || rc != 0) {
+		_nvmf_poll_group_remove_subsystem_cb(ctx, rc);
+	}
+	return;
+}
+
+void
+nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group,
+				 struct spdk_nvmf_subsystem *subsystem,
+				 spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+	struct spdk_nvmf_qpair *qpair;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	struct nvmf_qpair_disconnect_many_ctx *ctx;
+	int rc = 0;
+
+	ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx));
+
+	if (!ctx) {
+		SPDK_ERRLOG("Unable to allocate memory for context to remove poll subsystem\n");
+		goto fini;
+	}
+
+	ctx->group = group;
+	ctx->subsystem = subsystem;
+	ctx->cpl_fn = cb_fn;
+	ctx->cpl_ctx = cb_arg;
+
+	sgroup = &group->sgroups[subsystem->id];
+	sgroup->state = SPDK_NVMF_SUBSYSTEM_INACTIVE;
+
+	TAILQ_FOREACH(qpair, &group->qpairs, link) {
+		if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) {
+			break;
+		}
+	}
+
+	if (qpair) {
+		rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, ctx);
+	} else {
+		/* call the callback immediately. It will handle any channel iteration */
+		_nvmf_poll_group_remove_subsystem_cb(ctx, 0);
+	}
+
+	if (rc != 0) {
+		free(ctx);
+		goto fini;
+	}
+
+	return;
+fini:
+	if (cb_fn) {
+		cb_fn(cb_arg, rc);
+	}
+}
+
+void
+nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group,
+				struct spdk_nvmf_subsystem *subsystem,
+				spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	int rc = 0;
+
+	if (subsystem->id >= group->num_sgroups) {
+		rc = -1;
+		goto fini;
+	}
+
+	sgroup = &group->sgroups[subsystem->id];
+	if (sgroup == NULL) {
+		rc = -1;
+		goto fini;
+	}
+
+	assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE);
+	sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSING;
+
+	if (sgroup->io_outstanding > 0) {
+		sgroup->cb_fn = cb_fn;
+		sgroup->cb_arg = cb_arg;
+		return;
+	}
+
+	assert(sgroup->io_outstanding == 0);
+	sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED;
+fini:
+	if (cb_fn) {
+		cb_fn(cb_arg, rc);
+	}
+}
+
+void
+nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group,
+				 struct spdk_nvmf_subsystem *subsystem,
+				 spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+	struct spdk_nvmf_request *req, *tmp;
+	struct spdk_nvmf_subsystem_poll_group *sgroup;
+	int rc = 0;
+
+	if (subsystem->id >= group->num_sgroups) {
+		rc = -1;
+		goto fini;
+	}
+
+	sgroup = &group->sgroups[subsystem->id];
+
+	assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED);
+
+	rc = poll_group_update_subsystem(group, subsystem);
+	if (rc) {
+		goto fini;
+	}
+
+	sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+
+	/* Release all queued requests */
+	TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) {
+		TAILQ_REMOVE(&sgroup->queued, req, link);
+		spdk_nvmf_request_exec(req);
+	}
+fini:
+	if (cb_fn) {
+		cb_fn(cb_arg, rc);
+	}
+}
+
+
+struct spdk_nvmf_poll_group *
+spdk_nvmf_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_transport_poll_group *tgroup;
+
+	tgroup = nvmf_transport_get_optimal_poll_group(qpair->transport, qpair);
+
+	if (tgroup == NULL) {
+		return NULL;
+	}
+
+	return tgroup->group;
+}
+
+int
+spdk_nvmf_poll_group_get_stat(struct spdk_nvmf_tgt *tgt,
+			      struct spdk_nvmf_poll_group_stat *stat)
+{
+	struct spdk_io_channel *ch;
+	struct spdk_nvmf_poll_group *group;
+
+	if (tgt == NULL || stat == NULL) {
+		return -EINVAL;
+	}
+
+	ch = spdk_get_io_channel(tgt);
+	group = spdk_io_channel_get_ctx(ch);
+	*stat = group->stat;
+	spdk_put_io_channel(ch);
+	return 0;
+}
diff --git a/src/spdk/lib/nvmf/nvmf_fc.h b/src/spdk/lib/nvmf/nvmf_fc.h
new file mode 100644
index 000000000..10d3ef9cf
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf_fc.h
@@ -0,0 +1,999 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2018-2019 Broadcom.  All Rights Reserved.
+ *   The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVMF_FC_H__
+#define __NVMF_FC_H__
+
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/assert.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/nvmf_fc_spec.h"
+#include "spdk/thread.h"
+#include "nvmf_internal.h"
+
+#define SPDK_NVMF_FC_TR_ADDR_LEN 64
+#define NVMF_FC_INVALID_CONN_ID UINT64_MAX
+
+#define SPDK_FC_HW_DUMP_REASON_STR_MAX_SIZE 256
+#define SPDK_MAX_NUM_OF_FC_PORTS 32
+#define SPDK_NVMF_PORT_ID_MAX_LEN 32
+
+/*
+ * FC HWQP pointer
+ */
+typedef void *spdk_nvmf_fc_lld_hwqp_t;
+
+/*
+ * FC HW port states.
+ */
+enum spdk_fc_port_state {
+	SPDK_FC_PORT_OFFLINE = 0,
+	SPDK_FC_PORT_ONLINE = 1,
+	SPDK_FC_PORT_QUIESCED = 2,
+};
+
+enum spdk_fc_hwqp_state {
+	SPDK_FC_HWQP_OFFLINE = 0,
+	SPDK_FC_HWQP_ONLINE = 1,
+};
+
+/*
+ * NVMF FC Object state
+ * Add all the generic states of the object here.
+ * Specific object states can be added separately
+ */
+enum spdk_nvmf_fc_object_state {
+	SPDK_NVMF_FC_OBJECT_CREATED = 0,
+	SPDK_NVMF_FC_OBJECT_TO_BE_DELETED = 1,
+	SPDK_NVMF_FC_OBJECT_ZOMBIE = 2,      /* Partial Create or Delete  */
+};
+
+/*
+ * FC request state
+ */
+enum spdk_nvmf_fc_request_state {
+	SPDK_NVMF_FC_REQ_INIT = 0,
+	SPDK_NVMF_FC_REQ_READ_BDEV,
+	SPDK_NVMF_FC_REQ_READ_XFER,
+	SPDK_NVMF_FC_REQ_READ_RSP,
+	SPDK_NVMF_FC_REQ_WRITE_BUFFS,
+	SPDK_NVMF_FC_REQ_WRITE_XFER,
+	SPDK_NVMF_FC_REQ_WRITE_BDEV,
+	SPDK_NVMF_FC_REQ_WRITE_RSP,
+	SPDK_NVMF_FC_REQ_NONE_BDEV,
+	SPDK_NVMF_FC_REQ_NONE_RSP,
+	SPDK_NVMF_FC_REQ_SUCCESS,
+	SPDK_NVMF_FC_REQ_FAILED,
+	SPDK_NVMF_FC_REQ_ABORTED,
+	SPDK_NVMF_FC_REQ_BDEV_ABORTED,
+	SPDK_NVMF_FC_REQ_PENDING,
+	SPDK_NVMF_FC_REQ_MAX_STATE,
+};
+
+/*
+ * Generic DMA buffer descriptor
+ */
+struct spdk_nvmf_fc_buffer_desc {
+	void *virt;
+	uint64_t phys;
+	size_t len;
+
+	/* Internal */
+	uint32_t buf_index;
+};
+
+/*
+ * ABTS hadling context
+ */
+struct spdk_nvmf_fc_abts_ctx {
+	bool handled;
+	uint16_t hwqps_responded;
+	uint16_t rpi;
+	uint16_t oxid;
+	uint16_t rxid;
+	struct spdk_nvmf_fc_nport *nport;
+	uint16_t nport_hdl;
+	uint8_t port_hdl;
+	void *abts_poller_args;
+	void *sync_poller_args;
+	int num_hwqps;
+	bool queue_synced;
+	uint64_t u_id;
+	struct spdk_nvmf_fc_hwqp *ls_hwqp;
+	uint16_t fcp_rq_id;
+};
+
+/*
+ * NVME FC transport errors
+ */
+struct spdk_nvmf_fc_errors {
+	uint32_t no_xchg;
+	uint32_t nport_invalid;
+	uint32_t unknown_frame;
+	uint32_t wqe_cmplt_err;
+	uint32_t wqe_write_err;
+	uint32_t rq_status_err;
+	uint32_t rq_buf_len_err;
+	uint32_t rq_id_err;
+	uint32_t rq_index_err;
+	uint32_t invalid_cq_type;
+	uint32_t invalid_cq_id;
+	uint32_t fc_req_buf_err;
+	uint32_t buf_alloc_err;
+	uint32_t unexpected_err;
+	uint32_t nvme_cmd_iu_err;
+	uint32_t nvme_cmd_xfer_err;
+	uint32_t queue_entry_invalid;
+	uint32_t invalid_conn_err;
+	uint32_t fcp_rsp_failure;
+	uint32_t write_failed;
+	uint32_t read_failed;
+	uint32_t rport_invalid;
+	uint32_t num_aborted;
+	uint32_t num_abts_sent;
+};
+
+/*
+ *  Send Single Request/Response Sequence.
+ */
+struct spdk_nvmf_fc_srsr_bufs {
+	void *rqst;
+	size_t rqst_len;
+	void *rsp;
+	size_t rsp_len;
+	uint16_t rpi;
+};
+
+/*
+ * Struct representing a nport
+ */
+struct spdk_nvmf_fc_nport {
+
+	uint16_t nport_hdl;
+	uint8_t port_hdl;
+	uint32_t d_id;
+	enum spdk_nvmf_fc_object_state nport_state;
+	struct spdk_nvmf_fc_wwn fc_nodename;
+	struct spdk_nvmf_fc_wwn fc_portname;
+
+	/* list of remote ports (i.e. initiators) connected to nport */
+	TAILQ_HEAD(, spdk_nvmf_fc_remote_port_info) rem_port_list;
+	uint32_t rport_count;
+
+	void *vendor_data;	/* available for vendor use */
+
+	/* list of associations to nport */
+	TAILQ_HEAD(, spdk_nvmf_fc_association) fc_associations;
+	uint32_t assoc_count;
+	struct spdk_nvmf_fc_port *fc_port;
+	TAILQ_ENTRY(spdk_nvmf_fc_nport) link; /* list of nports on a hw port. */
+};
+
+/*
+ * NVMF FC Connection
+ */
+struct spdk_nvmf_fc_conn {
+	struct spdk_nvmf_qpair qpair;
+	struct spdk_nvme_transport_id trid;
+
+	uint64_t conn_id;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	uint16_t esrp_ratio;
+	uint16_t rsp_count;
+	uint32_t rsn;
+
+	/* The maximum number of I/O outstanding on this connection at one time */
+	uint16_t max_queue_depth;
+	uint16_t max_rw_depth;
+	/* The current number of I/O outstanding on this connection. This number
+	 * includes all I/O from the time the capsule is first received until it is
+	 * completed.
+	 */
+	uint16_t cur_queue_depth;
+
+	/* number of read/write requests that are outstanding */
+	uint16_t cur_fc_rw_depth;
+
+	struct spdk_nvmf_fc_association *fc_assoc;
+
+	uint16_t rpi;
+
+	/* for association's connection list */
+	TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_link;
+
+	/* for assocations's available connection list */
+	TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_avail_link;
+
+	/* for hwqp's connection list */
+	TAILQ_ENTRY(spdk_nvmf_fc_conn) link;
+
+	/* New QP create context. */
+	struct nvmf_fc_ls_op_ctx *create_opd;
+};
+
+/*
+ * Structure for maintaining the FC exchanges
+ */
+struct spdk_nvmf_fc_xchg {
+	uint32_t xchg_id;   /* The actual xchg identifier */
+
+	/* Internal */
+	TAILQ_ENTRY(spdk_nvmf_fc_xchg) link;
+	bool active;
+	bool aborted;
+	bool send_abts; /* Valid if is_aborted is set. */
+};
+
+/*
+ *  FC poll group structure
+ */
+struct spdk_nvmf_fc_poll_group {
+	struct spdk_nvmf_transport_poll_group group;
+	struct spdk_nvmf_tgt *nvmf_tgt;
+	uint32_t hwqp_count; /* number of hwqp's assigned to this pg */
+	TAILQ_HEAD(, spdk_nvmf_fc_hwqp) hwqp_list;
+
+	TAILQ_ENTRY(spdk_nvmf_fc_poll_group) link;
+};
+
+/*
+ *  HWQP poller structure passed from Master thread
+ */
+struct spdk_nvmf_fc_hwqp {
+	enum spdk_fc_hwqp_state state;  /* queue state (for poller) */
+	uint32_t lcore_id;   /* core hwqp is running on (for tracing purposes only) */
+	struct spdk_thread *thread;  /* thread hwqp is running on */
+	uint32_t hwqp_id;    /* A unique id (per physical port) for a hwqp */
+	uint32_t rq_size;    /* receive queue size */
+	spdk_nvmf_fc_lld_hwqp_t queues;    /* vendor HW queue set */
+	struct spdk_nvmf_fc_port *fc_port; /* HW port structure for these queues */
+	struct spdk_nvmf_fc_poll_group *fgroup;
+
+	/* qpair (fc_connection) list */
+	TAILQ_HEAD(, spdk_nvmf_fc_conn) connection_list;
+	uint32_t num_conns; /* number of connections to queue */
+
+	struct spdk_nvmf_fc_request *fc_reqs_buf;
+	TAILQ_HEAD(, spdk_nvmf_fc_request) free_reqs;
+	TAILQ_HEAD(, spdk_nvmf_fc_request) in_use_reqs;
+
+	struct spdk_nvmf_fc_errors counters;
+
+	/* Pending LS request waiting for FC resource */
+	TAILQ_HEAD(, spdk_nvmf_fc_ls_rqst) ls_pending_queue;
+
+	/* Sync req list */
+	TAILQ_HEAD(, spdk_nvmf_fc_poller_api_queue_sync_args) sync_cbs;
+
+	TAILQ_ENTRY(spdk_nvmf_fc_hwqp) link;
+
+	void *context;			/* Vendor specific context data */
+};
+
+/*
+ * FC HW port.
+ */
+struct spdk_nvmf_fc_port {
+	uint8_t port_hdl;
+	enum spdk_fc_port_state hw_port_status;
+	uint16_t fcp_rq_id;
+	struct spdk_nvmf_fc_hwqp ls_queue;
+
+	uint32_t num_io_queues;
+	struct spdk_nvmf_fc_hwqp *io_queues;
+	/*
+	 * List of nports on this HW port.
+	 */
+	TAILQ_HEAD(, spdk_nvmf_fc_nport)nport_list;
+	int	num_nports;
+	TAILQ_ENTRY(spdk_nvmf_fc_port) link;
+
+	struct spdk_mempool *io_resource_pool; /* Pools to store bdev_io's for this port */
+	void *port_ctx;
+};
+
+/*
+ * NVMF FC Request
+ */
+struct spdk_nvmf_fc_request {
+	struct spdk_nvmf_request req;
+	struct spdk_nvmf_fc_ersp_iu ersp;
+	uint32_t poller_lcore; /* for tracing purposes only */
+	struct spdk_thread *poller_thread;
+	uint16_t buf_index;
+	struct spdk_nvmf_fc_xchg *xchg;
+	uint16_t oxid;
+	uint16_t rpi;
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	int state;
+	uint32_t transfered_len;
+	bool is_aborted;
+	uint32_t magic;
+	uint32_t s_id;
+	uint32_t d_id;
+	TAILQ_ENTRY(spdk_nvmf_fc_request) link;
+	STAILQ_ENTRY(spdk_nvmf_fc_request) pending_link;
+	TAILQ_HEAD(, spdk_nvmf_fc_caller_ctx) abort_cbs;
+};
+
+SPDK_STATIC_ASSERT(!offsetof(struct spdk_nvmf_fc_request, req),
+		   "FC request and NVMF request address don't match.");
+
+
+/*
+ * NVMF FC Association
+ */
+struct spdk_nvmf_fc_association {
+	uint64_t assoc_id;
+	uint32_t s_id;
+	struct spdk_nvmf_fc_nport *tgtport;
+	struct spdk_nvmf_fc_remote_port_info *rport;
+	struct spdk_nvmf_subsystem *subsystem;
+	enum spdk_nvmf_fc_object_state assoc_state;
+
+	char host_id[FCNVME_ASSOC_HOSTID_LEN];
+	char host_nqn[SPDK_NVME_NQN_FIELD_SIZE];
+	char sub_nqn[SPDK_NVME_NQN_FIELD_SIZE];
+
+	struct spdk_nvmf_fc_conn *aq_conn; /* connection for admin queue */
+
+	uint16_t conn_count;
+	TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conns;
+
+	void *conns_buf;
+	TAILQ_HEAD(, spdk_nvmf_fc_conn) avail_fc_conns;
+
+	TAILQ_ENTRY(spdk_nvmf_fc_association) link;
+
+	/* for port's association free list */
+	TAILQ_ENTRY(spdk_nvmf_fc_association) port_free_assoc_list_link;
+
+	void *ls_del_op_ctx; /* delete assoc. callback list */
+
+	/* disconnect cmd buffers (sent to initiator) */
+	struct spdk_nvmf_fc_srsr_bufs *snd_disconn_bufs;
+};
+
+/*
+ * FC Remote Port
+ */
+struct spdk_nvmf_fc_remote_port_info {
+	uint32_t s_id;
+	uint32_t rpi;
+	uint32_t assoc_count;
+	struct spdk_nvmf_fc_wwn fc_nodename;
+	struct spdk_nvmf_fc_wwn fc_portname;
+	enum spdk_nvmf_fc_object_state rport_state;
+	TAILQ_ENTRY(spdk_nvmf_fc_remote_port_info) link;
+};
+
+/*
+ * Poller API error codes
+ */
+enum spdk_nvmf_fc_poller_api_ret {
+	SPDK_NVMF_FC_POLLER_API_SUCCESS = 0,
+	SPDK_NVMF_FC_POLLER_API_ERROR,
+	SPDK_NVMF_FC_POLLER_API_INVALID_ARG,
+	SPDK_NVMF_FC_POLLER_API_NO_CONN_ID,
+	SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID,
+	SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND,
+};
+
+/*
+ * Poller API definitions
+ */
+enum spdk_nvmf_fc_poller_api {
+	SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION,
+	SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION,
+	SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE,
+	SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE,
+	SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED,
+	SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+	SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT,
+	SPDK_NVMF_FC_POLLER_API_AEN,
+	SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC,
+	SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE,
+	SPDK_NVMF_FC_POLLER_API_ADD_HWQP,
+	SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP,
+};
+
+/*
+ * Poller API callback function proto
+ */
+typedef void (*spdk_nvmf_fc_poller_api_cb)(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret);
+
+/*
+ * Poller API callback data
+ */
+struct spdk_nvmf_fc_poller_api_cb_info {
+	struct spdk_thread *cb_thread;
+	spdk_nvmf_fc_poller_api_cb cb_func;
+	void *cb_data;
+	enum spdk_nvmf_fc_poller_api_ret ret;
+};
+
+/*
+ * Poller API structures
+ */
+struct spdk_nvmf_fc_poller_api_add_connection_args {
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_del_connection_args {
+	struct spdk_nvmf_fc_conn *fc_conn;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+	bool send_abts;
+	/* internal */
+	int fc_request_cnt;
+	bool backend_initiated;
+};
+
+struct spdk_nvmf_fc_poller_api_quiesce_queue_args {
+	void   *ctx;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_activate_queue_args {
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_abts_recvd_args {
+	struct spdk_nvmf_fc_abts_ctx *ctx;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_queue_sync_done_args {
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+	uint64_t tag;
+};
+
+/*
+ * NVMF LS request structure
+ */
+struct spdk_nvmf_fc_ls_rqst {
+	struct spdk_nvmf_fc_buffer_desc rqstbuf;
+	struct spdk_nvmf_fc_buffer_desc rspbuf;
+	uint32_t rqst_len;
+	uint32_t rsp_len;
+	uint32_t rpi;
+	struct spdk_nvmf_fc_xchg *xchg;
+	uint16_t oxid;
+	void *private_data; /* for LLD only (LS does not touch) */
+	TAILQ_ENTRY(spdk_nvmf_fc_ls_rqst) ls_pending_link;
+	uint32_t s_id;
+	uint32_t d_id;
+	struct spdk_nvmf_fc_nport *nport;
+	struct spdk_nvmf_fc_remote_port_info *rport;
+	struct spdk_nvmf_tgt *nvmf_tgt;
+};
+
+/*
+ * RQ Buffer LS Overlay Structure
+ */
+#define FCNVME_LS_RSVD_SIZE (FCNVME_MAX_LS_BUFFER_SIZE - \
+	(sizeof(struct spdk_nvmf_fc_ls_rqst) + FCNVME_MAX_LS_REQ_SIZE + FCNVME_MAX_LS_RSP_SIZE))
+
+struct spdk_nvmf_fc_rq_buf_ls_request {
+	uint8_t rqst[FCNVME_MAX_LS_REQ_SIZE];
+	uint8_t resp[FCNVME_MAX_LS_RSP_SIZE];
+	struct spdk_nvmf_fc_ls_rqst ls_rqst;
+	uint8_t rsvd[FCNVME_LS_RSVD_SIZE];
+};
+
+SPDK_STATIC_ASSERT(sizeof(struct spdk_nvmf_fc_rq_buf_ls_request) ==
+		   FCNVME_MAX_LS_BUFFER_SIZE, "LS RQ Buffer overflow");
+
+/* Poller API structures (arguments and callback data */
+typedef void (*spdk_nvmf_fc_del_assoc_cb)(void *arg, uint32_t err);
+
+struct spdk_nvmf_fc_ls_add_conn_api_data {
+	struct spdk_nvmf_fc_poller_api_add_connection_args args;
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+	struct spdk_nvmf_fc_association *assoc;
+	bool aq_conn; /* true if adding connection for new association */
+};
+
+/* Disconnect (connection) request functions */
+struct spdk_nvmf_fc_ls_del_conn_api_data {
+	struct spdk_nvmf_fc_poller_api_del_connection_args args;
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+	struct spdk_nvmf_fc_association *assoc;
+	bool aq_conn; /* true if deleting AQ connection */
+};
+
+/* used by LS disconnect association cmd handling */
+struct spdk_nvmf_fc_ls_disconn_assoc_api_data {
+	struct spdk_nvmf_fc_nport *tgtport;
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+};
+
+/* used by delete association call */
+struct spdk_nvmf_fc_delete_assoc_api_data {
+	struct spdk_nvmf_fc_poller_api_del_connection_args args;
+	struct spdk_nvmf_fc_association *assoc;
+	bool from_ls_rqst;   /* true = request came for LS */
+	spdk_nvmf_fc_del_assoc_cb del_assoc_cb;
+	void *del_assoc_cb_data;
+};
+
+struct nvmf_fc_ls_op_ctx {
+	union {
+		struct spdk_nvmf_fc_ls_add_conn_api_data add_conn;
+		struct spdk_nvmf_fc_ls_del_conn_api_data del_conn;
+		struct spdk_nvmf_fc_ls_disconn_assoc_api_data disconn_assoc;
+		struct spdk_nvmf_fc_delete_assoc_api_data del_assoc;
+	} u;
+	struct  nvmf_fc_ls_op_ctx *next_op_ctx;
+};
+
+struct spdk_nvmf_fc_poller_api_queue_sync_args {
+	uint64_t u_id;
+	struct spdk_nvmf_fc_hwqp *hwqp;
+	struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+
+	/* Used internally by poller */
+	TAILQ_ENTRY(spdk_nvmf_fc_poller_api_queue_sync_args) link;
+};
+
+/**
+ * Following defines and structures are used to pass messages between master thread
+ * and FCT driver.
+ */
+enum spdk_fc_event {
+	SPDK_FC_HW_PORT_INIT,
+	SPDK_FC_HW_PORT_ONLINE,
+	SPDK_FC_HW_PORT_OFFLINE,
+	SPDK_FC_HW_PORT_RESET,
+	SPDK_FC_NPORT_CREATE,
+	SPDK_FC_NPORT_DELETE,
+	SPDK_FC_IT_ADD,    /* PRLI */
+	SPDK_FC_IT_DELETE, /* PRLI */
+	SPDK_FC_ABTS_RECV,
+	SPDK_FC_LINK_BREAK,
+	SPDK_FC_HW_PORT_DUMP,
+	SPDK_FC_UNRECOVERABLE_ERR,
+	SPDK_FC_EVENT_MAX,
+};
+
+/**
+ * Arguments for to dump assoc id
+ */
+struct spdk_nvmf_fc_dump_assoc_id_args {
+	uint8_t                           pport_handle;
+	uint16_t                          nport_handle;
+	uint32_t                          assoc_id;
+};
+
+/**
+ * Arguments for HW port init event.
+ */
+struct spdk_nvmf_fc_hw_port_init_args {
+	uint32_t                       ls_queue_size;
+	spdk_nvmf_fc_lld_hwqp_t        ls_queue;
+	uint32_t                       io_queue_size;
+	uint32_t                       io_queue_cnt;
+	spdk_nvmf_fc_lld_hwqp_t       *io_queues;
+	void                          *cb_ctx;
+	void                          *port_ctx;
+	uint8_t                        port_handle;
+	uint8_t                        nvme_aq_index;  /* io_queue used for nvme admin queue */
+	uint16_t                       fcp_rq_id; /* Base rq ID of SCSI queue */
+};
+
+/**
+ * Arguments for HW port link break event.
+ */
+struct spdk_nvmf_hw_port_link_break_args {
+	uint8_t port_handle;
+	void   *cb_ctx;
+};
+
+/**
+ * Arguments for HW port online event.
+ */
+struct spdk_nvmf_fc_hw_port_online_args {
+	uint8_t port_handle;
+	void   *cb_ctx;
+};
+
+/**
+ * Arguments for HW port offline event.
+ */
+struct spdk_nvmf_fc_hw_port_offline_args {
+	uint8_t port_handle;
+	void   *cb_ctx;
+};
+
+/**
+ * Arguments for n-port add event.
+ */
+struct spdk_nvmf_fc_nport_create_args {
+	uint8_t                     port_handle;
+	uint16_t                    nport_handle;
+	struct spdk_uuid            container_uuid; /* UUID of the nports container */
+	struct spdk_uuid            nport_uuid;     /* Unique UUID for the nport */
+	uint32_t                    d_id;
+	struct spdk_nvmf_fc_wwn fc_nodename;
+	struct spdk_nvmf_fc_wwn fc_portname;
+	uint32_t                    subsys_id; /* Subsystemid */
+	char                        port_id[SPDK_NVMF_PORT_ID_MAX_LEN];
+	void                       *cb_ctx;
+};
+
+/**
+ * Arguments for n-port delete event.
+ */
+struct spdk_nvmf_fc_nport_delete_args {
+	uint8_t  port_handle;
+	uint32_t nport_handle;
+	uint32_t subsys_id; /* Subsystem id */
+	void    *cb_ctx;
+};
+
+/**
+ * Arguments for I_T add event.
+ */
+struct spdk_nvmf_fc_hw_i_t_add_args {
+	uint8_t                      port_handle;
+	uint32_t                     nport_handle;
+	uint16_t                     itn_handle;
+	uint32_t                     rpi;
+	uint32_t                     s_id;
+	uint32_t                     initiator_prli_info;
+	uint32_t                     target_prli_info; /* populated by the SPDK master */
+	struct spdk_nvmf_fc_wwn  fc_nodename;
+	struct spdk_nvmf_fc_wwn  fc_portname;
+	void                        *cb_ctx;
+};
+
+/**
+ * Arguments for I_T delete event.
+ */
+struct spdk_nvmf_fc_hw_i_t_delete_args {
+	uint8_t  port_handle;
+	uint32_t nport_handle;
+	uint16_t itn_handle;    /* Only used by FC LLD driver; unused in SPDK */
+	uint32_t rpi;
+	uint32_t s_id;
+	void    *cb_ctx;
+};
+
+/**
+ * Arguments for ABTS  event.
+ */
+struct spdk_nvmf_fc_abts_args {
+	uint8_t  port_handle;
+	uint32_t nport_handle;
+	uint32_t rpi;
+	uint16_t oxid, rxid;
+	void    *cb_ctx;
+};
+
+/**
+ * Arguments for link break event.
+ */
+struct spdk_nvmf_fc_link_break_args {
+	uint8_t port_handle;
+};
+
+/**
+ * Arguments for port reset event.
+ */
+struct spdk_nvmf_fc_hw_port_reset_args {
+	uint8_t    port_handle;
+	bool       dump_queues;
+	char       reason[SPDK_FC_HW_DUMP_REASON_STR_MAX_SIZE];
+	uint32_t **dump_buf;
+	void      *cb_ctx;
+};
+
+/**
+ * Arguments for unrecoverable error event
+ */
+struct spdk_nvmf_fc_unrecoverable_error_event_args {
+};
+
+/**
+ * Callback function to the FCT driver.
+ */
+typedef void (*spdk_nvmf_fc_callback)(uint8_t port_handle,
+				      enum spdk_fc_event event_type,
+				      void *arg, int err);
+
+/**
+ * Enqueue an FCT event to master thread
+ *
+ * \param event_type Type of the event.
+ * \param args Pointer to the argument structure.
+ * \param cb_func Callback function into fc driver.
+ *
+ * \return 0 on success, non-zero on failure.
+ */
+int
+nvmf_fc_master_enqueue_event(enum spdk_fc_event event_type,
+			     void *args,
+			     spdk_nvmf_fc_callback cb_func);
+
+/*
+ * dump info
+ */
+struct spdk_nvmf_fc_queue_dump_info {
+	char *buffer;
+	int   offset;
+};
+#define SPDK_FC_HW_DUMP_BUF_SIZE (10 * 4096)
+
+static inline void
+nvmf_fc_dump_buf_print(struct spdk_nvmf_fc_queue_dump_info *dump_info, char *fmt, ...)
+{
+	uint64_t buffer_size = SPDK_FC_HW_DUMP_BUF_SIZE;
+	int32_t avail = (int32_t)(buffer_size - dump_info->offset);
+
+	if (avail > 0) {
+		va_list ap;
+		int32_t written;
+
+		va_start(ap, fmt);
+		written = vsnprintf(dump_info->buffer + dump_info->offset, avail, fmt, ap);
+		if (written >= avail) {
+			dump_info->offset += avail;
+		} else {
+			dump_info->offset += written;
+		}
+		va_end(ap);
+	}
+}
+
+/*
+ * NVMF FC caller callback definitions
+ */
+typedef void (*spdk_nvmf_fc_caller_cb)(void *hwqp, int32_t status, void *args);
+
+struct spdk_nvmf_fc_caller_ctx {
+	void *ctx;
+	spdk_nvmf_fc_caller_cb cb;
+	void *cb_args;
+	TAILQ_ENTRY(spdk_nvmf_fc_caller_ctx) link;
+};
+
+/*
+ * NVMF FC Exchange Info (for debug)
+ */
+struct spdk_nvmf_fc_xchg_info {
+	uint32_t xchg_base;
+	uint32_t xchg_total_count;
+	uint32_t xchg_avail_count;
+	uint32_t send_frame_xchg_id;
+	uint8_t send_frame_seqid;
+};
+
+/*
+ * NVMF FC inline and function prototypes
+ */
+
+static inline struct spdk_nvmf_fc_request *
+nvmf_fc_get_fc_req(struct spdk_nvmf_request *req)
+{
+	return (struct spdk_nvmf_fc_request *)
+	       ((uintptr_t)req - offsetof(struct spdk_nvmf_fc_request, req));
+}
+
+static inline bool
+nvmf_fc_is_port_dead(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+	switch (hwqp->fc_port->hw_port_status) {
+	case SPDK_FC_PORT_QUIESCED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool
+nvmf_fc_req_in_xfer(struct spdk_nvmf_fc_request *fc_req)
+{
+	switch (fc_req->state) {
+	case SPDK_NVMF_FC_REQ_READ_XFER:
+	case SPDK_NVMF_FC_REQ_READ_RSP:
+	case SPDK_NVMF_FC_REQ_WRITE_XFER:
+	case SPDK_NVMF_FC_REQ_WRITE_RSP:
+	case SPDK_NVMF_FC_REQ_NONE_RSP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline void
+nvmf_fc_create_trid(struct spdk_nvme_transport_id *trid, uint64_t n_wwn, uint64_t p_wwn)
+{
+	spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_FC);
+	trid->adrfam = SPDK_NVMF_ADRFAM_FC;
+	snprintf(trid->trsvcid, sizeof(trid->trsvcid), "none");
+	snprintf(trid->traddr, sizeof(trid->traddr), "nn-0x%lx:pn-0x%lx", n_wwn, p_wwn);
+}
+
+void nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port);
+
+void nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port);
+
+void nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst);
+void nvmf_fc_ls_add_conn_failure(
+	struct spdk_nvmf_fc_association *assoc,
+	struct spdk_nvmf_fc_ls_rqst *ls_rqst,
+	struct spdk_nvmf_fc_conn *fc_conn,
+	bool aq_conn);
+
+void nvmf_fc_init_hwqp(struct spdk_nvmf_fc_port *fc_port, struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp);
+
+struct spdk_nvmf_fc_conn *nvmf_fc_hwqp_find_fc_conn(struct spdk_nvmf_fc_hwqp *hwqp,
+		uint64_t conn_id);
+
+void nvmf_fc_hwqp_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, void *queues_curr);
+
+struct spdk_nvmf_fc_port *nvmf_fc_port_lookup(uint8_t port_hdl);
+
+bool nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port);
+
+bool nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport,
+			    enum spdk_nvmf_fc_object_state state);
+
+void nvmf_fc_port_add(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port,
+			   struct spdk_nvmf_fc_nport *nport);
+
+int nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port,
+			      struct spdk_nvmf_fc_nport *nport);
+
+struct spdk_nvmf_fc_nport *nvmf_fc_nport_find(uint8_t port_hdl, uint16_t nport_hdl);
+
+int nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport,
+			    enum spdk_nvmf_fc_object_state state);
+
+bool nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport,
+				struct spdk_nvmf_fc_remote_port_info *rem_port);
+
+bool nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport,
+				   struct spdk_nvmf_fc_remote_port_info *rem_port);
+
+bool nvmf_fc_nport_has_no_rport(struct spdk_nvmf_fc_nport *nport);
+
+int nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc,
+			    enum spdk_nvmf_fc_object_state state);
+
+int nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport,
+			       uint64_t assoc_id, bool send_abts, bool backend_initiated,
+			       spdk_nvmf_fc_del_assoc_cb del_assoc_cb,
+			       void *cb_data);
+
+bool nvmf_ctrlr_is_on_nport(uint8_t port_hdl, uint16_t nport_hdl,
+			    struct spdk_nvmf_ctrlr *ctrlr);
+
+void nvmf_fc_assign_queue_to_master_thread(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_poll_group_add_hwqp(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_poll_group_remove_hwqp(struct spdk_nvmf_fc_hwqp *hwqp);
+
+int nvmf_fc_hwqp_set_online(struct spdk_nvmf_fc_hwqp *hwqp);
+
+int nvmf_fc_hwqp_set_offline(struct spdk_nvmf_fc_hwqp *hwqp);
+
+uint32_t nvmf_fc_get_prli_service_params(void);
+
+void nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, uint16_t rpi, uint16_t oxid,
+			       uint16_t rxid);
+
+void nvmf_fc_request_abort(struct spdk_nvmf_fc_request *fc_req, bool send_abts,
+			   spdk_nvmf_fc_caller_cb cb, void *cb_args);
+
+struct spdk_nvmf_tgt *nvmf_fc_get_tgt(void);
+
+struct spdk_thread *nvmf_fc_get_master_thread(void);
+
+/*
+ * These functions are called by low level FC driver
+ */
+
+static inline struct spdk_nvmf_fc_conn *
+nvmf_fc_get_conn(struct spdk_nvmf_qpair *qpair)
+{
+	return (struct spdk_nvmf_fc_conn *)
+	       ((uintptr_t)qpair - offsetof(struct spdk_nvmf_fc_conn, qpair));
+}
+
+static inline uint16_t
+nvmf_fc_advance_conn_sqhead(struct spdk_nvmf_qpair *qpair)
+{
+	/* advance sq_head pointer - wrap if needed */
+	qpair->sq_head = (qpair->sq_head == qpair->sq_head_max) ?
+			 0 : (qpair->sq_head + 1);
+	return qpair->sq_head;
+}
+
+static inline bool
+nvmf_fc_use_send_frame(struct spdk_nvmf_request *req)
+{
+	/* For now use for only keepalives. */
+	if (req->qpair->qid == 0 &&
+	    (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_KEEP_ALIVE)) {
+		return true;
+	}
+	return false;
+}
+
+enum spdk_nvmf_fc_poller_api_ret nvmf_fc_poller_api_func(
+	struct spdk_nvmf_fc_hwqp *hwqp,
+	enum spdk_nvmf_fc_poller_api api,
+	void *api_args);
+
+int nvmf_fc_hwqp_process_frame(struct spdk_nvmf_fc_hwqp *hwqp, uint32_t buff_idx,
+			       struct spdk_nvmf_fc_frame_hdr *frame,
+			       struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen);
+
+void nvmf_fc_hwqp_process_pending_reqs(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_hwqp_process_pending_ls_rqsts(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_request_set_state(struct spdk_nvmf_fc_request *fc_req,
+			       enum spdk_nvmf_fc_request_state state);
+
+char *nvmf_fc_request_get_state_str(int state);
+
+void _nvmf_fc_request_free(struct spdk_nvmf_fc_request *fc_req);
+
+void nvmf_fc_request_abort_complete(void *arg1);
+
+bool nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req,
+				uint32_t rsp_cnt, uint32_t xfer_len);
+
+int nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *req);
+
+#endif
diff --git a/src/spdk/lib/nvmf/nvmf_internal.h b/src/spdk/lib/nvmf/nvmf_internal.h
new file mode 100644
index 000000000..f1f3837d5
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf_internal.h
@@ -0,0 +1,371 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVMF_INTERNAL_H__
+#define __NVMF_INTERNAL_H__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/likely.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/assert.h"
+#include "spdk/bdev.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+#include "spdk/thread.h"
+
+#define NVMF_MAX_ASYNC_EVENTS	(4)
+
+enum spdk_nvmf_subsystem_state {
+	SPDK_NVMF_SUBSYSTEM_INACTIVE = 0,
+	SPDK_NVMF_SUBSYSTEM_ACTIVATING,
+	SPDK_NVMF_SUBSYSTEM_ACTIVE,
+	SPDK_NVMF_SUBSYSTEM_PAUSING,
+	SPDK_NVMF_SUBSYSTEM_PAUSED,
+	SPDK_NVMF_SUBSYSTEM_RESUMING,
+	SPDK_NVMF_SUBSYSTEM_DEACTIVATING,
+};
+
+struct spdk_nvmf_tgt {
+	char					name[NVMF_TGT_NAME_MAX_LENGTH];
+
+	pthread_mutex_t				mutex;
+
+	uint64_t				discovery_genctr;
+
+	uint32_t				max_subsystems;
+
+	/* Array of subsystem pointers of size max_subsystems indexed by sid */
+	struct spdk_nvmf_subsystem		**subsystems;
+
+	TAILQ_HEAD(, spdk_nvmf_transport)	transports;
+	TAILQ_HEAD(, spdk_nvmf_poll_group)	poll_groups;
+
+	/* Used for round-robin assignment of connections to poll groups */
+	struct spdk_nvmf_poll_group		*next_poll_group;
+
+	spdk_nvmf_tgt_destroy_done_fn		*destroy_cb_fn;
+	void					*destroy_cb_arg;
+
+	TAILQ_ENTRY(spdk_nvmf_tgt)		link;
+};
+
+struct spdk_nvmf_host {
+	char				nqn[SPDK_NVMF_NQN_MAX_LEN + 1];
+	TAILQ_ENTRY(spdk_nvmf_host)	link;
+};
+
+struct spdk_nvmf_subsystem_listener {
+	struct spdk_nvmf_subsystem			*subsystem;
+	spdk_nvmf_tgt_subsystem_listen_done_fn		cb_fn;
+	void						*cb_arg;
+	struct spdk_nvme_transport_id			*trid;
+	struct spdk_nvmf_transport			*transport;
+	TAILQ_ENTRY(spdk_nvmf_subsystem_listener)	link;
+};
+
+/* Maximum number of registrants supported per namespace */
+#define SPDK_NVMF_MAX_NUM_REGISTRANTS		16
+
+struct spdk_nvmf_registrant_info {
+	uint64_t		rkey;
+	char			host_uuid[SPDK_UUID_STRING_LEN];
+};
+
+struct spdk_nvmf_reservation_info {
+	bool					ptpl_activated;
+	enum spdk_nvme_reservation_type		rtype;
+	uint64_t				crkey;
+	char					bdev_uuid[SPDK_UUID_STRING_LEN];
+	char					holder_uuid[SPDK_UUID_STRING_LEN];
+	uint32_t				num_regs;
+	struct spdk_nvmf_registrant_info	registrants[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+};
+
+struct spdk_nvmf_subsystem_pg_ns_info {
+	struct spdk_io_channel		*channel;
+	struct spdk_uuid		uuid;
+	/* current reservation key, no reservation if the value is 0 */
+	uint64_t			crkey;
+	/* reservation type */
+	enum spdk_nvme_reservation_type	rtype;
+	/* Host ID which holds the reservation */
+	struct spdk_uuid		holder_id;
+	/* Host ID for the registrants with the namespace */
+	struct spdk_uuid		reg_hostid[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+	uint64_t			num_blocks;
+};
+
+typedef void(*spdk_nvmf_poll_group_mod_done)(void *cb_arg, int status);
+
+struct spdk_nvmf_subsystem_poll_group {
+	/* Array of namespace information for each namespace indexed by nsid - 1 */
+	struct spdk_nvmf_subsystem_pg_ns_info	*ns_info;
+	uint32_t				num_ns;
+
+	uint64_t				io_outstanding;
+	spdk_nvmf_poll_group_mod_done		cb_fn;
+	void					*cb_arg;
+
+	enum spdk_nvmf_subsystem_state		state;
+
+	TAILQ_HEAD(, spdk_nvmf_request)		queued;
+};
+
+struct spdk_nvmf_registrant {
+	TAILQ_ENTRY(spdk_nvmf_registrant) link;
+	struct spdk_uuid hostid;
+	/* Registration key */
+	uint64_t rkey;
+};
+
+struct spdk_nvmf_ns {
+	uint32_t nsid;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_bdev *bdev;
+	struct spdk_bdev_desc *desc;
+	struct spdk_nvmf_ns_opts opts;
+	/* reservation notificaton mask */
+	uint32_t mask;
+	/* generation code */
+	uint32_t gen;
+	/* registrants head */
+	TAILQ_HEAD(, spdk_nvmf_registrant) registrants;
+	/* current reservation key */
+	uint64_t crkey;
+	/* reservation type */
+	enum spdk_nvme_reservation_type rtype;
+	/* current reservation holder, only valid if reservation type can only have one holder */
+	struct spdk_nvmf_registrant *holder;
+	/* Persist Through Power Loss file which contains the persistent reservation */
+	char *ptpl_file;
+	/* Persist Through Power Loss feature is enabled */
+	bool ptpl_activated;
+};
+
+struct spdk_nvmf_ctrlr_feat {
+	union spdk_nvme_feat_arbitration arbitration;
+	union spdk_nvme_feat_power_management power_management;
+	union spdk_nvme_feat_error_recovery error_recovery;
+	union spdk_nvme_feat_volatile_write_cache volatile_write_cache;
+	union spdk_nvme_feat_number_of_queues number_of_queues;
+	union spdk_nvme_feat_write_atomicity write_atomicity;
+	union spdk_nvme_feat_async_event_configuration async_event_configuration;
+	union spdk_nvme_feat_keep_alive_timer keep_alive_timer;
+};
+
+/*
+ * NVMf reservation notificaton log page.
+ */
+struct spdk_nvmf_reservation_log {
+	struct spdk_nvme_reservation_notification_log	log;
+	TAILQ_ENTRY(spdk_nvmf_reservation_log)		link;
+	struct spdk_nvmf_ctrlr				*ctrlr;
+};
+
+/*
+ * This structure represents an NVMe-oF controller,
+ * which is like a "session" in networking terms.
+ */
+struct spdk_nvmf_ctrlr {
+	uint16_t			cntlid;
+	char				hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
+	struct spdk_nvmf_subsystem	*subsys;
+
+	struct spdk_nvmf_ctrlr_data	cdata;
+
+	struct spdk_nvmf_registers	vcprop;
+
+	struct spdk_nvmf_ctrlr_feat feat;
+
+	struct spdk_nvmf_qpair	*admin_qpair;
+	struct spdk_thread	*thread;
+	struct spdk_bit_array	*qpair_mask;
+
+	struct spdk_nvmf_request *aer_req[NVMF_MAX_ASYNC_EVENTS];
+	union spdk_nvme_async_event_completion notice_event;
+	union spdk_nvme_async_event_completion reservation_event;
+	uint8_t nr_aer_reqs;
+	struct spdk_uuid  hostid;
+
+	uint16_t changed_ns_list_count;
+	struct spdk_nvme_ns_list changed_ns_list;
+	uint64_t log_page_count;
+	uint8_t num_avail_log_pages;
+	TAILQ_HEAD(log_page_head, spdk_nvmf_reservation_log) log_head;
+
+	/* Time to trigger keep-alive--poller_time = now_tick + period */
+	uint64_t			last_keep_alive_tick;
+	struct spdk_poller		*keep_alive_poller;
+
+	bool				dif_insert_or_strip;
+
+	TAILQ_ENTRY(spdk_nvmf_ctrlr)	link;
+};
+
+struct spdk_nvmf_subsystem {
+	struct spdk_thread		*thread;
+	uint32_t			id;
+	enum spdk_nvmf_subsystem_state	state;
+
+	char subnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
+	enum spdk_nvmf_subtype subtype;
+	uint16_t next_cntlid;
+	bool allow_any_host;
+	bool allow_any_listener;
+
+	struct spdk_nvmf_tgt			*tgt;
+
+	char sn[SPDK_NVME_CTRLR_SN_LEN + 1];
+	char mn[SPDK_NVME_CTRLR_MN_LEN + 1];
+
+	/* Array of pointers to namespaces of size max_nsid indexed by nsid - 1 */
+	struct spdk_nvmf_ns			**ns;
+	uint32_t				max_nsid;
+	/* This is the maximum allowed nsid to a subsystem */
+	uint32_t				max_allowed_nsid;
+
+	TAILQ_HEAD(, spdk_nvmf_ctrlr)			ctrlrs;
+	TAILQ_HEAD(, spdk_nvmf_host)			hosts;
+	TAILQ_HEAD(, spdk_nvmf_subsystem_listener)	listeners;
+
+	TAILQ_ENTRY(spdk_nvmf_subsystem)	entries;
+};
+
+int nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group,
+				  struct spdk_nvmf_transport *transport);
+int nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group,
+				     struct spdk_nvmf_subsystem *subsystem);
+int nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group,
+				  struct spdk_nvmf_subsystem *subsystem,
+				  spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+void nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group,
+				      struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+void nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group,
+				     struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+void nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group,
+				      struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+
+void nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, const char *hostnqn,
+				 struct iovec *iov,
+				 uint32_t iovcnt, uint64_t offset, uint32_t length);
+
+void nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr);
+int nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req);
+int nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req);
+int nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req);
+bool nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr);
+bool nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid);
+
+void nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata,
+				 bool dif_insert_or_strip);
+int nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			     struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			      struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_compare_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+				struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_compare_and_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+		struct spdk_io_channel *ch, struct spdk_nvmf_request *cmp_req, struct spdk_nvmf_request *write_req);
+int nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+				     struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			      struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+			    struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+				     struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+bool nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd,
+				 struct spdk_dif_ctx *dif_ctx);
+
+int nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+			     struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+				 struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_subsystem_remove_all_listeners(struct spdk_nvmf_subsystem *subsystem,
+		bool stop);
+struct spdk_nvmf_ctrlr *nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+		uint16_t cntlid);
+struct spdk_nvmf_subsystem_listener *nvmf_subsystem_find_listener(
+	struct spdk_nvmf_subsystem *subsystem,
+	const struct spdk_nvme_transport_id *trid);
+struct spdk_nvmf_listener *nvmf_transport_find_listener(
+	struct spdk_nvmf_transport *transport,
+	const struct spdk_nvme_transport_id *trid);
+
+int nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_ns_reservation_request(void *ctx);
+void nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr,
+				       struct spdk_nvmf_ns *ns,
+				       enum spdk_nvme_reservation_notification_log_page_type type);
+
+/*
+ * Abort aer is sent on a per controller basis and sends a completion for the aer to the host.
+ * This function should be called when attempting to recover in error paths when it is OK for
+ * the host to send a subsequent AER.
+ */
+void nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr);
+
+/*
+ * Free aer simply frees the rdma resources for the aer without informing the host.
+ * This function should be called when deleting a qpair when one wants to make sure
+ * the qpair is completely empty before freeing the request. The reason we free the
+ * AER without sending a completion is to prevent the host from sending another AER.
+ */
+void nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair);
+
+int nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req);
+
+static inline struct spdk_nvmf_ns *
+_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+	/* NOTE: This implicitly also checks for 0, since 0 - 1 wraps around to UINT32_MAX. */
+	if (spdk_unlikely(nsid - 1 >= subsystem->max_nsid)) {
+		return NULL;
+	}
+
+	return subsystem->ns[nsid - 1];
+}
+
+static inline bool
+nvmf_qpair_is_admin_queue(struct spdk_nvmf_qpair *qpair)
+{
+	return qpair->qid == 0;
+}
+
+#endif /* __NVMF_INTERNAL_H__ */
diff --git a/src/spdk/lib/nvmf/nvmf_rpc.c b/src/spdk/lib/nvmf/nvmf_rpc.c
new file mode 100644
index 000000000..5dc9f42f0
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf_rpc.c
@@ -0,0 +1,2012 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2018-2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/bdev.h"
+#include "spdk/log.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/assert.h"
+
+#include "nvmf_internal.h"
+
+static int
+json_write_hex_str(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+	static const char hex_char[16] = "0123456789ABCDEF";
+	const uint8_t *buf = data;
+	char *str, *out;
+	int rc;
+
+	str = malloc(size * 2 + 1);
+	if (str == NULL) {
+		return -1;
+	}
+
+	out = str;
+	while (size--) {
+		unsigned byte = *buf++;
+
+		out[0] = hex_char[(byte >> 4) & 0xF];
+		out[1] = hex_char[byte & 0xF];
+
+		out += 2;
+	}
+	*out = '\0';
+
+	rc = spdk_json_write_string(w, str);
+	free(str);
+
+	return rc;
+}
+
+static int
+hex_nybble_to_num(char c)
+{
+	if (c >= '0' && c <= '9') {
+		return c - '0';
+	}
+
+	if (c >= 'a' && c <= 'f') {
+		return c - 'a' + 0xA;
+	}
+
+	if (c >= 'A' && c <= 'F') {
+		return c - 'A' + 0xA;
+	}
+
+	return -1;
+}
+
+static int
+hex_byte_to_num(const char *str)
+{
+	int hi, lo;
+
+	hi = hex_nybble_to_num(str[0]);
+	if (hi < 0) {
+		return hi;
+	}
+
+	lo = hex_nybble_to_num(str[1]);
+	if (lo < 0) {
+		return lo;
+	}
+
+	return hi * 16 + lo;
+}
+
+static int
+decode_hex_string_be(const char *str, uint8_t *out, size_t size)
+{
+	size_t i;
+
+	/* Decode a string in "ABCDEF012345" format to its binary representation */
+	for (i = 0; i < size; i++) {
+		int num = hex_byte_to_num(str);
+
+		if (num < 0) {
+			/* Invalid hex byte or end of string */
+			return -1;
+		}
+
+		out[i] = (uint8_t)num;
+		str += 2;
+	}
+
+	if (i != size || *str != '\0') {
+		/* Length mismatch */
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+decode_ns_nguid(const struct spdk_json_val *val, void *out)
+{
+	char *str = NULL;
+	int rc;
+
+	rc = spdk_json_decode_string(val, &str);
+	if (rc == 0) {
+		/* 16-byte NGUID */
+		rc = decode_hex_string_be(str, out, 16);
+	}
+
+	free(str);
+	return rc;
+}
+
+static int
+decode_ns_eui64(const struct spdk_json_val *val, void *out)
+{
+	char *str = NULL;
+	int rc;
+
+	rc = spdk_json_decode_string(val, &str);
+	if (rc == 0) {
+		/* 8-byte EUI-64 */
+		rc = decode_hex_string_be(str, out, 8);
+	}
+
+	free(str);
+	return rc;
+}
+
+static int
+decode_ns_uuid(const struct spdk_json_val *val, void *out)
+{
+	char *str = NULL;
+	int rc;
+
+	rc = spdk_json_decode_string(val, &str);
+	if (rc == 0) {
+		rc = spdk_uuid_parse(out, str);
+	}
+
+	free(str);
+	return rc;
+}
+
+struct rpc_get_subsystem {
+	char *tgt_name;
+};
+
+static const struct spdk_json_object_decoder rpc_get_subsystem_decoders[] = {
+	{"tgt_name", offsetof(struct rpc_get_subsystem, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+dump_nvmf_subsystem(struct spdk_json_write_ctx *w, struct spdk_nvmf_subsystem *subsystem)
+{
+	struct spdk_nvmf_host			*host;
+	struct spdk_nvmf_subsystem_listener	*listener;
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+	spdk_json_write_name(w, "subtype");
+	if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) {
+		spdk_json_write_string(w, "NVMe");
+	} else {
+		spdk_json_write_string(w, "Discovery");
+	}
+
+	spdk_json_write_named_array_begin(w, "listen_addresses");
+
+	for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL;
+	     listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) {
+		const struct spdk_nvme_transport_id *trid;
+		const char *adrfam;
+
+		trid = spdk_nvmf_subsystem_listener_get_trid(listener);
+
+		spdk_json_write_object_begin(w);
+		adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
+		if (adrfam == NULL) {
+			adrfam = "unknown";
+		}
+		/* NOTE: "transport" is kept for compatibility; new code should use "trtype" */
+		spdk_json_write_named_string(w, "transport", trid->trstring);
+		spdk_json_write_named_string(w, "trtype", trid->trstring);
+		spdk_json_write_named_string(w, "adrfam", adrfam);
+		spdk_json_write_named_string(w, "traddr", trid->traddr);
+		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_named_bool(w, "allow_any_host",
+				   spdk_nvmf_subsystem_get_allow_any_host(subsystem));
+
+	spdk_json_write_named_array_begin(w, "hosts");
+
+	for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL;
+	     host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) {
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "nqn", spdk_nvmf_host_get_nqn(host));
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) {
+		struct spdk_nvmf_ns *ns;
+		struct spdk_nvmf_ns_opts ns_opts;
+		uint32_t max_namespaces;
+
+		spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem));
+
+		spdk_json_write_named_string(w, "model_number", spdk_nvmf_subsystem_get_mn(subsystem));
+
+		max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem);
+		if (max_namespaces != 0) {
+			spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces);
+		}
+
+		spdk_json_write_named_array_begin(w, "namespaces");
+		for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+		     ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+			spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts));
+			spdk_json_write_object_begin(w);
+			spdk_json_write_named_int32(w, "nsid", spdk_nvmf_ns_get_id(ns));
+			spdk_json_write_named_string(w, "bdev_name",
+						     spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns)));
+			/* NOTE: "name" is kept for compatibility only - new code should use bdev_name. */
+			spdk_json_write_named_string(w, "name",
+						     spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns)));
+
+			if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) {
+				spdk_json_write_name(w, "nguid");
+				json_write_hex_str(w, ns_opts.nguid, sizeof(ns_opts.nguid));
+			}
+
+			if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) {
+				spdk_json_write_name(w, "eui64");
+				json_write_hex_str(w, ns_opts.eui64, sizeof(ns_opts.eui64));
+			}
+
+			if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) {
+				char uuid_str[SPDK_UUID_STRING_LEN];
+
+				spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid);
+				spdk_json_write_named_string(w, "uuid", uuid_str);
+			}
+
+			spdk_json_write_object_end(w);
+		}
+		spdk_json_write_array_end(w);
+	}
+	spdk_json_write_object_end(w);
+}
+
+static void
+rpc_nvmf_get_subsystems(struct spdk_jsonrpc_request *request,
+			const struct spdk_json_val *params)
+{
+	struct rpc_get_subsystem req = { 0 };
+	struct spdk_json_write_ctx *w;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	if (params) {
+		if (spdk_json_decode_object(params, rpc_get_subsystem_decoders,
+					    SPDK_COUNTOF(rpc_get_subsystem_decoders),
+					    &req)) {
+			SPDK_ERRLOG("spdk_json_decode_object failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+			return;
+		}
+	}
+
+	tgt = spdk_nvmf_get_tgt(req.tgt_name);
+	if (!tgt) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		free(req.tgt_name);
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	subsystem = spdk_nvmf_subsystem_get_first(tgt);
+	while (subsystem) {
+		dump_nvmf_subsystem(w, subsystem);
+		subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+	}
+	spdk_json_write_array_end(w);
+	spdk_jsonrpc_end_result(request, w);
+	free(req.tgt_name);
+}
+SPDK_RPC_REGISTER("nvmf_get_subsystems", rpc_nvmf_get_subsystems, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_get_subsystems, get_nvmf_subsystems)
+
+struct rpc_subsystem_create {
+	char *nqn;
+	char *serial_number;
+	char *model_number;
+	char *tgt_name;
+	uint32_t max_namespaces;
+	bool allow_any_host;
+};
+
+static const struct spdk_json_object_decoder rpc_subsystem_create_decoders[] = {
+	{"nqn", offsetof(struct rpc_subsystem_create, nqn), spdk_json_decode_string},
+	{"serial_number", offsetof(struct rpc_subsystem_create, serial_number), spdk_json_decode_string, true},
+	{"model_number", offsetof(struct rpc_subsystem_create, model_number), spdk_json_decode_string, true},
+	{"tgt_name", offsetof(struct rpc_subsystem_create, tgt_name), spdk_json_decode_string, true},
+	{"max_namespaces", offsetof(struct rpc_subsystem_create, max_namespaces), spdk_json_decode_uint32, true},
+	{"allow_any_host", offsetof(struct rpc_subsystem_create, allow_any_host), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_nvmf_subsystem_started(struct spdk_nvmf_subsystem *subsystem,
+			   void *cb_arg, int status)
+{
+	struct spdk_jsonrpc_request *request = cb_arg;
+
+	if (!status) {
+		struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+		spdk_json_write_bool(w, true);
+		spdk_jsonrpc_end_result(request, w);
+	} else {
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						     "Subsystem %s start failed",
+						     subsystem->subnqn);
+		spdk_nvmf_subsystem_destroy(subsystem);
+	}
+}
+
+static void
+rpc_nvmf_create_subsystem(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct rpc_subsystem_create *req;
+	struct spdk_nvmf_subsystem *subsystem = NULL;
+	struct spdk_nvmf_tgt *tgt;
+	int rc = -1;
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		SPDK_ERRLOG("Memory allocation failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Memory allocation failed");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, rpc_subsystem_create_decoders,
+				    SPDK_COUNTOF(rpc_subsystem_create_decoders),
+				    req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		goto cleanup;
+	}
+
+	tgt = spdk_nvmf_get_tgt(req->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find target %s\n", req->tgt_name);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						     "Unable to find target %s", req->tgt_name);
+		goto cleanup;
+	}
+
+	subsystem = spdk_nvmf_subsystem_create(tgt, req->nqn, SPDK_NVMF_SUBTYPE_NVME,
+					       req->max_namespaces);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to create subsystem %s\n", req->nqn);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						     "Unable to create subsystem %s", req->nqn);
+		goto cleanup;
+	}
+
+	if (req->serial_number) {
+		if (spdk_nvmf_subsystem_set_sn(subsystem, req->serial_number)) {
+			SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", req->nqn, req->serial_number);
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							     "Invalid SN %s", req->serial_number);
+			goto cleanup;
+		}
+	}
+
+	if (req->model_number) {
+		if (spdk_nvmf_subsystem_set_mn(subsystem, req->model_number)) {
+			SPDK_ERRLOG("Subsystem %s: invalid model number '%s'\n", req->nqn, req->model_number);
+			spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							     "Invalid MN %s", req->model_number);
+			goto cleanup;
+		}
+	}
+
+	spdk_nvmf_subsystem_set_allow_any_host(subsystem, req->allow_any_host);
+
+	rc = spdk_nvmf_subsystem_start(subsystem,
+				       rpc_nvmf_subsystem_started,
+				       request);
+
+cleanup:
+	free(req->nqn);
+	free(req->tgt_name);
+	free(req->serial_number);
+	free(req->model_number);
+	free(req);
+
+	if (rc && subsystem) {
+		spdk_nvmf_subsystem_destroy(subsystem);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_create_subsystem", rpc_nvmf_create_subsystem, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_create_subsystem, nvmf_subsystem_create)
+
+struct rpc_delete_subsystem {
+	char *nqn;
+	char *tgt_name;
+};
+
+static void
+free_rpc_delete_subsystem(struct rpc_delete_subsystem *r)
+{
+	free(r->nqn);
+	free(r->tgt_name);
+}
+
+static void
+rpc_nvmf_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem,
+			   void *cb_arg, int status)
+{
+	struct spdk_jsonrpc_request *request = cb_arg;
+	struct spdk_json_write_ctx *w;
+
+	nvmf_subsystem_remove_all_listeners(subsystem, true);
+	spdk_nvmf_subsystem_destroy(subsystem);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_subsystem_decoders[] = {
+	{"nqn", offsetof(struct rpc_delete_subsystem, nqn), spdk_json_decode_string},
+	{"tgt_name", offsetof(struct rpc_delete_subsystem, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvmf_delete_subsystem(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct rpc_delete_subsystem req = { 0 };
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	if (spdk_json_decode_object(params, rpc_delete_subsystem_decoders,
+				    SPDK_COUNTOF(rpc_delete_subsystem_decoders),
+				    &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (req.nqn == NULL) {
+		SPDK_ERRLOG("missing name param\n");
+		goto invalid;
+	}
+
+	tgt = spdk_nvmf_get_tgt(req.tgt_name);
+	if (!tgt) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		goto invalid_custom_response;
+	}
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, req.nqn);
+	if (!subsystem) {
+		goto invalid;
+	}
+
+	free_rpc_delete_subsystem(&req);
+
+	spdk_nvmf_subsystem_stop(subsystem,
+				 rpc_nvmf_subsystem_stopped,
+				 request);
+
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+invalid_custom_response:
+	free_rpc_delete_subsystem(&req);
+}
+SPDK_RPC_REGISTER("nvmf_delete_subsystem", rpc_nvmf_delete_subsystem, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_delete_subsystem, delete_nvmf_subsystem)
+
+struct rpc_listen_address {
+	char *transport;
+	char *adrfam;
+	char *traddr;
+	char *trsvcid;
+};
+
+#define RPC_MAX_LISTEN_ADDRESSES 255
+#define RPC_MAX_NAMESPACES 255
+
+struct rpc_listen_addresses {
+	size_t num_listen_address;
+	struct rpc_listen_address addresses[RPC_MAX_LISTEN_ADDRESSES];
+};
+
+static const struct spdk_json_object_decoder rpc_listen_address_decoders[] = {
+	/* NOTE: "transport" is kept for compatibility; new code should use "trtype" */
+	{"transport", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true},
+	{"trtype", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true},
+	{"adrfam", offsetof(struct rpc_listen_address, adrfam), spdk_json_decode_string, true},
+	{"traddr", offsetof(struct rpc_listen_address, traddr), spdk_json_decode_string},
+	{"trsvcid", offsetof(struct rpc_listen_address, trsvcid), spdk_json_decode_string},
+};
+
+static int
+decode_rpc_listen_address(const struct spdk_json_val *val, void *out)
+{
+	struct rpc_listen_address *req = (struct rpc_listen_address *)out;
+	if (spdk_json_decode_object(val, rpc_listen_address_decoders,
+				    SPDK_COUNTOF(rpc_listen_address_decoders),
+				    req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		return -1;
+	}
+	return 0;
+}
+
+static void
+free_rpc_listen_address(struct rpc_listen_address *r)
+{
+	free(r->transport);
+	free(r->adrfam);
+	free(r->traddr);
+	free(r->trsvcid);
+}
+
+enum nvmf_rpc_listen_op {
+	NVMF_RPC_LISTEN_ADD,
+	NVMF_RPC_LISTEN_REMOVE,
+};
+
+struct nvmf_rpc_listener_ctx {
+	char				*nqn;
+	char				*tgt_name;
+	struct spdk_nvmf_tgt		*tgt;
+	struct spdk_nvmf_subsystem	*subsystem;
+	struct rpc_listen_address	address;
+
+	struct spdk_jsonrpc_request	*request;
+	struct spdk_nvme_transport_id	trid;
+	enum nvmf_rpc_listen_op		op;
+	bool				response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_listener_decoder[] = {
+	{"nqn", offsetof(struct nvmf_rpc_listener_ctx, nqn), spdk_json_decode_string},
+	{"listen_address", offsetof(struct nvmf_rpc_listener_ctx, address), decode_rpc_listen_address},
+	{"tgt_name", offsetof(struct nvmf_rpc_listener_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_listener_ctx_free(struct nvmf_rpc_listener_ctx *ctx)
+{
+	free(ctx->nqn);
+	free(ctx->tgt_name);
+	free_rpc_listen_address(&ctx->address);
+	free(ctx);
+}
+
+static void
+nvmf_rpc_listen_resumed(struct spdk_nvmf_subsystem *subsystem,
+			void *cb_arg, int status)
+{
+	struct nvmf_rpc_listener_ctx *ctx = cb_arg;
+	struct spdk_jsonrpc_request *request;
+	struct spdk_json_write_ctx *w;
+
+	request = ctx->request;
+	if (ctx->response_sent) {
+		/* If an error occurred, the response has already been sent. */
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	nvmf_rpc_listener_ctx_free(ctx);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_subsystem_listen(void *cb_arg, int status)
+{
+	struct nvmf_rpc_listener_ctx *ctx = cb_arg;
+
+	if (status) {
+		/* Destroy the listener that we just created. Ignore the error code because
+		 * the RPC is failing already anyway. */
+		spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid);
+
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		ctx->response_sent = true;
+	}
+
+	if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) {
+		if (!ctx->response_sent) {
+			spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		}
+		nvmf_rpc_listener_ctx_free(ctx);
+		/* Can't really do anything to recover here - subsystem will remain paused. */
+	}
+}
+
+static void
+nvmf_rpc_listen_paused(struct spdk_nvmf_subsystem *subsystem,
+		       void *cb_arg, int status)
+{
+	struct nvmf_rpc_listener_ctx *ctx = cb_arg;
+	int rc;
+
+	if (ctx->op == NVMF_RPC_LISTEN_ADD) {
+		if (!nvmf_subsystem_find_listener(subsystem, &ctx->trid)) {
+			rc = spdk_nvmf_tgt_listen(ctx->tgt, &ctx->trid);
+			if (rc == 0) {
+				spdk_nvmf_subsystem_add_listener(ctx->subsystem, &ctx->trid, nvmf_rpc_subsystem_listen, ctx);
+				return;
+			}
+
+			spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							 "Invalid parameters");
+			ctx->response_sent = true;
+		}
+	} else if (ctx->op == NVMF_RPC_LISTEN_REMOVE) {
+		if (spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid)) {
+			SPDK_ERRLOG("Unable to remove listener.\n");
+			spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							 "Invalid parameters");
+			ctx->response_sent = true;
+		}
+		spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid);
+	} else {
+		SPDK_UNREACHABLE();
+	}
+
+	if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_listen_resumed, ctx)) {
+		if (!ctx->response_sent) {
+			spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		}
+		nvmf_rpc_listener_ctx_free(ctx);
+		/* Can't really do anything to recover here - subsystem will remain paused. */
+	}
+}
+
+static int
+rpc_listen_address_to_trid(const struct rpc_listen_address *address,
+			   struct spdk_nvme_transport_id *trid)
+{
+	size_t len;
+
+	memset(trid, 0, sizeof(*trid));
+
+	if (spdk_nvme_transport_id_populate_trstring(trid, address->transport)) {
+		SPDK_ERRLOG("Invalid transport string: %s\n", address->transport);
+		return -EINVAL;
+	}
+
+	if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, address->transport)) {
+		SPDK_ERRLOG("Invalid transport type: %s\n", address->transport);
+		return -EINVAL;
+	}
+
+	if (address->adrfam) {
+		if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, address->adrfam)) {
+			SPDK_ERRLOG("Invalid adrfam: %s\n", address->adrfam);
+			return -EINVAL;
+		}
+	} else {
+		trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+	}
+
+	len = strlen(address->traddr);
+	if (len > sizeof(trid->traddr) - 1) {
+		SPDK_ERRLOG("Transport address longer than %zu characters: %s\n",
+			    sizeof(trid->traddr) - 1, address->traddr);
+		return -EINVAL;
+	}
+	memcpy(trid->traddr, address->traddr, len + 1);
+
+	len = strlen(address->trsvcid);
+	if (len > sizeof(trid->trsvcid) - 1) {
+		SPDK_ERRLOG("Transport service id longer than %zu characters: %s\n",
+			    sizeof(trid->trsvcid) - 1, address->trsvcid);
+		return -EINVAL;
+	}
+	memcpy(trid->trsvcid, address->trsvcid, len + 1);
+
+	return 0;
+}
+
+static void
+rpc_nvmf_subsystem_add_listener(struct spdk_jsonrpc_request *request,
+				const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_listener_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	ctx->request = request;
+
+	if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_listener_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+	ctx->tgt = tgt;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	ctx->subsystem = subsystem;
+
+	if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) {
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	ctx->op = NVMF_RPC_LISTEN_ADD;
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_listener_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_add_listener", rpc_nvmf_subsystem_add_listener,
+		  SPDK_RPC_RUNTIME);
+
+static void
+rpc_nvmf_subsystem_remove_listener(struct spdk_jsonrpc_request *request,
+				   const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_listener_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	ctx->request = request;
+
+	if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_listener_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+	ctx->tgt = tgt;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	ctx->subsystem = subsystem;
+
+	if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) {
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		nvmf_rpc_listener_ctx_free(ctx);
+		return;
+	}
+
+	ctx->op = NVMF_RPC_LISTEN_REMOVE;
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_listener_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_remove_listener", rpc_nvmf_subsystem_remove_listener,
+		  SPDK_RPC_RUNTIME);
+
+struct spdk_nvmf_ns_params {
+	char *bdev_name;
+	char *ptpl_file;
+	uint32_t nsid;
+	char nguid[16];
+	char eui64[8];
+	struct spdk_uuid uuid;
+};
+
+struct rpc_namespaces {
+	size_t num_ns;
+	struct spdk_nvmf_ns_params ns_params[RPC_MAX_NAMESPACES];
+};
+
+
+static const struct spdk_json_object_decoder rpc_ns_params_decoders[] = {
+	{"nsid", offsetof(struct spdk_nvmf_ns_params, nsid), spdk_json_decode_uint32, true},
+	{"bdev_name", offsetof(struct spdk_nvmf_ns_params, bdev_name), spdk_json_decode_string},
+	{"ptpl_file", offsetof(struct spdk_nvmf_ns_params, ptpl_file), spdk_json_decode_string, true},
+	{"nguid", offsetof(struct spdk_nvmf_ns_params, nguid), decode_ns_nguid, true},
+	{"eui64", offsetof(struct spdk_nvmf_ns_params, eui64), decode_ns_eui64, true},
+	{"uuid", offsetof(struct spdk_nvmf_ns_params, uuid), decode_ns_uuid, true},
+};
+
+static int
+decode_rpc_ns_params(const struct spdk_json_val *val, void *out)
+{
+	struct spdk_nvmf_ns_params *ns_params = out;
+
+	return spdk_json_decode_object(val, rpc_ns_params_decoders,
+				       SPDK_COUNTOF(rpc_ns_params_decoders),
+				       ns_params);
+}
+
+struct nvmf_rpc_ns_ctx {
+	char *nqn;
+	char *tgt_name;
+	struct spdk_nvmf_ns_params ns_params;
+
+	struct spdk_jsonrpc_request *request;
+	bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_ns_decoder[] = {
+	{"nqn", offsetof(struct nvmf_rpc_ns_ctx, nqn), spdk_json_decode_string},
+	{"namespace", offsetof(struct nvmf_rpc_ns_ctx, ns_params), decode_rpc_ns_params},
+	{"tgt_name", offsetof(struct nvmf_rpc_ns_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_ns_ctx_free(struct nvmf_rpc_ns_ctx *ctx)
+{
+	free(ctx->nqn);
+	free(ctx->tgt_name);
+	free(ctx->ns_params.bdev_name);
+	free(ctx->ns_params.ptpl_file);
+	free(ctx);
+}
+
+static void
+nvmf_rpc_ns_resumed(struct spdk_nvmf_subsystem *subsystem,
+		    void *cb_arg, int status)
+{
+	struct nvmf_rpc_ns_ctx *ctx = cb_arg;
+	struct spdk_jsonrpc_request *request = ctx->request;
+	uint32_t nsid = ctx->ns_params.nsid;
+	bool response_sent = ctx->response_sent;
+	struct spdk_json_write_ctx *w;
+
+	nvmf_rpc_ns_ctx_free(ctx);
+
+	if (response_sent) {
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_uint32(w, nsid);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_ns_paused(struct spdk_nvmf_subsystem *subsystem,
+		   void *cb_arg, int status)
+{
+	struct nvmf_rpc_ns_ctx *ctx = cb_arg;
+	struct spdk_nvmf_ns_opts ns_opts;
+	struct spdk_bdev *bdev;
+
+	bdev = spdk_bdev_get_by_name(ctx->ns_params.bdev_name);
+	if (!bdev) {
+		SPDK_ERRLOG("No bdev with name %s\n", ctx->ns_params.bdev_name);
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		ctx->response_sent = true;
+		goto resume;
+	}
+
+	spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts));
+	ns_opts.nsid = ctx->ns_params.nsid;
+
+	SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(ctx->ns_params.nguid), "size mismatch");
+	memcpy(ns_opts.nguid, ctx->ns_params.nguid, sizeof(ns_opts.nguid));
+
+	SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(ctx->ns_params.eui64), "size mismatch");
+	memcpy(ns_opts.eui64, ctx->ns_params.eui64, sizeof(ns_opts.eui64));
+
+	if (!spdk_mem_all_zero(&ctx->ns_params.uuid, sizeof(ctx->ns_params.uuid))) {
+		ns_opts.uuid = ctx->ns_params.uuid;
+	}
+
+	ctx->ns_params.nsid = spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts),
+			      ctx->ns_params.ptpl_file);
+	if (ctx->ns_params.nsid == 0) {
+		SPDK_ERRLOG("Unable to add namespace\n");
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		ctx->response_sent = true;
+		goto resume;
+	}
+
+resume:
+	if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_ns_resumed, ctx)) {
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_ns_ctx_free(ctx);
+	}
+}
+
+static void
+rpc_nvmf_subsystem_add_ns(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_ns_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, nvmf_rpc_subsystem_ns_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_subsystem_ns_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_ns_ctx_free(ctx);
+		return;
+	}
+
+	ctx->request = request;
+	ctx->response_sent = false;
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_ns_ctx_free(ctx);
+		return;
+	}
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_ns_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_ns_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_ns_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_add_ns", rpc_nvmf_subsystem_add_ns, SPDK_RPC_RUNTIME)
+
+struct nvmf_rpc_remove_ns_ctx {
+	char *nqn;
+	char *tgt_name;
+	uint32_t nsid;
+
+	struct spdk_jsonrpc_request *request;
+	bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_remove_ns_decoder[] = {
+	{"nqn", offsetof(struct nvmf_rpc_remove_ns_ctx, nqn), spdk_json_decode_string},
+	{"nsid", offsetof(struct nvmf_rpc_remove_ns_ctx, nsid), spdk_json_decode_uint32},
+	{"tgt_name", offsetof(struct nvmf_rpc_remove_ns_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_remove_ns_ctx_free(struct nvmf_rpc_remove_ns_ctx *ctx)
+{
+	free(ctx->nqn);
+	free(ctx->tgt_name);
+	free(ctx);
+}
+
+static void
+nvmf_rpc_remove_ns_resumed(struct spdk_nvmf_subsystem *subsystem,
+			   void *cb_arg, int status)
+{
+	struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg;
+	struct spdk_jsonrpc_request *request = ctx->request;
+	bool response_sent = ctx->response_sent;
+	struct spdk_json_write_ctx *w;
+
+	nvmf_rpc_remove_ns_ctx_free(ctx);
+
+	if (response_sent) {
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_remove_ns_paused(struct spdk_nvmf_subsystem *subsystem,
+			  void *cb_arg, int status)
+{
+	struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg;
+	int ret;
+
+	ret = spdk_nvmf_subsystem_remove_ns(subsystem, ctx->nsid);
+	if (ret < 0) {
+		SPDK_ERRLOG("Unable to remove namespace ID %u\n", ctx->nsid);
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		ctx->response_sent = true;
+	}
+
+	if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_remove_ns_resumed, ctx)) {
+		if (!ctx->response_sent) {
+			spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		}
+		nvmf_rpc_remove_ns_ctx_free(ctx);
+	}
+}
+
+static void
+rpc_nvmf_subsystem_remove_ns(struct spdk_jsonrpc_request *request,
+			     const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_remove_ns_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, nvmf_rpc_subsystem_remove_ns_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_subsystem_remove_ns_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_remove_ns_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_remove_ns_ctx_free(ctx);
+		return;
+	}
+
+	ctx->request = request;
+	ctx->response_sent = false;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_remove_ns_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_remove_ns_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_remove_ns_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_remove_ns", rpc_nvmf_subsystem_remove_ns, SPDK_RPC_RUNTIME)
+
+enum nvmf_rpc_host_op {
+	NVMF_RPC_HOST_ADD,
+	NVMF_RPC_HOST_REMOVE,
+	NVMF_RPC_HOST_ALLOW_ANY,
+};
+
+struct nvmf_rpc_host_ctx {
+	struct spdk_jsonrpc_request *request;
+
+	char *nqn;
+	char *host;
+	char *tgt_name;
+
+	enum nvmf_rpc_host_op op;
+
+	bool allow_any_host;
+
+	bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_host_decoder[] = {
+	{"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string},
+	{"host", offsetof(struct nvmf_rpc_host_ctx, host), spdk_json_decode_string},
+	{"tgt_name", offsetof(struct nvmf_rpc_host_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_host_ctx_free(struct nvmf_rpc_host_ctx *ctx)
+{
+	free(ctx->nqn);
+	free(ctx->host);
+	free(ctx->tgt_name);
+	free(ctx);
+}
+
+static void
+nvmf_rpc_host_resumed(struct spdk_nvmf_subsystem *subsystem,
+		      void *cb_arg, int status)
+{
+	struct nvmf_rpc_host_ctx *ctx = cb_arg;
+	struct spdk_jsonrpc_request *request;
+	struct spdk_json_write_ctx *w;
+	bool response_sent = ctx->response_sent;
+
+	request = ctx->request;
+	nvmf_rpc_host_ctx_free(ctx);
+
+	if (response_sent) {
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_host_paused(struct spdk_nvmf_subsystem *subsystem,
+		     void *cb_arg, int status)
+{
+	struct nvmf_rpc_host_ctx *ctx = cb_arg;
+	int rc = -1;
+
+	switch (ctx->op) {
+	case NVMF_RPC_HOST_ADD:
+		rc = spdk_nvmf_subsystem_add_host(subsystem, ctx->host);
+		break;
+	case NVMF_RPC_HOST_REMOVE:
+		rc = spdk_nvmf_subsystem_remove_host(subsystem, ctx->host);
+		break;
+	case NVMF_RPC_HOST_ALLOW_ANY:
+		rc = spdk_nvmf_subsystem_set_allow_any_host(subsystem, ctx->allow_any_host);
+		break;
+	}
+
+	if (rc != 0) {
+		spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		ctx->response_sent = true;
+	}
+
+	if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_host_resumed, ctx)) {
+		if (!ctx->response_sent) {
+			spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		}
+		nvmf_rpc_host_ctx_free(ctx);
+	}
+}
+
+static void
+rpc_nvmf_subsystem_add_host(struct spdk_jsonrpc_request *request,
+			    const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_host_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	ctx->request = request;
+	ctx->op = NVMF_RPC_HOST_ADD;
+	ctx->response_sent = false;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_host_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_add_host", rpc_nvmf_subsystem_add_host, SPDK_RPC_RUNTIME)
+
+static void
+rpc_nvmf_subsystem_remove_host(struct spdk_jsonrpc_request *request,
+			       const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_host_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	ctx->request = request;
+	ctx->op = NVMF_RPC_HOST_REMOVE;
+	ctx->response_sent = false;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_host_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_remove_host", rpc_nvmf_subsystem_remove_host,
+		  SPDK_RPC_RUNTIME)
+
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_any_host_decoder[] = {
+	{"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string},
+	{"allow_any_host", offsetof(struct nvmf_rpc_host_ctx, allow_any_host), spdk_json_decode_bool},
+	{"tgt_name", offsetof(struct nvmf_rpc_host_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvmf_subsystem_allow_any_host(struct spdk_jsonrpc_request *request,
+				  const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_host_ctx *ctx;
+	struct spdk_nvmf_subsystem *subsystem;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	if (spdk_json_decode_object(params, nvmf_rpc_subsystem_any_host_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_subsystem_any_host_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	ctx->request = request;
+	ctx->op = NVMF_RPC_HOST_ALLOW_ANY;
+	ctx->response_sent = false;
+
+	subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+	if (!subsystem) {
+		SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_host_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+		nvmf_rpc_host_ctx_free(ctx);
+	}
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_allow_any_host", rpc_nvmf_subsystem_allow_any_host,
+		  SPDK_RPC_RUNTIME)
+
+struct nvmf_rpc_target_ctx {
+	char *name;
+	uint32_t max_subsystems;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_create_target_decoder[] = {
+	{"name", offsetof(struct nvmf_rpc_target_ctx, name), spdk_json_decode_string},
+	{"max_subsystems", offsetof(struct nvmf_rpc_target_ctx, max_subsystems), spdk_json_decode_uint32, true},
+};
+
+static void
+rpc_nvmf_create_target(struct spdk_jsonrpc_request *request,
+		       const struct spdk_json_val *params)
+{
+	struct spdk_nvmf_target_opts	opts;
+	struct nvmf_rpc_target_ctx	ctx = {0};
+	struct spdk_nvmf_tgt		*tgt;
+	struct spdk_json_write_ctx	*w;
+
+	/* Decode parameters the first time to get the transport type */
+	if (spdk_json_decode_object(params, nvmf_rpc_create_target_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_create_target_decoder),
+				    &ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		free(ctx.name);
+		return;
+	}
+
+	snprintf(opts.name, NVMF_TGT_NAME_MAX_LENGTH, "%s", ctx.name);
+	opts.max_subsystems = ctx.max_subsystems;
+
+	if (spdk_nvmf_get_tgt(opts.name) != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Target already exists.");
+		free(ctx.name);
+		return;
+	}
+
+	tgt = spdk_nvmf_tgt_create(&opts);
+
+	if (tgt == NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to create the requested target.");
+		free(ctx.name);
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_string(w, spdk_nvmf_tgt_get_name(tgt));
+	spdk_jsonrpc_end_result(request, w);
+	free(ctx.name);
+}
+SPDK_RPC_REGISTER("nvmf_create_target", rpc_nvmf_create_target, SPDK_RPC_RUNTIME);
+
+static const struct spdk_json_object_decoder nvmf_rpc_destroy_target_decoder[] = {
+	{"name", offsetof(struct nvmf_rpc_target_ctx, name), spdk_json_decode_string},
+};
+
+static void
+nvmf_rpc_destroy_target_done(void *ctx, int status)
+{
+	struct spdk_jsonrpc_request	*request = ctx;
+	struct spdk_json_write_ctx	*w;
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_nvmf_delete_target(struct spdk_jsonrpc_request *request,
+		       const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_target_ctx	ctx = {0};
+	struct spdk_nvmf_tgt		*tgt;
+
+	/* Decode parameters the first time to get the transport type */
+	if (spdk_json_decode_object(params, nvmf_rpc_destroy_target_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_destroy_target_decoder),
+				    &ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		free(ctx.name);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx.name);
+
+	if (tgt == NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "The specified target doesn't exist, cannot delete it.");
+		free(ctx.name);
+		return;
+	}
+
+	spdk_nvmf_tgt_destroy(tgt, nvmf_rpc_destroy_target_done, request);
+	free(ctx.name);
+}
+SPDK_RPC_REGISTER("nvmf_delete_target", rpc_nvmf_delete_target, SPDK_RPC_RUNTIME);
+
+static void
+rpc_nvmf_get_targets(struct spdk_jsonrpc_request *request,
+		     const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx	*w;
+	struct spdk_nvmf_tgt		*tgt;
+	const char			*name;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "nvmf_get_targets has no parameters.");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+
+	tgt = spdk_nvmf_get_first_tgt();
+
+	while (tgt != NULL) {
+		name = spdk_nvmf_tgt_get_name(tgt);
+		spdk_json_write_string(w, name);
+		tgt = spdk_nvmf_get_next_tgt(tgt);
+	}
+
+	spdk_json_write_array_end(w);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("nvmf_get_targets", rpc_nvmf_get_targets, SPDK_RPC_RUNTIME);
+
+struct nvmf_rpc_create_transport_ctx {
+	char				*trtype;
+	char				*tgt_name;
+	struct spdk_nvmf_transport_opts	opts;
+	struct spdk_jsonrpc_request	*request;
+};
+
+/**
+ * `max_qpairs_per_ctrlr` represents both admin and IO qpairs, that confuses
+ * users when they configure a transport using RPC. So it was decided to
+ * deprecate `max_qpairs_per_ctrlr` RPC parameter and use `max_io_qpairs_per_ctrlr`
+ * But internal logic remains unchanged and SPDK expects that
+ * spdk_nvmf_transport_opts::max_qpairs_per_ctrlr includes an admin qpair.
+ * This function parses the number of IO qpairs and adds +1 for admin qpair.
+ */
+static int
+nvmf_rpc_decode_max_io_qpairs(const struct spdk_json_val *val, void *out)
+{
+	uint16_t *i = out;
+	int rc;
+
+	rc = spdk_json_number_to_uint16(val, i);
+	if (rc == 0) {
+		(*i)++;
+	}
+
+	return rc;
+}
+
+/**
+ * This function parses deprecated `max_qpairs_per_ctrlr` and warns the user to use
+ * the new parameter `max_io_qpairs_per_ctrlr`
+ */
+static int
+nvmf_rpc_decode_max_qpairs(const struct spdk_json_val *val, void *out)
+{
+	uint16_t *i = out;
+	int rc;
+
+	rc = spdk_json_number_to_uint16(val, i);
+	if (rc == 0) {
+		SPDK_WARNLOG("Parameter max_qpairs_per_ctrlr is deprecated, use max_io_qpairs_per_ctrlr instead.\n");
+	}
+
+	return rc;
+}
+
+static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[] = {
+	{	"trtype", offsetof(struct nvmf_rpc_create_transport_ctx, trtype), spdk_json_decode_string},
+	{
+		"max_queue_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_queue_depth),
+		spdk_json_decode_uint16, true
+	},
+	{
+		"max_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr),
+		nvmf_rpc_decode_max_qpairs, true
+	},
+	{
+		"max_io_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr),
+		nvmf_rpc_decode_max_io_qpairs, true
+	},
+	{
+		"in_capsule_data_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.in_capsule_data_size),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"max_io_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_io_size),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"io_unit_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.io_unit_size),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"max_aq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_aq_depth),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"num_shared_buffers", offsetof(struct nvmf_rpc_create_transport_ctx, opts.num_shared_buffers),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"buf_cache_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.buf_cache_size),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"max_srq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_srq_depth),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"no_srq", offsetof(struct nvmf_rpc_create_transport_ctx, opts.no_srq),
+		spdk_json_decode_bool, true
+	},
+	{
+		"c2h_success", offsetof(struct nvmf_rpc_create_transport_ctx, opts.c2h_success),
+		spdk_json_decode_bool, true
+	},
+	{
+		"dif_insert_or_strip", offsetof(struct nvmf_rpc_create_transport_ctx, opts.dif_insert_or_strip),
+		spdk_json_decode_bool, true
+	},
+	{
+		"sock_priority", offsetof(struct nvmf_rpc_create_transport_ctx, opts.sock_priority),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"acceptor_backlog", offsetof(struct nvmf_rpc_create_transport_ctx, opts.acceptor_backlog),
+		spdk_json_decode_int32, true
+	},
+	{
+		"abort_timeout_sec", offsetof(struct nvmf_rpc_create_transport_ctx, opts.abort_timeout_sec),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name),
+		spdk_json_decode_string, true
+	},
+};
+
+static void
+nvmf_rpc_create_transport_ctx_free(struct nvmf_rpc_create_transport_ctx *ctx)
+{
+	free(ctx->trtype);
+	free(ctx->tgt_name);
+	free(ctx);
+}
+
+static void
+nvmf_rpc_tgt_add_transport_done(void *cb_arg, int status)
+{
+	struct nvmf_rpc_create_transport_ctx *ctx = cb_arg;
+	struct spdk_jsonrpc_request *request;
+	struct spdk_json_write_ctx *w;
+
+	request = ctx->request;
+	nvmf_rpc_create_transport_ctx_free(ctx);
+
+	if (status) {
+		SPDK_ERRLOG("Failed to add transport to tgt.(%d)\n", status);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						     "Failed to add transport to tgt.(%d)\n",
+						     status);
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_nvmf_create_transport(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct nvmf_rpc_create_transport_ctx *ctx;
+	enum spdk_nvme_transport_type trtype;
+	struct spdk_nvmf_transport *transport;
+	struct spdk_nvmf_tgt *tgt;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+		return;
+	}
+
+	/* Decode parameters the first time to get the transport type */
+	if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_create_transport_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!tgt) {
+		SPDK_ERRLOG("Unable to find a target object.\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvme_transport_id_parse_trtype(&trtype, ctx->trtype)) {
+		SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Invalid transport type '%s'\n", ctx->trtype);
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	/* Initialize all the transport options (based on transport type) and decode the
+	 * parameters again to update any options passed in rpc create transport call.
+	 */
+	if (!spdk_nvmf_transport_opts_init(ctx->trtype, &ctx->opts)) {
+		/* This can happen if user specifies PCIE transport type which isn't valid for
+		 * NVMe-oF.
+		 */
+		SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						     "Invalid transport type '%s'\n", ctx->trtype);
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder,
+				    SPDK_COUNTOF(nvmf_rpc_create_transport_decoder),
+				    ctx)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	if (spdk_nvmf_tgt_get_transport(tgt, ctx->trtype)) {
+		SPDK_ERRLOG("Transport type '%s' already exists\n", ctx->trtype);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						     "Transport type '%s' already exists\n", ctx->trtype);
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	transport = spdk_nvmf_transport_create(ctx->trtype, &ctx->opts);
+
+	if (!transport) {
+		SPDK_ERRLOG("Transport type '%s' create failed\n", ctx->trtype);
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						     "Transport type '%s' create failed\n", ctx->trtype);
+		nvmf_rpc_create_transport_ctx_free(ctx);
+		return;
+	}
+
+	/* add transport to target */
+	ctx->request = request;
+	spdk_nvmf_tgt_add_transport(tgt, transport, nvmf_rpc_tgt_add_transport_done, ctx);
+}
+SPDK_RPC_REGISTER("nvmf_create_transport", rpc_nvmf_create_transport, SPDK_RPC_RUNTIME)
+
+static void
+dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *transport)
+{
+	const struct spdk_nvmf_transport_opts *opts = spdk_nvmf_get_transport_opts(transport);
+	spdk_nvme_transport_type_t type = spdk_nvmf_get_transport_type(transport);
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "trtype", spdk_nvmf_get_transport_name(transport));
+	spdk_json_write_named_uint32(w, "max_queue_depth", opts->max_queue_depth);
+	spdk_json_write_named_uint32(w, "max_io_qpairs_per_ctrlr", opts->max_qpairs_per_ctrlr - 1);
+	spdk_json_write_named_uint32(w, "in_capsule_data_size", opts->in_capsule_data_size);
+	spdk_json_write_named_uint32(w, "max_io_size", opts->max_io_size);
+	spdk_json_write_named_uint32(w, "io_unit_size", opts->io_unit_size);
+	spdk_json_write_named_uint32(w, "max_aq_depth", opts->max_aq_depth);
+	spdk_json_write_named_uint32(w, "num_shared_buffers", opts->num_shared_buffers);
+	spdk_json_write_named_uint32(w, "buf_cache_size", opts->buf_cache_size);
+	spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip);
+	if (type == SPDK_NVME_TRANSPORT_RDMA) {
+		spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
+		spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
+		spdk_json_write_named_int32(w, "acceptor_backlog", opts->acceptor_backlog);
+	} else if (type == SPDK_NVME_TRANSPORT_TCP) {
+		spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
+		spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority);
+	}
+	spdk_json_write_named_uint32(w, "abort_timeout_sec", opts->abort_timeout_sec);
+
+	spdk_json_write_object_end(w);
+}
+
+struct rpc_get_transport {
+	char *tgt_name;
+};
+
+static const struct spdk_json_object_decoder rpc_get_transport_decoders[] = {
+	{"tgt_name", offsetof(struct rpc_get_transport, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvmf_get_transports(struct spdk_jsonrpc_request *request,
+			const struct spdk_json_val *params)
+{
+	struct rpc_get_transport req = { 0 };
+	struct spdk_json_write_ctx *w;
+	struct spdk_nvmf_transport *transport;
+	struct spdk_nvmf_tgt *tgt;
+
+	if (params) {
+		if (spdk_json_decode_object(params, rpc_get_transport_decoders,
+					    SPDK_COUNTOF(rpc_get_transport_decoders),
+					    &req)) {
+			SPDK_ERRLOG("spdk_json_decode_object failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+			return;
+		}
+	}
+
+	tgt = spdk_nvmf_get_tgt(req.tgt_name);
+	if (!tgt) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		free(req.tgt_name);
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	transport = spdk_nvmf_transport_get_first(tgt);
+	while (transport) {
+		dump_nvmf_transport(w, transport);
+		transport = spdk_nvmf_transport_get_next(transport);
+	}
+	spdk_json_write_array_end(w);
+	spdk_jsonrpc_end_result(request, w);
+	free(req.tgt_name);
+}
+SPDK_RPC_REGISTER("nvmf_get_transports", rpc_nvmf_get_transports, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_get_transports, get_nvmf_transports)
+
+struct rpc_nvmf_get_stats_ctx {
+	char *tgt_name;
+	struct spdk_nvmf_tgt *tgt;
+	struct spdk_jsonrpc_request *request;
+	struct spdk_json_write_ctx *w;
+};
+
+static const struct spdk_json_object_decoder rpc_get_stats_decoders[] = {
+	{"tgt_name", offsetof(struct rpc_nvmf_get_stats_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+free_get_stats_ctx(struct rpc_nvmf_get_stats_ctx *ctx)
+{
+	free(ctx->tgt_name);
+	free(ctx);
+}
+
+static void
+rpc_nvmf_get_stats_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct rpc_nvmf_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	spdk_json_write_array_end(ctx->w);
+	spdk_json_write_object_end(ctx->w);
+	spdk_jsonrpc_end_result(ctx->request, ctx->w);
+	free_get_stats_ctx(ctx);
+}
+
+static void
+write_nvmf_transport_stats(struct spdk_json_write_ctx *w,
+			   struct spdk_nvmf_transport_poll_group_stat *stat)
+{
+	uint64_t i;
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "trtype",
+				     spdk_nvme_transport_id_trtype_str(stat->trtype));
+	switch (stat->trtype) {
+	case SPDK_NVME_TRANSPORT_RDMA:
+		spdk_json_write_named_uint64(w, "pending_data_buffer", stat->rdma.pending_data_buffer);
+		spdk_json_write_named_array_begin(w, "devices");
+		for (i = 0; i < stat->rdma.num_devices; ++i) {
+			spdk_json_write_object_begin(w);
+			spdk_json_write_named_string(w, "name", stat->rdma.devices[i].name);
+			spdk_json_write_named_uint64(w, "polls", stat->rdma.devices[i].polls);
+			spdk_json_write_named_uint64(w, "completions", stat->rdma.devices[i].completions);
+			spdk_json_write_named_uint64(w, "requests",
+						     stat->rdma.devices[i].requests);
+			spdk_json_write_named_uint64(w, "request_latency",
+						     stat->rdma.devices[i].request_latency);
+			spdk_json_write_named_uint64(w, "pending_free_request",
+						     stat->rdma.devices[i].pending_free_request);
+			spdk_json_write_named_uint64(w, "pending_rdma_read",
+						     stat->rdma.devices[i].pending_rdma_read);
+			spdk_json_write_named_uint64(w, "pending_rdma_write",
+						     stat->rdma.devices[i].pending_rdma_write);
+			spdk_json_write_object_end(w);
+		}
+		spdk_json_write_array_end(w);
+		break;
+	default:
+		break;
+	}
+	spdk_json_write_object_end(w);
+}
+
+static void
+_rpc_nvmf_get_stats(struct spdk_io_channel_iter *i)
+{
+	struct rpc_nvmf_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+	struct spdk_nvmf_transport *transport;
+	struct spdk_nvmf_poll_group_stat stat;
+	struct spdk_nvmf_transport_poll_group_stat *trstat;
+	int rc;
+
+	if (0 == spdk_nvmf_poll_group_get_stat(ctx->tgt, &stat)) {
+		spdk_json_write_object_begin(ctx->w);
+		spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(spdk_get_thread()));
+		spdk_json_write_named_uint32(ctx->w, "admin_qpairs", stat.admin_qpairs);
+		spdk_json_write_named_uint32(ctx->w, "io_qpairs", stat.io_qpairs);
+		spdk_json_write_named_uint64(ctx->w, "pending_bdev_io", stat.pending_bdev_io);
+
+		spdk_json_write_named_array_begin(ctx->w, "transports");
+		transport = spdk_nvmf_transport_get_first(ctx->tgt);
+		while (transport) {
+			rc = spdk_nvmf_transport_poll_group_get_stat(ctx->tgt, transport, &trstat);
+			if (0 == rc) {
+				write_nvmf_transport_stats(ctx->w, trstat);
+				spdk_nvmf_transport_poll_group_free_stat(transport, trstat);
+			} else if (-ENOTSUP != rc) {
+				SPDK_ERRLOG("Failed to get poll group statistics for transport %s, errno %d\n",
+					    spdk_nvme_transport_id_trtype_str(spdk_nvmf_get_transport_type(transport)),
+					    rc);
+			}
+			transport = spdk_nvmf_transport_get_next(transport);
+		}
+		spdk_json_write_array_end(ctx->w);
+		spdk_json_write_object_end(ctx->w);
+	}
+
+	spdk_for_each_channel_continue(i, 0);
+}
+
+
+static void
+rpc_nvmf_get_stats(struct spdk_jsonrpc_request *request,
+		   const struct spdk_json_val *params)
+{
+	struct rpc_nvmf_get_stats_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Memory allocation error");
+		return;
+	}
+	ctx->request = request;
+
+	if (params) {
+		if (spdk_json_decode_object(params, rpc_get_stats_decoders,
+					    SPDK_COUNTOF(rpc_get_stats_decoders),
+					    ctx)) {
+			SPDK_ERRLOG("spdk_json_decode_object failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+			free_get_stats_ctx(ctx);
+			return;
+		}
+	}
+
+	ctx->tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+	if (!ctx->tgt) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+						 "Unable to find a target.");
+		free_get_stats_ctx(ctx);
+		return;
+	}
+
+	ctx->w = spdk_jsonrpc_begin_result(ctx->request);
+	spdk_json_write_object_begin(ctx->w);
+	spdk_json_write_named_uint64(ctx->w, "tick_rate", spdk_get_ticks_hz());
+	spdk_json_write_named_array_begin(ctx->w, "poll_groups");
+
+	spdk_for_each_channel(ctx->tgt,
+			      _rpc_nvmf_get_stats,
+			      ctx,
+			      rpc_nvmf_get_stats_done);
+}
+
+SPDK_RPC_REGISTER("nvmf_get_stats", rpc_nvmf_get_stats, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/lib/nvmf/rdma.c b/src/spdk/lib/nvmf/rdma.c
new file mode 100644
index 000000000..4a4de4374
--- /dev/null
+++ b/src/spdk/lib/nvmf/rdma.c
@@ -0,0 +1,4313 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/config.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/rdma.h"
+
+#include "nvmf_internal.h"
+
+struct spdk_nvme_rdma_hooks g_nvmf_hooks = {};
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma;
+
+/*
+ RDMA Connection Resource Defaults
+ */
+#define NVMF_DEFAULT_TX_SGE		SPDK_NVMF_MAX_SGL_ENTRIES
+#define NVMF_DEFAULT_RSP_SGE		1
+#define NVMF_DEFAULT_RX_SGE		2
+
+/* The RDMA completion queue size */
+#define DEFAULT_NVMF_RDMA_CQ_SIZE	4096
+#define MAX_WR_PER_QP(queue_depth)	(queue_depth * 3 + 2)
+
+/* Timeout for destroying defunct rqpairs */
+#define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US	4000000
+
+static int g_spdk_nvmf_ibv_query_mask =
+	IBV_QP_STATE |
+	IBV_QP_PKEY_INDEX |
+	IBV_QP_PORT |
+	IBV_QP_ACCESS_FLAGS |
+	IBV_QP_AV |
+	IBV_QP_PATH_MTU |
+	IBV_QP_DEST_QPN |
+	IBV_QP_RQ_PSN |
+	IBV_QP_MAX_DEST_RD_ATOMIC |
+	IBV_QP_MIN_RNR_TIMER |
+	IBV_QP_SQ_PSN |
+	IBV_QP_TIMEOUT |
+	IBV_QP_RETRY_CNT |
+	IBV_QP_RNR_RETRY |
+	IBV_QP_MAX_QP_RD_ATOMIC;
+
+enum spdk_nvmf_rdma_request_state {
+	/* The request is not currently in use */
+	RDMA_REQUEST_STATE_FREE = 0,
+
+	/* Initial state when request first received */
+	RDMA_REQUEST_STATE_NEW,
+
+	/* The request is queued until a data buffer is available. */
+	RDMA_REQUEST_STATE_NEED_BUFFER,
+
+	/* The request is waiting on RDMA queue depth availability
+	 * to transfer data from the host to the controller.
+	 */
+	RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
+
+	/* The request is currently transferring data from the host to the controller. */
+	RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+
+	/* The request is ready to execute at the block device */
+	RDMA_REQUEST_STATE_READY_TO_EXECUTE,
+
+	/* The request is currently executing at the block device */
+	RDMA_REQUEST_STATE_EXECUTING,
+
+	/* The request finished executing at the block device */
+	RDMA_REQUEST_STATE_EXECUTED,
+
+	/* The request is waiting on RDMA queue depth availability
+	 * to transfer data from the controller to the host.
+	 */
+	RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
+
+	/* The request is ready to send a completion */
+	RDMA_REQUEST_STATE_READY_TO_COMPLETE,
+
+	/* The request is currently transferring data from the controller to the host. */
+	RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+
+	/* The request currently has an outstanding completion without an
+	 * associated data transfer.
+	 */
+	RDMA_REQUEST_STATE_COMPLETING,
+
+	/* The request completed and can be marked free. */
+	RDMA_REQUEST_STATE_COMPLETED,
+
+	/* Terminator */
+	RDMA_REQUEST_NUM_STATES,
+};
+
+#define OBJECT_NVMF_RDMA_IO				0x40
+
+#define TRACE_GROUP_NVMF_RDMA				0x4
+#define TRACE_RDMA_REQUEST_STATE_NEW					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0)
+#define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1)
+#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING	SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2)
+#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER	SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3)
+#define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE			SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4)
+#define TRACE_RDMA_REQUEST_STATE_EXECUTING				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5)
+#define TRACE_RDMA_REQUEST_STATE_EXECUTED				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6)
+#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING		SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7)
+#define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE			SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8)
+#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST	SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9)
+#define TRACE_RDMA_REQUEST_STATE_COMPLETING				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA)
+#define TRACE_RDMA_REQUEST_STATE_COMPLETED				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB)
+#define TRACE_RDMA_QP_CREATE						SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC)
+#define TRACE_RDMA_IBV_ASYNC_EVENT					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD)
+#define TRACE_RDMA_CM_ASYNC_EVENT					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE)
+#define TRACE_RDMA_QP_STATE_CHANGE					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF)
+#define TRACE_RDMA_QP_DISCONNECT					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10)
+#define TRACE_RDMA_QP_DESTROY						SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11)
+
+SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA)
+{
+	spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r');
+	spdk_trace_register_description("RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H",
+					TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C",
+					TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_TX_H2C",
+					TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE",
+					TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_EXECUTING",
+					TRACE_RDMA_REQUEST_STATE_EXECUTING,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_EXECUTED",
+					TRACE_RDMA_REQUEST_STATE_EXECUTED,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL",
+					TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H",
+					TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_COMPLETING",
+					TRACE_RDMA_REQUEST_STATE_COMPLETING,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+	spdk_trace_register_description("RDMA_REQ_COMPLETED",
+					TRACE_RDMA_REQUEST_STATE_COMPLETED,
+					OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid:   ");
+
+	spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "type:   ");
+	spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "type:   ");
+	spdk_trace_register_description("RDMA_QP_STATE_CHANGE", TRACE_RDMA_QP_STATE_CHANGE,
+					OWNER_NONE, OBJECT_NONE, 0, 1, "state:  ");
+	spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+}
+
+enum spdk_nvmf_rdma_wr_type {
+	RDMA_WR_TYPE_RECV,
+	RDMA_WR_TYPE_SEND,
+	RDMA_WR_TYPE_DATA,
+};
+
+struct spdk_nvmf_rdma_wr {
+	enum spdk_nvmf_rdma_wr_type	type;
+};
+
+/* This structure holds commands as they are received off the wire.
+ * It must be dynamically paired with a full request object
+ * (spdk_nvmf_rdma_request) to service a request. It is separate
+ * from the request because RDMA does not appear to order
+ * completions, so occasionally we'll get a new incoming
+ * command when there aren't any free request objects.
+ */
+struct spdk_nvmf_rdma_recv {
+	struct ibv_recv_wr			wr;
+	struct ibv_sge				sgl[NVMF_DEFAULT_RX_SGE];
+
+	struct spdk_nvmf_rdma_qpair		*qpair;
+
+	/* In-capsule data buffer */
+	uint8_t					*buf;
+
+	struct spdk_nvmf_rdma_wr		rdma_wr;
+	uint64_t				receive_tsc;
+
+	STAILQ_ENTRY(spdk_nvmf_rdma_recv)	link;
+};
+
+struct spdk_nvmf_rdma_request_data {
+	struct spdk_nvmf_rdma_wr	rdma_wr;
+	struct ibv_send_wr		wr;
+	struct ibv_sge			sgl[SPDK_NVMF_MAX_SGL_ENTRIES];
+};
+
+struct spdk_nvmf_rdma_request {
+	struct spdk_nvmf_request		req;
+
+	enum spdk_nvmf_rdma_request_state	state;
+
+	struct spdk_nvmf_rdma_recv		*recv;
+
+	struct {
+		struct spdk_nvmf_rdma_wr	rdma_wr;
+		struct	ibv_send_wr		wr;
+		struct	ibv_sge			sgl[NVMF_DEFAULT_RSP_SGE];
+	} rsp;
+
+	struct spdk_nvmf_rdma_request_data	data;
+
+	uint32_t				iovpos;
+
+	uint32_t				num_outstanding_data_wr;
+	uint64_t				receive_tsc;
+
+	STAILQ_ENTRY(spdk_nvmf_rdma_request)	state_link;
+};
+
+enum spdk_nvmf_rdma_qpair_disconnect_flags {
+	RDMA_QP_DISCONNECTING		= 1,
+	RDMA_QP_RECV_DRAINED		= 1 << 1,
+	RDMA_QP_SEND_DRAINED		= 1 << 2
+};
+
+struct spdk_nvmf_rdma_resource_opts {
+	struct spdk_nvmf_rdma_qpair	*qpair;
+	/* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */
+	void				*qp;
+	struct ibv_pd			*pd;
+	uint32_t			max_queue_depth;
+	uint32_t			in_capsule_data_size;
+	bool				shared;
+};
+
+struct spdk_nvmf_send_wr_list {
+	struct ibv_send_wr	*first;
+	struct ibv_send_wr	*last;
+};
+
+struct spdk_nvmf_recv_wr_list {
+	struct ibv_recv_wr	*first;
+	struct ibv_recv_wr	*last;
+};
+
+struct spdk_nvmf_rdma_resources {
+	/* Array of size "max_queue_depth" containing RDMA requests. */
+	struct spdk_nvmf_rdma_request		*reqs;
+
+	/* Array of size "max_queue_depth" containing RDMA recvs. */
+	struct spdk_nvmf_rdma_recv		*recvs;
+
+	/* Array of size "max_queue_depth" containing 64 byte capsules
+	 * used for receive.
+	 */
+	union nvmf_h2c_msg			*cmds;
+	struct ibv_mr				*cmds_mr;
+
+	/* Array of size "max_queue_depth" containing 16 byte completions
+	 * to be sent back to the user.
+	 */
+	union nvmf_c2h_msg			*cpls;
+	struct ibv_mr				*cpls_mr;
+
+	/* Array of size "max_queue_depth * InCapsuleDataSize" containing
+	 * buffers to be used for in capsule data.
+	 */
+	void					*bufs;
+	struct ibv_mr				*bufs_mr;
+
+	/* The list of pending recvs to transfer */
+	struct spdk_nvmf_recv_wr_list		recvs_to_post;
+
+	/* Receives that are waiting for a request object */
+	STAILQ_HEAD(, spdk_nvmf_rdma_recv)	incoming_queue;
+
+	/* Queue to track free requests */
+	STAILQ_HEAD(, spdk_nvmf_rdma_request)	free_queue;
+};
+
+typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair);
+
+struct spdk_nvmf_rdma_ibv_event_ctx {
+	struct spdk_nvmf_rdma_qpair			*rqpair;
+	spdk_nvmf_rdma_qpair_ibv_event			cb_fn;
+	/* Link to other ibv events associated with this qpair */
+	STAILQ_ENTRY(spdk_nvmf_rdma_ibv_event_ctx)	link;
+};
+
+struct spdk_nvmf_rdma_qpair {
+	struct spdk_nvmf_qpair			qpair;
+
+	struct spdk_nvmf_rdma_device		*device;
+	struct spdk_nvmf_rdma_poller		*poller;
+
+	struct spdk_rdma_qp			*rdma_qp;
+	struct rdma_cm_id			*cm_id;
+	struct ibv_srq				*srq;
+	struct rdma_cm_id			*listen_id;
+
+	/* The maximum number of I/O outstanding on this connection at one time */
+	uint16_t				max_queue_depth;
+
+	/* The maximum number of active RDMA READ and ATOMIC operations at one time */
+	uint16_t				max_read_depth;
+
+	/* The maximum number of RDMA SEND operations at one time */
+	uint32_t				max_send_depth;
+
+	/* The current number of outstanding WRs from this qpair's
+	 * recv queue. Should not exceed device->attr.max_queue_depth.
+	 */
+	uint16_t				current_recv_depth;
+
+	/* The current number of active RDMA READ operations */
+	uint16_t				current_read_depth;
+
+	/* The current number of posted WRs from this qpair's
+	 * send queue. Should not exceed max_send_depth.
+	 */
+	uint32_t				current_send_depth;
+
+	/* The maximum number of SGEs per WR on the send queue */
+	uint32_t				max_send_sge;
+
+	/* The maximum number of SGEs per WR on the recv queue */
+	uint32_t				max_recv_sge;
+
+	struct spdk_nvmf_rdma_resources		*resources;
+
+	STAILQ_HEAD(, spdk_nvmf_rdma_request)	pending_rdma_read_queue;
+
+	STAILQ_HEAD(, spdk_nvmf_rdma_request)	pending_rdma_write_queue;
+
+	/* Number of requests not in the free state */
+	uint32_t				qd;
+
+	TAILQ_ENTRY(spdk_nvmf_rdma_qpair)	link;
+
+	STAILQ_ENTRY(spdk_nvmf_rdma_qpair)	recv_link;
+
+	STAILQ_ENTRY(spdk_nvmf_rdma_qpair)	send_link;
+
+	/* IBV queue pair attributes: they are used to manage
+	 * qp state and recover from errors.
+	 */
+	enum ibv_qp_state			ibv_state;
+
+	uint32_t				disconnect_flags;
+
+	/* Poller registered in case the qpair doesn't properly
+	 * complete the qpair destruct process and becomes defunct.
+	 */
+
+	struct spdk_poller			*destruct_poller;
+
+	/*
+	 * io_channel which is used to destroy qpair when it is removed from poll group
+	 */
+	struct spdk_io_channel		*destruct_channel;
+
+	/* List of ibv async events */
+	STAILQ_HEAD(, spdk_nvmf_rdma_ibv_event_ctx)	ibv_events;
+
+	/* There are several ways a disconnect can start on a qpair
+	 * and they are not all mutually exclusive. It is important
+	 * that we only initialize one of these paths.
+	 */
+	bool					disconnect_started;
+	/* Lets us know that we have received the last_wqe event. */
+	bool					last_wqe_reached;
+};
+
+struct spdk_nvmf_rdma_poller_stat {
+	uint64_t				completions;
+	uint64_t				polls;
+	uint64_t				requests;
+	uint64_t				request_latency;
+	uint64_t				pending_free_request;
+	uint64_t				pending_rdma_read;
+	uint64_t				pending_rdma_write;
+};
+
+struct spdk_nvmf_rdma_poller {
+	struct spdk_nvmf_rdma_device		*device;
+	struct spdk_nvmf_rdma_poll_group	*group;
+
+	int					num_cqe;
+	int					required_num_wr;
+	struct ibv_cq				*cq;
+
+	/* The maximum number of I/O outstanding on the shared receive queue at one time */
+	uint16_t				max_srq_depth;
+
+	/* Shared receive queue */
+	struct ibv_srq				*srq;
+
+	struct spdk_nvmf_rdma_resources		*resources;
+	struct spdk_nvmf_rdma_poller_stat	stat;
+
+	TAILQ_HEAD(, spdk_nvmf_rdma_qpair)	qpairs;
+
+	STAILQ_HEAD(, spdk_nvmf_rdma_qpair)	qpairs_pending_recv;
+
+	STAILQ_HEAD(, spdk_nvmf_rdma_qpair)	qpairs_pending_send;
+
+	TAILQ_ENTRY(spdk_nvmf_rdma_poller)	link;
+};
+
+struct spdk_nvmf_rdma_poll_group_stat {
+	uint64_t				pending_data_buffer;
+};
+
+struct spdk_nvmf_rdma_poll_group {
+	struct spdk_nvmf_transport_poll_group		group;
+	struct spdk_nvmf_rdma_poll_group_stat		stat;
+	TAILQ_HEAD(, spdk_nvmf_rdma_poller)		pollers;
+	TAILQ_ENTRY(spdk_nvmf_rdma_poll_group)		link;
+	/*
+	 * buffers which are split across multiple RDMA
+	 * memory regions cannot be used by this transport.
+	 */
+	STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf)	retired_bufs;
+};
+
+struct spdk_nvmf_rdma_conn_sched {
+	struct spdk_nvmf_rdma_poll_group *next_admin_pg;
+	struct spdk_nvmf_rdma_poll_group *next_io_pg;
+};
+
+/* Assuming rdma_cm uses just one protection domain per ibv_context. */
+struct spdk_nvmf_rdma_device {
+	struct ibv_device_attr			attr;
+	struct ibv_context			*context;
+
+	struct spdk_mem_map			*map;
+	struct ibv_pd				*pd;
+
+	int					num_srq;
+
+	TAILQ_ENTRY(spdk_nvmf_rdma_device)	link;
+};
+
+struct spdk_nvmf_rdma_port {
+	const struct spdk_nvme_transport_id	*trid;
+	struct rdma_cm_id			*id;
+	struct spdk_nvmf_rdma_device		*device;
+	TAILQ_ENTRY(spdk_nvmf_rdma_port)	link;
+};
+
+struct spdk_nvmf_rdma_transport {
+	struct spdk_nvmf_transport	transport;
+
+	struct spdk_nvmf_rdma_conn_sched conn_sched;
+
+	struct rdma_event_channel	*event_channel;
+
+	struct spdk_mempool		*data_wr_pool;
+
+	pthread_mutex_t			lock;
+
+	/* fields used to poll RDMA/IB events */
+	nfds_t			npoll_fds;
+	struct pollfd		*poll_fds;
+
+	TAILQ_HEAD(, spdk_nvmf_rdma_device)	devices;
+	TAILQ_HEAD(, spdk_nvmf_rdma_port)	ports;
+	TAILQ_HEAD(, spdk_nvmf_rdma_poll_group)	poll_groups;
+};
+
+static inline void
+nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair);
+
+static bool
+nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
+			  struct spdk_nvmf_rdma_request *rdma_req);
+
+static inline int
+nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
+{
+	switch (state) {
+	case IBV_QPS_RESET:
+	case IBV_QPS_INIT:
+	case IBV_QPS_RTR:
+	case IBV_QPS_RTS:
+	case IBV_QPS_SQD:
+	case IBV_QPS_SQE:
+	case IBV_QPS_ERR:
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+static inline enum spdk_nvme_media_error_status_code
+nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) {
+	enum spdk_nvme_media_error_status_code result;
+	switch (err_type)
+	{
+	case SPDK_DIF_REFTAG_ERROR:
+		result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR;
+		break;
+	case SPDK_DIF_APPTAG_ERROR:
+		result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR;
+		break;
+	case SPDK_DIF_GUARD_ERROR:
+		result = SPDK_NVME_SC_GUARD_CHECK_ERROR;
+		break;
+	default:
+		SPDK_UNREACHABLE();
+	}
+
+	return result;
+}
+
+static enum ibv_qp_state
+nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) {
+	enum ibv_qp_state old_state, new_state;
+	struct ibv_qp_attr qp_attr;
+	struct ibv_qp_init_attr init_attr;
+	int rc;
+
+	old_state = rqpair->ibv_state;
+	rc = ibv_query_qp(rqpair->rdma_qp->qp, &qp_attr,
+			  g_spdk_nvmf_ibv_query_mask, &init_attr);
+
+	if (rc)
+	{
+		SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n");
+		return IBV_QPS_ERR + 1;
+	}
+
+	new_state = qp_attr.qp_state;
+	rqpair->ibv_state = new_state;
+	qp_attr.ah_attr.port_num = qp_attr.port_num;
+
+	rc = nvmf_rdma_check_ibv_state(new_state);
+	if (rc)
+	{
+		SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state);
+		/*
+		 * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8
+		 * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR
+		 */
+		return IBV_QPS_ERR + 1;
+	}
+
+	if (old_state != new_state)
+	{
+		spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0,
+				  (uintptr_t)rqpair->cm_id, new_state);
+	}
+	return new_state;
+}
+
+static void
+nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
+			    struct spdk_nvmf_rdma_transport *rtransport)
+{
+	struct spdk_nvmf_rdma_request_data	*data_wr;
+	struct ibv_send_wr			*next_send_wr;
+	uint64_t				req_wrid;
+
+	rdma_req->num_outstanding_data_wr = 0;
+	data_wr = &rdma_req->data;
+	req_wrid = data_wr->wr.wr_id;
+	while (data_wr && data_wr->wr.wr_id == req_wrid) {
+		memset(data_wr->sgl, 0, sizeof(data_wr->wr.sg_list[0]) * data_wr->wr.num_sge);
+		data_wr->wr.num_sge = 0;
+		next_send_wr = data_wr->wr.next;
+		if (data_wr != &rdma_req->data) {
+			spdk_mempool_put(rtransport->data_wr_pool, data_wr);
+		}
+		data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL :
+			  SPDK_CONTAINEROF(next_send_wr, struct spdk_nvmf_rdma_request_data, wr);
+	}
+}
+
+static void
+nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req)
+{
+	SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool);
+	if (req->req.cmd) {
+		SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode);
+	}
+	if (req->recv) {
+		SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id);
+	}
+}
+
+static void
+nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	int i;
+
+	SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid);
+	for (i = 0; i < rqpair->max_queue_depth; i++) {
+		if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) {
+			nvmf_rdma_dump_request(&rqpair->resources->reqs[i]);
+		}
+	}
+}
+
+static void
+nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources)
+{
+	if (resources->cmds_mr) {
+		ibv_dereg_mr(resources->cmds_mr);
+	}
+
+	if (resources->cpls_mr) {
+		ibv_dereg_mr(resources->cpls_mr);
+	}
+
+	if (resources->bufs_mr) {
+		ibv_dereg_mr(resources->bufs_mr);
+	}
+
+	spdk_free(resources->cmds);
+	spdk_free(resources->cpls);
+	spdk_free(resources->bufs);
+	free(resources->reqs);
+	free(resources->recvs);
+	free(resources);
+}
+
+
+static struct spdk_nvmf_rdma_resources *
+nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts)
+{
+	struct spdk_nvmf_rdma_resources	*resources;
+	struct spdk_nvmf_rdma_request	*rdma_req;
+	struct spdk_nvmf_rdma_recv	*rdma_recv;
+	struct ibv_qp			*qp;
+	struct ibv_srq			*srq;
+	uint32_t			i;
+	int				rc;
+
+	resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources));
+	if (!resources) {
+		SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
+		return NULL;
+	}
+
+	resources->reqs = calloc(opts->max_queue_depth, sizeof(*resources->reqs));
+	resources->recvs = calloc(opts->max_queue_depth, sizeof(*resources->recvs));
+	resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds),
+				       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls),
+				       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+
+	if (opts->in_capsule_data_size > 0) {
+		resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size,
+					       0x1000, NULL, SPDK_ENV_LCORE_ID_ANY,
+					       SPDK_MALLOC_DMA);
+	}
+
+	if (!resources->reqs || !resources->recvs || !resources->cmds ||
+	    !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) {
+		SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n");
+		goto cleanup;
+	}
+
+	resources->cmds_mr = ibv_reg_mr(opts->pd, resources->cmds,
+					opts->max_queue_depth * sizeof(*resources->cmds),
+					IBV_ACCESS_LOCAL_WRITE);
+	resources->cpls_mr = ibv_reg_mr(opts->pd, resources->cpls,
+					opts->max_queue_depth * sizeof(*resources->cpls),
+					0);
+
+	if (opts->in_capsule_data_size) {
+		resources->bufs_mr = ibv_reg_mr(opts->pd, resources->bufs,
+						opts->max_queue_depth *
+						opts->in_capsule_data_size,
+						IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+	}
+
+	if (!resources->cmds_mr || !resources->cpls_mr ||
+	    (opts->in_capsule_data_size &&
+	     !resources->bufs_mr)) {
+		goto cleanup;
+	}
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n",
+		      resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds),
+		      resources->cmds_mr->lkey);
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n",
+		      resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls),
+		      resources->cpls_mr->lkey);
+	if (resources->bufs && resources->bufs_mr) {
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n",
+			      resources->bufs, opts->max_queue_depth *
+			      opts->in_capsule_data_size, resources->bufs_mr->lkey);
+	}
+
+	/* Initialize queues */
+	STAILQ_INIT(&resources->incoming_queue);
+	STAILQ_INIT(&resources->free_queue);
+
+	for (i = 0; i < opts->max_queue_depth; i++) {
+		struct ibv_recv_wr *bad_wr = NULL;
+
+		rdma_recv = &resources->recvs[i];
+		rdma_recv->qpair = opts->qpair;
+
+		/* Set up memory to receive commands */
+		if (resources->bufs) {
+			rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i *
+						  opts->in_capsule_data_size));
+		}
+
+		rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV;
+
+		rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i];
+		rdma_recv->sgl[0].length = sizeof(resources->cmds[i]);
+		rdma_recv->sgl[0].lkey = resources->cmds_mr->lkey;
+		rdma_recv->wr.num_sge = 1;
+
+		if (rdma_recv->buf && resources->bufs_mr) {
+			rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf;
+			rdma_recv->sgl[1].length = opts->in_capsule_data_size;
+			rdma_recv->sgl[1].lkey = resources->bufs_mr->lkey;
+			rdma_recv->wr.num_sge++;
+		}
+
+		rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr;
+		rdma_recv->wr.sg_list = rdma_recv->sgl;
+		if (opts->shared) {
+			srq = (struct ibv_srq *)opts->qp;
+			rc = ibv_post_srq_recv(srq, &rdma_recv->wr, &bad_wr);
+		} else {
+			qp = (struct ibv_qp *)opts->qp;
+			rc = ibv_post_recv(qp, &rdma_recv->wr, &bad_wr);
+		}
+		if (rc) {
+			goto cleanup;
+		}
+	}
+
+	for (i = 0; i < opts->max_queue_depth; i++) {
+		rdma_req = &resources->reqs[i];
+
+		if (opts->qpair != NULL) {
+			rdma_req->req.qpair = &opts->qpair->qpair;
+		} else {
+			rdma_req->req.qpair = NULL;
+		}
+		rdma_req->req.cmd = NULL;
+
+		/* Set up memory to send responses */
+		rdma_req->req.rsp = &resources->cpls[i];
+
+		rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i];
+		rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]);
+		rdma_req->rsp.sgl[0].lkey = resources->cpls_mr->lkey;
+
+		rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND;
+		rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr;
+		rdma_req->rsp.wr.next = NULL;
+		rdma_req->rsp.wr.opcode = IBV_WR_SEND;
+		rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED;
+		rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl;
+		rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl);
+
+		/* Set up memory for data buffers */
+		rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA;
+		rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr;
+		rdma_req->data.wr.next = NULL;
+		rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED;
+		rdma_req->data.wr.sg_list = rdma_req->data.sgl;
+		rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl);
+
+		/* Initialize request state to FREE */
+		rdma_req->state = RDMA_REQUEST_STATE_FREE;
+		STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link);
+	}
+
+	return resources;
+
+cleanup:
+	nvmf_rdma_resources_destroy(resources);
+	return NULL;
+}
+
+static void
+nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	struct spdk_nvmf_rdma_ibv_event_ctx *ctx, *tctx;
+	STAILQ_FOREACH_SAFE(ctx, &rqpair->ibv_events, link, tctx) {
+		ctx->rqpair = NULL;
+		/* Memory allocated for ctx is freed in nvmf_rdma_qpair_process_ibv_event */
+		STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link);
+	}
+}
+
+static void
+nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	struct spdk_nvmf_rdma_recv	*rdma_recv, *recv_tmp;
+	struct ibv_recv_wr		*bad_recv_wr = NULL;
+	int				rc;
+
+	spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0);
+
+	spdk_poller_unregister(&rqpair->destruct_poller);
+
+	if (rqpair->qd != 0) {
+		struct spdk_nvmf_qpair *qpair = &rqpair->qpair;
+		struct spdk_nvmf_rdma_transport	*rtransport = SPDK_CONTAINEROF(qpair->transport,
+				struct spdk_nvmf_rdma_transport, transport);
+		struct spdk_nvmf_rdma_request *req;
+		uint32_t i, max_req_count = 0;
+
+		SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd);
+
+		if (rqpair->srq == NULL) {
+			nvmf_rdma_dump_qpair_contents(rqpair);
+			max_req_count = rqpair->max_queue_depth;
+		} else if (rqpair->poller && rqpair->resources) {
+			max_req_count = rqpair->poller->max_srq_depth;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Release incomplete requests\n");
+		for (i = 0; i < max_req_count; i++) {
+			req = &rqpair->resources->reqs[i];
+			if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) {
+				/* nvmf_rdma_request_process checks qpair ibv and internal state
+				 * and completes a request */
+				nvmf_rdma_request_process(rtransport, req);
+			}
+		}
+		assert(rqpair->qd == 0);
+	}
+
+	if (rqpair->poller) {
+		TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link);
+
+		if (rqpair->srq != NULL && rqpair->resources != NULL) {
+			/* Drop all received but unprocessed commands for this queue and return them to SRQ */
+			STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) {
+				if (rqpair == rdma_recv->qpair) {
+					STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link);
+					rc = ibv_post_srq_recv(rqpair->srq, &rdma_recv->wr, &bad_recv_wr);
+					if (rc) {
+						SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+					}
+				}
+			}
+		}
+	}
+
+	if (rqpair->cm_id) {
+		if (rqpair->rdma_qp != NULL) {
+			spdk_rdma_qp_destroy(rqpair->rdma_qp);
+			rqpair->rdma_qp = NULL;
+		}
+		rdma_destroy_id(rqpair->cm_id);
+
+		if (rqpair->poller != NULL && rqpair->srq == NULL) {
+			rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth);
+		}
+	}
+
+	if (rqpair->srq == NULL && rqpair->resources != NULL) {
+		nvmf_rdma_resources_destroy(rqpair->resources);
+	}
+
+	nvmf_rdma_qpair_clean_ibv_events(rqpair);
+
+	if (rqpair->destruct_channel) {
+		spdk_put_io_channel(rqpair->destruct_channel);
+		rqpair->destruct_channel = NULL;
+	}
+
+	free(rqpair);
+}
+
+static int
+nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device)
+{
+	struct spdk_nvmf_rdma_poller	*rpoller;
+	int				rc, num_cqe, required_num_wr;
+
+	/* Enlarge CQ size dynamically */
+	rpoller = rqpair->poller;
+	required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth);
+	num_cqe = rpoller->num_cqe;
+	if (num_cqe < required_num_wr) {
+		num_cqe = spdk_max(num_cqe * 2, required_num_wr);
+		num_cqe = spdk_min(num_cqe, device->attr.max_cqe);
+	}
+
+	if (rpoller->num_cqe != num_cqe) {
+		if (required_num_wr > device->attr.max_cqe) {
+			SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n",
+				    required_num_wr, device->attr.max_cqe);
+			return -1;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe);
+		rc = ibv_resize_cq(rpoller->cq, num_cqe);
+		if (rc) {
+			SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno));
+			return -1;
+		}
+
+		rpoller->num_cqe = num_cqe;
+	}
+
+	rpoller->required_num_wr = required_num_wr;
+	return 0;
+}
+
+static int
+nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+	struct spdk_nvmf_rdma_transport		*rtransport;
+	struct spdk_nvmf_transport		*transport;
+	struct spdk_nvmf_rdma_resource_opts	opts;
+	struct spdk_nvmf_rdma_device		*device;
+	struct spdk_rdma_qp_init_attr		qp_init_attr = {};
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	device = rqpair->device;
+
+	qp_init_attr.qp_context	= rqpair;
+	qp_init_attr.pd		= device->pd;
+	qp_init_attr.send_cq	= rqpair->poller->cq;
+	qp_init_attr.recv_cq	= rqpair->poller->cq;
+
+	if (rqpair->srq) {
+		qp_init_attr.srq		= rqpair->srq;
+	} else {
+		qp_init_attr.cap.max_recv_wr	= rqpair->max_queue_depth;
+	}
+
+	/* SEND, READ, and WRITE operations */
+	qp_init_attr.cap.max_send_wr	= (uint32_t)rqpair->max_queue_depth * 2;
+	qp_init_attr.cap.max_send_sge	= spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_TX_SGE);
+	qp_init_attr.cap.max_recv_sge	= spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
+
+	if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) {
+		SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n");
+		goto error;
+	}
+
+	rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &qp_init_attr);
+	if (!rqpair->rdma_qp) {
+		goto error;
+	}
+
+	rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2),
+					  qp_init_attr.cap.max_send_wr);
+	rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, qp_init_attr.cap.max_send_sge);
+	rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, qp_init_attr.cap.max_recv_sge);
+	spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0);
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair);
+
+	if (rqpair->poller->srq == NULL) {
+		rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
+		transport = &rtransport->transport;
+
+		opts.qp = rqpair->rdma_qp->qp;
+		opts.pd = rqpair->cm_id->pd;
+		opts.qpair = rqpair;
+		opts.shared = false;
+		opts.max_queue_depth = rqpair->max_queue_depth;
+		opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
+
+		rqpair->resources = nvmf_rdma_resources_create(&opts);
+
+		if (!rqpair->resources) {
+			SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
+			rdma_destroy_qp(rqpair->cm_id);
+			goto error;
+		}
+	} else {
+		rqpair->resources = rqpair->poller->resources;
+	}
+
+	rqpair->current_recv_depth = 0;
+	STAILQ_INIT(&rqpair->pending_rdma_read_queue);
+	STAILQ_INIT(&rqpair->pending_rdma_write_queue);
+
+	return 0;
+
+error:
+	rdma_destroy_id(rqpair->cm_id);
+	rqpair->cm_id = NULL;
+	return -1;
+}
+
+/* Append the given recv wr structure to the resource structs outstanding recvs list. */
+/* This function accepts either a single wr or the first wr in a linked list. */
+static void
+nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first)
+{
+	struct ibv_recv_wr *last;
+
+	last = first;
+	while (last->next != NULL) {
+		last = last->next;
+	}
+
+	if (rqpair->resources->recvs_to_post.first == NULL) {
+		rqpair->resources->recvs_to_post.first = first;
+		rqpair->resources->recvs_to_post.last = last;
+		if (rqpair->srq == NULL) {
+			STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link);
+		}
+	} else {
+		rqpair->resources->recvs_to_post.last->next = first;
+		rqpair->resources->recvs_to_post.last = last;
+	}
+}
+
+static int
+request_transfer_in(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_rdma_request	*rdma_req;
+	struct spdk_nvmf_qpair		*qpair;
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+
+	qpair = req->qpair;
+	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
+	assert(rdma_req != NULL);
+
+	if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, &rdma_req->data.wr)) {
+		STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
+	}
+
+	rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
+	rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
+	return 0;
+}
+
+static int
+request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
+{
+	int				num_outstanding_data_wr = 0;
+	struct spdk_nvmf_rdma_request	*rdma_req;
+	struct spdk_nvmf_qpair		*qpair;
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+	struct spdk_nvme_cpl		*rsp;
+	struct ibv_send_wr		*first = NULL;
+
+	*data_posted = 0;
+	qpair = req->qpair;
+	rsp = &req->rsp->nvme_cpl;
+	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	/* Advance our sq_head pointer */
+	if (qpair->sq_head == qpair->sq_head_max) {
+		qpair->sq_head = 0;
+	} else {
+		qpair->sq_head++;
+	}
+	rsp->sqhd = qpair->sq_head;
+
+	/* queue the capsule for the recv buffer */
+	assert(rdma_req->recv != NULL);
+
+	nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr);
+
+	rdma_req->recv = NULL;
+	assert(rqpair->current_recv_depth > 0);
+	rqpair->current_recv_depth--;
+
+	/* Build the response which consists of optional
+	 * RDMA WRITEs to transfer data, plus an RDMA SEND
+	 * containing the response.
+	 */
+	first = &rdma_req->rsp.wr;
+
+	if (rsp->status.sc != SPDK_NVME_SC_SUCCESS) {
+		/* On failure, data was not read from the controller. So clear the
+		 * number of outstanding data WRs to zero.
+		 */
+		rdma_req->num_outstanding_data_wr = 0;
+	} else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+		first = &rdma_req->data.wr;
+		*data_posted = 1;
+		num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
+	}
+	if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, first)) {
+		STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
+	}
+
+	/* +1 for the rsp wr */
+	rqpair->current_send_depth += num_outstanding_data_wr + 1;
+
+	return 0;
+}
+
+static int
+nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	struct spdk_nvmf_rdma_accept_private_data	accept_data;
+	struct rdma_conn_param				ctrlr_event_data = {};
+	int						rc;
+
+	accept_data.recfmt = 0;
+	accept_data.crqsize = rqpair->max_queue_depth;
+
+	ctrlr_event_data.private_data = &accept_data;
+	ctrlr_event_data.private_data_len = sizeof(accept_data);
+	if (id->ps == RDMA_PS_TCP) {
+		ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */
+		ctrlr_event_data.initiator_depth = rqpair->max_read_depth;
+	}
+
+	/* Configure infinite retries for the initiator side qpair.
+	 * When using a shared receive queue on the target side,
+	 * we need to pass this value to the initiator to prevent the
+	 * initiator side NIC from completing SEND requests back to the
+	 * initiator with status rnr_retry_count_exceeded. */
+	if (rqpair->srq != NULL) {
+		ctrlr_event_data.rnr_retry_count = 0x7;
+	}
+
+	/* When qpair is created without use of rdma cm API, an additional
+	 * information must be provided to initiator in the connection response:
+	 * whether qpair is using SRQ and its qp_num
+	 * Fields below are ignored by rdma cm if qpair has been
+	 * created using rdma cm API. */
+	ctrlr_event_data.srq = rqpair->srq ? 1 : 0;
+	ctrlr_event_data.qp_num = rqpair->rdma_qp->qp->qp_num;
+
+	rc = spdk_rdma_qp_accept(rqpair->rdma_qp, &ctrlr_event_data);
+	if (rc) {
+		SPDK_ERRLOG("Error %d on spdk_rdma_qp_accept\n", errno);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n");
+	}
+
+	return rc;
+}
+
+static void
+nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error)
+{
+	struct spdk_nvmf_rdma_reject_private_data	rej_data;
+
+	rej_data.recfmt = 0;
+	rej_data.sts = error;
+
+	rdma_reject(id, &rej_data, sizeof(rej_data));
+}
+
+static int
+nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event)
+{
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_rdma_qpair	*rqpair = NULL;
+	struct spdk_nvmf_rdma_port	*port;
+	struct rdma_conn_param		*rdma_param = NULL;
+	const struct spdk_nvmf_rdma_request_private_data *private_data = NULL;
+	uint16_t			max_queue_depth;
+	uint16_t			max_read_depth;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+	assert(event->id != NULL); /* Impossible. Can't even reject the connection. */
+	assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */
+
+	rdma_param = &event->param.conn;
+	if (rdma_param->private_data == NULL ||
+	    rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) {
+		SPDK_ERRLOG("connect request: no private data provided\n");
+		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH);
+		return -1;
+	}
+
+	private_data = rdma_param->private_data;
+	if (private_data->recfmt != 0) {
+		SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n");
+		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT);
+		return -1;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n",
+		      event->id->verbs->device->name, event->id->verbs->device->dev_name);
+
+	port = event->listen_id->context;
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n",
+		      event->listen_id, event->listen_id->verbs, port);
+
+	/* Figure out the supported queue depth. This is a multi-step process
+	 * that takes into account hardware maximums, host provided values,
+	 * and our target's internal memory limits */
+
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n");
+
+	/* Start with the maximum queue depth allowed by the target */
+	max_queue_depth = rtransport->transport.opts.max_queue_depth;
+	max_read_depth = rtransport->transport.opts.max_queue_depth;
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n",
+		      rtransport->transport.opts.max_queue_depth);
+
+	/* Next check the local NIC's hardware limitations */
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA,
+		      "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n",
+		      port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom);
+	max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr);
+	max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom);
+
+	/* Next check the remote NIC's hardware limitations */
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA,
+		      "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n",
+		      rdma_param->initiator_depth, rdma_param->responder_resources);
+	if (rdma_param->initiator_depth > 0) {
+		max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth);
+	}
+
+	/* Finally check for the host software requested values, which are
+	 * optional. */
+	if (rdma_param->private_data != NULL &&
+	    rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) {
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize);
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize);
+		max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize);
+		max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n",
+		      max_queue_depth, max_read_depth);
+
+	rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair));
+	if (rqpair == NULL) {
+		SPDK_ERRLOG("Could not allocate new connection.\n");
+		nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
+		return -1;
+	}
+
+	rqpair->device = port->device;
+	rqpair->max_queue_depth = max_queue_depth;
+	rqpair->max_read_depth = max_read_depth;
+	rqpair->cm_id = event->id;
+	rqpair->listen_id = event->listen_id;
+	rqpair->qpair.transport = transport;
+	STAILQ_INIT(&rqpair->ibv_events);
+	/* use qid from the private data to determine the qpair type
+	   qid will be set to the appropriate value when the controller is created */
+	rqpair->qpair.qid = private_data->qid;
+
+	event->id->context = &rqpair->qpair;
+
+	spdk_nvmf_tgt_new_qpair(transport->tgt, &rqpair->qpair);
+
+	return 0;
+}
+
+static int
+nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
+		     enum spdk_mem_map_notify_action action,
+		     void *vaddr, size_t size)
+{
+	struct ibv_pd *pd = cb_ctx;
+	struct ibv_mr *mr;
+	int rc;
+
+	switch (action) {
+	case SPDK_MEM_MAP_NOTIFY_REGISTER:
+		if (!g_nvmf_hooks.get_rkey) {
+			mr = ibv_reg_mr(pd, vaddr, size,
+					IBV_ACCESS_LOCAL_WRITE |
+					IBV_ACCESS_REMOTE_READ |
+					IBV_ACCESS_REMOTE_WRITE);
+			if (mr == NULL) {
+				SPDK_ERRLOG("ibv_reg_mr() failed\n");
+				return -1;
+			} else {
+				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
+			}
+		} else {
+			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
+							  g_nvmf_hooks.get_rkey(pd, vaddr, size));
+		}
+		break;
+	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+		if (!g_nvmf_hooks.get_rkey) {
+			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
+			if (mr) {
+				ibv_dereg_mr(mr);
+			}
+		}
+		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
+		break;
+	default:
+		SPDK_UNREACHABLE();
+	}
+
+	return rc;
+}
+
+static int
+nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
+{
+	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
+	return addr_1 == addr_2;
+}
+
+static inline void
+nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next,
+		   enum spdk_nvme_data_transfer xfer)
+{
+	if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+		wr->opcode = IBV_WR_RDMA_WRITE;
+		wr->send_flags = 0;
+		wr->next = next;
+	} else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+		wr->opcode = IBV_WR_RDMA_READ;
+		wr->send_flags = IBV_SEND_SIGNALED;
+		wr->next = NULL;
+	} else {
+		assert(0);
+	}
+}
+
+static int
+nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
+		       struct spdk_nvmf_rdma_request *rdma_req,
+		       uint32_t num_sgl_descriptors)
+{
+	struct spdk_nvmf_rdma_request_data	*work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
+	struct spdk_nvmf_rdma_request_data	*current_data_wr;
+	uint32_t				i;
+
+	if (num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES) {
+		SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n",
+			    num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES);
+		return -EINVAL;
+	}
+
+	if (spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, num_sgl_descriptors)) {
+		return -ENOMEM;
+	}
+
+	current_data_wr = &rdma_req->data;
+
+	for (i = 0; i < num_sgl_descriptors; i++) {
+		nvmf_rdma_setup_wr(&current_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer);
+		current_data_wr->wr.next = &work_requests[i]->wr;
+		current_data_wr = work_requests[i];
+		current_data_wr->wr.sg_list = current_data_wr->sgl;
+		current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id;
+	}
+
+	nvmf_rdma_setup_wr(&current_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
+
+	return 0;
+}
+
+static inline void
+nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req)
+{
+	struct ibv_send_wr		*wr = &rdma_req->data.wr;
+	struct spdk_nvme_sgl_descriptor	*sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
+
+	wr->wr.rdma.rkey = sgl->keyed.key;
+	wr->wr.rdma.remote_addr = sgl->address;
+	nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
+}
+
+static inline void
+nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs)
+{
+	struct ibv_send_wr		*wr = &rdma_req->data.wr;
+	struct spdk_nvme_sgl_descriptor	*sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
+	uint32_t			i;
+	int				j;
+	uint64_t			remote_addr_offset = 0;
+
+	for (i = 0; i < num_wrs; ++i) {
+		wr->wr.rdma.rkey = sgl->keyed.key;
+		wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset;
+		for (j = 0; j < wr->num_sge; ++j) {
+			remote_addr_offset += wr->sg_list[j].length;
+		}
+		wr = wr->next;
+	}
+}
+
+/* This function is used in the rare case that we have a buffer split over multiple memory regions. */
+static int
+nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
+{
+	struct spdk_nvmf_transport_poll_group	*group = &rgroup->group;
+	struct spdk_nvmf_transport		*transport = group->transport;
+	struct spdk_nvmf_transport_pg_cache_buf	*old_buf;
+	void					*new_buf;
+
+	if (!(STAILQ_EMPTY(&group->buf_cache))) {
+		group->buf_cache_count--;
+		new_buf = STAILQ_FIRST(&group->buf_cache);
+		STAILQ_REMOVE_HEAD(&group->buf_cache, link);
+		assert(*buf != NULL);
+	} else {
+		new_buf = spdk_mempool_get(transport->data_buf_pool);
+	}
+
+	if (*buf == NULL) {
+		return -ENOMEM;
+	}
+
+	old_buf = *buf;
+	STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link);
+	*buf = new_buf;
+	return 0;
+}
+
+static bool
+nvmf_rdma_get_lkey(struct spdk_nvmf_rdma_device *device, struct iovec *iov,
+		   uint32_t *_lkey)
+{
+	uint64_t	translation_len;
+	uint32_t	lkey;
+
+	translation_len = iov->iov_len;
+
+	if (!g_nvmf_hooks.get_rkey) {
+		lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map,
+				(uint64_t)iov->iov_base, &translation_len))->lkey;
+	} else {
+		lkey = spdk_mem_map_translate(device->map,
+					      (uint64_t)iov->iov_base, &translation_len);
+	}
+
+	if (spdk_unlikely(translation_len < iov->iov_len)) {
+		return false;
+	}
+
+	*_lkey = lkey;
+	return true;
+}
+
+static bool
+nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device,
+		      struct iovec *iov, struct ibv_send_wr **_wr,
+		      uint32_t *_remaining_data_block, uint32_t *_offset,
+		      uint32_t *_num_extra_wrs,
+		      const struct spdk_dif_ctx *dif_ctx)
+{
+	struct ibv_send_wr *wr = *_wr;
+	struct ibv_sge	*sg_ele = &wr->sg_list[wr->num_sge];
+	uint32_t	lkey = 0;
+	uint32_t	remaining, data_block_size, md_size, sge_len;
+
+	if (spdk_unlikely(!nvmf_rdma_get_lkey(device, iov, &lkey))) {
+		/* This is a very rare case that can occur when using DPDK version < 19.05 */
+		SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n");
+		return false;
+	}
+
+	if (spdk_likely(!dif_ctx)) {
+		sg_ele->lkey = lkey;
+		sg_ele->addr = (uintptr_t)(iov->iov_base);
+		sg_ele->length = iov->iov_len;
+		wr->num_sge++;
+	} else {
+		remaining = iov->iov_len - *_offset;
+		data_block_size = dif_ctx->block_size - dif_ctx->md_size;
+		md_size = dif_ctx->md_size;
+
+		while (remaining) {
+			if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) {
+				if (*_num_extra_wrs > 0 && wr->next) {
+					*_wr = wr->next;
+					wr = *_wr;
+					wr->num_sge = 0;
+					sg_ele = &wr->sg_list[wr->num_sge];
+					(*_num_extra_wrs)--;
+				} else {
+					break;
+				}
+			}
+			sg_ele->lkey = lkey;
+			sg_ele->addr = (uintptr_t)((char *)iov->iov_base + *_offset);
+			sge_len = spdk_min(remaining, *_remaining_data_block);
+			sg_ele->length = sge_len;
+			remaining -= sge_len;
+			*_remaining_data_block -= sge_len;
+			*_offset += sge_len;
+
+			sg_ele++;
+			wr->num_sge++;
+
+			if (*_remaining_data_block == 0) {
+				/* skip metadata */
+				*_offset += md_size;
+				/* Metadata that do not fit this IO buffer will be included in the next IO buffer */
+				remaining -= spdk_min(remaining, md_size);
+				*_remaining_data_block = data_block_size;
+			}
+
+			if (remaining == 0) {
+				/* By subtracting the size of the last IOV from the offset, we ensure that we skip
+				   the remaining metadata bits at the beginning of the next buffer */
+				*_offset -= iov->iov_len;
+			}
+		}
+	}
+
+	return true;
+}
+
+static int
+nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
+		      struct spdk_nvmf_rdma_device *device,
+		      struct spdk_nvmf_rdma_request *rdma_req,
+		      struct ibv_send_wr *wr,
+		      uint32_t length,
+		      uint32_t num_extra_wrs)
+{
+	struct spdk_nvmf_request *req = &rdma_req->req;
+	struct spdk_dif_ctx *dif_ctx = NULL;
+	uint32_t remaining_data_block = 0;
+	uint32_t offset = 0;
+
+	if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
+		dif_ctx = &rdma_req->req.dif.dif_ctx;
+		remaining_data_block = dif_ctx->block_size - dif_ctx->md_size;
+	}
+
+	wr->num_sge = 0;
+
+	while (length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) {
+		while (spdk_unlikely(!nvmf_rdma_fill_wr_sge(device, &req->iov[rdma_req->iovpos], &wr,
+				     &remaining_data_block, &offset, &num_extra_wrs, dif_ctx))) {
+			if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[rdma_req->iovpos]) == -ENOMEM) {
+				return -ENOMEM;
+			}
+			req->iov[rdma_req->iovpos].iov_base = (void *)((uintptr_t)(req->buffers[rdma_req->iovpos] +
+							      NVMF_DATA_BUFFER_MASK) &
+							      ~NVMF_DATA_BUFFER_MASK);
+		}
+
+		length -= req->iov[rdma_req->iovpos].iov_len;
+		rdma_req->iovpos++;
+	}
+
+	if (length) {
+		SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline uint32_t
+nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size)
+{
+	/* estimate the number of SG entries and WRs needed to process the request */
+	uint32_t num_sge = 0;
+	uint32_t i;
+	uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size);
+
+	for (i = 0; i < num_buffers && length > 0; i++) {
+		uint32_t buffer_len = spdk_min(length, io_unit_size);
+		uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size);
+
+		if (num_sge_in_block * block_size > buffer_len) {
+			++num_sge_in_block;
+		}
+		num_sge += num_sge_in_block;
+		length -= buffer_len;
+	}
+	return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES);
+}
+
+static int
+nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
+			    struct spdk_nvmf_rdma_device *device,
+			    struct spdk_nvmf_rdma_request *rdma_req,
+			    uint32_t length)
+{
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+	struct spdk_nvmf_rdma_poll_group	*rgroup;
+	struct spdk_nvmf_request		*req = &rdma_req->req;
+	struct ibv_send_wr			*wr = &rdma_req->data.wr;
+	int					rc;
+	uint32_t				num_wrs = 1;
+
+	rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	rgroup = rqpair->poller->group;
+
+	/* rdma wr specifics */
+	nvmf_rdma_setup_request(rdma_req);
+
+	rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport,
+					   length);
+	if (rc != 0) {
+		return rc;
+	}
+
+	assert(req->iovcnt <= rqpair->max_send_sge);
+
+	rdma_req->iovpos = 0;
+
+	if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+		num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size,
+						 req->dif.dif_ctx.block_size);
+		if (num_wrs > 1) {
+			rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1);
+			if (rc != 0) {
+				goto err_exit;
+			}
+		}
+	}
+
+	rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, wr, length, num_wrs - 1);
+	if (spdk_unlikely(rc != 0)) {
+		goto err_exit;
+	}
+
+	if (spdk_unlikely(num_wrs > 1)) {
+		nvmf_rdma_update_remote_addr(rdma_req, num_wrs);
+	}
+
+	/* set the number of outstanding data WRs for this request. */
+	rdma_req->num_outstanding_data_wr = num_wrs;
+
+	return rc;
+
+err_exit:
+	spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
+	nvmf_rdma_request_free_data(rdma_req, rtransport);
+	req->iovcnt = 0;
+	return rc;
+}
+
+static int
+nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport,
+				      struct spdk_nvmf_rdma_device *device,
+				      struct spdk_nvmf_rdma_request *rdma_req)
+{
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+	struct spdk_nvmf_rdma_poll_group	*rgroup;
+	struct ibv_send_wr			*current_wr;
+	struct spdk_nvmf_request		*req = &rdma_req->req;
+	struct spdk_nvme_sgl_descriptor		*inline_segment, *desc;
+	uint32_t				num_sgl_descriptors;
+	uint32_t				lengths[SPDK_NVMF_MAX_SGL_ENTRIES];
+	uint32_t				i;
+	int					rc;
+
+	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	rgroup = rqpair->poller->group;
+
+	inline_segment = &req->cmd->nvme_cmd.dptr.sgl1;
+	assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT);
+	assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET);
+
+	num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor);
+	assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES);
+
+	if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) {
+		return -ENOMEM;
+	}
+
+	desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
+	for (i = 0; i < num_sgl_descriptors; i++) {
+		if (spdk_likely(!req->dif.dif_insert_or_strip)) {
+			lengths[i] = desc->keyed.length;
+		} else {
+			req->dif.orig_length += desc->keyed.length;
+			lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx);
+			req->dif.elba_length += lengths[i];
+		}
+		desc++;
+	}
+
+	rc = spdk_nvmf_request_get_buffers_multi(req, &rgroup->group, &rtransport->transport,
+			lengths, num_sgl_descriptors);
+	if (rc != 0) {
+		nvmf_rdma_request_free_data(rdma_req, rtransport);
+		return rc;
+	}
+
+	/* The first WR must always be the embedded data WR. This is how we unwind them later. */
+	current_wr = &rdma_req->data.wr;
+	assert(current_wr != NULL);
+
+	req->length = 0;
+	rdma_req->iovpos = 0;
+	desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
+	for (i = 0; i < num_sgl_descriptors; i++) {
+		/* The descriptors must be keyed data block descriptors with an address, not an offset. */
+		if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK ||
+				  desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) {
+			rc = -EINVAL;
+			goto err_exit;
+		}
+
+		current_wr->num_sge = 0;
+
+		rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, current_wr, lengths[i], 0);
+		if (rc != 0) {
+			rc = -ENOMEM;
+			goto err_exit;
+		}
+
+		req->length += desc->keyed.length;
+		current_wr->wr.rdma.rkey = desc->keyed.key;
+		current_wr->wr.rdma.remote_addr = desc->address;
+		current_wr = current_wr->next;
+		desc++;
+	}
+
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+	/* Go back to the last descriptor in the list. */
+	desc--;
+	if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
+		if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
+			rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
+			rdma_req->rsp.wr.imm_data = desc->keyed.key;
+		}
+	}
+#endif
+
+	rdma_req->num_outstanding_data_wr = num_sgl_descriptors;
+
+	return 0;
+
+err_exit:
+	spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
+	nvmf_rdma_request_free_data(rdma_req, rtransport);
+	return rc;
+}
+
+static int
+nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
+			    struct spdk_nvmf_rdma_device *device,
+			    struct spdk_nvmf_rdma_request *rdma_req)
+{
+	struct spdk_nvmf_request		*req = &rdma_req->req;
+	struct spdk_nvme_cpl			*rsp;
+	struct spdk_nvme_sgl_descriptor		*sgl;
+	int					rc;
+	uint32_t				length;
+
+	rsp = &req->rsp->nvme_cpl;
+	sgl = &req->cmd->nvme_cmd.dptr.sgl1;
+
+	if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
+	    (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
+	     sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
+
+		length = sgl->keyed.length;
+		if (length > rtransport->transport.opts.max_io_size) {
+			SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
+				    length, rtransport->transport.opts.max_io_size);
+			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+			return -1;
+		}
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+		if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
+			if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
+				rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
+				rdma_req->rsp.wr.imm_data = sgl->keyed.key;
+			}
+		}
+#endif
+
+		/* fill request length and populate iovs */
+		req->length = length;
+
+		if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+			req->dif.orig_length = length;
+			length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
+			req->dif.elba_length = length;
+		}
+
+		rc = nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length);
+		if (spdk_unlikely(rc < 0)) {
+			if (rc == -EINVAL) {
+				SPDK_ERRLOG("SGL length exceeds the max I/O size\n");
+				rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+				return -1;
+			}
+			/* No available buffers. Queue this request up. */
+			SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
+			return 0;
+		}
+
+		/* backward compatible */
+		req->data = req->iov[0].iov_base;
+
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req,
+			      req->iovcnt);
+
+		return 0;
+	} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
+		   sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
+		uint64_t offset = sgl->address;
+		uint32_t max_len = rtransport->transport.opts.in_capsule_data_size;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
+			      offset, sgl->unkeyed.length);
+
+		if (offset > max_len) {
+			SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
+				    offset, max_len);
+			rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
+			return -1;
+		}
+		max_len -= (uint32_t)offset;
+
+		if (sgl->unkeyed.length > max_len) {
+			SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
+				    sgl->unkeyed.length, max_len);
+			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+			return -1;
+		}
+
+		rdma_req->num_outstanding_data_wr = 0;
+		req->data = rdma_req->recv->buf + offset;
+		req->data_from_pool = false;
+		req->length = sgl->unkeyed.length;
+
+		req->iov[0].iov_base = req->data;
+		req->iov[0].iov_len = req->length;
+		req->iovcnt = 1;
+
+		return 0;
+	} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT &&
+		   sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
+
+		rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req);
+		if (rc == -ENOMEM) {
+			SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
+			return 0;
+		} else if (rc == -EINVAL) {
+			SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n");
+			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+			return -1;
+		}
+
+		/* backward compatible */
+		req->data = req->iov[0].iov_base;
+
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req,
+			      req->iovcnt);
+
+		return 0;
+	}
+
+	SPDK_ERRLOG("Invalid NVMf I/O Command SGL:  Type 0x%x, Subtype 0x%x\n",
+		    sgl->generic.type, sgl->generic.subtype);
+	rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
+	return -1;
+}
+
+static void
+_nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
+			struct spdk_nvmf_rdma_transport	*rtransport)
+{
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+	struct spdk_nvmf_rdma_poll_group	*rgroup;
+
+	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	if (rdma_req->req.data_from_pool) {
+		rgroup = rqpair->poller->group;
+
+		spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport);
+	}
+	nvmf_rdma_request_free_data(rdma_req, rtransport);
+	rdma_req->req.length = 0;
+	rdma_req->req.iovcnt = 0;
+	rdma_req->req.data = NULL;
+	rdma_req->rsp.wr.next = NULL;
+	rdma_req->data.wr.next = NULL;
+	memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
+	rqpair->qd--;
+
+	STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link);
+	rdma_req->state = RDMA_REQUEST_STATE_FREE;
+}
+
+bool
+nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
+			  struct spdk_nvmf_rdma_request *rdma_req)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+	struct spdk_nvmf_rdma_device	*device;
+	struct spdk_nvmf_rdma_poll_group *rgroup;
+	struct spdk_nvme_cpl		*rsp = &rdma_req->req.rsp->nvme_cpl;
+	int				rc;
+	struct spdk_nvmf_rdma_recv	*rdma_recv;
+	enum spdk_nvmf_rdma_request_state prev_state;
+	bool				progress = false;
+	int				data_posted;
+	uint32_t			num_blocks;
+
+	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	device = rqpair->device;
+	rgroup = rqpair->poller->group;
+
+	assert(rdma_req->state != RDMA_REQUEST_STATE_FREE);
+
+	/* If the queue pair is in an error state, force the request to the completed state
+	 * to release resources. */
+	if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+		if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) {
+			STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link);
+		} else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) {
+			STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
+		} else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING) {
+			STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
+		}
+		rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+	}
+
+	/* The loop here is to allow for several back-to-back state changes. */
+	do {
+		prev_state = rdma_req->state;
+
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state);
+
+		switch (rdma_req->state) {
+		case RDMA_REQUEST_STATE_FREE:
+			/* Some external code must kick a request into RDMA_REQUEST_STATE_NEW
+			 * to escape this state. */
+			break;
+		case RDMA_REQUEST_STATE_NEW:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			rdma_recv = rdma_req->recv;
+
+			/* The first element of the SGL is the NVMe command */
+			rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr;
+			memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp));
+
+			if (rqpair->ibv_state == IBV_QPS_ERR  || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+				rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+				break;
+			}
+
+			if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) {
+				rdma_req->req.dif.dif_insert_or_strip = true;
+			}
+
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+			rdma_req->rsp.wr.opcode = IBV_WR_SEND;
+			rdma_req->rsp.wr.imm_data = 0;
+#endif
+
+			/* The next state transition depends on the data transfer needs of this request. */
+			rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req);
+
+			/* If no data to transfer, ready to execute. */
+			if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) {
+				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
+				break;
+			}
+
+			rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER;
+			STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link);
+			break;
+		case RDMA_REQUEST_STATE_NEED_BUFFER:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+			assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE);
+
+			if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) {
+				/* This request needs to wait in line to obtain a buffer */
+				break;
+			}
+
+			/* Try to get a data buffer */
+			rc = nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req);
+			if (rc < 0) {
+				STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
+				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+				break;
+			}
+
+			if (!rdma_req->req.data) {
+				/* No buffers available. */
+				rgroup->stat.pending_data_buffer++;
+				break;
+			}
+
+			STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
+
+			/* If data is transferring from host to controller and the data didn't
+			 * arrive using in capsule data, we need to do a transfer from the host.
+			 */
+			if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER &&
+			    rdma_req->req.data_from_pool) {
+				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link);
+				rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
+				break;
+			}
+
+			rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
+			break;
+		case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+			if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) {
+				/* This request needs to wait in line to perform RDMA */
+				break;
+			}
+			if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth
+			    || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) {
+				/* We can only have so many WRs outstanding. we have to wait until some finish. */
+				rqpair->poller->stat.pending_rdma_read++;
+				break;
+			}
+
+			/* We have already verified that this request is the head of the queue. */
+			STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);
+
+			rc = request_transfer_in(&rdma_req->req);
+			if (!rc) {
+				rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
+			} else {
+				rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+			}
+			break;
+		case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			/* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE
+			 * to escape this state. */
+			break;
+		case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+			if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
+				if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+					/* generate DIF for write operation */
+					num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
+					assert(num_blocks > 0);
+
+					rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt,
+							       num_blocks, &rdma_req->req.dif.dif_ctx);
+					if (rc != 0) {
+						SPDK_ERRLOG("DIF generation failed\n");
+						rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+						nvmf_rdma_start_disconnect(rqpair);
+						break;
+					}
+				}
+
+				assert(rdma_req->req.dif.elba_length >= rdma_req->req.length);
+				/* set extended length before IO operation */
+				rdma_req->req.length = rdma_req->req.dif.elba_length;
+			}
+
+			rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
+			spdk_nvmf_request_exec(&rdma_req->req);
+			break;
+		case RDMA_REQUEST_STATE_EXECUTING:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			/* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED
+			 * to escape this state. */
+			break;
+		case RDMA_REQUEST_STATE_EXECUTED:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
+			    rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+				STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link);
+				rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING;
+			} else {
+				rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+			}
+			if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
+				/* restore the original length */
+				rdma_req->req.length = rdma_req->req.dif.orig_length;
+
+				if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+					struct spdk_dif_error error_blk;
+
+					num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
+
+					rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
+							     &rdma_req->req.dif.dif_ctx, &error_blk);
+					if (rc) {
+						struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;
+
+						SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type,
+							    error_blk.err_offset);
+						rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR;
+						rsp->status.sc = nvmf_rdma_dif_error_to_compl_status(error_blk.err_type);
+						rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+						STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
+					}
+				}
+			}
+			break;
+		case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+			if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) {
+				/* This request needs to wait in line to perform RDMA */
+				break;
+			}
+			if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) >
+			    rqpair->max_send_depth) {
+				/* We can only have so many WRs outstanding. we have to wait until some finish.
+				 * +1 since each request has an additional wr in the resp. */
+				rqpair->poller->stat.pending_rdma_write++;
+				break;
+			}
+
+			/* We have already verified that this request is the head of the queue. */
+			STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link);
+
+			/* The data transfer will be kicked off from
+			 * RDMA_REQUEST_STATE_READY_TO_COMPLETE state.
+			 */
+			rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+			break;
+		case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			rc = request_transfer_out(&rdma_req->req, &data_posted);
+			assert(rc == 0); /* No good way to handle this currently */
+			if (rc) {
+				rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+			} else {
+				rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST :
+						  RDMA_REQUEST_STATE_COMPLETING;
+			}
+			break;
+		case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			/* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
+			 * to escape this state. */
+			break;
+		case RDMA_REQUEST_STATE_COMPLETING:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+			/* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
+			 * to escape this state. */
+			break;
+		case RDMA_REQUEST_STATE_COMPLETED:
+			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0,
+					  (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+			rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc;
+			_nvmf_rdma_request_free(rdma_req, rtransport);
+			break;
+		case RDMA_REQUEST_NUM_STATES:
+		default:
+			assert(0);
+			break;
+		}
+
+		if (rdma_req->state != prev_state) {
+			progress = true;
+		}
+	} while (rdma_req->state != prev_state);
+
+	return progress;
+}
+
+/* Public API callbacks begin here */
+
+#define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128
+#define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128
+#define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096
+#define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128
+#define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
+#define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072
+#define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES)
+#define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095
+#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
+#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
+#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
+#define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100
+#define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1
+
+static void
+nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
+{
+	opts->max_queue_depth =		SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH;
+	opts->max_qpairs_per_ctrlr =	SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR;
+	opts->in_capsule_data_size =	SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE;
+	opts->max_io_size =		SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE;
+	opts->io_unit_size =		SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE;
+	opts->max_aq_depth =		SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH;
+	opts->num_shared_buffers =	SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS;
+	opts->buf_cache_size =		SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE;
+	opts->max_srq_depth =		SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
+	opts->no_srq =			SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
+	opts->dif_insert_or_strip =	SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
+	opts->acceptor_backlog =	SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
+	opts->abort_timeout_sec =	SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC;
+}
+
+const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
+	.notify_cb = nvmf_rdma_mem_notify,
+	.are_contiguous = nvmf_rdma_check_contiguous_entries
+};
+
+static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport);
+
+static struct spdk_nvmf_transport *
+nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
+{
+	int rc;
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_rdma_device	*device, *tmp;
+	struct ibv_context		**contexts;
+	uint32_t			i;
+	int				flag;
+	uint32_t			sge_count;
+	uint32_t			min_shared_buffers;
+	int				max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
+	pthread_mutexattr_t		attr;
+
+	rtransport = calloc(1, sizeof(*rtransport));
+	if (!rtransport) {
+		return NULL;
+	}
+
+	if (pthread_mutexattr_init(&attr)) {
+		SPDK_ERRLOG("pthread_mutexattr_init() failed\n");
+		free(rtransport);
+		return NULL;
+	}
+
+	if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) {
+		SPDK_ERRLOG("pthread_mutexattr_settype() failed\n");
+		pthread_mutexattr_destroy(&attr);
+		free(rtransport);
+		return NULL;
+	}
+
+	if (pthread_mutex_init(&rtransport->lock, &attr)) {
+		SPDK_ERRLOG("pthread_mutex_init() failed\n");
+		pthread_mutexattr_destroy(&attr);
+		free(rtransport);
+		return NULL;
+	}
+
+	pthread_mutexattr_destroy(&attr);
+
+	TAILQ_INIT(&rtransport->devices);
+	TAILQ_INIT(&rtransport->ports);
+	TAILQ_INIT(&rtransport->poll_groups);
+
+	rtransport->transport.ops = &spdk_nvmf_transport_rdma;
+
+	SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n"
+		     "  Transport opts:  max_ioq_depth=%d, max_io_size=%d,\n"
+		     "  max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
+		     "  in_capsule_data_size=%d, max_aq_depth=%d,\n"
+		     "  num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d,"
+		     "  acceptor_backlog=%d, abort_timeout_sec=%d\n",
+		     opts->max_queue_depth,
+		     opts->max_io_size,
+		     opts->max_qpairs_per_ctrlr - 1,
+		     opts->io_unit_size,
+		     opts->in_capsule_data_size,
+		     opts->max_aq_depth,
+		     opts->num_shared_buffers,
+		     opts->max_srq_depth,
+		     opts->no_srq,
+		     opts->acceptor_backlog,
+		     opts->abort_timeout_sec);
+
+	/* I/O unit size cannot be larger than max I/O size */
+	if (opts->io_unit_size > opts->max_io_size) {
+		opts->io_unit_size = opts->max_io_size;
+	}
+
+	if (opts->acceptor_backlog <= 0) {
+		SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n",
+			    SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG);
+		opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
+	}
+
+	if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) {
+		SPDK_ERRLOG("The number of shared data buffers (%d) is less than"
+			    "the minimum number required to guarantee that forward progress can be made (%d)\n",
+			    opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2));
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size;
+	if (min_shared_buffers > opts->num_shared_buffers) {
+		SPDK_ERRLOG("There are not enough buffers to satisfy"
+			    "per-poll group caches for each thread. (%" PRIu32 ")"
+			    "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers);
+		SPDK_ERRLOG("Please specify a larger number of shared buffers\n");
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	sge_count = opts->max_io_size / opts->io_unit_size;
+	if (sge_count > NVMF_DEFAULT_TX_SGE) {
+		SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	rtransport->event_channel = rdma_create_event_channel();
+	if (rtransport->event_channel == NULL) {
+		SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno));
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	flag = fcntl(rtransport->event_channel->fd, F_GETFL);
+	if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+		SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+			    rtransport->event_channel->fd, spdk_strerror(errno));
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data",
+				   opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES,
+				   sizeof(struct spdk_nvmf_rdma_request_data),
+				   SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+				   SPDK_ENV_SOCKET_ID_ANY);
+	if (!rtransport->data_wr_pool) {
+		SPDK_ERRLOG("Unable to allocate work request pool for poll group\n");
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	contexts = rdma_get_devices(NULL);
+	if (contexts == NULL) {
+		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	i = 0;
+	rc = 0;
+	while (contexts[i] != NULL) {
+		device = calloc(1, sizeof(*device));
+		if (!device) {
+			SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
+			rc = -ENOMEM;
+			break;
+		}
+		device->context = contexts[i];
+		rc = ibv_query_device(device->context, &device->attr);
+		if (rc < 0) {
+			SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+			free(device);
+			break;
+
+		}
+
+		max_device_sge = spdk_min(max_device_sge, device->attr.max_sge);
+
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+		if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) {
+			SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,");
+			SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id);
+		}
+
+		/**
+		 * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE.
+		 * The Soft-RoCE RXE driver does not currently support send with invalidate,
+		 * but incorrectly reports that it does. There are changes making their way
+		 * through the kernel now that will enable this feature. When they are merged,
+		 * we can conditionally enable this feature.
+		 *
+		 * TODO: enable this for versions of the kernel rxe driver that support it.
+		 */
+		if (device->attr.vendor_id == 0) {
+			device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS);
+		}
+#endif
+
+		/* set up device context async ev fd as NON_BLOCKING */
+		flag = fcntl(device->context->async_fd, F_GETFL);
+		rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK);
+		if (rc < 0) {
+			SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n");
+			free(device);
+			break;
+		}
+
+		TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
+		i++;
+
+		if (g_nvmf_hooks.get_ibv_pd) {
+			device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context);
+		} else {
+			device->pd = ibv_alloc_pd(device->context);
+		}
+
+		if (!device->pd) {
+			SPDK_ERRLOG("Unable to allocate protection domain.\n");
+			rc = -ENOMEM;
+			break;
+		}
+
+		assert(device->map == NULL);
+
+		device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd);
+		if (!device->map) {
+			SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
+			rc = -ENOMEM;
+			break;
+		}
+
+		assert(device->map != NULL);
+		assert(device->pd != NULL);
+	}
+	rdma_free_devices(contexts);
+
+	if (opts->io_unit_size * max_device_sge < opts->max_io_size) {
+		/* divide and round up. */
+		opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge;
+
+		/* round up to the nearest 4k. */
+		opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK;
+
+		opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE);
+		SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n",
+			       opts->io_unit_size);
+	}
+
+	if (rc < 0) {
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	/* Set up poll descriptor array to monitor events from RDMA and IB
+	 * in a single poll syscall
+	 */
+	rtransport->npoll_fds = i + 1;
+	i = 0;
+	rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd));
+	if (rtransport->poll_fds == NULL) {
+		SPDK_ERRLOG("poll_fds allocation failed\n");
+		nvmf_rdma_destroy(&rtransport->transport);
+		return NULL;
+	}
+
+	rtransport->poll_fds[i].fd = rtransport->event_channel->fd;
+	rtransport->poll_fds[i++].events = POLLIN;
+
+	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
+		rtransport->poll_fds[i].fd = device->context->async_fd;
+		rtransport->poll_fds[i++].events = POLLIN;
+	}
+
+	return &rtransport->transport;
+}
+
+static int
+nvmf_rdma_destroy(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_rdma_transport	*rtransport;
+	struct spdk_nvmf_rdma_port	*port, *port_tmp;
+	struct spdk_nvmf_rdma_device	*device, *device_tmp;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+	TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
+		TAILQ_REMOVE(&rtransport->ports, port, link);
+		rdma_destroy_id(port->id);
+		free(port);
+	}
+
+	if (rtransport->poll_fds != NULL) {
+		free(rtransport->poll_fds);
+	}
+
+	if (rtransport->event_channel != NULL) {
+		rdma_destroy_event_channel(rtransport->event_channel);
+	}
+
+	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
+		TAILQ_REMOVE(&rtransport->devices, device, link);
+		if (device->map) {
+			spdk_mem_map_free(&device->map);
+		}
+		if (device->pd) {
+			if (!g_nvmf_hooks.get_ibv_pd) {
+				ibv_dealloc_pd(device->pd);
+			}
+		}
+		free(device);
+	}
+
+	if (rtransport->data_wr_pool != NULL) {
+		if (spdk_mempool_count(rtransport->data_wr_pool) !=
+		    (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) {
+			SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n",
+				    spdk_mempool_count(rtransport->data_wr_pool),
+				    transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES);
+		}
+	}
+
+	spdk_mempool_free(rtransport->data_wr_pool);
+
+	pthread_mutex_destroy(&rtransport->lock);
+	free(rtransport);
+
+	return 0;
+}
+
+static int
+nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
+			  struct spdk_nvme_transport_id *trid,
+			  bool peer);
+
+static int
+nvmf_rdma_listen(struct spdk_nvmf_transport *transport,
+		 const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_rdma_transport	*rtransport;
+	struct spdk_nvmf_rdma_device	*device;
+	struct spdk_nvmf_rdma_port	*port;
+	struct addrinfo			*res;
+	struct addrinfo			hints;
+	int				family;
+	int				rc;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+	assert(rtransport->event_channel != NULL);
+
+	pthread_mutex_lock(&rtransport->lock);
+	port = calloc(1, sizeof(*port));
+	if (!port) {
+		SPDK_ERRLOG("Port allocation failed\n");
+		pthread_mutex_unlock(&rtransport->lock);
+		return -ENOMEM;
+	}
+
+	port->trid = trid;
+
+	switch (trid->adrfam) {
+	case SPDK_NVMF_ADRFAM_IPV4:
+		family = AF_INET;
+		break;
+	case SPDK_NVMF_ADRFAM_IPV6:
+		family = AF_INET6;
+		break;
+	default:
+		SPDK_ERRLOG("Unhandled ADRFAM %d\n", trid->adrfam);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return -EINVAL;
+	}
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = family;
+	hints.ai_flags = AI_NUMERICSERV;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_protocol = 0;
+
+	rc = getaddrinfo(trid->traddr, trid->trsvcid, &hints, &res);
+	if (rc) {
+		SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return -EINVAL;
+	}
+
+	rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP);
+	if (rc < 0) {
+		SPDK_ERRLOG("rdma_create_id() failed\n");
+		freeaddrinfo(res);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return rc;
+	}
+
+	rc = rdma_bind_addr(port->id, res->ai_addr);
+	freeaddrinfo(res);
+
+	if (rc < 0) {
+		SPDK_ERRLOG("rdma_bind_addr() failed\n");
+		rdma_destroy_id(port->id);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return rc;
+	}
+
+	if (!port->id->verbs) {
+		SPDK_ERRLOG("ibv_context is null\n");
+		rdma_destroy_id(port->id);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return -1;
+	}
+
+	rc = rdma_listen(port->id, transport->opts.acceptor_backlog);
+	if (rc < 0) {
+		SPDK_ERRLOG("rdma_listen() failed\n");
+		rdma_destroy_id(port->id);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return rc;
+	}
+
+	TAILQ_FOREACH(device, &rtransport->devices, link) {
+		if (device->context == port->id->verbs) {
+			port->device = device;
+			break;
+		}
+	}
+	if (!port->device) {
+		SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n",
+			    port->id->verbs);
+		rdma_destroy_id(port->id);
+		free(port);
+		pthread_mutex_unlock(&rtransport->lock);
+		return -EINVAL;
+	}
+
+	SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n",
+		       trid->traddr, trid->trsvcid);
+
+	TAILQ_INSERT_TAIL(&rtransport->ports, port, link);
+	pthread_mutex_unlock(&rtransport->lock);
+	return 0;
+}
+
+static void
+nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport,
+		      const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_rdma_port *port, *tmp;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+	pthread_mutex_lock(&rtransport->lock);
+	TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) {
+		if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) {
+			TAILQ_REMOVE(&rtransport->ports, port, link);
+			rdma_destroy_id(port->id);
+			free(port);
+			break;
+		}
+	}
+
+	pthread_mutex_unlock(&rtransport->lock);
+}
+
+static void
+nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport,
+				struct spdk_nvmf_rdma_qpair *rqpair, bool drain)
+{
+	struct spdk_nvmf_request *req, *tmp;
+	struct spdk_nvmf_rdma_request	*rdma_req, *req_tmp;
+	struct spdk_nvmf_rdma_resources *resources;
+
+	/* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */
+	STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) {
+		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
+			break;
+		}
+	}
+
+	/* Then RDMA writes since reads have stronger restrictions than writes */
+	STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) {
+		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
+			break;
+		}
+	}
+
+	/* The second highest priority is I/O waiting on memory buffers. */
+	STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) {
+		rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+		if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
+			break;
+		}
+	}
+
+	resources = rqpair->resources;
+	while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) {
+		rdma_req = STAILQ_FIRST(&resources->free_queue);
+		STAILQ_REMOVE_HEAD(&resources->free_queue, state_link);
+		rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue);
+		STAILQ_REMOVE_HEAD(&resources->incoming_queue, link);
+
+		if (rqpair->srq != NULL) {
+			rdma_req->req.qpair = &rdma_req->recv->qpair->qpair;
+			rdma_req->recv->qpair->qd++;
+		} else {
+			rqpair->qd++;
+		}
+
+		rdma_req->receive_tsc = rdma_req->recv->receive_tsc;
+		rdma_req->state = RDMA_REQUEST_STATE_NEW;
+		if (nvmf_rdma_request_process(rtransport, rdma_req) == false) {
+			break;
+		}
+	}
+	if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) {
+		rqpair->poller->stat.pending_free_request++;
+	}
+}
+
+static void
+_nvmf_rdma_qpair_disconnect(void *ctx)
+{
+	struct spdk_nvmf_qpair *qpair = ctx;
+
+	spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+}
+
+static void
+_nvmf_rdma_try_disconnect(void *ctx)
+{
+	struct spdk_nvmf_qpair *qpair = ctx;
+	struct spdk_nvmf_poll_group *group;
+
+	/* Read the group out of the qpair. This is normally set and accessed only from
+	 * the thread that created the group. Here, we're not on that thread necessarily.
+	 * The data member qpair->group begins it's life as NULL and then is assigned to
+	 * a pointer and never changes. So fortunately reading this and checking for
+	 * non-NULL is thread safe in the x86_64 memory model. */
+	group = qpair->group;
+
+	if (group == NULL) {
+		/* The qpair hasn't been assigned to a group yet, so we can't
+		 * process a disconnect. Send a message to ourself and try again. */
+		spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair);
+		return;
+	}
+
+	spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair);
+}
+
+static inline void
+nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	if (!__atomic_test_and_set(&rqpair->disconnect_started, __ATOMIC_RELAXED)) {
+		_nvmf_rdma_try_disconnect(&rqpair->qpair);
+	}
+}
+
+static void nvmf_rdma_destroy_drained_qpair(void *ctx)
+{
+	struct spdk_nvmf_rdma_qpair *rqpair = ctx;
+	struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
+			struct spdk_nvmf_rdma_transport, transport);
+
+	/* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */
+	if (rqpair->current_send_depth != 0) {
+		return;
+	}
+
+	if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) {
+		return;
+	}
+
+	if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) {
+		return;
+	}
+
+	nvmf_rdma_qpair_process_pending(rtransport, rqpair, true);
+
+	/* Qpair will be destroyed after nvmf layer closes this qpair */
+	if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ERROR) {
+		return;
+	}
+
+	nvmf_rdma_qpair_destroy(rqpair);
+}
+
+
+static int
+nvmf_rdma_disconnect(struct rdma_cm_event *evt)
+{
+	struct spdk_nvmf_qpair		*qpair;
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+
+	if (evt->id == NULL) {
+		SPDK_ERRLOG("disconnect request: missing cm_id\n");
+		return -1;
+	}
+
+	qpair = evt->id->context;
+	if (qpair == NULL) {
+		SPDK_ERRLOG("disconnect request: no active connection\n");
+		return -1;
+	}
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0);
+
+	nvmf_rdma_start_disconnect(rqpair);
+
+	return 0;
+}
+
+#ifdef DEBUG
+static const char *CM_EVENT_STR[] = {
+	"RDMA_CM_EVENT_ADDR_RESOLVED",
+	"RDMA_CM_EVENT_ADDR_ERROR",
+	"RDMA_CM_EVENT_ROUTE_RESOLVED",
+	"RDMA_CM_EVENT_ROUTE_ERROR",
+	"RDMA_CM_EVENT_CONNECT_REQUEST",
+	"RDMA_CM_EVENT_CONNECT_RESPONSE",
+	"RDMA_CM_EVENT_CONNECT_ERROR",
+	"RDMA_CM_EVENT_UNREACHABLE",
+	"RDMA_CM_EVENT_REJECTED",
+	"RDMA_CM_EVENT_ESTABLISHED",
+	"RDMA_CM_EVENT_DISCONNECTED",
+	"RDMA_CM_EVENT_DEVICE_REMOVAL",
+	"RDMA_CM_EVENT_MULTICAST_JOIN",
+	"RDMA_CM_EVENT_MULTICAST_ERROR",
+	"RDMA_CM_EVENT_ADDR_CHANGE",
+	"RDMA_CM_EVENT_TIMEWAIT_EXIT"
+};
+#endif /* DEBUG */
+
+static void
+nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport,
+				    struct spdk_nvmf_rdma_port *port)
+{
+	struct spdk_nvmf_rdma_poll_group	*rgroup;
+	struct spdk_nvmf_rdma_poller		*rpoller;
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+
+	TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
+		TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+			TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) {
+				if (rqpair->listen_id == port->id) {
+					nvmf_rdma_start_disconnect(rqpair);
+				}
+			}
+		}
+	}
+}
+
+static bool
+nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport,
+				      struct rdma_cm_event *event)
+{
+	const struct spdk_nvme_transport_id	*trid;
+	struct spdk_nvmf_rdma_port		*port;
+	struct spdk_nvmf_rdma_transport		*rtransport;
+	bool					event_acked = false;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+	TAILQ_FOREACH(port, &rtransport->ports, link) {
+		if (port->id == event->id) {
+			SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid->traddr, port->trid->trsvcid);
+			rdma_ack_cm_event(event);
+			event_acked = true;
+			trid = port->trid;
+			break;
+		}
+	}
+
+	if (event_acked) {
+		nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);
+
+		nvmf_rdma_stop_listen(transport, trid);
+		nvmf_rdma_listen(transport, trid);
+	}
+
+	return event_acked;
+}
+
+static void
+nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport,
+				       struct rdma_cm_event *event)
+{
+	struct spdk_nvmf_rdma_port		*port;
+	struct spdk_nvmf_rdma_transport		*rtransport;
+
+	port = event->id->context;
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+	SPDK_NOTICELOG("Port %s:%s is being removed\n", port->trid->traddr, port->trid->trsvcid);
+
+	nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);
+
+	rdma_ack_cm_event(event);
+
+	while (spdk_nvmf_transport_stop_listen(transport, port->trid) == 0) {
+		;
+	}
+}
+
+static void
+nvmf_process_cm_event(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct rdma_cm_event		*event;
+	int				rc;
+	bool				event_acked;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+	if (rtransport->event_channel == NULL) {
+		return;
+	}
+
+	while (1) {
+		event_acked = false;
+		rc = rdma_get_cm_event(rtransport->event_channel, &event);
+		if (rc) {
+			if (errno != EAGAIN && errno != EWOULDBLOCK) {
+				SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno));
+			}
+			break;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]);
+
+		spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event);
+
+		switch (event->event) {
+		case RDMA_CM_EVENT_ADDR_RESOLVED:
+		case RDMA_CM_EVENT_ADDR_ERROR:
+		case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		case RDMA_CM_EVENT_ROUTE_ERROR:
+			/* No action required. The target never attempts to resolve routes. */
+			break;
+		case RDMA_CM_EVENT_CONNECT_REQUEST:
+			rc = nvmf_rdma_connect(transport, event);
+			if (rc < 0) {
+				SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc);
+				break;
+			}
+			break;
+		case RDMA_CM_EVENT_CONNECT_RESPONSE:
+			/* The target never initiates a new connection. So this will not occur. */
+			break;
+		case RDMA_CM_EVENT_CONNECT_ERROR:
+			/* Can this happen? The docs say it can, but not sure what causes it. */
+			break;
+		case RDMA_CM_EVENT_UNREACHABLE:
+		case RDMA_CM_EVENT_REJECTED:
+			/* These only occur on the client side. */
+			break;
+		case RDMA_CM_EVENT_ESTABLISHED:
+			/* TODO: Should we be waiting for this event anywhere? */
+			break;
+		case RDMA_CM_EVENT_DISCONNECTED:
+			rc = nvmf_rdma_disconnect(event);
+			if (rc < 0) {
+				SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
+				break;
+			}
+			break;
+		case RDMA_CM_EVENT_DEVICE_REMOVAL:
+			/* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL
+			 * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s.
+			 * Once these events are sent to SPDK, we should release all IB resources and
+			 * don't make attempts to call any ibv_query/modify/create functions. We can only call
+			 * ibv_destory* functions to release user space memory allocated by IB. All kernel
+			 * resources are already cleaned. */
+			if (event->id->qp) {
+				/* If rdma_cm event has a valid `qp` pointer then the event refers to the
+				 * corresponding qpair. Otherwise the event refers to a listening device */
+				rc = nvmf_rdma_disconnect(event);
+				if (rc < 0) {
+					SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
+					break;
+				}
+			} else {
+				nvmf_rdma_handle_cm_event_port_removal(transport, event);
+				event_acked = true;
+			}
+			break;
+		case RDMA_CM_EVENT_MULTICAST_JOIN:
+		case RDMA_CM_EVENT_MULTICAST_ERROR:
+			/* Multicast is not used */
+			break;
+		case RDMA_CM_EVENT_ADDR_CHANGE:
+			event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event);
+			break;
+		case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+			/* For now, do nothing. The target never re-uses queue pairs. */
+			break;
+		default:
+			SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
+			break;
+		}
+		if (!event_acked) {
+			rdma_ack_cm_event(event);
+		}
+	}
+}
+
+static void
+nvmf_rdma_handle_qp_fatal(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	nvmf_rdma_update_ibv_state(rqpair);
+	nvmf_rdma_start_disconnect(rqpair);
+}
+
+static void
+nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	rqpair->last_wqe_reached = true;
+	nvmf_rdma_destroy_drained_qpair(rqpair);
+}
+
+static void
+nvmf_rdma_handle_sq_drained(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	nvmf_rdma_start_disconnect(rqpair);
+}
+
+static void
+nvmf_rdma_qpair_process_ibv_event(void *ctx)
+{
+	struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx;
+
+	if (event_ctx->rqpair) {
+		STAILQ_REMOVE(&event_ctx->rqpair->ibv_events, event_ctx, spdk_nvmf_rdma_ibv_event_ctx, link);
+		if (event_ctx->cb_fn) {
+			event_ctx->cb_fn(event_ctx->rqpair);
+		}
+	}
+	free(event_ctx);
+}
+
+static int
+nvmf_rdma_send_qpair_async_event(struct spdk_nvmf_rdma_qpair *rqpair,
+				 spdk_nvmf_rdma_qpair_ibv_event fn)
+{
+	struct spdk_nvmf_rdma_ibv_event_ctx *ctx;
+	struct spdk_thread *thr = NULL;
+	int rc;
+
+	if (rqpair->qpair.group) {
+		thr = rqpair->qpair.group->thread;
+	} else if (rqpair->destruct_channel) {
+		thr = spdk_io_channel_get_thread(rqpair->destruct_channel);
+	}
+
+	if (!thr) {
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "rqpair %p has no thread\n", rqpair);
+		return -EINVAL;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		return -ENOMEM;
+	}
+
+	ctx->rqpair = rqpair;
+	ctx->cb_fn = fn;
+	STAILQ_INSERT_TAIL(&rqpair->ibv_events, ctx, link);
+
+	rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_ibv_event, ctx);
+	if (rc) {
+		STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link);
+		free(ctx);
+	}
+
+	return rc;
+}
+
+static void
+nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device)
+{
+	int				rc;
+	struct spdk_nvmf_rdma_qpair	*rqpair = NULL;
+	struct ibv_async_event		event;
+
+	rc = ibv_get_async_event(device->context, &event);
+
+	if (rc) {
+		SPDK_ERRLOG("Failed to get async_event (%d): %s\n",
+			    errno, spdk_strerror(errno));
+		return;
+	}
+
+	switch (event.event_type) {
+	case IBV_EVENT_QP_FATAL:
+		rqpair = event.element.qp->qp_context;
+		SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair);
+		spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
+				  (uintptr_t)rqpair->cm_id, event.event_type);
+		rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_qp_fatal);
+		if (rc) {
+			SPDK_WARNLOG("Failed to send QP_FATAL event. rqpair %p, err %d\n", rqpair, rc);
+			nvmf_rdma_handle_qp_fatal(rqpair);
+		}
+		break;
+	case IBV_EVENT_QP_LAST_WQE_REACHED:
+		/* This event only occurs for shared receive queues. */
+		rqpair = event.element.qp->qp_context;
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last WQE reached event received for rqpair %p\n", rqpair);
+		rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_last_wqe_reached);
+		if (rc) {
+			SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc);
+			rqpair->last_wqe_reached = true;
+		}
+		break;
+	case IBV_EVENT_SQ_DRAINED:
+		/* This event occurs frequently in both error and non-error states.
+		 * Check if the qpair is in an error state before sending a message. */
+		rqpair = event.element.qp->qp_context;
+		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last sq drained event received for rqpair %p\n", rqpair);
+		spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
+				  (uintptr_t)rqpair->cm_id, event.event_type);
+		if (nvmf_rdma_update_ibv_state(rqpair) == IBV_QPS_ERR) {
+			rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_sq_drained);
+			if (rc) {
+				SPDK_WARNLOG("Failed to send SQ_DRAINED event. rqpair %p, err %d\n", rqpair, rc);
+				nvmf_rdma_handle_sq_drained(rqpair);
+			}
+		}
+		break;
+	case IBV_EVENT_QP_REQ_ERR:
+	case IBV_EVENT_QP_ACCESS_ERR:
+	case IBV_EVENT_COMM_EST:
+	case IBV_EVENT_PATH_MIG:
+	case IBV_EVENT_PATH_MIG_ERR:
+		SPDK_NOTICELOG("Async event: %s\n",
+			       ibv_event_type_str(event.event_type));
+		rqpair = event.element.qp->qp_context;
+		spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
+				  (uintptr_t)rqpair->cm_id, event.event_type);
+		nvmf_rdma_update_ibv_state(rqpair);
+		break;
+	case IBV_EVENT_CQ_ERR:
+	case IBV_EVENT_DEVICE_FATAL:
+	case IBV_EVENT_PORT_ACTIVE:
+	case IBV_EVENT_PORT_ERR:
+	case IBV_EVENT_LID_CHANGE:
+	case IBV_EVENT_PKEY_CHANGE:
+	case IBV_EVENT_SM_CHANGE:
+	case IBV_EVENT_SRQ_ERR:
+	case IBV_EVENT_SRQ_LIMIT_REACHED:
+	case IBV_EVENT_CLIENT_REREGISTER:
+	case IBV_EVENT_GID_CHANGE:
+	default:
+		SPDK_NOTICELOG("Async event: %s\n",
+			       ibv_event_type_str(event.event_type));
+		spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type);
+		break;
+	}
+	ibv_ack_async_event(&event);
+}
+
+static uint32_t
+nvmf_rdma_accept(struct spdk_nvmf_transport *transport)
+{
+	int	nfds, i = 0;
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_rdma_device *device, *tmp;
+	uint32_t count;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+	count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0);
+
+	if (nfds <= 0) {
+		return 0;
+	}
+
+	/* The first poll descriptor is RDMA CM event */
+	if (rtransport->poll_fds[i++].revents & POLLIN) {
+		nvmf_process_cm_event(transport);
+		nfds--;
+	}
+
+	if (nfds == 0) {
+		return count;
+	}
+
+	/* Second and subsequent poll descriptors are IB async events */
+	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
+		if (rtransport->poll_fds[i++].revents & POLLIN) {
+			nvmf_process_ib_event(device);
+			nfds--;
+		}
+	}
+	/* check all flagged fd's have been served */
+	assert(nfds == 0);
+
+	return count;
+}
+
+static void
+nvmf_rdma_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem,
+		     struct spdk_nvmf_ctrlr_data *cdata)
+{
+	cdata->nvmf_specific.msdbd = SPDK_NVMF_MAX_SGL_ENTRIES;
+
+	/* Disable in-capsule data transfer for RDMA controller when dif_insert_or_strip is enabled
+	since in-capsule data only works with NVME drives that support SGL memory layout */
+	if (transport->opts.dif_insert_or_strip) {
+		cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16;
+	}
+}
+
+static void
+nvmf_rdma_discover(struct spdk_nvmf_transport *transport,
+		   struct spdk_nvme_transport_id *trid,
+		   struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+	entry->trtype = SPDK_NVMF_TRTYPE_RDMA;
+	entry->adrfam = trid->adrfam;
+	entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED;
+
+	spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
+	spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
+
+	entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED;
+	entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE;
+	entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
+}
+
+static void
+nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_rdma_transport		*rtransport;
+	struct spdk_nvmf_rdma_poll_group	*rgroup;
+	struct spdk_nvmf_rdma_poller		*poller;
+	struct spdk_nvmf_rdma_device		*device;
+	struct ibv_srq_init_attr		srq_init_attr;
+	struct spdk_nvmf_rdma_resource_opts	opts;
+	int					num_cqe;
+
+	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+	rgroup = calloc(1, sizeof(*rgroup));
+	if (!rgroup) {
+		return NULL;
+	}
+
+	TAILQ_INIT(&rgroup->pollers);
+	STAILQ_INIT(&rgroup->retired_bufs);
+
+	pthread_mutex_lock(&rtransport->lock);
+	TAILQ_FOREACH(device, &rtransport->devices, link) {
+		poller = calloc(1, sizeof(*poller));
+		if (!poller) {
+			SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
+			nvmf_rdma_poll_group_destroy(&rgroup->group);
+			pthread_mutex_unlock(&rtransport->lock);
+			return NULL;
+		}
+
+		poller->device = device;
+		poller->group = rgroup;
+
+		TAILQ_INIT(&poller->qpairs);
+		STAILQ_INIT(&poller->qpairs_pending_send);
+		STAILQ_INIT(&poller->qpairs_pending_recv);
+
+		TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
+		if (transport->opts.no_srq == false && device->num_srq < device->attr.max_srq) {
+			poller->max_srq_depth = transport->opts.max_srq_depth;
+
+			device->num_srq++;
+			memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr));
+			srq_init_attr.attr.max_wr = poller->max_srq_depth;
+			srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
+			poller->srq = ibv_create_srq(device->pd, &srq_init_attr);
+			if (!poller->srq) {
+				SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno);
+				nvmf_rdma_poll_group_destroy(&rgroup->group);
+				pthread_mutex_unlock(&rtransport->lock);
+				return NULL;
+			}
+
+			opts.qp = poller->srq;
+			opts.pd = device->pd;
+			opts.qpair = NULL;
+			opts.shared = true;
+			opts.max_queue_depth = poller->max_srq_depth;
+			opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
+
+			poller->resources = nvmf_rdma_resources_create(&opts);
+			if (!poller->resources) {
+				SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n");
+				nvmf_rdma_poll_group_destroy(&rgroup->group);
+				pthread_mutex_unlock(&rtransport->lock);
+				return NULL;
+			}
+		}
+
+		/*
+		 * When using an srq, we can limit the completion queue at startup.
+		 * The following formula represents the calculation:
+		 * num_cqe = num_recv + num_data_wr + num_send_wr.
+		 * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth
+		 */
+		if (poller->srq) {
+			num_cqe = poller->max_srq_depth * 3;
+		} else {
+			num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE;
+		}
+
+		poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0);
+		if (!poller->cq) {
+			SPDK_ERRLOG("Unable to create completion queue\n");
+			nvmf_rdma_poll_group_destroy(&rgroup->group);
+			pthread_mutex_unlock(&rtransport->lock);
+			return NULL;
+		}
+		poller->num_cqe = num_cqe;
+	}
+
+	TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link);
+	if (rtransport->conn_sched.next_admin_pg == NULL) {
+		rtransport->conn_sched.next_admin_pg = rgroup;
+		rtransport->conn_sched.next_io_pg = rgroup;
+	}
+
+	pthread_mutex_unlock(&rtransport->lock);
+	return &rgroup->group;
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_rdma_poll_group **pg;
+	struct spdk_nvmf_transport_poll_group *result;
+
+	rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
+
+	pthread_mutex_lock(&rtransport->lock);
+
+	if (TAILQ_EMPTY(&rtransport->poll_groups)) {
+		pthread_mutex_unlock(&rtransport->lock);
+		return NULL;
+	}
+
+	if (qpair->qid == 0) {
+		pg = &rtransport->conn_sched.next_admin_pg;
+	} else {
+		pg = &rtransport->conn_sched.next_io_pg;
+	}
+
+	assert(*pg != NULL);
+
+	result = &(*pg)->group;
+
+	*pg = TAILQ_NEXT(*pg, link);
+	if (*pg == NULL) {
+		*pg = TAILQ_FIRST(&rtransport->poll_groups);
+	}
+
+	pthread_mutex_unlock(&rtransport->lock);
+
+	return result;
+}
+
+static void
+nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_rdma_poll_group	*rgroup, *next_rgroup;
+	struct spdk_nvmf_rdma_poller		*poller, *tmp;
+	struct spdk_nvmf_rdma_qpair		*qpair, *tmp_qpair;
+	struct spdk_nvmf_transport_pg_cache_buf	*buf, *tmp_buf;
+	struct spdk_nvmf_rdma_transport		*rtransport;
+
+	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
+	if (!rgroup) {
+		return;
+	}
+
+	/* free all retired buffers back to the transport so we don't short the mempool. */
+	STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) {
+		STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link);
+		assert(group->transport != NULL);
+		spdk_mempool_put(group->transport->data_buf_pool, buf);
+	}
+
+	TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
+		TAILQ_REMOVE(&rgroup->pollers, poller, link);
+
+		TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) {
+			nvmf_rdma_qpair_destroy(qpair);
+		}
+
+		if (poller->srq) {
+			if (poller->resources) {
+				nvmf_rdma_resources_destroy(poller->resources);
+			}
+			ibv_destroy_srq(poller->srq);
+			SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Destroyed RDMA shared queue %p\n", poller->srq);
+		}
+
+		if (poller->cq) {
+			ibv_destroy_cq(poller->cq);
+		}
+
+		free(poller);
+	}
+
+	if (rgroup->group.transport == NULL) {
+		/* Transport can be NULL when nvmf_rdma_poll_group_create()
+		 * calls this function directly in a failure path. */
+		free(rgroup);
+		return;
+	}
+
+	rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport);
+
+	pthread_mutex_lock(&rtransport->lock);
+	next_rgroup = TAILQ_NEXT(rgroup, link);
+	TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link);
+	if (next_rgroup == NULL) {
+		next_rgroup = TAILQ_FIRST(&rtransport->poll_groups);
+	}
+	if (rtransport->conn_sched.next_admin_pg == rgroup) {
+		rtransport->conn_sched.next_admin_pg = next_rgroup;
+	}
+	if (rtransport->conn_sched.next_io_pg == rgroup) {
+		rtransport->conn_sched.next_io_pg = next_rgroup;
+	}
+	pthread_mutex_unlock(&rtransport->lock);
+
+	free(rgroup);
+}
+
+static void
+nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+	if (rqpair->cm_id != NULL) {
+		nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
+	}
+	nvmf_rdma_qpair_destroy(rqpair);
+}
+
+static int
+nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+			 struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_rdma_poll_group	*rgroup;
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+	struct spdk_nvmf_rdma_device		*device;
+	struct spdk_nvmf_rdma_poller		*poller;
+	int					rc;
+
+	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	device = rqpair->device;
+
+	TAILQ_FOREACH(poller, &rgroup->pollers, link) {
+		if (poller->device == device) {
+			break;
+		}
+	}
+
+	if (!poller) {
+		SPDK_ERRLOG("No poller found for device.\n");
+		return -1;
+	}
+
+	TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link);
+	rqpair->poller = poller;
+	rqpair->srq = rqpair->poller->srq;
+
+	rc = nvmf_rdma_qpair_initialize(qpair);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair);
+		return -1;
+	}
+
+	rc = nvmf_rdma_event_accept(rqpair->cm_id, rqpair);
+	if (rc) {
+		/* Try to reject, but we probably can't */
+		nvmf_rdma_qpair_reject_connection(rqpair);
+		return -1;
+	}
+
+	nvmf_rdma_update_ibv_state(rqpair);
+
+	return 0;
+}
+
+static int
+nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+			    struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_rdma_qpair		*rqpair;
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	assert(group->transport->tgt != NULL);
+
+	rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt);
+
+	if (!rqpair->destruct_channel) {
+		SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair);
+		return 0;
+	}
+
+	/* Sanity check that we get io_channel on the correct thread */
+	if (qpair->group) {
+		assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel));
+	}
+
+	return 0;
+}
+
+static int
+nvmf_rdma_request_free(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_rdma_request	*rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+	struct spdk_nvmf_rdma_transport	*rtransport = SPDK_CONTAINEROF(req->qpair->transport,
+			struct spdk_nvmf_rdma_transport, transport);
+	struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
+					      struct spdk_nvmf_rdma_qpair, qpair);
+
+	/*
+	 * AER requests are freed when a qpair is destroyed. The recv corresponding to that request
+	 * needs to be returned to the shared receive queue or the poll group will eventually be
+	 * starved of RECV structures.
+	 */
+	if (rqpair->srq && rdma_req->recv) {
+		int rc;
+		struct ibv_recv_wr *bad_recv_wr;
+
+		rc = ibv_post_srq_recv(rqpair->srq, &rdma_req->recv->wr, &bad_recv_wr);
+		if (rc) {
+			SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+		}
+	}
+
+	_nvmf_rdma_request_free(rdma_req, rtransport);
+	return 0;
+}
+
+static int
+nvmf_rdma_request_complete(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_rdma_transport	*rtransport = SPDK_CONTAINEROF(req->qpair->transport,
+			struct spdk_nvmf_rdma_transport, transport);
+	struct spdk_nvmf_rdma_request	*rdma_req = SPDK_CONTAINEROF(req,
+			struct spdk_nvmf_rdma_request, req);
+	struct spdk_nvmf_rdma_qpair     *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
+			struct spdk_nvmf_rdma_qpair, qpair);
+
+	if (rqpair->ibv_state != IBV_QPS_ERR) {
+		/* The connection is alive, so process the request as normal */
+		rdma_req->state = RDMA_REQUEST_STATE_EXECUTED;
+	} else {
+		/* The connection is dead. Move the request directly to the completed state. */
+		rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+	}
+
+	nvmf_rdma_request_process(rtransport, rdma_req);
+
+	return 0;
+}
+
+static int
+nvmf_rdma_destroy_defunct_qpair(void *ctx)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair = ctx;
+	struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
+			struct spdk_nvmf_rdma_transport, transport);
+
+	SPDK_INFOLOG(SPDK_LOG_RDMA, "QP#%d hasn't been drained as expected, manually destroy it\n",
+		     rqpair->qpair.qid);
+
+	nvmf_rdma_qpair_process_pending(rtransport, rqpair, true);
+	nvmf_rdma_qpair_destroy(rqpair);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) {
+		return;
+	}
+
+	rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING;
+
+	/* This happens only when the qpair is disconnected before
+	 * it is added to the poll group. Since there is no poll group,
+	 * the RDMA qp has not been initialized yet and the RDMA CM
+	 * event has not yet been acknowledged, so we need to reject it.
+	 */
+	if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) {
+		nvmf_rdma_qpair_reject_connection(rqpair);
+		return;
+	}
+
+	if (rqpair->rdma_qp) {
+		spdk_rdma_qp_disconnect(rqpair->rdma_qp);
+	}
+
+	rqpair->destruct_poller = SPDK_POLLER_REGISTER(nvmf_rdma_destroy_defunct_qpair, (void *)rqpair,
+				  NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US);
+}
+
+static struct spdk_nvmf_rdma_qpair *
+get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc)
+{
+	struct spdk_nvmf_rdma_qpair *rqpair;
+	/* @todo: improve QP search */
+	TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) {
+		if (wc->qp_num == rqpair->rdma_qp->qp->qp_num) {
+			return rqpair;
+		}
+	}
+	SPDK_ERRLOG("Didn't find QP with qp_num %u\n", wc->qp_num);
+	return NULL;
+}
+
+#ifdef DEBUG
+static int
+nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req)
+{
+	return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST ||
+	       rdma_req->state == RDMA_REQUEST_STATE_COMPLETING;
+}
+#endif
+
+static void
+_poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr,
+			   int rc)
+{
+	struct spdk_nvmf_rdma_recv	*rdma_recv;
+	struct spdk_nvmf_rdma_wr	*bad_rdma_wr;
+
+	SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc);
+	while (bad_recv_wr != NULL) {
+		bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id;
+		rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
+
+		rdma_recv->qpair->current_recv_depth++;
+		bad_recv_wr = bad_recv_wr->next;
+		SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc);
+		nvmf_rdma_start_disconnect(rdma_recv->qpair);
+	}
+}
+
+static void
+_qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc)
+{
+	SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc);
+	while (bad_recv_wr != NULL) {
+		bad_recv_wr = bad_recv_wr->next;
+		rqpair->current_recv_depth++;
+	}
+	nvmf_rdma_start_disconnect(rqpair);
+}
+
+static void
+_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
+		     struct spdk_nvmf_rdma_poller *rpoller)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+	struct ibv_recv_wr		*bad_recv_wr;
+	int				rc;
+
+	if (rpoller->srq) {
+		if (rpoller->resources->recvs_to_post.first != NULL) {
+			rc = ibv_post_srq_recv(rpoller->srq, rpoller->resources->recvs_to_post.first, &bad_recv_wr);
+			if (rc) {
+				_poller_reset_failed_recvs(rpoller, bad_recv_wr, rc);
+			}
+			rpoller->resources->recvs_to_post.first = NULL;
+			rpoller->resources->recvs_to_post.last = NULL;
+		}
+	} else {
+		while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) {
+			rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv);
+			assert(rqpair->resources->recvs_to_post.first != NULL);
+			rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->resources->recvs_to_post.first, &bad_recv_wr);
+			if (rc) {
+				_qp_reset_failed_recvs(rqpair, bad_recv_wr, rc);
+			}
+			rqpair->resources->recvs_to_post.first = NULL;
+			rqpair->resources->recvs_to_post.last = NULL;
+			STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link);
+		}
+	}
+}
+
+static void
+_qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport,
+		       struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc)
+{
+	struct spdk_nvmf_rdma_wr	*bad_rdma_wr;
+	struct spdk_nvmf_rdma_request	*prev_rdma_req = NULL, *cur_rdma_req = NULL;
+
+	SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc);
+	for (; bad_wr != NULL; bad_wr = bad_wr->next) {
+		bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id;
+		assert(rqpair->current_send_depth > 0);
+		rqpair->current_send_depth--;
+		switch (bad_rdma_wr->type) {
+		case RDMA_WR_TYPE_DATA:
+			cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr);
+			if (bad_wr->opcode == IBV_WR_RDMA_READ) {
+				assert(rqpair->current_read_depth > 0);
+				rqpair->current_read_depth--;
+			}
+			break;
+		case RDMA_WR_TYPE_SEND:
+			cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr);
+			break;
+		default:
+			SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair);
+			prev_rdma_req = cur_rdma_req;
+			continue;
+		}
+
+		if (prev_rdma_req == cur_rdma_req) {
+			/* this request was handled by an earlier wr. i.e. we were performing an nvme read. */
+			/* We only have to check against prev_wr since each requests wrs are contiguous in this list. */
+			continue;
+		}
+
+		switch (cur_rdma_req->state) {
+		case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+			cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+			cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+			break;
+		case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
+		case RDMA_REQUEST_STATE_COMPLETING:
+			cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+			break;
+		default:
+			SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n",
+				    cur_rdma_req->state, rqpair);
+			continue;
+		}
+
+		nvmf_rdma_request_process(rtransport, cur_rdma_req);
+		prev_rdma_req = cur_rdma_req;
+	}
+
+	if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) {
+		/* Disconnect the connection. */
+		nvmf_rdma_start_disconnect(rqpair);
+	}
+
+}
+
+static void
+_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
+		     struct spdk_nvmf_rdma_poller *rpoller)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+	struct ibv_send_wr		*bad_wr = NULL;
+	int				rc;
+
+	while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) {
+		rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send);
+		rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_wr);
+
+		/* bad wr always points to the first wr that failed. */
+		if (rc) {
+			_qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc);
+		}
+		STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link);
+	}
+}
+
+static int
+nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
+		      struct spdk_nvmf_rdma_poller *rpoller)
+{
+	struct ibv_wc wc[32];
+	struct spdk_nvmf_rdma_wr	*rdma_wr;
+	struct spdk_nvmf_rdma_request	*rdma_req;
+	struct spdk_nvmf_rdma_recv	*rdma_recv;
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+	int reaped, i;
+	int count = 0;
+	bool error = false;
+	uint64_t poll_tsc = spdk_get_ticks();
+
+	/* Poll for completing operations. */
+	reaped = ibv_poll_cq(rpoller->cq, 32, wc);
+	if (reaped < 0) {
+		SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
+			    errno, spdk_strerror(errno));
+		return -1;
+	}
+
+	rpoller->stat.polls++;
+	rpoller->stat.completions += reaped;
+
+	for (i = 0; i < reaped; i++) {
+
+		rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id;
+
+		switch (rdma_wr->type) {
+		case RDMA_WR_TYPE_SEND:
+			rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr);
+			rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+			if (!wc[i].status) {
+				count++;
+				assert(wc[i].opcode == IBV_WC_SEND);
+				assert(nvmf_rdma_req_is_completing(rdma_req));
+			}
+
+			rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+			/* RDMA_WRITE operation completed. +1 since it was chained with rsp WR */
+			rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1;
+			rdma_req->num_outstanding_data_wr = 0;
+
+			nvmf_rdma_request_process(rtransport, rdma_req);
+			break;
+		case RDMA_WR_TYPE_RECV:
+			/* rdma_recv->qpair will be invalid if using an SRQ.  In that case we have to get the qpair from the wc. */
+			rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
+			if (rpoller->srq != NULL) {
+				rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]);
+				/* It is possible that there are still some completions for destroyed QP
+				 * associated with SRQ. We just ignore these late completions and re-post
+				 * receive WRs back to SRQ.
+				 */
+				if (spdk_unlikely(NULL == rdma_recv->qpair)) {
+					struct ibv_recv_wr *bad_wr;
+					int rc;
+
+					rdma_recv->wr.next = NULL;
+					rc = ibv_post_srq_recv(rpoller->srq,
+							       &rdma_recv->wr,
+							       &bad_wr);
+					if (rc) {
+						SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc);
+					}
+					continue;
+				}
+			}
+			rqpair = rdma_recv->qpair;
+
+			assert(rqpair != NULL);
+			if (!wc[i].status) {
+				assert(wc[i].opcode == IBV_WC_RECV);
+				if (rqpair->current_recv_depth >= rqpair->max_queue_depth) {
+					nvmf_rdma_start_disconnect(rqpair);
+					break;
+				}
+			}
+
+			rdma_recv->wr.next = NULL;
+			rqpair->current_recv_depth++;
+			rdma_recv->receive_tsc = poll_tsc;
+			rpoller->stat.requests++;
+			STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link);
+			break;
+		case RDMA_WR_TYPE_DATA:
+			rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr);
+			rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+			assert(rdma_req->num_outstanding_data_wr > 0);
+
+			rqpair->current_send_depth--;
+			rdma_req->num_outstanding_data_wr--;
+			if (!wc[i].status) {
+				assert(wc[i].opcode == IBV_WC_RDMA_READ);
+				rqpair->current_read_depth--;
+				/* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
+				if (rdma_req->num_outstanding_data_wr == 0) {
+					rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
+					nvmf_rdma_request_process(rtransport, rdma_req);
+				}
+			} else {
+				/* If the data transfer fails still force the queue into the error state,
+				 * if we were performing an RDMA_READ, we need to force the request into a
+				 * completed state since it wasn't linked to a send. However, in the RDMA_WRITE
+				 * case, we should wait for the SEND to complete. */
+				if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) {
+					rqpair->current_read_depth--;
+					if (rdma_req->num_outstanding_data_wr == 0) {
+						rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+					}
+				}
+			}
+			break;
+		default:
+			SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode);
+			continue;
+		}
+
+		/* Handle error conditions */
+		if (wc[i].status) {
+			if ((rdma_wr->type == RDMA_WR_TYPE_RECV && !rpoller->srq)) {
+				/* When we don't use SRQ and close a qpair, we will receive completions with error
+				 * status for all posted ibv_recv_wrs. This is expected and we don't want to log
+				 * an error in that case. */
+				SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n",
+					      rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status));
+			} else {
+				SPDK_ERRLOG("Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n",
+					    rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status));
+			}
+
+			error = true;
+
+			if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) {
+				/* Disconnect the connection. */
+				nvmf_rdma_start_disconnect(rqpair);
+			} else {
+				nvmf_rdma_destroy_drained_qpair(rqpair);
+			}
+			continue;
+		}
+
+		nvmf_rdma_qpair_process_pending(rtransport, rqpair, false);
+
+		if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+			nvmf_rdma_destroy_drained_qpair(rqpair);
+		}
+	}
+
+	if (error == true) {
+		return -1;
+	}
+
+	/* submit outstanding work requests. */
+	_poller_submit_recvs(rtransport, rpoller);
+	_poller_submit_sends(rtransport, rpoller);
+
+	return count;
+}
+
+static int
+nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_rdma_poll_group *rgroup;
+	struct spdk_nvmf_rdma_poller	*rpoller;
+	int				count, rc;
+
+	rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport);
+	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
+
+	count = 0;
+	TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+		rc = nvmf_rdma_poller_poll(rtransport, rpoller);
+		if (rc < 0) {
+			return rc;
+		}
+		count += rc;
+	}
+
+	return count;
+}
+
+static int
+nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
+			  struct spdk_nvme_transport_id *trid,
+			  bool peer)
+{
+	struct sockaddr *saddr;
+	uint16_t port;
+
+	spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA);
+
+	if (peer) {
+		saddr = rdma_get_peer_addr(id);
+	} else {
+		saddr = rdma_get_local_addr(id);
+	}
+	switch (saddr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr;
+
+		trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+		inet_ntop(AF_INET, &saddr_in->sin_addr,
+			  trid->traddr, sizeof(trid->traddr));
+		if (peer) {
+			port = ntohs(rdma_get_dst_port(id));
+		} else {
+			port = ntohs(rdma_get_src_port(id));
+		}
+		snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr;
+		trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
+		inet_ntop(AF_INET6, &saddr_in->sin6_addr,
+			  trid->traddr, sizeof(trid->traddr));
+		if (peer) {
+			port = ntohs(rdma_get_dst_port(id));
+		} else {
+			port = ntohs(rdma_get_src_port(id));
+		}
+		snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
+		break;
+	}
+	default:
+		return -1;
+
+	}
+
+	return 0;
+}
+
+static int
+nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+			      struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true);
+}
+
+static int
+nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+			       struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false);
+}
+
+static int
+nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+				struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_rdma_qpair	*rqpair;
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+	return nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false);
+}
+
+void
+spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks)
+{
+	g_nvmf_hooks = *hooks;
+}
+
+static void
+nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req,
+				   struct spdk_nvmf_rdma_request *rdma_req_to_abort)
+{
+	rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+
+	rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+
+	req->rsp->nvme_cpl.cdw0 &= ~1U;	/* Command was successfully aborted. */
+}
+
+static int
+_nvmf_rdma_qpair_abort_request(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF(
+				req->req_to_abort, struct spdk_nvmf_rdma_request, req);
+	struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair,
+					      struct spdk_nvmf_rdma_qpair, qpair);
+	int rc;
+
+	spdk_poller_unregister(&req->poller);
+
+	switch (rdma_req_to_abort->state) {
+	case RDMA_REQUEST_STATE_EXECUTING:
+		rc = nvmf_ctrlr_abort_request(req);
+		if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
+			return SPDK_POLLER_BUSY;
+		}
+		break;
+
+	case RDMA_REQUEST_STATE_NEED_BUFFER:
+		STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue,
+			      &rdma_req_to_abort->req, spdk_nvmf_request, buf_link);
+
+		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort);
+		break;
+
+	case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
+		STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort,
+			      spdk_nvmf_rdma_request, state_link);
+
+		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort);
+		break;
+
+	case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
+		STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort,
+			      spdk_nvmf_rdma_request, state_link);
+
+		nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort);
+		break;
+
+	case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+		if (spdk_get_ticks() < req->timeout_tsc) {
+			req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0);
+			return SPDK_POLLER_BUSY;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	spdk_nvmf_request_complete(req);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+			      struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_rdma_qpair *rqpair;
+	struct spdk_nvmf_rdma_transport *rtransport;
+	struct spdk_nvmf_transport *transport;
+	uint16_t cid;
+	uint32_t i;
+	struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL;
+
+	rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+	rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
+	transport = &rtransport->transport;
+
+	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
+
+	for (i = 0; i < rqpair->max_queue_depth; i++) {
+		rdma_req_to_abort = &rqpair->resources->reqs[i];
+
+		if (rdma_req_to_abort->state != RDMA_REQUEST_STATE_FREE &&
+		    rdma_req_to_abort->req.cmd->nvme_cmd.cid == cid) {
+			break;
+		}
+	}
+
+	if (rdma_req_to_abort == NULL) {
+		spdk_nvmf_request_complete(req);
+		return;
+	}
+
+	req->req_to_abort = &rdma_req_to_abort->req;
+	req->timeout_tsc = spdk_get_ticks() +
+			   transport->opts.abort_timeout_sec * spdk_get_ticks_hz();
+	req->poller = NULL;
+
+	_nvmf_rdma_qpair_abort_request(req);
+}
+
+static int
+nvmf_rdma_poll_group_get_stat(struct spdk_nvmf_tgt *tgt,
+			      struct spdk_nvmf_transport_poll_group_stat **stat)
+{
+	struct spdk_io_channel *ch;
+	struct spdk_nvmf_poll_group *group;
+	struct spdk_nvmf_transport_poll_group *tgroup;
+	struct spdk_nvmf_rdma_poll_group *rgroup;
+	struct spdk_nvmf_rdma_poller *rpoller;
+	struct spdk_nvmf_rdma_device_stat *device_stat;
+	uint64_t num_devices = 0;
+
+	if (tgt == NULL || stat == NULL) {
+		return -EINVAL;
+	}
+
+	ch = spdk_get_io_channel(tgt);
+	group = spdk_io_channel_get_ctx(ch);;
+	spdk_put_io_channel(ch);
+	TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		if (SPDK_NVME_TRANSPORT_RDMA == tgroup->transport->ops->type) {
+			*stat = calloc(1, sizeof(struct spdk_nvmf_transport_poll_group_stat));
+			if (!*stat) {
+				SPDK_ERRLOG("Failed to allocate memory for NVMf RDMA statistics\n");
+				return -ENOMEM;
+			}
+			(*stat)->trtype = SPDK_NVME_TRANSPORT_RDMA;
+
+			rgroup = SPDK_CONTAINEROF(tgroup, struct spdk_nvmf_rdma_poll_group, group);
+			/* Count devices to allocate enough memory */
+			TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+				++num_devices;
+			}
+			(*stat)->rdma.devices = calloc(num_devices, sizeof(struct spdk_nvmf_rdma_device_stat));
+			if (!(*stat)->rdma.devices) {
+				SPDK_ERRLOG("Failed to allocate NVMf RDMA devices statistics\n");
+				free(*stat);
+				return -ENOMEM;
+			}
+
+			(*stat)->rdma.pending_data_buffer = rgroup->stat.pending_data_buffer;
+			(*stat)->rdma.num_devices = num_devices;
+			num_devices = 0;
+			TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+				device_stat = &(*stat)->rdma.devices[num_devices++];
+				device_stat->name = ibv_get_device_name(rpoller->device->context->device);
+				device_stat->polls = rpoller->stat.polls;
+				device_stat->completions = rpoller->stat.completions;
+				device_stat->requests = rpoller->stat.requests;
+				device_stat->request_latency = rpoller->stat.request_latency;
+				device_stat->pending_free_request = rpoller->stat.pending_free_request;
+				device_stat->pending_rdma_read = rpoller->stat.pending_rdma_read;
+				device_stat->pending_rdma_write = rpoller->stat.pending_rdma_write;
+			}
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static void
+nvmf_rdma_poll_group_free_stat(struct spdk_nvmf_transport_poll_group_stat *stat)
+{
+	if (stat) {
+		free(stat->rdma.devices);
+	}
+	free(stat);
+}
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = {
+	.name = "RDMA",
+	.type = SPDK_NVME_TRANSPORT_RDMA,
+	.opts_init = nvmf_rdma_opts_init,
+	.create = nvmf_rdma_create,
+	.destroy = nvmf_rdma_destroy,
+
+	.listen = nvmf_rdma_listen,
+	.stop_listen = nvmf_rdma_stop_listen,
+	.accept = nvmf_rdma_accept,
+	.cdata_init = nvmf_rdma_cdata_init,
+
+	.listener_discover = nvmf_rdma_discover,
+
+	.poll_group_create = nvmf_rdma_poll_group_create,
+	.get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group,
+	.poll_group_destroy = nvmf_rdma_poll_group_destroy,
+	.poll_group_add = nvmf_rdma_poll_group_add,
+	.poll_group_remove = nvmf_rdma_poll_group_remove,
+	.poll_group_poll = nvmf_rdma_poll_group_poll,
+
+	.req_free = nvmf_rdma_request_free,
+	.req_complete = nvmf_rdma_request_complete,
+
+	.qpair_fini = nvmf_rdma_close_qpair,
+	.qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid,
+	.qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid,
+	.qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid,
+	.qpair_abort_request = nvmf_rdma_qpair_abort_request,
+
+	.poll_group_get_stat = nvmf_rdma_poll_group_get_stat,
+	.poll_group_free_stat = nvmf_rdma_poll_group_free_stat,
+};
+
+SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma);
+SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA)
diff --git a/src/spdk/lib/nvmf/spdk_nvmf.map b/src/spdk/lib/nvmf/spdk_nvmf.map
new file mode 100644
index 000000000..994e7437b
--- /dev/null
+++ b/src/spdk/lib/nvmf/spdk_nvmf.map
@@ -0,0 +1,118 @@
+{
+	global:
+
+	# public functions in nvmf.h
+	spdk_nvmf_tgt_create;
+	spdk_nvmf_tgt_destroy;
+	spdk_nvmf_tgt_get_name;
+	spdk_nvmf_get_tgt;
+	spdk_nvmf_get_first_tgt;
+	spdk_nvmf_get_next_tgt;
+	spdk_nvmf_tgt_write_config_json;
+	spdk_nvmf_tgt_listen;
+	spdk_nvmf_tgt_stop_listen;
+	spdk_nvmf_tgt_accept;
+	spdk_nvmf_poll_group_create;
+	spdk_nvmf_get_optimal_poll_group;
+	spdk_nvmf_poll_group_destroy;
+	spdk_nvmf_poll_group_add;
+	spdk_nvmf_poll_group_get_stat;
+	spdk_nvmf_qpair_disconnect;
+	spdk_nvmf_qpair_get_peer_trid;
+	spdk_nvmf_qpair_get_local_trid;
+	spdk_nvmf_qpair_get_listen_trid;
+	spdk_nvmf_subsystem_create;
+	spdk_nvmf_subsystem_destroy;
+	spdk_nvmf_subsystem_start;
+	spdk_nvmf_subsystem_stop;
+	spdk_nvmf_subsystem_pause;
+	spdk_nvmf_subsystem_resume;
+	spdk_nvmf_tgt_find_subsystem;
+	spdk_nvmf_subsystem_get_first;
+	spdk_nvmf_subsystem_get_next;
+	spdk_nvmf_subsystem_add_host;
+	spdk_nvmf_subsystem_remove_host;
+	spdk_nvmf_subsystem_set_allow_any_host;
+	spdk_nvmf_subsystem_get_allow_any_host;
+	spdk_nvmf_subsystem_host_allowed;
+	spdk_nvmf_subsystem_get_first_host;
+	spdk_nvmf_subsystem_get_next_host;
+	spdk_nvmf_host_get_nqn;
+	spdk_nvmf_subsystem_add_listener;
+	spdk_nvmf_subsystem_remove_listener;
+	spdk_nvmf_subsystem_listener_allowed;
+	spdk_nvmf_subsystem_get_first_listener;
+	spdk_nvmf_subsystem_get_next_listener;
+	spdk_nvmf_subsystem_listener_get_trid;
+	spdk_nvmf_subsystem_allow_any_listener;
+	spdk_nvmf_subsytem_any_listener_allowed;
+	spdk_nvmf_ns_opts_get_defaults;
+	spdk_nvmf_subsystem_add_ns;
+	spdk_nvmf_subsystem_remove_ns;
+	spdk_nvmf_subsystem_get_first_ns;
+	spdk_nvmf_subsystem_get_next_ns;
+	spdk_nvmf_subsystem_get_ns;
+	spdk_nvmf_subsystem_get_max_namespaces;
+	spdk_nvmf_ns_get_id;
+	spdk_nvmf_ns_get_bdev;
+	spdk_nvmf_ns_get_opts;
+	spdk_nvmf_subsystem_get_sn;
+	spdk_nvmf_subsystem_set_sn;
+	spdk_nvmf_subsystem_get_mn;
+	spdk_nvmf_subsystem_set_mn;
+	spdk_nvmf_subsystem_get_nqn;
+	spdk_nvmf_subsystem_get_type;
+	spdk_nvmf_subsystem_get_max_nsid;
+	spdk_nvmf_transport_opts_init;
+	spdk_nvmf_transport_create;
+	spdk_nvmf_transport_destroy;
+	spdk_nvmf_tgt_get_transport;
+	spdk_nvmf_transport_get_first;
+	spdk_nvmf_transport_get_next;
+	spdk_nvmf_get_transport_opts;
+	spdk_nvmf_get_transport_type;
+	spdk_nvmf_get_transport_name;
+	spdk_nvmf_tgt_add_transport;
+	spdk_nvmf_transport_listen;
+	spdk_nvmf_transport_stop_listen;
+	spdk_nvmf_transport_poll_group_get_stat;
+	spdk_nvmf_transport_poll_group_free_stat;
+	spdk_nvmf_rdma_init_hooks;
+
+	# public functions in nvmf_cmd.h
+	spdk_nvmf_ctrlr_identify_ctrlr;
+	spdk_nvmf_ctrlr_identify_ns;
+	spdk_nvmf_set_custom_admin_cmd_hdlr;
+	spdk_nvmf_set_passthru_admin_cmd;
+	spdk_nvmf_bdev_ctrlr_nvme_passthru_admin;
+	spdk_nvmf_request_get_bdev;
+	spdk_nvmf_request_get_ctrlr;
+	spdk_nvmf_request_get_subsystem;
+	spdk_nvmf_request_get_data;
+	spdk_nvmf_request_get_cmd;
+	spdk_nvmf_request_get_response;
+	spdk_nvmf_request_get_req_to_abort;
+	spdk_nvmf_bdev_ctrlr_abort_cmd;
+
+	# public functions in nvmf_transport.h
+	spdk_nvmf_transport_register;
+	spdk_nvmf_tgt_new_qpair;
+	spdk_nvmf_ctrlr_connect;
+	spdk_nvmf_ctrlr_data_init;
+	spdk_nvmf_ctrlr_get_regs;
+	spdk_nvmf_request_free_buffers;
+	spdk_nvmf_request_get_buffers;
+	spdk_nvmf_request_get_buffers_multi;
+	spdk_nvmf_request_get_dif_ctx;
+	spdk_nvmf_request_exec;
+	spdk_nvmf_request_exec_fabrics;
+	spdk_nvmf_request_free;
+	spdk_nvmf_request_complete;
+	spdk_nvmf_ctrlr_get_subsystem;
+	spdk_nvmf_ctrlr_get_id;
+	spdk_nvmf_req_get_xfer;
+	spdk_nvmf_poll_group_remove;
+
+
+	local: *;
+};
diff --git a/src/spdk/lib/nvmf/subsystem.c b/src/spdk/lib/nvmf/subsystem.c
new file mode 100644
index 000000000..ebe8d9a8e
--- /dev/null
+++ b/src/spdk/lib/nvmf/subsystem.c
@@ -0,0 +1,2515 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/uuid.h"
+#include "spdk/json.h"
+#include "spdk/file.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/utf.h"
+
+#define MODEL_NUMBER_DEFAULT "SPDK bdev Controller"
+
+/*
+ * States for parsing valid domains in NQNs according to RFC 1034
+ */
+enum spdk_nvmf_nqn_domain_states {
+	/* First character of a domain must be a letter */
+	SPDK_NVMF_DOMAIN_ACCEPT_LETTER = 0,
+
+	/* Subsequent characters can be any of letter, digit, or hyphen */
+	SPDK_NVMF_DOMAIN_ACCEPT_LDH = 1,
+
+	/* A domain label must end with either a letter or digit */
+	SPDK_NVMF_DOMAIN_ACCEPT_ANY = 2
+};
+
+/* Returns true if is a valid ASCII string as defined by the NVMe spec */
+static bool
+nvmf_valid_ascii_string(const void *buf, size_t size)
+{
+	const uint8_t *str = buf;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (str[i] < 0x20 || str[i] > 0x7E) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static bool
+nvmf_valid_nqn(const char *nqn)
+{
+	size_t len;
+	struct spdk_uuid uuid_value;
+	uint32_t i;
+	int bytes_consumed;
+	uint32_t domain_label_length;
+	char *reverse_domain_end;
+	uint32_t reverse_domain_end_index;
+	enum spdk_nvmf_nqn_domain_states domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER;
+
+	/* Check for length requirements */
+	len = strlen(nqn);
+	if (len > SPDK_NVMF_NQN_MAX_LEN) {
+		SPDK_ERRLOG("Invalid NQN \"%s\": length %zu > max %d\n", nqn, len, SPDK_NVMF_NQN_MAX_LEN);
+		return false;
+	}
+
+	/* The nqn must be at least as long as SPDK_NVMF_NQN_MIN_LEN to contain the necessary prefix. */
+	if (len < SPDK_NVMF_NQN_MIN_LEN) {
+		SPDK_ERRLOG("Invalid NQN \"%s\": length %zu < min %d\n", nqn, len, SPDK_NVMF_NQN_MIN_LEN);
+		return false;
+	}
+
+	/* Check for discovery controller nqn */
+	if (!strcmp(nqn, SPDK_NVMF_DISCOVERY_NQN)) {
+		return true;
+	}
+
+	/* Check for equality with the generic nqn structure of the form "nqn.2014-08.org.nvmexpress:uuid:11111111-2222-3333-4444-555555555555" */
+	if (!strncmp(nqn, SPDK_NVMF_NQN_UUID_PRE, SPDK_NVMF_NQN_UUID_PRE_LEN)) {
+		if (len != SPDK_NVMF_NQN_UUID_PRE_LEN + SPDK_NVMF_UUID_STRING_LEN) {
+			SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not the correct length\n", nqn);
+			return false;
+		}
+
+		if (spdk_uuid_parse(&uuid_value, &nqn[SPDK_NVMF_NQN_UUID_PRE_LEN])) {
+			SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not formatted correctly\n", nqn);
+			return false;
+		}
+		return true;
+	}
+
+	/* If the nqn does not match the uuid structure, the next several checks validate the form "nqn.yyyy-mm.reverse.domain:user-string" */
+
+	if (strncmp(nqn, "nqn.", 4) != 0) {
+		SPDK_ERRLOG("Invalid NQN \"%s\": NQN must begin with \"nqn.\".\n", nqn);
+		return false;
+	}
+
+	/* Check for yyyy-mm. */
+	if (!(isdigit(nqn[4]) && isdigit(nqn[5]) && isdigit(nqn[6]) && isdigit(nqn[7]) &&
+	      nqn[8] == '-' && isdigit(nqn[9]) && isdigit(nqn[10]) && nqn[11] == '.')) {
+		SPDK_ERRLOG("Invalid date code in NQN \"%s\"\n", nqn);
+		return false;
+	}
+
+	reverse_domain_end = strchr(nqn, ':');
+	if (reverse_domain_end != NULL && (reverse_domain_end_index = reverse_domain_end - nqn) < len - 1) {
+	} else {
+		SPDK_ERRLOG("Invalid NQN \"%s\". NQN must contain user specified name with a ':' as a prefix.\n",
+			    nqn);
+		return false;
+	}
+
+	/* Check for valid reverse domain */
+	domain_label_length = 0;
+	for (i = 12; i < reverse_domain_end_index; i++) {
+		if (domain_label_length > SPDK_DOMAIN_LABEL_MAX_LEN) {
+			SPDK_ERRLOG("Invalid domain name in NQN \"%s\". At least one Label is too long.\n", nqn);
+			return false;
+		}
+
+		switch (domain_state) {
+
+		case SPDK_NVMF_DOMAIN_ACCEPT_LETTER: {
+			if (isalpha(nqn[i])) {
+				domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY;
+				domain_label_length++;
+				break;
+			} else {
+				SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must start with a letter.\n", nqn);
+				return false;
+			}
+		}
+
+		case SPDK_NVMF_DOMAIN_ACCEPT_LDH: {
+			if (isalpha(nqn[i]) || isdigit(nqn[i])) {
+				domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY;
+				domain_label_length++;
+				break;
+			} else if (nqn[i] == '-') {
+				if (i == reverse_domain_end_index - 1) {
+					SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n",
+						    nqn);
+					return false;
+				}
+				domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH;
+				domain_label_length++;
+				break;
+			} else if (nqn[i] == '.') {
+				SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n",
+					    nqn);
+				return false;
+			} else {
+				SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n",
+					    nqn);
+				return false;
+			}
+		}
+
+		case SPDK_NVMF_DOMAIN_ACCEPT_ANY: {
+			if (isalpha(nqn[i]) || isdigit(nqn[i])) {
+				domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY;
+				domain_label_length++;
+				break;
+			} else if (nqn[i] == '-') {
+				if (i == reverse_domain_end_index - 1) {
+					SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n",
+						    nqn);
+					return false;
+				}
+				domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH;
+				domain_label_length++;
+				break;
+			} else if (nqn[i] == '.') {
+				domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER;
+				domain_label_length = 0;
+				break;
+			} else {
+				SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n",
+					    nqn);
+				return false;
+			}
+		}
+		}
+	}
+
+	i = reverse_domain_end_index + 1;
+	while (i < len) {
+		bytes_consumed = utf8_valid(&nqn[i], &nqn[len]);
+		if (bytes_consumed <= 0) {
+			SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only valid utf-8.\n", nqn);
+			return false;
+		}
+
+		i += bytes_consumed;
+	}
+	return true;
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_subsystem_create(struct spdk_nvmf_tgt *tgt,
+			   const char *nqn,
+			   enum spdk_nvmf_subtype type,
+			   uint32_t num_ns)
+{
+	struct spdk_nvmf_subsystem	*subsystem;
+	uint32_t			sid;
+
+	if (spdk_nvmf_tgt_find_subsystem(tgt, nqn)) {
+		SPDK_ERRLOG("Subsystem NQN '%s' already exists\n", nqn);
+		return NULL;
+	}
+
+	if (!nvmf_valid_nqn(nqn)) {
+		return NULL;
+	}
+
+	if (type == SPDK_NVMF_SUBTYPE_DISCOVERY && num_ns != 0) {
+		SPDK_ERRLOG("Discovery subsystem cannot have namespaces.\n");
+		return NULL;
+	}
+
+	/* Find a free subsystem id (sid) */
+	for (sid = 0; sid < tgt->max_subsystems; sid++) {
+		if (tgt->subsystems[sid] == NULL) {
+			break;
+		}
+	}
+	if (sid >= tgt->max_subsystems) {
+		return NULL;
+	}
+
+	subsystem = calloc(1, sizeof(struct spdk_nvmf_subsystem));
+	if (subsystem == NULL) {
+		return NULL;
+	}
+
+	subsystem->thread = spdk_get_thread();
+	subsystem->state = SPDK_NVMF_SUBSYSTEM_INACTIVE;
+	subsystem->tgt = tgt;
+	subsystem->id = sid;
+	subsystem->subtype = type;
+	subsystem->max_nsid = num_ns;
+	subsystem->max_allowed_nsid = num_ns;
+	subsystem->next_cntlid = 0;
+	snprintf(subsystem->subnqn, sizeof(subsystem->subnqn), "%s", nqn);
+	TAILQ_INIT(&subsystem->listeners);
+	TAILQ_INIT(&subsystem->hosts);
+	TAILQ_INIT(&subsystem->ctrlrs);
+
+	if (num_ns != 0) {
+		subsystem->ns = calloc(num_ns, sizeof(struct spdk_nvmf_ns *));
+		if (subsystem->ns == NULL) {
+			SPDK_ERRLOG("Namespace memory allocation failed\n");
+			free(subsystem);
+			return NULL;
+		}
+	}
+
+	memset(subsystem->sn, '0', sizeof(subsystem->sn) - 1);
+	subsystem->sn[sizeof(subsystem->sn) - 1] = '\0';
+
+	snprintf(subsystem->mn, sizeof(subsystem->mn), "%s",
+		 MODEL_NUMBER_DEFAULT);
+
+	tgt->subsystems[sid] = subsystem;
+	tgt->discovery_genctr++;
+
+	return subsystem;
+}
+
+static void
+nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_host *host)
+{
+	TAILQ_REMOVE(&subsystem->hosts, host, link);
+	free(host);
+}
+
+static void
+_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem,
+				struct spdk_nvmf_subsystem_listener *listener,
+				bool stop)
+{
+	struct spdk_nvmf_transport *transport;
+
+	if (stop) {
+		transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, listener->trid->trstring);
+		if (transport != NULL) {
+			spdk_nvmf_transport_stop_listen(transport, listener->trid);
+		}
+	}
+
+	TAILQ_REMOVE(&subsystem->listeners, listener, link);
+	free(listener);
+}
+
+void
+spdk_nvmf_subsystem_destroy(struct spdk_nvmf_subsystem *subsystem)
+{
+	struct spdk_nvmf_host		*host, *host_tmp;
+	struct spdk_nvmf_ctrlr		*ctrlr, *ctrlr_tmp;
+	struct spdk_nvmf_ns		*ns;
+
+	if (!subsystem) {
+		return;
+	}
+
+	assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "subsystem is %p\n", subsystem);
+
+	nvmf_subsystem_remove_all_listeners(subsystem, false);
+
+	TAILQ_FOREACH_SAFE(host, &subsystem->hosts, link, host_tmp) {
+		nvmf_subsystem_remove_host(subsystem, host);
+	}
+
+	TAILQ_FOREACH_SAFE(ctrlr, &subsystem->ctrlrs, link, ctrlr_tmp) {
+		nvmf_ctrlr_destruct(ctrlr);
+	}
+
+	ns = spdk_nvmf_subsystem_get_first_ns(subsystem);
+	while (ns != NULL) {
+		struct spdk_nvmf_ns *next_ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns);
+
+		spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid);
+		ns = next_ns;
+	}
+
+	free(subsystem->ns);
+
+	subsystem->tgt->subsystems[subsystem->id] = NULL;
+	subsystem->tgt->discovery_genctr++;
+
+	free(subsystem);
+}
+
+static int
+nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem,
+			 enum spdk_nvmf_subsystem_state state)
+{
+	enum spdk_nvmf_subsystem_state actual_old_state, expected_old_state;
+	bool exchanged;
+
+	switch (state) {
+	case SPDK_NVMF_SUBSYSTEM_INACTIVE:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_ACTIVATING:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_INACTIVE;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_ACTIVE:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_PAUSING:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_PAUSED:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSING;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_RESUMING:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSED;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_DEACTIVATING:
+		expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+		break;
+	default:
+		assert(false);
+		return -1;
+	}
+
+	actual_old_state = expected_old_state;
+	exchanged = __atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false,
+						__ATOMIC_RELAXED, __ATOMIC_RELAXED);
+	if (spdk_unlikely(exchanged == false)) {
+		if (actual_old_state == SPDK_NVMF_SUBSYSTEM_RESUMING &&
+		    state == SPDK_NVMF_SUBSYSTEM_ACTIVE) {
+			expected_old_state = SPDK_NVMF_SUBSYSTEM_RESUMING;
+		}
+		/* This is for the case when activating the subsystem fails. */
+		if (actual_old_state == SPDK_NVMF_SUBSYSTEM_ACTIVATING &&
+		    state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING) {
+			expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING;
+		}
+		actual_old_state = expected_old_state;
+		__atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false,
+					    __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+	}
+	assert(actual_old_state == expected_old_state);
+	return actual_old_state - expected_old_state;
+}
+
+struct subsystem_state_change_ctx {
+	struct spdk_nvmf_subsystem *subsystem;
+
+	enum spdk_nvmf_subsystem_state requested_state;
+
+	spdk_nvmf_subsystem_state_change_done cb_fn;
+	void *cb_arg;
+};
+
+static void
+subsystem_state_change_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	if (status == 0) {
+		status = nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state);
+		if (status) {
+			status = -1;
+		}
+	}
+
+	if (ctx->cb_fn) {
+		ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status);
+	}
+	free(ctx);
+}
+
+static void
+subsystem_state_change_continue(void *ctx, int status)
+{
+	struct spdk_io_channel_iter *i = ctx;
+	spdk_for_each_channel_continue(i, status);
+}
+
+static void
+subsystem_state_change_on_pg(struct spdk_io_channel_iter *i)
+{
+	struct subsystem_state_change_ctx *ctx;
+	struct spdk_io_channel *ch;
+	struct spdk_nvmf_poll_group *group;
+
+	ctx = spdk_io_channel_iter_get_ctx(i);
+	ch = spdk_io_channel_iter_get_channel(i);
+	group = spdk_io_channel_get_ctx(ch);
+
+	switch (ctx->requested_state) {
+	case SPDK_NVMF_SUBSYSTEM_INACTIVE:
+		nvmf_poll_group_remove_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+		break;
+	case SPDK_NVMF_SUBSYSTEM_ACTIVE:
+		if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_ACTIVATING) {
+			nvmf_poll_group_add_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+		} else if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_RESUMING) {
+			nvmf_poll_group_resume_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+		}
+		break;
+	case SPDK_NVMF_SUBSYSTEM_PAUSED:
+		nvmf_poll_group_pause_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+		break;
+	default:
+		assert(false);
+		break;
+	}
+}
+
+static int
+nvmf_subsystem_state_change(struct spdk_nvmf_subsystem *subsystem,
+			    enum spdk_nvmf_subsystem_state requested_state,
+			    spdk_nvmf_subsystem_state_change_done cb_fn,
+			    void *cb_arg)
+{
+	struct subsystem_state_change_ctx *ctx;
+	enum spdk_nvmf_subsystem_state intermediate_state;
+	int rc;
+
+	switch (requested_state) {
+	case SPDK_NVMF_SUBSYSTEM_INACTIVE:
+		intermediate_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING;
+		break;
+	case SPDK_NVMF_SUBSYSTEM_ACTIVE:
+		if (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) {
+			intermediate_state = SPDK_NVMF_SUBSYSTEM_RESUMING;
+		} else {
+			intermediate_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING;
+		}
+		break;
+	case SPDK_NVMF_SUBSYSTEM_PAUSED:
+		intermediate_state = SPDK_NVMF_SUBSYSTEM_PAUSING;
+		break;
+	default:
+		assert(false);
+		return -EINVAL;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx) {
+		return -ENOMEM;
+	}
+
+	rc = nvmf_subsystem_set_state(subsystem, intermediate_state);
+	if (rc) {
+		free(ctx);
+		return rc;
+	}
+
+	ctx->subsystem = subsystem;
+	ctx->requested_state = requested_state;
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	spdk_for_each_channel(subsystem->tgt,
+			      subsystem_state_change_on_pg,
+			      ctx,
+			      subsystem_state_change_done);
+
+	return 0;
+}
+
+int
+spdk_nvmf_subsystem_start(struct spdk_nvmf_subsystem *subsystem,
+			  spdk_nvmf_subsystem_state_change_done cb_fn,
+			  void *cb_arg)
+{
+	return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg);
+}
+
+int
+spdk_nvmf_subsystem_stop(struct spdk_nvmf_subsystem *subsystem,
+			 spdk_nvmf_subsystem_state_change_done cb_fn,
+			 void *cb_arg)
+{
+	return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_INACTIVE, cb_fn, cb_arg);
+}
+
+int
+spdk_nvmf_subsystem_pause(struct spdk_nvmf_subsystem *subsystem,
+			  spdk_nvmf_subsystem_state_change_done cb_fn,
+			  void *cb_arg)
+{
+	return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_PAUSED, cb_fn, cb_arg);
+}
+
+int
+spdk_nvmf_subsystem_resume(struct spdk_nvmf_subsystem *subsystem,
+			   spdk_nvmf_subsystem_state_change_done cb_fn,
+			   void *cb_arg)
+{
+	return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg);
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_subsystem_get_first(struct spdk_nvmf_tgt *tgt)
+{
+	struct spdk_nvmf_subsystem	*subsystem;
+	uint32_t sid;
+
+	for (sid = 0; sid < tgt->max_subsystems; sid++) {
+		subsystem = tgt->subsystems[sid];
+		if (subsystem) {
+			return subsystem;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_subsystem_get_next(struct spdk_nvmf_subsystem *subsystem)
+{
+	uint32_t sid;
+	struct spdk_nvmf_tgt *tgt;
+
+	if (!subsystem) {
+		return NULL;
+	}
+
+	tgt = subsystem->tgt;
+
+	for (sid = subsystem->id + 1; sid < tgt->max_subsystems; sid++) {
+		subsystem = tgt->subsystems[sid];
+		if (subsystem) {
+			return subsystem;
+		}
+	}
+
+	return NULL;
+}
+
+static struct spdk_nvmf_host *
+nvmf_subsystem_find_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+	struct spdk_nvmf_host *host = NULL;
+
+	TAILQ_FOREACH(host, &subsystem->hosts, link) {
+		if (strcmp(hostnqn, host->nqn) == 0) {
+			return host;
+		}
+	}
+
+	return NULL;
+}
+
+int
+spdk_nvmf_subsystem_add_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+	struct spdk_nvmf_host *host;
+
+	if (!nvmf_valid_nqn(hostnqn)) {
+		return -EINVAL;
+	}
+
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		return -EAGAIN;
+	}
+
+	if (nvmf_subsystem_find_host(subsystem, hostnqn)) {
+		/* This subsystem already allows the specified host. */
+		return 0;
+	}
+
+	host = calloc(1, sizeof(*host));
+	if (!host) {
+		return -ENOMEM;
+	}
+
+	snprintf(host->nqn, sizeof(host->nqn), "%s", hostnqn);
+
+	TAILQ_INSERT_HEAD(&subsystem->hosts, host, link);
+	subsystem->tgt->discovery_genctr++;
+
+	return 0;
+}
+
+int
+spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+	struct spdk_nvmf_host *host;
+
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		return -EAGAIN;
+	}
+
+	host = nvmf_subsystem_find_host(subsystem, hostnqn);
+	if (host == NULL) {
+		return -ENOENT;
+	}
+
+	nvmf_subsystem_remove_host(subsystem, host);
+	return 0;
+}
+
+int
+spdk_nvmf_subsystem_set_allow_any_host(struct spdk_nvmf_subsystem *subsystem, bool allow_any_host)
+{
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		return -EAGAIN;
+	}
+
+	subsystem->allow_any_host = allow_any_host;
+
+	return 0;
+}
+
+bool
+spdk_nvmf_subsystem_get_allow_any_host(const struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->allow_any_host;
+}
+
+bool
+spdk_nvmf_subsystem_host_allowed(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+	if (!hostnqn) {
+		return false;
+	}
+
+	if (subsystem->allow_any_host) {
+		return true;
+	}
+
+	return nvmf_subsystem_find_host(subsystem, hostnqn) != NULL;
+}
+
+struct spdk_nvmf_host *
+spdk_nvmf_subsystem_get_first_host(struct spdk_nvmf_subsystem *subsystem)
+{
+	return TAILQ_FIRST(&subsystem->hosts);
+}
+
+
+struct spdk_nvmf_host *
+spdk_nvmf_subsystem_get_next_host(struct spdk_nvmf_subsystem *subsystem,
+				  struct spdk_nvmf_host *prev_host)
+{
+	return TAILQ_NEXT(prev_host, link);
+}
+
+const char *
+spdk_nvmf_host_get_nqn(const struct spdk_nvmf_host *host)
+{
+	return host->nqn;
+}
+
+struct spdk_nvmf_subsystem_listener *
+nvmf_subsystem_find_listener(struct spdk_nvmf_subsystem *subsystem,
+			     const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_subsystem_listener *listener;
+
+	TAILQ_FOREACH(listener, &subsystem->listeners, link) {
+		if (spdk_nvme_transport_id_compare(listener->trid, trid) == 0) {
+			return listener;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Function to be called once the target is listening.
+ *
+ * \param ctx Context argument passed to this function.
+ * \param status 0 if it completed successfully, or negative errno if it failed.
+ */
+static void
+_nvmf_subsystem_add_listener_done(void *ctx, int status)
+{
+	struct spdk_nvmf_subsystem_listener *listener = ctx;
+
+	if (status) {
+		listener->cb_fn(listener->cb_arg, status);
+		free(listener);
+		return;
+	}
+
+	TAILQ_INSERT_HEAD(&listener->subsystem->listeners, listener, link);
+	listener->subsystem->tgt->discovery_genctr++;
+	listener->cb_fn(listener->cb_arg, status);
+}
+
+void
+spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem,
+				 struct spdk_nvme_transport_id *trid,
+				 spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn,
+				 void *cb_arg)
+{
+	struct spdk_nvmf_transport *transport;
+	struct spdk_nvmf_subsystem_listener *listener;
+	struct spdk_nvmf_listener *tr_listener;
+
+	assert(cb_fn != NULL);
+
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		cb_fn(cb_arg, -EAGAIN);
+		return;
+	}
+
+	if (nvmf_subsystem_find_listener(subsystem, trid)) {
+		/* Listener already exists in this subsystem */
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, trid->trstring);
+	if (transport == NULL) {
+		SPDK_ERRLOG("Unknown transport type %d\n", trid->trtype);
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	tr_listener = nvmf_transport_find_listener(transport, trid);
+	if (!tr_listener) {
+		SPDK_ERRLOG("Cannot find transport listener for %s\n", trid->traddr);
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	listener = calloc(1, sizeof(*listener));
+	if (!listener) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	listener->trid = &tr_listener->trid;
+	listener->transport = transport;
+	listener->cb_fn = cb_fn;
+	listener->cb_arg = cb_arg;
+	listener->subsystem = subsystem;
+
+	if (transport->ops->listen_associate != NULL) {
+		transport->ops->listen_associate(transport, subsystem, trid,
+						 _nvmf_subsystem_add_listener_done,
+						 listener);
+	} else {
+		_nvmf_subsystem_add_listener_done(listener, 0);
+	}
+}
+
+int
+spdk_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem,
+				    const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_subsystem_listener *listener;
+
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		return -EAGAIN;
+	}
+
+	listener = nvmf_subsystem_find_listener(subsystem, trid);
+	if (listener == NULL) {
+		return -ENOENT;
+	}
+
+	_nvmf_subsystem_remove_listener(subsystem, listener, false);
+
+	return 0;
+}
+
+void
+nvmf_subsystem_remove_all_listeners(struct spdk_nvmf_subsystem *subsystem,
+				    bool stop)
+{
+	struct spdk_nvmf_subsystem_listener *listener, *listener_tmp;
+
+	TAILQ_FOREACH_SAFE(listener, &subsystem->listeners, link, listener_tmp) {
+		_nvmf_subsystem_remove_listener(subsystem, listener, stop);
+	}
+}
+
+bool
+spdk_nvmf_subsystem_listener_allowed(struct spdk_nvmf_subsystem *subsystem,
+				     const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_subsystem_listener *listener;
+
+	if (!strcmp(subsystem->subnqn, SPDK_NVMF_DISCOVERY_NQN)) {
+		return true;
+	}
+
+	TAILQ_FOREACH(listener, &subsystem->listeners, link) {
+		if (spdk_nvme_transport_id_compare(listener->trid, trid) == 0) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+struct spdk_nvmf_subsystem_listener *
+spdk_nvmf_subsystem_get_first_listener(struct spdk_nvmf_subsystem *subsystem)
+{
+	return TAILQ_FIRST(&subsystem->listeners);
+}
+
+struct spdk_nvmf_subsystem_listener *
+spdk_nvmf_subsystem_get_next_listener(struct spdk_nvmf_subsystem *subsystem,
+				      struct spdk_nvmf_subsystem_listener *prev_listener)
+{
+	return TAILQ_NEXT(prev_listener, link);
+}
+
+const struct spdk_nvme_transport_id *
+spdk_nvmf_subsystem_listener_get_trid(struct spdk_nvmf_subsystem_listener *listener)
+{
+	return listener->trid;
+}
+
+void
+spdk_nvmf_subsystem_allow_any_listener(struct spdk_nvmf_subsystem *subsystem,
+				       bool allow_any_listener)
+{
+	subsystem->allow_any_listener = allow_any_listener;
+}
+
+bool
+spdk_nvmf_subsytem_any_listener_allowed(struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->allow_any_listener;
+}
+
+
+struct subsystem_update_ns_ctx {
+	struct spdk_nvmf_subsystem *subsystem;
+
+	spdk_nvmf_subsystem_state_change_done cb_fn;
+	void *cb_arg;
+};
+
+static void
+subsystem_update_ns_done(struct spdk_io_channel_iter *i, int status)
+{
+	struct subsystem_update_ns_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+	if (ctx->cb_fn) {
+		ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status);
+	}
+	free(ctx);
+}
+
+static void
+subsystem_update_ns_on_pg(struct spdk_io_channel_iter *i)
+{
+	int rc;
+	struct subsystem_update_ns_ctx *ctx;
+	struct spdk_nvmf_poll_group *group;
+	struct spdk_nvmf_subsystem *subsystem;
+
+	ctx = spdk_io_channel_iter_get_ctx(i);
+	group = spdk_io_channel_get_ctx(spdk_io_channel_iter_get_channel(i));
+	subsystem = ctx->subsystem;
+
+	rc = nvmf_poll_group_update_subsystem(group, subsystem);
+	spdk_for_each_channel_continue(i, rc);
+}
+
+static int
+nvmf_subsystem_update_ns(struct spdk_nvmf_subsystem *subsystem, spdk_channel_for_each_cpl cpl,
+			 void *ctx)
+{
+	spdk_for_each_channel(subsystem->tgt,
+			      subsystem_update_ns_on_pg,
+			      ctx,
+			      cpl);
+
+	return 0;
+}
+
+static void
+nvmf_subsystem_ns_changed(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+	struct spdk_nvmf_ctrlr *ctrlr;
+
+	TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+		nvmf_ctrlr_ns_changed(ctrlr, nsid);
+	}
+}
+
+int
+spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+	struct spdk_nvmf_ns *ns;
+	struct spdk_nvmf_registrant *reg, *reg_tmp;
+
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		assert(false);
+		return -1;
+	}
+
+	if (nsid == 0 || nsid > subsystem->max_nsid) {
+		return -1;
+	}
+
+	ns = subsystem->ns[nsid - 1];
+	if (!ns) {
+		return -1;
+	}
+
+	subsystem->ns[nsid - 1] = NULL;
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, reg_tmp) {
+		TAILQ_REMOVE(&ns->registrants, reg, link);
+		free(reg);
+	}
+	spdk_bdev_module_release_bdev(ns->bdev);
+	spdk_bdev_close(ns->desc);
+	if (ns->ptpl_file) {
+		free(ns->ptpl_file);
+	}
+	free(ns);
+
+	nvmf_subsystem_ns_changed(subsystem, nsid);
+
+	return 0;
+}
+
+static void
+_nvmf_ns_hot_remove(struct spdk_nvmf_subsystem *subsystem,
+		    void *cb_arg, int status)
+{
+	struct spdk_nvmf_ns *ns = cb_arg;
+	int rc;
+
+	rc = spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to make changes to NVME-oF subsystem with id: %u\n", subsystem->id);
+	}
+
+	spdk_nvmf_subsystem_resume(subsystem, NULL, NULL);
+}
+
+static void
+nvmf_ns_hot_remove(void *remove_ctx)
+{
+	struct spdk_nvmf_ns *ns = remove_ctx;
+	int rc;
+
+	rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_hot_remove, ns);
+	if (rc) {
+		SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n");
+	}
+}
+
+static void
+_nvmf_ns_resize(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status)
+{
+	struct spdk_nvmf_ns *ns = cb_arg;
+
+	nvmf_subsystem_ns_changed(subsystem, ns->opts.nsid);
+	spdk_nvmf_subsystem_resume(subsystem, NULL, NULL);
+}
+
+static void
+nvmf_ns_resize(void *event_ctx)
+{
+	struct spdk_nvmf_ns *ns = event_ctx;
+	int rc;
+
+	rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_resize, ns);
+	if (rc) {
+		SPDK_ERRLOG("Unable to pause subsystem to process namespace resize!\n");
+	}
+}
+
+static void
+nvmf_ns_event(enum spdk_bdev_event_type type,
+	      struct spdk_bdev *bdev,
+	      void *event_ctx)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Bdev event: type %d, name %s, subsystem_id %d, ns_id %d\n",
+		      type,
+		      bdev->name,
+		      ((struct spdk_nvmf_ns *)event_ctx)->subsystem->id,
+		      ((struct spdk_nvmf_ns *)event_ctx)->nsid);
+
+	switch (type) {
+	case SPDK_BDEV_EVENT_REMOVE:
+		nvmf_ns_hot_remove(event_ctx);
+		break;
+	case SPDK_BDEV_EVENT_RESIZE:
+		nvmf_ns_resize(event_ctx);
+		break;
+	default:
+		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
+		break;
+	}
+}
+
+void
+spdk_nvmf_ns_opts_get_defaults(struct spdk_nvmf_ns_opts *opts, size_t opts_size)
+{
+	/* All current fields are set to 0 by default. */
+	memset(opts, 0, opts_size);
+}
+
+/* Dummy bdev module used to to claim bdevs. */
+static struct spdk_bdev_module ns_bdev_module = {
+	.name	= "NVMe-oF Target",
+};
+
+static int
+nvmf_ns_load_reservation(const char *file, struct spdk_nvmf_reservation_info *info);
+static int
+nvmf_ns_reservation_restore(struct spdk_nvmf_ns *ns, struct spdk_nvmf_reservation_info *info);
+
+uint32_t
+spdk_nvmf_subsystem_add_ns(struct spdk_nvmf_subsystem *subsystem, struct spdk_bdev *bdev,
+			   const struct spdk_nvmf_ns_opts *user_opts, size_t opts_size,
+			   const char *ptpl_file)
+{
+	struct spdk_nvmf_ns_opts opts;
+	struct spdk_nvmf_ns *ns;
+	struct spdk_nvmf_reservation_info info = {0};
+	int rc;
+
+	if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+	      subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+		return 0;
+	}
+
+	if (spdk_bdev_get_md_size(bdev) != 0 && !spdk_bdev_is_md_interleaved(bdev)) {
+		SPDK_ERRLOG("Can't attach bdev with separate metadata.\n");
+		return 0;
+	}
+
+	spdk_nvmf_ns_opts_get_defaults(&opts, sizeof(opts));
+	if (user_opts) {
+		memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size));
+	}
+
+	if (spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) {
+		opts.uuid = *spdk_bdev_get_uuid(bdev);
+	}
+
+	if (opts.nsid == SPDK_NVME_GLOBAL_NS_TAG) {
+		SPDK_ERRLOG("Invalid NSID %" PRIu32 "\n", opts.nsid);
+		return 0;
+	}
+
+	if (opts.nsid == 0) {
+		/*
+		 * NSID not specified - find a free index.
+		 *
+		 * If no free slots are found, opts.nsid will be subsystem->max_nsid + 1, which will
+		 * expand max_nsid if possible.
+		 */
+		for (opts.nsid = 1; opts.nsid <= subsystem->max_nsid; opts.nsid++) {
+			if (_nvmf_subsystem_get_ns(subsystem, opts.nsid) == NULL) {
+				break;
+			}
+		}
+	}
+
+	if (_nvmf_subsystem_get_ns(subsystem, opts.nsid)) {
+		SPDK_ERRLOG("Requested NSID %" PRIu32 " already in use\n", opts.nsid);
+		return 0;
+	}
+
+	if (opts.nsid > subsystem->max_nsid) {
+		struct spdk_nvmf_ns **new_ns_array;
+
+		/* If MaxNamespaces was specified, we can't extend max_nsid beyond it. */
+		if (subsystem->max_allowed_nsid > 0 && opts.nsid > subsystem->max_allowed_nsid) {
+			SPDK_ERRLOG("Can't extend NSID range above MaxNamespaces\n");
+			return 0;
+		}
+
+		/* If a controller is connected, we can't change NN. */
+		if (!TAILQ_EMPTY(&subsystem->ctrlrs)) {
+			SPDK_ERRLOG("Can't extend NSID range while controllers are connected\n");
+			return 0;
+		}
+
+		new_ns_array = realloc(subsystem->ns, sizeof(struct spdk_nvmf_ns *) * opts.nsid);
+		if (new_ns_array == NULL) {
+			SPDK_ERRLOG("Memory allocation error while resizing namespace array.\n");
+			return 0;
+		}
+
+		memset(new_ns_array + subsystem->max_nsid, 0,
+		       sizeof(struct spdk_nvmf_ns *) * (opts.nsid - subsystem->max_nsid));
+		subsystem->ns = new_ns_array;
+		subsystem->max_nsid = opts.nsid;
+	}
+
+	ns = calloc(1, sizeof(*ns));
+	if (ns == NULL) {
+		SPDK_ERRLOG("Namespace allocation failed\n");
+		return 0;
+	}
+
+	ns->bdev = bdev;
+	ns->opts = opts;
+	ns->subsystem = subsystem;
+	rc = spdk_bdev_open_ext(bdev->name, true, nvmf_ns_event, ns, &ns->desc);
+	if (rc != 0) {
+		SPDK_ERRLOG("Subsystem %s: bdev %s cannot be opened, error=%d\n",
+			    subsystem->subnqn, spdk_bdev_get_name(bdev), rc);
+		free(ns);
+		return 0;
+	}
+	rc = spdk_bdev_module_claim_bdev(bdev, ns->desc, &ns_bdev_module);
+	if (rc != 0) {
+		spdk_bdev_close(ns->desc);
+		free(ns);
+		return 0;
+	}
+	subsystem->ns[opts.nsid - 1] = ns;
+	ns->nsid = opts.nsid;
+	TAILQ_INIT(&ns->registrants);
+
+	if (ptpl_file) {
+		rc = nvmf_ns_load_reservation(ptpl_file, &info);
+		if (!rc) {
+			rc = nvmf_ns_reservation_restore(ns, &info);
+			if (rc) {
+				SPDK_ERRLOG("Subsystem restore reservation failed\n");
+				subsystem->ns[opts.nsid - 1] = NULL;
+				spdk_bdev_close(ns->desc);
+				free(ns);
+				return 0;
+			}
+		}
+		ns->ptpl_file = strdup(ptpl_file);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Subsystem %s: bdev %s assigned nsid %" PRIu32 "\n",
+		      spdk_nvmf_subsystem_get_nqn(subsystem),
+		      spdk_bdev_get_name(bdev),
+		      opts.nsid);
+
+	nvmf_subsystem_ns_changed(subsystem, opts.nsid);
+
+	return opts.nsid;
+}
+
+static uint32_t
+nvmf_subsystem_get_next_allocated_nsid(struct spdk_nvmf_subsystem *subsystem,
+				       uint32_t prev_nsid)
+{
+	uint32_t nsid;
+
+	if (prev_nsid >= subsystem->max_nsid) {
+		return 0;
+	}
+
+	for (nsid = prev_nsid + 1; nsid <= subsystem->max_nsid; nsid++) {
+		if (subsystem->ns[nsid - 1]) {
+			return nsid;
+		}
+	}
+
+	return 0;
+}
+
+struct spdk_nvmf_ns *
+spdk_nvmf_subsystem_get_first_ns(struct spdk_nvmf_subsystem *subsystem)
+{
+	uint32_t first_nsid;
+
+	first_nsid = nvmf_subsystem_get_next_allocated_nsid(subsystem, 0);
+	return _nvmf_subsystem_get_ns(subsystem, first_nsid);
+}
+
+struct spdk_nvmf_ns *
+spdk_nvmf_subsystem_get_next_ns(struct spdk_nvmf_subsystem *subsystem,
+				struct spdk_nvmf_ns *prev_ns)
+{
+	uint32_t next_nsid;
+
+	next_nsid = nvmf_subsystem_get_next_allocated_nsid(subsystem, prev_ns->opts.nsid);
+	return _nvmf_subsystem_get_ns(subsystem, next_nsid);
+}
+
+struct spdk_nvmf_ns *
+spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+	return _nvmf_subsystem_get_ns(subsystem, nsid);
+}
+
+uint32_t
+spdk_nvmf_ns_get_id(const struct spdk_nvmf_ns *ns)
+{
+	return ns->opts.nsid;
+}
+
+struct spdk_bdev *
+spdk_nvmf_ns_get_bdev(struct spdk_nvmf_ns *ns)
+{
+	return ns->bdev;
+}
+
+void
+spdk_nvmf_ns_get_opts(const struct spdk_nvmf_ns *ns, struct spdk_nvmf_ns_opts *opts,
+		      size_t opts_size)
+{
+	memset(opts, 0, opts_size);
+	memcpy(opts, &ns->opts, spdk_min(sizeof(ns->opts), opts_size));
+}
+
+const char *
+spdk_nvmf_subsystem_get_sn(const struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->sn;
+}
+
+int
+spdk_nvmf_subsystem_set_sn(struct spdk_nvmf_subsystem *subsystem, const char *sn)
+{
+	size_t len, max_len;
+
+	max_len = sizeof(subsystem->sn) - 1;
+	len = strlen(sn);
+	if (len > max_len) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid sn \"%s\": length %zu > max %zu\n",
+			      sn, len, max_len);
+		return -1;
+	}
+
+	if (!nvmf_valid_ascii_string(sn, len)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII sn\n");
+		SPDK_LOGDUMP(SPDK_LOG_NVMF, "sn", sn, len);
+		return -1;
+	}
+
+	snprintf(subsystem->sn, sizeof(subsystem->sn), "%s", sn);
+
+	return 0;
+}
+
+const char *
+spdk_nvmf_subsystem_get_mn(const struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->mn;
+}
+
+int
+spdk_nvmf_subsystem_set_mn(struct spdk_nvmf_subsystem *subsystem, const char *mn)
+{
+	size_t len, max_len;
+
+	if (mn == NULL) {
+		mn = MODEL_NUMBER_DEFAULT;
+	}
+	max_len = sizeof(subsystem->mn) - 1;
+	len = strlen(mn);
+	if (len > max_len) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid mn \"%s\": length %zu > max %zu\n",
+			      mn, len, max_len);
+		return -1;
+	}
+
+	if (!nvmf_valid_ascii_string(mn, len)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII mn\n");
+		SPDK_LOGDUMP(SPDK_LOG_NVMF, "mn", mn, len);
+		return -1;
+	}
+
+	snprintf(subsystem->mn, sizeof(subsystem->mn), "%s", mn);
+
+	return 0;
+}
+
+const char *
+spdk_nvmf_subsystem_get_nqn(const struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->subnqn;
+}
+
+enum spdk_nvmf_subtype spdk_nvmf_subsystem_get_type(struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->subtype;
+}
+
+uint32_t
+spdk_nvmf_subsystem_get_max_nsid(struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->max_nsid;
+}
+
+static uint16_t
+nvmf_subsystem_gen_cntlid(struct spdk_nvmf_subsystem *subsystem)
+{
+	int count;
+
+	/*
+	 * In the worst case, we might have to try all CNTLID values between 1 and 0xFFF0 - 1
+	 * before we find one that is unused (or find that all values are in use).
+	 */
+	for (count = 0; count < 0xFFF0 - 1; count++) {
+		subsystem->next_cntlid++;
+		if (subsystem->next_cntlid >= 0xFFF0) {
+			/* The spec reserves cntlid values in the range FFF0h to FFFFh. */
+			subsystem->next_cntlid = 1;
+		}
+
+		/* Check if a controller with this cntlid currently exists. */
+		if (nvmf_subsystem_get_ctrlr(subsystem, subsystem->next_cntlid) == NULL) {
+			/* Found unused cntlid */
+			return subsystem->next_cntlid;
+		}
+	}
+
+	/* All valid cntlid values are in use. */
+	return 0xFFFF;
+}
+
+int
+nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_ctrlr *ctrlr)
+{
+	ctrlr->cntlid = nvmf_subsystem_gen_cntlid(subsystem);
+	if (ctrlr->cntlid == 0xFFFF) {
+		/* Unable to get a cntlid */
+		SPDK_ERRLOG("Reached max simultaneous ctrlrs\n");
+		return -EBUSY;
+	}
+
+	TAILQ_INSERT_TAIL(&subsystem->ctrlrs, ctrlr, link);
+
+	return 0;
+}
+
+void
+nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+			    struct spdk_nvmf_ctrlr *ctrlr)
+{
+	assert(subsystem == ctrlr->subsys);
+	TAILQ_REMOVE(&subsystem->ctrlrs, ctrlr, link);
+}
+
+struct spdk_nvmf_ctrlr *
+nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid)
+{
+	struct spdk_nvmf_ctrlr *ctrlr;
+
+	TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+		if (ctrlr->cntlid == cntlid) {
+			return ctrlr;
+		}
+	}
+
+	return NULL;
+}
+
+uint32_t
+spdk_nvmf_subsystem_get_max_namespaces(const struct spdk_nvmf_subsystem *subsystem)
+{
+	return subsystem->max_allowed_nsid;
+}
+
+struct _nvmf_ns_registrant {
+	uint64_t		rkey;
+	char			*host_uuid;
+};
+
+struct _nvmf_ns_registrants {
+	size_t				num_regs;
+	struct _nvmf_ns_registrant	reg[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+};
+
+struct _nvmf_ns_reservation {
+	bool					ptpl_activated;
+	enum spdk_nvme_reservation_type		rtype;
+	uint64_t				crkey;
+	char					*bdev_uuid;
+	char					*holder_uuid;
+	struct _nvmf_ns_registrants		regs;
+};
+
+static const struct spdk_json_object_decoder nvmf_ns_pr_reg_decoders[] = {
+	{"rkey", offsetof(struct _nvmf_ns_registrant, rkey), spdk_json_decode_uint64},
+	{"host_uuid", offsetof(struct _nvmf_ns_registrant, host_uuid), spdk_json_decode_string},
+};
+
+static int
+nvmf_decode_ns_pr_reg(const struct spdk_json_val *val, void *out)
+{
+	struct _nvmf_ns_registrant *reg = out;
+
+	return spdk_json_decode_object(val, nvmf_ns_pr_reg_decoders,
+				       SPDK_COUNTOF(nvmf_ns_pr_reg_decoders), reg);
+}
+
+static int
+nvmf_decode_ns_pr_regs(const struct spdk_json_val *val, void *out)
+{
+	struct _nvmf_ns_registrants *regs = out;
+
+	return spdk_json_decode_array(val, nvmf_decode_ns_pr_reg, regs->reg,
+				      SPDK_NVMF_MAX_NUM_REGISTRANTS, &regs->num_regs,
+				      sizeof(struct _nvmf_ns_registrant));
+}
+
+static const struct spdk_json_object_decoder nvmf_ns_pr_decoders[] = {
+	{"ptpl", offsetof(struct _nvmf_ns_reservation, ptpl_activated), spdk_json_decode_bool, true},
+	{"rtype", offsetof(struct _nvmf_ns_reservation, rtype), spdk_json_decode_uint32, true},
+	{"crkey", offsetof(struct _nvmf_ns_reservation, crkey), spdk_json_decode_uint64, true},
+	{"bdev_uuid", offsetof(struct _nvmf_ns_reservation, bdev_uuid), spdk_json_decode_string},
+	{"holder_uuid", offsetof(struct _nvmf_ns_reservation, holder_uuid), spdk_json_decode_string, true},
+	{"registrants", offsetof(struct _nvmf_ns_reservation, regs), nvmf_decode_ns_pr_regs},
+};
+
+static int
+nvmf_ns_load_reservation(const char *file, struct spdk_nvmf_reservation_info *info)
+{
+	FILE *fd;
+	size_t json_size;
+	ssize_t values_cnt, rc;
+	void *json = NULL, *end;
+	struct spdk_json_val *values = NULL;
+	struct _nvmf_ns_reservation res = {};
+	uint32_t i;
+
+	fd = fopen(file, "r");
+	/* It's not an error if the file does not exist */
+	if (!fd) {
+		SPDK_NOTICELOG("File %s does not exist\n", file);
+		return -ENOENT;
+	}
+
+	/* Load all persist file contents into a local buffer */
+	json = spdk_posix_file_load(fd, &json_size);
+	fclose(fd);
+	if (!json) {
+		SPDK_ERRLOG("Load persit file %s failed\n", file);
+		return -ENOMEM;
+	}
+
+	rc = spdk_json_parse(json, json_size, NULL, 0, &end, 0);
+	if (rc < 0) {
+		SPDK_NOTICELOG("Parsing JSON configuration failed (%zd)\n", rc);
+		goto exit;
+	}
+
+	values_cnt = rc;
+	values = calloc(values_cnt, sizeof(struct spdk_json_val));
+	if (values == NULL) {
+		goto exit;
+	}
+
+	rc = spdk_json_parse(json, json_size, values, values_cnt, &end, 0);
+	if (rc != values_cnt) {
+		SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc);
+		goto exit;
+	}
+
+	/* Decode json */
+	if (spdk_json_decode_object(values, nvmf_ns_pr_decoders,
+				    SPDK_COUNTOF(nvmf_ns_pr_decoders),
+				    &res)) {
+		SPDK_ERRLOG("Invalid objects in the persist file %s\n", file);
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	if (res.regs.num_regs > SPDK_NVMF_MAX_NUM_REGISTRANTS) {
+		SPDK_ERRLOG("Can only support up to %u registrants\n", SPDK_NVMF_MAX_NUM_REGISTRANTS);
+		rc = -ERANGE;
+		goto exit;
+	}
+
+	rc = 0;
+	info->ptpl_activated = res.ptpl_activated;
+	info->rtype = res.rtype;
+	info->crkey = res.crkey;
+	snprintf(info->bdev_uuid, sizeof(info->bdev_uuid), "%s", res.bdev_uuid);
+	snprintf(info->holder_uuid, sizeof(info->holder_uuid), "%s", res.holder_uuid);
+	info->num_regs = res.regs.num_regs;
+	for (i = 0; i < res.regs.num_regs; i++) {
+		info->registrants[i].rkey = res.regs.reg[i].rkey;
+		snprintf(info->registrants[i].host_uuid, sizeof(info->registrants[i].host_uuid), "%s",
+			 res.regs.reg[i].host_uuid);
+	}
+
+exit:
+	free(json);
+	free(values);
+	free(res.bdev_uuid);
+	free(res.holder_uuid);
+	for (i = 0; i < res.regs.num_regs; i++) {
+		free(res.regs.reg[i].host_uuid);
+	}
+
+	return rc;
+}
+
+static bool
+nvmf_ns_reservation_all_registrants_type(struct spdk_nvmf_ns *ns);
+
+static int
+nvmf_ns_reservation_restore(struct spdk_nvmf_ns *ns, struct spdk_nvmf_reservation_info *info)
+{
+	uint32_t i;
+	struct spdk_nvmf_registrant *reg, *holder = NULL;
+	struct spdk_uuid bdev_uuid, holder_uuid;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "NSID %u, PTPL %u, Number of registrants %u\n",
+		      ns->nsid, info->ptpl_activated, info->num_regs);
+
+	/* it's not an error */
+	if (!info->ptpl_activated || !info->num_regs) {
+		return 0;
+	}
+
+	spdk_uuid_parse(&bdev_uuid, info->bdev_uuid);
+	if (spdk_uuid_compare(&bdev_uuid, spdk_bdev_get_uuid(ns->bdev))) {
+		SPDK_ERRLOG("Existing bdev UUID is not same with configuration file\n");
+		return -EINVAL;
+	}
+
+	ns->crkey = info->crkey;
+	ns->rtype = info->rtype;
+	ns->ptpl_activated = info->ptpl_activated;
+	spdk_uuid_parse(&holder_uuid, info->holder_uuid);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Bdev UUID %s\n", info->bdev_uuid);
+	if (info->rtype) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Holder UUID %s, RTYPE %u, RKEY 0x%"PRIx64"\n",
+			      info->holder_uuid, info->rtype, info->crkey);
+	}
+
+	for (i = 0; i < info->num_regs; i++) {
+		reg = calloc(1, sizeof(*reg));
+		if (!reg) {
+			return -ENOMEM;
+		}
+		spdk_uuid_parse(&reg->hostid, info->registrants[i].host_uuid);
+		reg->rkey = info->registrants[i].rkey;
+		TAILQ_INSERT_TAIL(&ns->registrants, reg, link);
+		if (!spdk_uuid_compare(&holder_uuid, &reg->hostid)) {
+			holder = reg;
+		}
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Registrant RKEY 0x%"PRIx64", Host UUID %s\n",
+			      info->registrants[i].rkey, info->registrants[i].host_uuid);
+	}
+
+	if (nvmf_ns_reservation_all_registrants_type(ns)) {
+		ns->holder = TAILQ_FIRST(&ns->registrants);
+	} else {
+		ns->holder = holder;
+	}
+
+	return 0;
+}
+
+static int
+nvmf_ns_json_write_cb(void *cb_ctx, const void *data, size_t size)
+{
+	char *file = cb_ctx;
+	size_t rc;
+	FILE *fd;
+
+	fd = fopen(file, "w");
+	if (!fd) {
+		SPDK_ERRLOG("Can't open file %s for write\n", file);
+		return -ENOENT;
+	}
+	rc = fwrite(data, 1, size, fd);
+	fclose(fd);
+
+	return rc == size ? 0 : -1;
+}
+
+static int
+nvmf_ns_reservation_update(const char *file, struct spdk_nvmf_reservation_info *info)
+{
+	struct spdk_json_write_ctx *w;
+	uint32_t i;
+	int rc = 0;
+
+	w = spdk_json_write_begin(nvmf_ns_json_write_cb, (void *)file, 0);
+	if (w == NULL) {
+		return -ENOMEM;
+	}
+	/* clear the configuration file */
+	if (!info->ptpl_activated) {
+		goto exit;
+	}
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_bool(w, "ptpl", info->ptpl_activated);
+	spdk_json_write_named_uint32(w, "rtype", info->rtype);
+	spdk_json_write_named_uint64(w, "crkey", info->crkey);
+	spdk_json_write_named_string(w, "bdev_uuid", info->bdev_uuid);
+	spdk_json_write_named_string(w, "holder_uuid", info->holder_uuid);
+
+	spdk_json_write_named_array_begin(w, "registrants");
+	for (i = 0; i < info->num_regs; i++) {
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_uint64(w, "rkey", info->registrants[i].rkey);
+		spdk_json_write_named_string(w, "host_uuid", info->registrants[i].host_uuid);
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+	spdk_json_write_object_end(w);
+
+exit:
+	rc = spdk_json_write_end(w);
+	return rc;
+}
+
+static int
+nvmf_ns_update_reservation_info(struct spdk_nvmf_ns *ns)
+{
+	struct spdk_nvmf_reservation_info info;
+	struct spdk_nvmf_registrant *reg, *tmp;
+	uint32_t i = 0;
+
+	assert(ns != NULL);
+
+	if (!ns->bdev || !ns->ptpl_file) {
+		return 0;
+	}
+
+	memset(&info, 0, sizeof(info));
+	spdk_uuid_fmt_lower(info.bdev_uuid, sizeof(info.bdev_uuid), spdk_bdev_get_uuid(ns->bdev));
+
+	if (ns->rtype) {
+		info.rtype = ns->rtype;
+		info.crkey = ns->crkey;
+		if (!nvmf_ns_reservation_all_registrants_type(ns)) {
+			assert(ns->holder != NULL);
+			spdk_uuid_fmt_lower(info.holder_uuid, sizeof(info.holder_uuid), &ns->holder->hostid);
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+		spdk_uuid_fmt_lower(info.registrants[i].host_uuid, sizeof(info.registrants[i].host_uuid),
+				    &reg->hostid);
+		info.registrants[i++].rkey = reg->rkey;
+	}
+
+	info.num_regs = i;
+	info.ptpl_activated = ns->ptpl_activated;
+
+	return nvmf_ns_reservation_update(ns->ptpl_file, &info);
+}
+
+static struct spdk_nvmf_registrant *
+nvmf_ns_reservation_get_registrant(struct spdk_nvmf_ns *ns,
+				   struct spdk_uuid *uuid)
+{
+	struct spdk_nvmf_registrant *reg, *tmp;
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+		if (!spdk_uuid_compare(&reg->hostid, uuid)) {
+			return reg;
+		}
+	}
+
+	return NULL;
+}
+
+/* Generate reservation notice log to registered HostID controllers */
+static void
+nvmf_subsystem_gen_ctrlr_notification(struct spdk_nvmf_subsystem *subsystem,
+				      struct spdk_nvmf_ns *ns,
+				      struct spdk_uuid *hostid_list,
+				      uint32_t num_hostid,
+				      enum spdk_nvme_reservation_notification_log_page_type type)
+{
+	struct spdk_nvmf_ctrlr *ctrlr;
+	uint32_t i;
+
+	for (i = 0; i < num_hostid; i++) {
+		TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+			if (!spdk_uuid_compare(&ctrlr->hostid, &hostid_list[i])) {
+				nvmf_ctrlr_reservation_notice_log(ctrlr, ns, type);
+			}
+		}
+	}
+}
+
+/* Get all registrants' hostid other than the controller who issued the command */
+static uint32_t
+nvmf_ns_reservation_get_all_other_hostid(struct spdk_nvmf_ns *ns,
+		struct spdk_uuid *hostid_list,
+		uint32_t max_num_hostid,
+		struct spdk_uuid *current_hostid)
+{
+	struct spdk_nvmf_registrant *reg, *tmp;
+	uint32_t num_hostid = 0;
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+		if (spdk_uuid_compare(&reg->hostid, current_hostid)) {
+			if (num_hostid == max_num_hostid) {
+				assert(false);
+				return max_num_hostid;
+			}
+			hostid_list[num_hostid++] = reg->hostid;
+		}
+	}
+
+	return num_hostid;
+}
+
+/* Calculate the unregistered HostID list according to list
+ * prior to execute preempt command and list after executing
+ * preempt command.
+ */
+static uint32_t
+nvmf_ns_reservation_get_unregistered_hostid(struct spdk_uuid *old_hostid_list,
+		uint32_t old_num_hostid,
+		struct spdk_uuid *remaining_hostid_list,
+		uint32_t remaining_num_hostid)
+{
+	struct spdk_uuid temp_hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+	uint32_t i, j, num_hostid = 0;
+	bool found;
+
+	if (!remaining_num_hostid) {
+		return old_num_hostid;
+	}
+
+	for (i = 0; i < old_num_hostid; i++) {
+		found = false;
+		for (j = 0; j < remaining_num_hostid; j++) {
+			if (!spdk_uuid_compare(&old_hostid_list[i], &remaining_hostid_list[j])) {
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			spdk_uuid_copy(&temp_hostid_list[num_hostid++], &old_hostid_list[i]);
+		}
+	}
+
+	if (num_hostid) {
+		memcpy(old_hostid_list, temp_hostid_list, sizeof(struct spdk_uuid) * num_hostid);
+	}
+
+	return num_hostid;
+}
+
+/* current reservation type is all registrants or not */
+static bool
+nvmf_ns_reservation_all_registrants_type(struct spdk_nvmf_ns *ns)
+{
+	return (ns->rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_ALL_REGS ||
+		ns->rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS);
+}
+
+/* current registrant is reservation holder or not */
+static bool
+nvmf_ns_reservation_registrant_is_holder(struct spdk_nvmf_ns *ns,
+		struct spdk_nvmf_registrant *reg)
+{
+	if (!reg) {
+		return false;
+	}
+
+	if (nvmf_ns_reservation_all_registrants_type(ns)) {
+		return true;
+	}
+
+	return (ns->holder == reg);
+}
+
+static int
+nvmf_ns_reservation_add_registrant(struct spdk_nvmf_ns *ns,
+				   struct spdk_nvmf_ctrlr *ctrlr,
+				   uint64_t nrkey)
+{
+	struct spdk_nvmf_registrant *reg;
+
+	reg = calloc(1, sizeof(*reg));
+	if (!reg) {
+		return -ENOMEM;
+	}
+
+	reg->rkey = nrkey;
+	/* set hostid for the registrant */
+	spdk_uuid_copy(&reg->hostid, &ctrlr->hostid);
+	TAILQ_INSERT_TAIL(&ns->registrants, reg, link);
+	ns->gen++;
+
+	return 0;
+}
+
+static void
+nvmf_ns_reservation_release_reservation(struct spdk_nvmf_ns *ns)
+{
+	ns->rtype = 0;
+	ns->crkey = 0;
+	ns->holder = NULL;
+}
+
+/* release the reservation if the last registrant was removed */
+static void
+nvmf_ns_reservation_check_release_on_remove_registrant(struct spdk_nvmf_ns *ns,
+		struct spdk_nvmf_registrant *reg)
+{
+	struct spdk_nvmf_registrant *next_reg;
+
+	/* no reservation holder */
+	if (!ns->holder) {
+		assert(ns->rtype == 0);
+		return;
+	}
+
+	next_reg = TAILQ_FIRST(&ns->registrants);
+	if (next_reg && nvmf_ns_reservation_all_registrants_type(ns)) {
+		/* the next valid registrant is the new holder now */
+		ns->holder = next_reg;
+	} else if (nvmf_ns_reservation_registrant_is_holder(ns, reg)) {
+		/* release the reservation */
+		nvmf_ns_reservation_release_reservation(ns);
+	}
+}
+
+static void
+nvmf_ns_reservation_remove_registrant(struct spdk_nvmf_ns *ns,
+				      struct spdk_nvmf_registrant *reg)
+{
+	TAILQ_REMOVE(&ns->registrants, reg, link);
+	nvmf_ns_reservation_check_release_on_remove_registrant(ns, reg);
+	free(reg);
+	ns->gen++;
+	return;
+}
+
+static uint32_t
+nvmf_ns_reservation_remove_registrants_by_key(struct spdk_nvmf_ns *ns,
+		uint64_t rkey)
+{
+	struct spdk_nvmf_registrant *reg, *tmp;
+	uint32_t count = 0;
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+		if (reg->rkey == rkey) {
+			nvmf_ns_reservation_remove_registrant(ns, reg);
+			count++;
+		}
+	}
+	return count;
+}
+
+static uint32_t
+nvmf_ns_reservation_remove_all_other_registrants(struct spdk_nvmf_ns *ns,
+		struct spdk_nvmf_registrant *reg)
+{
+	struct spdk_nvmf_registrant *reg_tmp, *reg_tmp2;
+	uint32_t count = 0;
+
+	TAILQ_FOREACH_SAFE(reg_tmp, &ns->registrants, link, reg_tmp2) {
+		if (reg_tmp != reg) {
+			nvmf_ns_reservation_remove_registrant(ns, reg_tmp);
+			count++;
+		}
+	}
+	return count;
+}
+
+static uint32_t
+nvmf_ns_reservation_clear_all_registrants(struct spdk_nvmf_ns *ns)
+{
+	struct spdk_nvmf_registrant *reg, *reg_tmp;
+	uint32_t count = 0;
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, reg_tmp) {
+		nvmf_ns_reservation_remove_registrant(ns, reg);
+		count++;
+	}
+	return count;
+}
+
+static void
+nvmf_ns_reservation_acquire_reservation(struct spdk_nvmf_ns *ns, uint64_t rkey,
+					enum spdk_nvme_reservation_type rtype,
+					struct spdk_nvmf_registrant *holder)
+{
+	ns->rtype = rtype;
+	ns->crkey = rkey;
+	assert(ns->holder == NULL);
+	ns->holder = holder;
+}
+
+static bool
+nvmf_ns_reservation_register(struct spdk_nvmf_ns *ns,
+			     struct spdk_nvmf_ctrlr *ctrlr,
+			     struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	uint8_t rrega, iekey, cptpl, rtype;
+	struct spdk_nvme_reservation_register_data key;
+	struct spdk_nvmf_registrant *reg;
+	uint8_t status = SPDK_NVME_SC_SUCCESS;
+	bool update_sgroup = false;
+	struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+	uint32_t num_hostid = 0;
+	int rc;
+
+	rrega = cmd->cdw10_bits.resv_register.rrega;
+	iekey = cmd->cdw10_bits.resv_register.iekey;
+	cptpl = cmd->cdw10_bits.resv_register.cptpl;
+
+	if (req->data && req->length >= sizeof(key)) {
+		memcpy(&key, req->data, sizeof(key));
+	} else {
+		SPDK_ERRLOG("No key provided. Failing request.\n");
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		goto exit;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "REGISTER: RREGA %u, IEKEY %u, CPTPL %u, "
+		      "NRKEY 0x%"PRIx64", NRKEY 0x%"PRIx64"\n",
+		      rrega, iekey, cptpl, key.crkey, key.nrkey);
+
+	if (cptpl == SPDK_NVME_RESERVE_PTPL_CLEAR_POWER_ON) {
+		/* Ture to OFF state, and need to be updated in the configuration file */
+		if (ns->ptpl_activated) {
+			ns->ptpl_activated = 0;
+			update_sgroup = true;
+		}
+	} else if (cptpl == SPDK_NVME_RESERVE_PTPL_PERSIST_POWER_LOSS) {
+		if (ns->ptpl_file == NULL) {
+			status = SPDK_NVME_SC_INVALID_FIELD;
+			goto exit;
+		} else if (ns->ptpl_activated == 0) {
+			ns->ptpl_activated = 1;
+			update_sgroup = true;
+		}
+	}
+
+	/* current Host Identifier has registrant or not */
+	reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid);
+
+	switch (rrega) {
+	case SPDK_NVME_RESERVE_REGISTER_KEY:
+		if (!reg) {
+			/* register new controller */
+			if (key.nrkey == 0) {
+				SPDK_ERRLOG("Can't register zeroed new key\n");
+				status = SPDK_NVME_SC_INVALID_FIELD;
+				goto exit;
+			}
+			rc = nvmf_ns_reservation_add_registrant(ns, ctrlr, key.nrkey);
+			if (rc < 0) {
+				status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+				goto exit;
+			}
+			update_sgroup = true;
+		} else {
+			/* register with same key is not an error */
+			if (reg->rkey != key.nrkey) {
+				SPDK_ERRLOG("The same host already register a "
+					    "key with 0x%"PRIx64"\n",
+					    reg->rkey);
+				status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+				goto exit;
+			}
+		}
+		break;
+	case SPDK_NVME_RESERVE_UNREGISTER_KEY:
+		if (!reg || (!iekey && reg->rkey != key.crkey)) {
+			SPDK_ERRLOG("No registrant or current key doesn't match "
+				    "with existing registrant key\n");
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+			goto exit;
+		}
+
+		rtype = ns->rtype;
+		num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list,
+				SPDK_NVMF_MAX_NUM_REGISTRANTS,
+				&ctrlr->hostid);
+
+		nvmf_ns_reservation_remove_registrant(ns, reg);
+
+		if (!ns->rtype && num_hostid && (rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_REG_ONLY ||
+						 rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_REG_ONLY)) {
+			nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+							      hostid_list,
+							      num_hostid,
+							      SPDK_NVME_RESERVATION_RELEASED);
+		}
+		update_sgroup = true;
+		break;
+	case SPDK_NVME_RESERVE_REPLACE_KEY:
+		if (!reg || (!iekey && reg->rkey != key.crkey)) {
+			SPDK_ERRLOG("No registrant or current key doesn't match "
+				    "with existing registrant key\n");
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+			goto exit;
+		}
+		if (key.nrkey == 0) {
+			SPDK_ERRLOG("Can't register zeroed new key\n");
+			status = SPDK_NVME_SC_INVALID_FIELD;
+			goto exit;
+		}
+		reg->rkey = key.nrkey;
+		update_sgroup = true;
+		break;
+	default:
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		goto exit;
+	}
+
+exit:
+	if (update_sgroup) {
+		rc = nvmf_ns_update_reservation_info(ns);
+		if (rc != 0) {
+			status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		}
+	}
+	req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	req->rsp->nvme_cpl.status.sc = status;
+	return update_sgroup;
+}
+
+static bool
+nvmf_ns_reservation_acquire(struct spdk_nvmf_ns *ns,
+			    struct spdk_nvmf_ctrlr *ctrlr,
+			    struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	uint8_t racqa, iekey, rtype;
+	struct spdk_nvme_reservation_acquire_data key;
+	struct spdk_nvmf_registrant *reg;
+	bool all_regs = false;
+	uint32_t count = 0;
+	bool update_sgroup = true;
+	struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+	uint32_t num_hostid = 0;
+	struct spdk_uuid new_hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+	uint32_t new_num_hostid = 0;
+	bool reservation_released = false;
+	uint8_t status = SPDK_NVME_SC_SUCCESS;
+
+	racqa = cmd->cdw10_bits.resv_acquire.racqa;
+	iekey = cmd->cdw10_bits.resv_acquire.iekey;
+	rtype = cmd->cdw10_bits.resv_acquire.rtype;
+
+	if (req->data && req->length >= sizeof(key)) {
+		memcpy(&key, req->data, sizeof(key));
+	} else {
+		SPDK_ERRLOG("No key provided. Failing request.\n");
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		goto exit;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ACQUIRE: RACQA %u, IEKEY %u, RTYPE %u, "
+		      "NRKEY 0x%"PRIx64", PRKEY 0x%"PRIx64"\n",
+		      racqa, iekey, rtype, key.crkey, key.prkey);
+
+	if (iekey || rtype > SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) {
+		SPDK_ERRLOG("Ignore existing key field set to 1\n");
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		update_sgroup = false;
+		goto exit;
+	}
+
+	reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid);
+	/* must be registrant and CRKEY must match */
+	if (!reg || reg->rkey != key.crkey) {
+		SPDK_ERRLOG("No registrant or current key doesn't match "
+			    "with existing registrant key\n");
+		status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+		update_sgroup = false;
+		goto exit;
+	}
+
+	all_regs = nvmf_ns_reservation_all_registrants_type(ns);
+
+	switch (racqa) {
+	case SPDK_NVME_RESERVE_ACQUIRE:
+		/* it's not an error for the holder to acquire same reservation type again */
+		if (nvmf_ns_reservation_registrant_is_holder(ns, reg) && ns->rtype == rtype) {
+			/* do nothing */
+			update_sgroup = false;
+		} else if (ns->holder == NULL) {
+			/* fisrt time to acquire the reservation */
+			nvmf_ns_reservation_acquire_reservation(ns, key.crkey, rtype, reg);
+		} else {
+			SPDK_ERRLOG("Invalid rtype or current registrant is not holder\n");
+			status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+			update_sgroup = false;
+			goto exit;
+		}
+		break;
+	case SPDK_NVME_RESERVE_PREEMPT:
+		/* no reservation holder */
+		if (!ns->holder) {
+			/* unregister with PRKEY */
+			nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey);
+			break;
+		}
+		num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list,
+				SPDK_NVMF_MAX_NUM_REGISTRANTS,
+				&ctrlr->hostid);
+
+		/* only 1 reservation holder and reservation key is valid */
+		if (!all_regs) {
+			/* preempt itself */
+			if (nvmf_ns_reservation_registrant_is_holder(ns, reg) &&
+			    ns->crkey == key.prkey) {
+				ns->rtype = rtype;
+				reservation_released = true;
+				break;
+			}
+
+			if (ns->crkey == key.prkey) {
+				nvmf_ns_reservation_remove_registrant(ns, ns->holder);
+				nvmf_ns_reservation_acquire_reservation(ns, key.crkey, rtype, reg);
+				reservation_released = true;
+			} else if (key.prkey != 0) {
+				nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey);
+			} else {
+				/* PRKEY is zero */
+				SPDK_ERRLOG("Current PRKEY is zero\n");
+				status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+				update_sgroup = false;
+				goto exit;
+			}
+		} else {
+			/* release all other registrants except for the current one */
+			if (key.prkey == 0) {
+				nvmf_ns_reservation_remove_all_other_registrants(ns, reg);
+				assert(ns->holder == reg);
+			} else {
+				count = nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey);
+				if (count == 0) {
+					SPDK_ERRLOG("PRKEY doesn't match any registrant\n");
+					status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+					update_sgroup = false;
+					goto exit;
+				}
+			}
+		}
+		break;
+	default:
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		update_sgroup = false;
+		break;
+	}
+
+exit:
+	if (update_sgroup && racqa == SPDK_NVME_RESERVE_PREEMPT) {
+		new_num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, new_hostid_list,
+				 SPDK_NVMF_MAX_NUM_REGISTRANTS,
+				 &ctrlr->hostid);
+		/* Preempt notification occurs on the unregistered controllers
+		 * other than the controller who issued the command.
+		 */
+		num_hostid = nvmf_ns_reservation_get_unregistered_hostid(hostid_list,
+				num_hostid,
+				new_hostid_list,
+				new_num_hostid);
+		if (num_hostid) {
+			nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+							      hostid_list,
+							      num_hostid,
+							      SPDK_NVME_REGISTRATION_PREEMPTED);
+
+		}
+		/* Reservation released notification occurs on the
+		 * controllers which are the remaining registrants other than
+		 * the controller who issued the command.
+		 */
+		if (reservation_released && new_num_hostid) {
+			nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+							      new_hostid_list,
+							      new_num_hostid,
+							      SPDK_NVME_RESERVATION_RELEASED);
+
+		}
+	}
+	if (update_sgroup && ns->ptpl_activated) {
+		if (nvmf_ns_update_reservation_info(ns)) {
+			status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		}
+	}
+	req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	req->rsp->nvme_cpl.status.sc = status;
+	return update_sgroup;
+}
+
+static bool
+nvmf_ns_reservation_release(struct spdk_nvmf_ns *ns,
+			    struct spdk_nvmf_ctrlr *ctrlr,
+			    struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	uint8_t rrela, iekey, rtype;
+	struct spdk_nvmf_registrant *reg;
+	uint64_t crkey;
+	uint8_t status = SPDK_NVME_SC_SUCCESS;
+	bool update_sgroup = true;
+	struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+	uint32_t num_hostid = 0;
+
+	rrela = cmd->cdw10_bits.resv_release.rrela;
+	iekey = cmd->cdw10_bits.resv_release.iekey;
+	rtype = cmd->cdw10_bits.resv_release.rtype;
+
+	if (req->data && req->length >= sizeof(crkey)) {
+		memcpy(&crkey, req->data, sizeof(crkey));
+	} else {
+		SPDK_ERRLOG("No key provided. Failing request.\n");
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		goto exit;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF, "RELEASE: RRELA %u, IEKEY %u, RTYPE %u, "
+		      "CRKEY 0x%"PRIx64"\n",  rrela, iekey, rtype, crkey);
+
+	if (iekey) {
+		SPDK_ERRLOG("Ignore existing key field set to 1\n");
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		update_sgroup = false;
+		goto exit;
+	}
+
+	reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid);
+	if (!reg || reg->rkey != crkey) {
+		SPDK_ERRLOG("No registrant or current key doesn't match "
+			    "with existing registrant key\n");
+		status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+		update_sgroup = false;
+		goto exit;
+	}
+
+	num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list,
+			SPDK_NVMF_MAX_NUM_REGISTRANTS,
+			&ctrlr->hostid);
+
+	switch (rrela) {
+	case SPDK_NVME_RESERVE_RELEASE:
+		if (!ns->holder) {
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF, "RELEASE: no holder\n");
+			update_sgroup = false;
+			goto exit;
+		}
+		if (ns->rtype != rtype) {
+			SPDK_ERRLOG("Type doesn't match\n");
+			status = SPDK_NVME_SC_INVALID_FIELD;
+			update_sgroup = false;
+			goto exit;
+		}
+		if (!nvmf_ns_reservation_registrant_is_holder(ns, reg)) {
+			/* not the reservation holder, this isn't an error */
+			update_sgroup = false;
+			goto exit;
+		}
+
+		rtype = ns->rtype;
+		nvmf_ns_reservation_release_reservation(ns);
+
+		if (num_hostid && rtype != SPDK_NVME_RESERVE_WRITE_EXCLUSIVE &&
+		    rtype != SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) {
+			nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+							      hostid_list,
+							      num_hostid,
+							      SPDK_NVME_RESERVATION_RELEASED);
+		}
+		break;
+	case SPDK_NVME_RESERVE_CLEAR:
+		nvmf_ns_reservation_clear_all_registrants(ns);
+		if (num_hostid) {
+			nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+							      hostid_list,
+							      num_hostid,
+							      SPDK_NVME_RESERVATION_PREEMPTED);
+		}
+		break;
+	default:
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		update_sgroup = false;
+		goto exit;
+	}
+
+exit:
+	if (update_sgroup && ns->ptpl_activated) {
+		if (nvmf_ns_update_reservation_info(ns)) {
+			status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		}
+	}
+	req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	req->rsp->nvme_cpl.status.sc = status;
+	return update_sgroup;
+}
+
+static void
+nvmf_ns_reservation_report(struct spdk_nvmf_ns *ns,
+			   struct spdk_nvmf_ctrlr *ctrlr,
+			   struct spdk_nvmf_request *req)
+{
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+	struct spdk_nvmf_ctrlr *ctrlr_tmp;
+	struct spdk_nvmf_registrant *reg, *tmp;
+	struct spdk_nvme_reservation_status_extended_data *status_data;
+	struct spdk_nvme_registered_ctrlr_extended_data *ctrlr_data;
+	uint8_t *payload;
+	uint32_t len, count = 0;
+	uint32_t regctl = 0;
+	uint8_t status = SPDK_NVME_SC_SUCCESS;
+
+	if (req->data == NULL) {
+		SPDK_ERRLOG("No data transfer specified for request. "
+			    " Unable to transfer back response.\n");
+		status = SPDK_NVME_SC_INVALID_FIELD;
+		goto exit;
+	}
+
+	if (!cmd->cdw11_bits.resv_report.eds) {
+		SPDK_ERRLOG("NVMeoF uses extended controller data structure, "
+			    "please set EDS bit in cdw11 and try again\n");
+		status = SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT;
+		goto exit;
+	}
+
+	/* Get number of registerd controllers, one Host may have more than
+	 * one controller based on different ports.
+	 */
+	TAILQ_FOREACH(ctrlr_tmp, &subsystem->ctrlrs, link) {
+		reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr_tmp->hostid);
+		if (reg) {
+			regctl++;
+		}
+	}
+
+	len = sizeof(*status_data) + sizeof(*ctrlr_data) * regctl;
+	payload = calloc(1, len);
+	if (!payload) {
+		status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		goto exit;
+	}
+
+	status_data = (struct spdk_nvme_reservation_status_extended_data *)payload;
+	status_data->data.gen = ns->gen;
+	status_data->data.rtype = ns->rtype;
+	status_data->data.regctl = regctl;
+	status_data->data.ptpls = ns->ptpl_activated;
+
+	TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+		assert(count <= regctl);
+		ctrlr_data = (struct spdk_nvme_registered_ctrlr_extended_data *)
+			     (payload + sizeof(*status_data) + sizeof(*ctrlr_data) * count);
+		/* Set to 0xffffh for dynamic controller */
+		ctrlr_data->cntlid = 0xffff;
+		ctrlr_data->rcsts.status = (ns->holder == reg) ? true : false;
+		ctrlr_data->rkey = reg->rkey;
+		spdk_uuid_copy((struct spdk_uuid *)ctrlr_data->hostid, &reg->hostid);
+		count++;
+	}
+
+	memcpy(req->data, payload, spdk_min(len, (cmd->cdw10 + 1) * sizeof(uint32_t)));
+	free(payload);
+
+exit:
+	req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	req->rsp->nvme_cpl.status.sc = status;
+	return;
+}
+
+static void
+nvmf_ns_reservation_complete(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+
+	spdk_nvmf_request_complete(req);
+}
+
+static void
+_nvmf_ns_reservation_update_done(struct spdk_nvmf_subsystem *subsystem,
+				 void *cb_arg, int status)
+{
+	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)cb_arg;
+	struct spdk_nvmf_poll_group *group = req->qpair->group;
+
+	spdk_thread_send_msg(group->thread, nvmf_ns_reservation_complete, req);
+}
+
+void
+nvmf_ns_reservation_request(void *ctx)
+{
+	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)ctx;
+	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+	struct subsystem_update_ns_ctx *update_ctx;
+	uint32_t nsid;
+	struct spdk_nvmf_ns *ns;
+	bool update_sgroup = false;
+
+	nsid = cmd->nsid;
+	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
+	assert(ns != NULL);
+
+	switch (cmd->opc) {
+	case SPDK_NVME_OPC_RESERVATION_REGISTER:
+		update_sgroup = nvmf_ns_reservation_register(ns, ctrlr, req);
+		break;
+	case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
+		update_sgroup = nvmf_ns_reservation_acquire(ns, ctrlr, req);
+		break;
+	case SPDK_NVME_OPC_RESERVATION_RELEASE:
+		update_sgroup = nvmf_ns_reservation_release(ns, ctrlr, req);
+		break;
+	case SPDK_NVME_OPC_RESERVATION_REPORT:
+		nvmf_ns_reservation_report(ns, ctrlr, req);
+		break;
+	default:
+		break;
+	}
+
+	/* update reservation information to subsystem's poll group */
+	if (update_sgroup) {
+		update_ctx = calloc(1, sizeof(*update_ctx));
+		if (update_ctx == NULL) {
+			SPDK_ERRLOG("Can't alloc subsystem poll group update context\n");
+			goto update_done;
+		}
+		update_ctx->subsystem = ctrlr->subsys;
+		update_ctx->cb_fn = _nvmf_ns_reservation_update_done;
+		update_ctx->cb_arg = req;
+
+		nvmf_subsystem_update_ns(ctrlr->subsys, subsystem_update_ns_done, update_ctx);
+		return;
+	}
+
+update_done:
+	_nvmf_ns_reservation_update_done(ctrlr->subsys, (void *)req, 0);
+}
diff --git a/src/spdk/lib/nvmf/tcp.c b/src/spdk/lib/nvmf/tcp.c
new file mode 100644
index 000000000..391d4bcf1
--- /dev/null
+++ b/src/spdk/lib/nvmf/tcp.c
@@ -0,0 +1,2631 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/assert.h"
+#include "spdk/thread.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/nvme_tcp.h"
+
+#include "nvmf_internal.h"
+
+#define NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME 16
+#define SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY 6
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp;
+
+/* spdk nvmf related structure */
+enum spdk_nvmf_tcp_req_state {
+
+	/* The request is not currently in use */
+	TCP_REQUEST_STATE_FREE = 0,
+
+	/* Initial state when request first received */
+	TCP_REQUEST_STATE_NEW,
+
+	/* The request is queued until a data buffer is available. */
+	TCP_REQUEST_STATE_NEED_BUFFER,
+
+	/* The request is currently transferring data from the host to the controller. */
+	TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+
+	/* The request is waiting for the R2T send acknowledgement. */
+	TCP_REQUEST_STATE_AWAITING_R2T_ACK,
+
+	/* The request is ready to execute at the block device */
+	TCP_REQUEST_STATE_READY_TO_EXECUTE,
+
+	/* The request is currently executing at the block device */
+	TCP_REQUEST_STATE_EXECUTING,
+
+	/* The request finished executing at the block device */
+	TCP_REQUEST_STATE_EXECUTED,
+
+	/* The request is ready to send a completion */
+	TCP_REQUEST_STATE_READY_TO_COMPLETE,
+
+	/* The request is currently transferring final pdus from the controller to the host. */
+	TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+
+	/* The request completed and can be marked free. */
+	TCP_REQUEST_STATE_COMPLETED,
+
+	/* Terminator */
+	TCP_REQUEST_NUM_STATES,
+};
+
+static const char *spdk_nvmf_tcp_term_req_fes_str[] = {
+	"Invalid PDU Header Field",
+	"PDU Sequence Error",
+	"Header Digiest Error",
+	"Data Transfer Out of Range",
+	"R2T Limit Exceeded",
+	"Unsupported parameter",
+};
+
+#define OBJECT_NVMF_TCP_IO				0x80
+
+#define TRACE_GROUP_NVMF_TCP				0x5
+#define TRACE_TCP_REQUEST_STATE_NEW					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x0)
+#define TRACE_TCP_REQUEST_STATE_NEED_BUFFER				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x1)
+#define TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER		SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x2)
+#define TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE			SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x3)
+#define TRACE_TCP_REQUEST_STATE_EXECUTING				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x4)
+#define TRACE_TCP_REQUEST_STATE_EXECUTED				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x5)
+#define TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE			SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x6)
+#define TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST		SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x7)
+#define TRACE_TCP_REQUEST_STATE_COMPLETED				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x8)
+#define TRACE_TCP_FLUSH_WRITEBUF_START					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x9)
+#define TRACE_TCP_FLUSH_WRITEBUF_DONE					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xA)
+#define TRACE_TCP_READ_FROM_SOCKET_DONE					SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xB)
+#define TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK				SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xC)
+
+SPDK_TRACE_REGISTER_FN(nvmf_tcp_trace, "nvmf_tcp", TRACE_GROUP_NVMF_TCP)
+{
+	spdk_trace_register_object(OBJECT_NVMF_TCP_IO, 'r');
+	spdk_trace_register_description("TCP_REQ_NEW",
+					TRACE_TCP_REQUEST_STATE_NEW,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 1, 1, "");
+	spdk_trace_register_description("TCP_REQ_NEED_BUFFER",
+					TRACE_TCP_REQUEST_STATE_NEED_BUFFER,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_TX_H_TO_C",
+					TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_RDY_TO_EXECUTE",
+					TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_EXECUTING",
+					TRACE_TCP_REQUEST_STATE_EXECUTING,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_EXECUTED",
+					TRACE_TCP_REQUEST_STATE_EXECUTED,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_RDY_TO_COMPLETE",
+					TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_TRANSFER_C2H",
+					TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_REQ_COMPLETED",
+					TRACE_TCP_REQUEST_STATE_COMPLETED,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+	spdk_trace_register_description("TCP_WRITE_START",
+					TRACE_TCP_FLUSH_WRITEBUF_START,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("TCP_WRITE_DONE",
+					TRACE_TCP_FLUSH_WRITEBUF_DONE,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("TCP_READ_DONE",
+					TRACE_TCP_READ_FROM_SOCKET_DONE,
+					OWNER_NONE, OBJECT_NONE, 0, 0, "");
+	spdk_trace_register_description("TCP_REQ_AWAIT_R2T_ACK",
+					TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK,
+					OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+}
+
+struct spdk_nvmf_tcp_req  {
+	struct spdk_nvmf_request		req;
+	struct spdk_nvme_cpl			rsp;
+	struct spdk_nvme_cmd			cmd;
+
+	/* A PDU that can be used for sending responses. This is
+	 * not the incoming PDU! */
+	struct nvme_tcp_pdu			*pdu;
+
+	/*
+	 * The PDU for a request may be used multiple times in serial over
+	 * the request's lifetime. For example, first to send an R2T, then
+	 * to send a completion. To catch mistakes where the PDU is used
+	 * twice at the same time, add a debug flag here for init/fini.
+	 */
+	bool					pdu_in_use;
+
+	/* In-capsule data buffer */
+	uint8_t					*buf;
+
+	bool					has_incapsule_data;
+
+	/* transfer_tag */
+	uint16_t				ttag;
+
+	enum spdk_nvmf_tcp_req_state		state;
+
+	/*
+	 * h2c_offset is used when we receive the h2c_data PDU.
+	 */
+	uint32_t				h2c_offset;
+
+	STAILQ_ENTRY(spdk_nvmf_tcp_req)		link;
+	TAILQ_ENTRY(spdk_nvmf_tcp_req)		state_link;
+};
+
+struct spdk_nvmf_tcp_qpair {
+	struct spdk_nvmf_qpair			qpair;
+	struct spdk_nvmf_tcp_poll_group		*group;
+	struct spdk_nvmf_tcp_port		*port;
+	struct spdk_sock			*sock;
+
+	enum nvme_tcp_pdu_recv_state		recv_state;
+	enum nvme_tcp_qpair_state		state;
+
+	/* PDU being actively received */
+	struct nvme_tcp_pdu			pdu_in_progress;
+	uint32_t				recv_buf_size;
+
+	/* This is a spare PDU used for sending special management
+	 * operations. Primarily, this is used for the initial
+	 * connection response and c2h termination request. */
+	struct nvme_tcp_pdu			mgmt_pdu;
+
+	TAILQ_HEAD(, nvme_tcp_pdu)		send_queue;
+
+	/* Arrays of in-capsule buffers, requests, and pdus.
+	 * Each array is 'resource_count' number of elements */
+	void					*bufs;
+	struct spdk_nvmf_tcp_req		*reqs;
+	struct nvme_tcp_pdu			*pdus;
+	uint32_t				resource_count;
+
+	/* Queues to track the requests in all states */
+	TAILQ_HEAD(, spdk_nvmf_tcp_req)		state_queue[TCP_REQUEST_NUM_STATES];
+	/* Number of requests in each state */
+	uint32_t				state_cntr[TCP_REQUEST_NUM_STATES];
+
+	uint8_t					cpda;
+
+	bool					host_hdgst_enable;
+	bool					host_ddgst_enable;
+
+	/* IP address */
+	char					initiator_addr[SPDK_NVMF_TRADDR_MAX_LEN];
+	char					target_addr[SPDK_NVMF_TRADDR_MAX_LEN];
+
+	/* IP port */
+	uint16_t				initiator_port;
+	uint16_t				target_port;
+
+	/* Timer used to destroy qpair after detecting transport error issue if initiator does
+	 *  not close the connection.
+	 */
+	struct spdk_poller			*timeout_poller;
+
+	TAILQ_ENTRY(spdk_nvmf_tcp_qpair)	link;
+};
+
+struct spdk_nvmf_tcp_poll_group {
+	struct spdk_nvmf_transport_poll_group	group;
+	struct spdk_sock_group			*sock_group;
+
+	TAILQ_HEAD(, spdk_nvmf_tcp_qpair)	qpairs;
+	TAILQ_HEAD(, spdk_nvmf_tcp_qpair)	await_req;
+};
+
+struct spdk_nvmf_tcp_port {
+	const struct spdk_nvme_transport_id	*trid;
+	struct spdk_sock			*listen_sock;
+	TAILQ_ENTRY(spdk_nvmf_tcp_port)		link;
+};
+
+struct spdk_nvmf_tcp_transport {
+	struct spdk_nvmf_transport		transport;
+
+	pthread_mutex_t				lock;
+
+	TAILQ_HEAD(, spdk_nvmf_tcp_port)	ports;
+};
+
+static bool nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
+				 struct spdk_nvmf_tcp_req *tcp_req);
+
+static void
+nvmf_tcp_req_set_state(struct spdk_nvmf_tcp_req *tcp_req,
+		       enum spdk_nvmf_tcp_req_state state)
+{
+	struct spdk_nvmf_qpair *qpair;
+	struct spdk_nvmf_tcp_qpair *tqpair;
+
+	qpair = tcp_req->req.qpair;
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+	TAILQ_REMOVE(&tqpair->state_queue[tcp_req->state], tcp_req, state_link);
+	assert(tqpair->state_cntr[tcp_req->state] > 0);
+	tqpair->state_cntr[tcp_req->state]--;
+
+	TAILQ_INSERT_TAIL(&tqpair->state_queue[state], tcp_req, state_link);
+	tqpair->state_cntr[state]++;
+
+	tcp_req->state = state;
+}
+
+static inline struct nvme_tcp_pdu *
+nvmf_tcp_req_pdu_init(struct spdk_nvmf_tcp_req *tcp_req)
+{
+	assert(tcp_req->pdu_in_use == false);
+	tcp_req->pdu_in_use = true;
+
+	memset(tcp_req->pdu, 0, sizeof(*tcp_req->pdu));
+	tcp_req->pdu->qpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+	return tcp_req->pdu;
+}
+
+static inline void
+nvmf_tcp_req_pdu_fini(struct spdk_nvmf_tcp_req *tcp_req)
+{
+	tcp_req->pdu_in_use = false;
+}
+
+static struct spdk_nvmf_tcp_req *
+nvmf_tcp_req_get(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	struct spdk_nvmf_tcp_req *tcp_req;
+
+	tcp_req = TAILQ_FIRST(&tqpair->state_queue[TCP_REQUEST_STATE_FREE]);
+	if (!tcp_req) {
+		return NULL;
+	}
+
+	memset(&tcp_req->rsp, 0, sizeof(tcp_req->rsp));
+	tcp_req->h2c_offset = 0;
+	tcp_req->has_incapsule_data = false;
+	tcp_req->req.dif.dif_insert_or_strip = false;
+
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_NEW);
+	return tcp_req;
+}
+
+static void
+nvmf_tcp_request_free(struct spdk_nvmf_tcp_req *tcp_req)
+{
+	struct spdk_nvmf_tcp_transport *ttransport;
+
+	assert(tcp_req != NULL);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tcp_req=%p will be freed\n", tcp_req);
+	ttransport = SPDK_CONTAINEROF(tcp_req->req.qpair->transport,
+				      struct spdk_nvmf_tcp_transport, transport);
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED);
+	nvmf_tcp_req_process(ttransport, tcp_req);
+}
+
+static int
+nvmf_tcp_req_free(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_tcp_req *tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+
+	nvmf_tcp_request_free(tcp_req);
+
+	return 0;
+}
+
+static void
+nvmf_tcp_drain_state_queue(struct spdk_nvmf_tcp_qpair *tqpair,
+			   enum spdk_nvmf_tcp_req_state state)
+{
+	struct spdk_nvmf_tcp_req *tcp_req, *req_tmp;
+
+	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->state_queue[state], state_link, req_tmp) {
+		nvmf_tcp_request_free(tcp_req);
+	}
+}
+
+static void
+nvmf_tcp_cleanup_all_states(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	struct spdk_nvmf_tcp_req *tcp_req, *req_tmp;
+
+	assert(TAILQ_EMPTY(&tqpair->send_queue));
+
+	nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST);
+	nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_NEW);
+
+	/* Wipe the requests waiting for buffer from the global list */
+	TAILQ_FOREACH_SAFE(tcp_req, &tqpair->state_queue[TCP_REQUEST_STATE_NEED_BUFFER], state_link,
+			   req_tmp) {
+		STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue, &tcp_req->req,
+			      spdk_nvmf_request, buf_link);
+	}
+
+	nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_NEED_BUFFER);
+	nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_EXECUTING);
+	nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
+	nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_AWAITING_R2T_ACK);
+}
+
+static void
+nvmf_tcp_dump_qpair_req_contents(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	int i;
+	struct spdk_nvmf_tcp_req *tcp_req;
+
+	SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", tqpair->qpair.qid);
+	for (i = 1; i < TCP_REQUEST_NUM_STATES; i++) {
+		SPDK_ERRLOG("\tNum of requests in state[%d] = %u\n", i, tqpair->state_cntr[i]);
+		TAILQ_FOREACH(tcp_req, &tqpair->state_queue[i], state_link) {
+			SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", tcp_req->req.data_from_pool);
+			SPDK_ERRLOG("\t\tRequest opcode: %d\n", tcp_req->req.cmd->nvmf_cmd.opcode);
+		}
+	}
+}
+
+static void
+nvmf_tcp_qpair_destroy(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	int err = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+	err = spdk_sock_close(&tqpair->sock);
+	assert(err == 0);
+	nvmf_tcp_cleanup_all_states(tqpair);
+
+	if (tqpair->state_cntr[TCP_REQUEST_STATE_FREE] != tqpair->resource_count) {
+		SPDK_ERRLOG("tqpair(%p) free tcp request num is %u but should be %u\n", tqpair,
+			    tqpair->state_cntr[TCP_REQUEST_STATE_FREE],
+			    tqpair->resource_count);
+		err++;
+	}
+
+	if (err > 0) {
+		nvmf_tcp_dump_qpair_req_contents(tqpair);
+	}
+
+	spdk_dma_free(tqpair->pdus);
+	free(tqpair->reqs);
+	spdk_free(tqpair->bufs);
+	free(tqpair);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Leave\n");
+}
+
+static int
+nvmf_tcp_destroy(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_tcp_transport	*ttransport;
+
+	assert(transport != NULL);
+	ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+	pthread_mutex_destroy(&ttransport->lock);
+	free(ttransport);
+	return 0;
+}
+
+static struct spdk_nvmf_transport *
+nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts)
+{
+	struct spdk_nvmf_tcp_transport *ttransport;
+	uint32_t sge_count;
+	uint32_t min_shared_buffers;
+
+	ttransport = calloc(1, sizeof(*ttransport));
+	if (!ttransport) {
+		return NULL;
+	}
+
+	TAILQ_INIT(&ttransport->ports);
+
+	ttransport->transport.ops = &spdk_nvmf_transport_tcp;
+
+	SPDK_NOTICELOG("*** TCP Transport Init ***\n");
+
+	SPDK_INFOLOG(SPDK_LOG_NVMF_TCP, "*** TCP Transport Init ***\n"
+		     "  Transport opts:  max_ioq_depth=%d, max_io_size=%d,\n"
+		     "  max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
+		     "  in_capsule_data_size=%d, max_aq_depth=%d\n"
+		     "  num_shared_buffers=%d, c2h_success=%d,\n"
+		     "  dif_insert_or_strip=%d, sock_priority=%d\n"
+		     "  abort_timeout_sec=%d\n",
+		     opts->max_queue_depth,
+		     opts->max_io_size,
+		     opts->max_qpairs_per_ctrlr - 1,
+		     opts->io_unit_size,
+		     opts->in_capsule_data_size,
+		     opts->max_aq_depth,
+		     opts->num_shared_buffers,
+		     opts->c2h_success,
+		     opts->dif_insert_or_strip,
+		     opts->sock_priority,
+		     opts->abort_timeout_sec);
+
+	if (opts->sock_priority > SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY) {
+		SPDK_ERRLOG("Unsupported socket_priority=%d, the current range is: 0 to %d\n"
+			    "you can use man 7 socket to view the range of priority under SO_PRIORITY item\n",
+			    opts->sock_priority, SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY);
+		free(ttransport);
+		return NULL;
+	}
+
+	/* I/O unit size cannot be larger than max I/O size */
+	if (opts->io_unit_size > opts->max_io_size) {
+		opts->io_unit_size = opts->max_io_size;
+	}
+
+	sge_count = opts->max_io_size / opts->io_unit_size;
+	if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) {
+		SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
+		free(ttransport);
+		return NULL;
+	}
+
+	min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size;
+	if (min_shared_buffers > opts->num_shared_buffers) {
+		SPDK_ERRLOG("There are not enough buffers to satisfy"
+			    "per-poll group caches for each thread. (%" PRIu32 ")"
+			    "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers);
+		SPDK_ERRLOG("Please specify a larger number of shared buffers\n");
+		nvmf_tcp_destroy(&ttransport->transport);
+		return NULL;
+	}
+
+	pthread_mutex_init(&ttransport->lock, NULL);
+
+	return &ttransport->transport;
+}
+
+static int
+nvmf_tcp_trsvcid_to_int(const char *trsvcid)
+{
+	unsigned long long ull;
+	char *end = NULL;
+
+	ull = strtoull(trsvcid, &end, 10);
+	if (end == NULL || end == trsvcid || *end != '\0') {
+		return -1;
+	}
+
+	/* Valid TCP/IP port numbers are in [0, 65535] */
+	if (ull > 65535) {
+		return -1;
+	}
+
+	return (int)ull;
+}
+
+/**
+ * Canonicalize a listen address trid.
+ */
+static int
+nvmf_tcp_canon_listen_trid(struct spdk_nvme_transport_id *canon_trid,
+			   const struct spdk_nvme_transport_id *trid)
+{
+	int trsvcid_int;
+
+	trsvcid_int = nvmf_tcp_trsvcid_to_int(trid->trsvcid);
+	if (trsvcid_int < 0) {
+		return -EINVAL;
+	}
+
+	memset(canon_trid, 0, sizeof(*canon_trid));
+	spdk_nvme_trid_populate_transport(canon_trid, SPDK_NVME_TRANSPORT_TCP);
+	canon_trid->adrfam = trid->adrfam;
+	snprintf(canon_trid->traddr, sizeof(canon_trid->traddr), "%s", trid->traddr);
+	snprintf(canon_trid->trsvcid, sizeof(canon_trid->trsvcid), "%d", trsvcid_int);
+
+	return 0;
+}
+
+/**
+ * Find an existing listening port.
+ *
+ * Caller must hold ttransport->lock.
+ */
+static struct spdk_nvmf_tcp_port *
+nvmf_tcp_find_port(struct spdk_nvmf_tcp_transport *ttransport,
+		   const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvme_transport_id canon_trid;
+	struct spdk_nvmf_tcp_port *port;
+
+	if (nvmf_tcp_canon_listen_trid(&canon_trid, trid) != 0) {
+		return NULL;
+	}
+
+	TAILQ_FOREACH(port, &ttransport->ports, link) {
+		if (spdk_nvme_transport_id_compare(&canon_trid, port->trid) == 0) {
+			return port;
+		}
+	}
+
+	return NULL;
+}
+
+static int
+nvmf_tcp_listen(struct spdk_nvmf_transport *transport,
+		const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_tcp_transport *ttransport;
+	struct spdk_nvmf_tcp_port *port;
+	int trsvcid_int;
+	uint8_t adrfam;
+	struct spdk_sock_opts opts;
+
+	ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+	trsvcid_int = nvmf_tcp_trsvcid_to_int(trid->trsvcid);
+	if (trsvcid_int < 0) {
+		SPDK_ERRLOG("Invalid trsvcid '%s'\n", trid->trsvcid);
+		return -EINVAL;
+	}
+
+	pthread_mutex_lock(&ttransport->lock);
+	port = calloc(1, sizeof(*port));
+	if (!port) {
+		SPDK_ERRLOG("Port allocation failed\n");
+		pthread_mutex_unlock(&ttransport->lock);
+		return -ENOMEM;
+	}
+
+	port->trid = trid;
+	opts.opts_size = sizeof(opts);
+	spdk_sock_get_default_opts(&opts);
+	opts.priority = transport->opts.sock_priority;
+	port->listen_sock = spdk_sock_listen_ext(trid->traddr, trsvcid_int,
+			    NULL, &opts);
+	if (port->listen_sock == NULL) {
+		SPDK_ERRLOG("spdk_sock_listen(%s, %d) failed: %s (%d)\n",
+			    trid->traddr, trsvcid_int,
+			    spdk_strerror(errno), errno);
+		free(port);
+		pthread_mutex_unlock(&ttransport->lock);
+		return -errno;
+	}
+
+	if (spdk_sock_is_ipv4(port->listen_sock)) {
+		adrfam = SPDK_NVMF_ADRFAM_IPV4;
+	} else if (spdk_sock_is_ipv6(port->listen_sock)) {
+		adrfam = SPDK_NVMF_ADRFAM_IPV6;
+	} else {
+		SPDK_ERRLOG("Unhandled socket type\n");
+		adrfam = 0;
+	}
+
+	if (adrfam != trid->adrfam) {
+		SPDK_ERRLOG("Socket address family mismatch\n");
+		spdk_sock_close(&port->listen_sock);
+		free(port);
+		pthread_mutex_unlock(&ttransport->lock);
+		return -EINVAL;
+	}
+
+	SPDK_NOTICELOG("*** NVMe/TCP Target Listening on %s port %s ***\n",
+		       trid->traddr, trid->trsvcid);
+
+	TAILQ_INSERT_TAIL(&ttransport->ports, port, link);
+	pthread_mutex_unlock(&ttransport->lock);
+	return 0;
+}
+
+static void
+nvmf_tcp_stop_listen(struct spdk_nvmf_transport *transport,
+		     const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_tcp_transport *ttransport;
+	struct spdk_nvmf_tcp_port *port;
+
+	ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Removing listen address %s port %s\n",
+		      trid->traddr, trid->trsvcid);
+
+	pthread_mutex_lock(&ttransport->lock);
+	port = nvmf_tcp_find_port(ttransport, trid);
+	if (port) {
+		TAILQ_REMOVE(&ttransport->ports, port, link);
+		spdk_sock_close(&port->listen_sock);
+		free(port);
+	}
+
+	pthread_mutex_unlock(&ttransport->lock);
+}
+
+static void nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair,
+		enum nvme_tcp_pdu_recv_state state);
+
+static void
+nvmf_tcp_qpair_disconnect(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Disconnecting qpair %p\n", tqpair);
+
+	if (tqpair->state <= NVME_TCP_QPAIR_STATE_RUNNING) {
+		tqpair->state = NVME_TCP_QPAIR_STATE_EXITING;
+		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+		spdk_poller_unregister(&tqpair->timeout_poller);
+
+		/* This will end up calling nvmf_tcp_close_qpair */
+		spdk_nvmf_qpair_disconnect(&tqpair->qpair, NULL, NULL);
+	}
+}
+
+static void
+_pdu_write_done(void *_pdu, int err)
+{
+	struct nvme_tcp_pdu			*pdu = _pdu;
+	struct spdk_nvmf_tcp_qpair		*tqpair = pdu->qpair;
+
+	TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
+
+	if (err != 0) {
+		nvmf_tcp_qpair_disconnect(tqpair);
+		return;
+	}
+
+	assert(pdu->cb_fn != NULL);
+	pdu->cb_fn(pdu->cb_arg);
+}
+
+static void
+nvmf_tcp_qpair_write_pdu(struct spdk_nvmf_tcp_qpair *tqpair,
+			 struct nvme_tcp_pdu *pdu,
+			 nvme_tcp_qpair_xfer_complete_cb cb_fn,
+			 void *cb_arg)
+{
+	int hlen;
+	uint32_t crc32c;
+	uint32_t mapped_length = 0;
+	ssize_t rc;
+
+	assert(&tqpair->pdu_in_progress != pdu);
+
+	hlen = pdu->hdr.common.hlen;
+
+	/* Header Digest */
+	if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) {
+		crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+		MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c);
+	}
+
+	/* Data Digest */
+	if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) {
+		crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+		MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
+	}
+
+	pdu->cb_fn = cb_fn;
+	pdu->cb_arg = cb_arg;
+
+	pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, SPDK_COUNTOF(pdu->iov), pdu,
+			       tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
+			       &mapped_length);
+	pdu->sock_req.cb_fn = _pdu_write_done;
+	pdu->sock_req.cb_arg = pdu;
+	TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
+	if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP ||
+	    pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ) {
+		rc = spdk_sock_writev(tqpair->sock, pdu->iov, pdu->sock_req.iovcnt);
+		if (rc == mapped_length) {
+			_pdu_write_done(pdu, 0);
+		} else {
+			SPDK_ERRLOG("IC_RESP or TERM_REQ could not write to socket.\n");
+			_pdu_write_done(pdu, -1);
+		}
+	} else {
+		spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
+	}
+}
+
+static int
+nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	uint32_t i;
+	struct spdk_nvmf_transport_opts *opts;
+	uint32_t in_capsule_data_size;
+
+	opts = &tqpair->qpair.transport->opts;
+
+	in_capsule_data_size = opts->in_capsule_data_size;
+	if (opts->dif_insert_or_strip) {
+		in_capsule_data_size = SPDK_BDEV_BUF_SIZE_WITH_MD(in_capsule_data_size);
+	}
+
+	tqpair->resource_count = opts->max_queue_depth;
+
+	tqpair->mgmt_pdu.qpair = tqpair;
+
+	tqpair->reqs = calloc(tqpair->resource_count, sizeof(*tqpair->reqs));
+	if (!tqpair->reqs) {
+		SPDK_ERRLOG("Unable to allocate reqs on tqpair=%p\n", tqpair);
+		return -1;
+	}
+
+	if (in_capsule_data_size) {
+		tqpair->bufs = spdk_zmalloc(tqpair->resource_count * in_capsule_data_size, 0x1000,
+					    NULL, SPDK_ENV_LCORE_ID_ANY,
+					    SPDK_MALLOC_DMA);
+		if (!tqpair->bufs) {
+			SPDK_ERRLOG("Unable to allocate bufs on tqpair=%p.\n", tqpair);
+			return -1;
+		}
+	}
+
+	tqpair->pdus = spdk_dma_malloc(tqpair->resource_count * sizeof(*tqpair->pdus), 0x1000, NULL);
+	if (!tqpair->pdus) {
+		SPDK_ERRLOG("Unable to allocate pdu pool on tqpair =%p.\n", tqpair);
+		return -1;
+	}
+
+	for (i = 0; i < tqpair->resource_count; i++) {
+		struct spdk_nvmf_tcp_req *tcp_req = &tqpair->reqs[i];
+
+		tcp_req->ttag = i + 1;
+		tcp_req->req.qpair = &tqpair->qpair;
+
+		tcp_req->pdu = &tqpair->pdus[i];
+		tcp_req->pdu->qpair = tqpair;
+
+		/* Set up memory to receive commands */
+		if (tqpair->bufs) {
+			tcp_req->buf = (void *)((uintptr_t)tqpair->bufs + (i * in_capsule_data_size));
+		}
+
+		/* Set the cmdn and rsp */
+		tcp_req->req.rsp = (union nvmf_c2h_msg *)&tcp_req->rsp;
+		tcp_req->req.cmd = (union nvmf_h2c_msg *)&tcp_req->cmd;
+
+		/* Initialize request state to FREE */
+		tcp_req->state = TCP_REQUEST_STATE_FREE;
+		TAILQ_INSERT_TAIL(&tqpair->state_queue[tcp_req->state], tcp_req, state_link);
+		tqpair->state_cntr[TCP_REQUEST_STATE_FREE]++;
+	}
+
+	tqpair->recv_buf_size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 *
+				 SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
+
+	return 0;
+}
+
+static int
+nvmf_tcp_qpair_init(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair;
+	int i;
+
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "New TCP Connection: %p\n", qpair);
+
+	TAILQ_INIT(&tqpair->send_queue);
+
+	/* Initialise request state queues of the qpair */
+	for (i = TCP_REQUEST_STATE_FREE; i < TCP_REQUEST_NUM_STATES; i++) {
+		TAILQ_INIT(&tqpair->state_queue[i]);
+	}
+
+	tqpair->host_hdgst_enable = true;
+	tqpair->host_ddgst_enable = true;
+
+	return 0;
+}
+
+static int
+nvmf_tcp_qpair_sock_init(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	int rc;
+
+	/* set low water mark */
+	rc = spdk_sock_set_recvlowat(tqpair->sock, sizeof(struct spdk_nvme_tcp_common_pdu_hdr));
+	if (rc != 0) {
+		SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+nvmf_tcp_handle_connect(struct spdk_nvmf_transport *transport,
+			struct spdk_nvmf_tcp_port *port,
+			struct spdk_sock *sock)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "New connection accepted on %s port %s\n",
+		      port->trid->traddr, port->trid->trsvcid);
+
+	tqpair = calloc(1, sizeof(struct spdk_nvmf_tcp_qpair));
+	if (tqpair == NULL) {
+		SPDK_ERRLOG("Could not allocate new connection.\n");
+		spdk_sock_close(&sock);
+		return;
+	}
+
+	tqpair->sock = sock;
+	tqpair->state_cntr[TCP_REQUEST_STATE_FREE] = 0;
+	tqpair->port = port;
+	tqpair->qpair.transport = transport;
+
+	rc = spdk_sock_getaddr(tqpair->sock, tqpair->target_addr,
+			       sizeof(tqpair->target_addr), &tqpair->target_port,
+			       tqpair->initiator_addr, sizeof(tqpair->initiator_addr),
+			       &tqpair->initiator_port);
+	if (rc < 0) {
+		SPDK_ERRLOG("spdk_sock_getaddr() failed of tqpair=%p\n", tqpair);
+		nvmf_tcp_qpair_destroy(tqpair);
+		return;
+	}
+
+	spdk_nvmf_tgt_new_qpair(transport->tgt, &tqpair->qpair);
+}
+
+static uint32_t
+nvmf_tcp_port_accept(struct spdk_nvmf_transport *transport, struct spdk_nvmf_tcp_port *port)
+{
+	struct spdk_sock *sock;
+	uint32_t count = 0;
+	int i;
+
+	for (i = 0; i < NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME; i++) {
+		sock = spdk_sock_accept(port->listen_sock);
+		if (sock == NULL) {
+			break;
+		}
+		count++;
+		nvmf_tcp_handle_connect(transport, port, sock);
+	}
+
+	return count;
+}
+
+static uint32_t
+nvmf_tcp_accept(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_tcp_transport *ttransport;
+	struct spdk_nvmf_tcp_port *port;
+	uint32_t count = 0;
+
+	ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+	TAILQ_FOREACH(port, &ttransport->ports, link) {
+		count += nvmf_tcp_port_accept(transport, port);
+	}
+
+	return count;
+}
+
+static void
+nvmf_tcp_discover(struct spdk_nvmf_transport *transport,
+		  struct spdk_nvme_transport_id *trid,
+		  struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+	entry->trtype = SPDK_NVMF_TRTYPE_TCP;
+	entry->adrfam = trid->adrfam;
+	entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED;
+
+	spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
+	spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
+
+	entry->tsas.tcp.sectype = SPDK_NVME_TCP_SECURITY_NONE;
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_tcp_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_tcp_poll_group *tgroup;
+
+	tgroup = calloc(1, sizeof(*tgroup));
+	if (!tgroup) {
+		return NULL;
+	}
+
+	tgroup->sock_group = spdk_sock_group_create(&tgroup->group);
+	if (!tgroup->sock_group) {
+		goto cleanup;
+	}
+
+	TAILQ_INIT(&tgroup->qpairs);
+	TAILQ_INIT(&tgroup->await_req);
+
+	return &tgroup->group;
+
+cleanup:
+	free(tgroup);
+	return NULL;
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_tcp_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair;
+	struct spdk_sock_group *group = NULL;
+	int rc;
+
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+	rc = spdk_sock_get_optimal_sock_group(tqpair->sock, &group);
+	if (!rc && group != NULL) {
+		return spdk_sock_group_get_ctx(group);
+	}
+
+	return NULL;
+}
+
+static void
+nvmf_tcp_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_tcp_poll_group *tgroup;
+
+	tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+	spdk_sock_group_close(&tgroup->sock_group);
+
+	free(tgroup);
+}
+
+static void
+nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair,
+			      enum nvme_tcp_pdu_recv_state state)
+{
+	if (tqpair->recv_state == state) {
+		SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n",
+			    tqpair, state);
+		return;
+	}
+
+	if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) {
+		/* When leaving the await req state, move the qpair to the main list */
+		TAILQ_REMOVE(&tqpair->group->await_req, tqpair, link);
+		TAILQ_INSERT_TAIL(&tqpair->group->qpairs, tqpair, link);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair(%p) recv state=%d\n", tqpair, state);
+	tqpair->recv_state = state;
+
+	switch (state) {
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+		break;
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_REQ:
+		TAILQ_REMOVE(&tqpair->group->qpairs, tqpair, link);
+		TAILQ_INSERT_TAIL(&tqpair->group->await_req, tqpair, link);
+		break;
+	case NVME_TCP_PDU_RECV_STATE_ERROR:
+	case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+		memset(&tqpair->pdu_in_progress, 0, sizeof(tqpair->pdu_in_progress));
+		break;
+	default:
+		SPDK_ERRLOG("The state(%d) is invalid\n", state);
+		abort();
+		break;
+	}
+}
+
+static int
+nvmf_tcp_qpair_handle_timeout(void *ctx)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair = ctx;
+
+	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_ERROR);
+
+	SPDK_ERRLOG("No pdu coming for tqpair=%p within %d seconds\n", tqpair,
+		    SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT);
+
+	nvmf_tcp_qpair_disconnect(tqpair);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_tcp_send_c2h_term_req_complete(void *cb_arg)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair = (struct spdk_nvmf_tcp_qpair *)cb_arg;
+
+	if (!tqpair->timeout_poller) {
+		tqpair->timeout_poller = SPDK_POLLER_REGISTER(nvmf_tcp_qpair_handle_timeout, tqpair,
+					 SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT * 1000000);
+	}
+}
+
+static void
+nvmf_tcp_send_c2h_term_req(struct spdk_nvmf_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
+			   enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset)
+{
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_term_req_hdr *c2h_term_req;
+	uint32_t c2h_term_req_hdr_len = sizeof(*c2h_term_req);
+	uint32_t copy_len;
+
+	rsp_pdu = &tqpair->mgmt_pdu;
+
+	c2h_term_req = &rsp_pdu->hdr.term_req;
+	c2h_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ;
+	c2h_term_req->common.hlen = c2h_term_req_hdr_len;
+
+	if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+	    (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+		DSET32(&c2h_term_req->fei, error_offset);
+	}
+
+	copy_len = spdk_min(pdu->hdr.common.hlen, SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
+
+	/* Copy the error info into the buffer */
+	memcpy((uint8_t *)rsp_pdu->hdr.raw + c2h_term_req_hdr_len, pdu->hdr.raw, copy_len);
+	nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + c2h_term_req_hdr_len, copy_len);
+
+	/* Contain the header of the wrong received pdu */
+	c2h_term_req->common.plen = c2h_term_req->common.hlen + copy_len;
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+	nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_send_c2h_term_req_complete, tqpair);
+}
+
+static void
+nvmf_tcp_capsule_cmd_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,
+				struct spdk_nvmf_tcp_qpair *tqpair,
+				struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvmf_tcp_req *tcp_req;
+
+	assert(pdu->psh_valid_bytes == pdu->psh_len);
+	assert(pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD);
+
+	tcp_req = nvmf_tcp_req_get(tqpair);
+	if (!tcp_req) {
+		/* Directly return and make the allocation retry again */
+		if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0) {
+			return;
+		}
+
+		/* The host sent more commands than the maximum queue depth. */
+		SPDK_ERRLOG("Cannot allocate tcp_req on tqpair=%p\n", tqpair);
+		nvmf_tcp_qpair_disconnect(tqpair);
+		return;
+	}
+
+	pdu->req = tcp_req;
+	assert(tcp_req->state == TCP_REQUEST_STATE_NEW);
+	nvmf_tcp_req_process(ttransport, tcp_req);
+}
+
+static void
+nvmf_tcp_capsule_cmd_payload_handle(struct spdk_nvmf_tcp_transport *ttransport,
+				    struct spdk_nvmf_tcp_qpair *tqpair,
+				    struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvmf_tcp_req *tcp_req;
+	struct spdk_nvme_tcp_cmd *capsule_cmd;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	capsule_cmd = &pdu->hdr.capsule_cmd;
+	tcp_req = pdu->req;
+	assert(tcp_req != NULL);
+	if (capsule_cmd->common.pdo > SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET) {
+		SPDK_ERRLOG("Expected ICReq capsule_cmd pdu offset <= %d, got %c\n",
+			    SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET, capsule_cmd->common.pdo);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdo);
+		goto err;
+	}
+
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+	nvmf_tcp_req_process(ttransport, tcp_req);
+
+	return;
+err:
+	nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static int
+nvmf_tcp_find_req_in_state(struct spdk_nvmf_tcp_qpair *tqpair,
+			   enum spdk_nvmf_tcp_req_state state,
+			   uint16_t cid, uint16_t tag,
+			   struct spdk_nvmf_tcp_req **req)
+{
+	struct spdk_nvmf_tcp_req *tcp_req = NULL;
+
+	TAILQ_FOREACH(tcp_req, &tqpair->state_queue[state], state_link) {
+		if (tcp_req->req.cmd->nvme_cmd.cid != cid) {
+			continue;
+		}
+
+		if (tcp_req->ttag == tag) {
+			*req = tcp_req;
+			return 0;
+		}
+
+		*req = NULL;
+		return -1;
+	}
+
+	/* Didn't find it, but not an error */
+	*req = NULL;
+	return 0;
+}
+
+static void
+nvmf_tcp_h2c_data_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,
+			     struct spdk_nvmf_tcp_qpair *tqpair,
+			     struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvmf_tcp_req *tcp_req;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes = 0;
+	struct spdk_nvme_tcp_h2c_data_hdr *h2c_data;
+	int rc;
+
+	h2c_data = &pdu->hdr.h2c_data;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair=%p, r2t_info: datao=%u, datal=%u, cccid=%u, ttag=%u\n",
+		      tqpair, h2c_data->datao, h2c_data->datal, h2c_data->cccid, h2c_data->ttag);
+
+	rc = nvmf_tcp_find_req_in_state(tqpair, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+					h2c_data->cccid, h2c_data->ttag, &tcp_req);
+	if (rc == 0 && tcp_req == NULL) {
+		rc = nvmf_tcp_find_req_in_state(tqpair, TCP_REQUEST_STATE_AWAITING_R2T_ACK, h2c_data->cccid,
+						h2c_data->ttag, &tcp_req);
+	}
+
+	if (!tcp_req) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tcp_req is not found for tqpair=%p\n", tqpair);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER;
+		if (rc == 0) {
+			error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, cccid);
+		} else {
+			error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, ttag);
+		}
+		goto err;
+	}
+
+	if (tcp_req->h2c_offset != h2c_data->datao) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+			      "tcp_req(%p), tqpair=%p, expected data offset %u, but data offset is %u\n",
+			      tcp_req, tqpair, tcp_req->h2c_offset, h2c_data->datao);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+		goto err;
+	}
+
+	if ((h2c_data->datao + h2c_data->datal) > tcp_req->req.length) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+			      "tcp_req(%p), tqpair=%p,  (datao=%u + datal=%u) execeeds requested length=%u\n",
+			      tcp_req, tqpair, h2c_data->datao, h2c_data->datal, tcp_req->req.length);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+		goto err;
+	}
+
+	pdu->req = tcp_req;
+
+	if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+		pdu->dif_ctx = &tcp_req->req.dif.dif_ctx;
+	}
+
+	nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
+				  h2c_data->datao, h2c_data->datal);
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+	return;
+
+err:
+	nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static void
+nvmf_tcp_pdu_cmd_complete(void *cb_arg)
+{
+	struct spdk_nvmf_tcp_req *tcp_req = cb_arg;
+	nvmf_tcp_request_free(tcp_req);
+}
+
+static void
+nvmf_tcp_send_capsule_resp_pdu(struct spdk_nvmf_tcp_req *tcp_req,
+			       struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_rsp *capsule_resp;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter, tqpair=%p\n", tqpair);
+
+	rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req);
+	assert(rsp_pdu != NULL);
+
+	capsule_resp = &rsp_pdu->hdr.capsule_resp;
+	capsule_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP;
+	capsule_resp->common.plen = capsule_resp->common.hlen = sizeof(*capsule_resp);
+	capsule_resp->rccqe = tcp_req->req.rsp->nvme_cpl;
+	if (tqpair->host_hdgst_enable) {
+		capsule_resp->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+		capsule_resp->common.plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_pdu_cmd_complete, tcp_req);
+}
+
+static void
+nvmf_tcp_pdu_c2h_data_complete(void *cb_arg)
+{
+	struct spdk_nvmf_tcp_req *tcp_req = cb_arg;
+	struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair,
+					     struct spdk_nvmf_tcp_qpair, qpair);
+
+	assert(tqpair != NULL);
+	if (tqpair->qpair.transport->opts.c2h_success) {
+		nvmf_tcp_request_free(tcp_req);
+	} else {
+		nvmf_tcp_req_pdu_fini(tcp_req);
+		nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
+	}
+}
+
+static void
+nvmf_tcp_r2t_complete(void *cb_arg)
+{
+	struct spdk_nvmf_tcp_req *tcp_req = cb_arg;
+	struct spdk_nvmf_tcp_transport *ttransport;
+
+	nvmf_tcp_req_pdu_fini(tcp_req);
+
+	ttransport = SPDK_CONTAINEROF(tcp_req->req.qpair->transport,
+				      struct spdk_nvmf_tcp_transport, transport);
+
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
+
+	if (tcp_req->h2c_offset == tcp_req->req.length) {
+		nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+		nvmf_tcp_req_process(ttransport, tcp_req);
+	}
+}
+
+static void
+nvmf_tcp_send_r2t_pdu(struct spdk_nvmf_tcp_qpair *tqpair,
+		      struct spdk_nvmf_tcp_req *tcp_req)
+{
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_r2t_hdr *r2t;
+
+	rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req);
+	assert(rsp_pdu != NULL);
+
+	r2t = &rsp_pdu->hdr.r2t;
+	r2t->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_R2T;
+	r2t->common.plen = r2t->common.hlen = sizeof(*r2t);
+
+	if (tqpair->host_hdgst_enable) {
+		r2t->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+		r2t->common.plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	r2t->cccid = tcp_req->req.cmd->nvme_cmd.cid;
+	r2t->ttag = tcp_req->ttag;
+	r2t->r2to = tcp_req->h2c_offset;
+	r2t->r2tl = tcp_req->req.length;
+
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_R2T_ACK);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+		      "tcp_req(%p) on tqpair(%p), r2t_info: cccid=%u, ttag=%u, r2to=%u, r2tl=%u\n",
+		      tcp_req, tqpair, r2t->cccid, r2t->ttag, r2t->r2to, r2t->r2tl);
+	nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_r2t_complete, tcp_req);
+}
+
+static void
+nvmf_tcp_h2c_data_payload_handle(struct spdk_nvmf_tcp_transport *ttransport,
+				 struct spdk_nvmf_tcp_qpair *tqpair,
+				 struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvmf_tcp_req *tcp_req;
+
+	tcp_req = pdu->req;
+	assert(tcp_req != NULL);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+	tcp_req->h2c_offset += pdu->data_len;
+
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+
+	/* Wait for all of the data to arrive AND for the initial R2T PDU send to be
+	 * acknowledged before moving on. */
+	if (tcp_req->h2c_offset == tcp_req->req.length &&
+	    tcp_req->state == TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER) {
+		nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+		nvmf_tcp_req_process(ttransport, tcp_req);
+	}
+}
+
+static void
+nvmf_tcp_h2c_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *h2c_term_req)
+{
+	SPDK_ERRLOG("Error info of pdu(%p): %s\n", h2c_term_req,
+		    spdk_nvmf_tcp_term_req_fes_str[h2c_term_req->fes]);
+	if ((h2c_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+	    (h2c_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "The offset from the start of the PDU header is %u\n",
+			      DGET32(h2c_term_req->fei));
+	}
+}
+
+static void
+nvmf_tcp_h2c_term_req_hdr_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+				 struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+
+	if (h2c_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) {
+		SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for h2c_term_req pdu=%p\n", pdu);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes);
+		goto end;
+	}
+
+	/* set the data buffer */
+	nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + h2c_term_req->common.hlen,
+			      h2c_term_req->common.plen - h2c_term_req->common.hlen);
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+	return;
+end:
+	nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static void
+nvmf_tcp_h2c_term_req_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+				     struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req;
+
+	nvmf_tcp_h2c_term_req_dump(h2c_term_req);
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+}
+
+static void
+nvmf_tcp_pdu_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+			    struct spdk_nvmf_tcp_transport *ttransport)
+{
+	int rc = 0;
+	struct nvme_tcp_pdu *pdu;
+	uint32_t crc32c, error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+	pdu = &tqpair->pdu_in_progress;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+	/* check data digest if need */
+	if (pdu->ddgst_enable) {
+		crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+		rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
+		if (rc == 0) {
+			SPDK_ERRLOG("Data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+			nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+			return;
+
+		}
+	}
+
+	switch (pdu->hdr.common.pdu_type) {
+	case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+		nvmf_tcp_capsule_cmd_payload_handle(ttransport, tqpair, pdu);
+		break;
+	case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA:
+		nvmf_tcp_h2c_data_payload_handle(ttransport, tqpair, pdu);
+		break;
+
+	case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+		nvmf_tcp_h2c_term_req_payload_handle(tqpair, pdu);
+		break;
+
+	default:
+		/* The code should not go to here */
+		SPDK_ERRLOG("The code should not go to here\n");
+		break;
+	}
+}
+
+static void
+nvmf_tcp_send_icresp_complete(void *cb_arg)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair = cb_arg;
+
+	tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING;
+}
+
+static void
+nvmf_tcp_icreq_handle(struct spdk_nvmf_tcp_transport *ttransport,
+		      struct spdk_nvmf_tcp_qpair *tqpair,
+		      struct nvme_tcp_pdu *pdu)
+{
+	struct spdk_nvme_tcp_ic_req *ic_req = &pdu->hdr.ic_req;
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_ic_resp *ic_resp;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	/* Only PFV 0 is defined currently */
+	if (ic_req->pfv != 0) {
+		SPDK_ERRLOG("Expected ICReq PFV %u, got %u\n", 0u, ic_req->pfv);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_ic_req, pfv);
+		goto end;
+	}
+
+	/* MAXR2T is 0's based */
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "maxr2t =%u\n", (ic_req->maxr2t + 1u));
+
+	tqpair->host_hdgst_enable = ic_req->dgst.bits.hdgst_enable ? true : false;
+	if (!tqpair->host_hdgst_enable) {
+		tqpair->recv_buf_size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
+	}
+
+	tqpair->host_ddgst_enable = ic_req->dgst.bits.ddgst_enable ? true : false;
+	if (!tqpair->host_ddgst_enable) {
+		tqpair->recv_buf_size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
+	}
+
+	/* Now that we know whether digests are enabled, properly size the receive buffer */
+	if (spdk_sock_set_recvbuf(tqpair->sock, tqpair->recv_buf_size) < 0) {
+		SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n",
+			     tqpair,
+			     tqpair->recv_buf_size);
+		/* Not fatal. */
+	}
+
+	tqpair->cpda = spdk_min(ic_req->hpda, SPDK_NVME_TCP_CPDA_MAX);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "cpda of tqpair=(%p) is : %u\n", tqpair, tqpair->cpda);
+
+	rsp_pdu = &tqpair->mgmt_pdu;
+
+	ic_resp = &rsp_pdu->hdr.ic_resp;
+	ic_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_RESP;
+	ic_resp->common.hlen = ic_resp->common.plen =  sizeof(*ic_resp);
+	ic_resp->pfv = 0;
+	ic_resp->cpda = tqpair->cpda;
+	ic_resp->maxh2cdata = ttransport->transport.opts.max_io_size;
+	ic_resp->dgst.bits.hdgst_enable = tqpair->host_hdgst_enable ? 1 : 0;
+	ic_resp->dgst.bits.ddgst_enable = tqpair->host_ddgst_enable ? 1 : 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable);
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable);
+
+	tqpair->state = NVME_TCP_QPAIR_STATE_INITIALIZING;
+	nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_send_icresp_complete, tqpair);
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+	return;
+end:
+	nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static void
+nvmf_tcp_pdu_psh_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+			struct spdk_nvmf_tcp_transport *ttransport)
+{
+	struct nvme_tcp_pdu *pdu;
+	int rc;
+	uint32_t crc32c, error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+
+	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+	pdu = &tqpair->pdu_in_progress;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "pdu type of tqpair(%p) is %d\n", tqpair,
+		      pdu->hdr.common.pdu_type);
+	/* check header digest if needed */
+	if (pdu->has_hdgst) {
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Compare the header of pdu=%p on tqpair=%p\n", pdu, tqpair);
+		crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+		rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c);
+		if (rc == 0) {
+			SPDK_ERRLOG("Header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+			nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+			return;
+
+		}
+	}
+
+	switch (pdu->hdr.common.pdu_type) {
+	case SPDK_NVME_TCP_PDU_TYPE_IC_REQ:
+		nvmf_tcp_icreq_handle(ttransport, tqpair, pdu);
+		break;
+	case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_REQ);
+		break;
+	case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA:
+		nvmf_tcp_h2c_data_hdr_handle(ttransport, tqpair, pdu);
+		break;
+
+	case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+		nvmf_tcp_h2c_term_req_hdr_handle(tqpair, pdu);
+		break;
+
+	default:
+		SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->pdu_in_progress.hdr.common.pdu_type);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = 1;
+		nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+		break;
+	}
+}
+
+static void
+nvmf_tcp_pdu_ch_handle(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	struct nvme_tcp_pdu *pdu;
+	uint32_t error_offset = 0;
+	enum spdk_nvme_tcp_term_req_fes fes;
+	uint8_t expected_hlen, pdo;
+	bool plen_error = false, pdo_error = false;
+
+	assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
+	pdu = &tqpair->pdu_in_progress;
+
+	if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_REQ) {
+		if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) {
+			SPDK_ERRLOG("Already received ICreq PDU, and reject this pdu=%p\n", pdu);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+			goto err;
+		}
+		expected_hlen = sizeof(struct spdk_nvme_tcp_ic_req);
+		if (pdu->hdr.common.plen != expected_hlen) {
+			plen_error = true;
+		}
+	} else {
+		if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
+			SPDK_ERRLOG("The TCP/IP connection is not negotitated\n");
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+			goto err;
+		}
+
+		switch (pdu->hdr.common.pdu_type) {
+		case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_cmd);
+			pdo = pdu->hdr.common.pdo;
+			if ((tqpair->cpda != 0) && (pdo != ((tqpair->cpda + 1) << 2))) {
+				pdo_error = true;
+				break;
+			}
+
+			if (pdu->hdr.common.plen < expected_hlen) {
+				plen_error = true;
+			}
+			break;
+		case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_h2c_data_hdr);
+			pdo = pdu->hdr.common.pdo;
+			if ((tqpair->cpda != 0) && (pdo != ((tqpair->cpda + 1) << 2))) {
+				pdo_error = true;
+				break;
+			}
+			if (pdu->hdr.common.plen < expected_hlen) {
+				plen_error = true;
+			}
+			break;
+
+		case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+			expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr);
+			if ((pdu->hdr.common.plen <= expected_hlen) ||
+			    (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) {
+				plen_error = true;
+			}
+			break;
+
+		default:
+			SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", pdu->hdr.common.pdu_type);
+			fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+			error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type);
+			goto err;
+		}
+	}
+
+	if (pdu->hdr.common.hlen != expected_hlen) {
+		SPDK_ERRLOG("PDU type=0x%02x, Expected ICReq header length %u, got %u on tqpair=%p\n",
+			    pdu->hdr.common.pdu_type,
+			    expected_hlen, pdu->hdr.common.hlen, tqpair);
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen);
+		goto err;
+	} else if (pdo_error) {
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdo);
+	} else if (plen_error) {
+		fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+		error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen);
+		goto err;
+	} else {
+		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+		nvme_tcp_pdu_calc_psh_len(&tqpair->pdu_in_progress, tqpair->host_hdgst_enable);
+		return;
+	}
+err:
+	nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static int
+nvmf_tcp_pdu_payload_insert_dif(struct nvme_tcp_pdu *pdu, uint32_t read_offset,
+				int read_len)
+{
+	int rc;
+
+	rc = spdk_dif_generate_stream(pdu->data_iov, pdu->data_iovcnt,
+				      read_offset, read_len, pdu->dif_ctx);
+	if (rc != 0) {
+		SPDK_ERRLOG("DIF generate failed\n");
+	}
+
+	return rc;
+}
+
+static int
+nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+	int rc = 0;
+	struct nvme_tcp_pdu *pdu;
+	enum nvme_tcp_pdu_recv_state prev_state;
+	uint32_t data_len;
+	struct spdk_nvmf_tcp_transport *ttransport = SPDK_CONTAINEROF(tqpair->qpair.transport,
+			struct spdk_nvmf_tcp_transport, transport);
+
+	/* The loop here is to allow for several back-to-back state changes. */
+	do {
+		prev_state = tqpair->recv_state;
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair(%p) recv pdu entering state %d\n", tqpair, prev_state);
+
+		pdu = &tqpair->pdu_in_progress;
+		switch (tqpair->recv_state) {
+		/* Wait for the common header  */
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+			if (spdk_unlikely(tqpair->state == NVME_TCP_QPAIR_STATE_INITIALIZING)) {
+				return rc;
+			}
+
+			rc = nvme_tcp_read_data(tqpair->sock,
+						sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
+						(void *)&pdu->hdr.common + pdu->ch_valid_bytes);
+			if (rc < 0) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect tqpair=%p\n", tqpair);
+				return NVME_TCP_PDU_FATAL;
+			} else if (rc > 0) {
+				pdu->ch_valid_bytes += rc;
+				spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0);
+				if (spdk_likely(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY)) {
+					nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
+				}
+			}
+
+			if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			/* The command header of this PDU has now been read from the socket. */
+			nvmf_tcp_pdu_ch_handle(tqpair);
+			break;
+		/* Wait for the pdu specific header  */
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+			rc = nvme_tcp_read_data(tqpair->sock,
+						pdu->psh_len - pdu->psh_valid_bytes,
+						(void *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
+			if (rc < 0) {
+				return NVME_TCP_PDU_FATAL;
+			} else if (rc > 0) {
+				spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE,
+						  0, rc, 0, 0);
+				pdu->psh_valid_bytes += rc;
+			}
+
+			if (pdu->psh_valid_bytes < pdu->psh_len) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			/* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
+			nvmf_tcp_pdu_psh_handle(tqpair, ttransport);
+			break;
+		/* Wait for the req slot */
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_REQ:
+			nvmf_tcp_capsule_cmd_hdr_handle(ttransport, tqpair, pdu);
+			break;
+		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+			/* check whether the data is valid, if not we just return */
+			if (!pdu->data_len) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			data_len = pdu->data_len;
+			/* data digest */
+			if (spdk_unlikely((pdu->hdr.common.pdu_type != SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ) &&
+					  tqpair->host_ddgst_enable)) {
+				data_len += SPDK_NVME_TCP_DIGEST_LEN;
+				pdu->ddgst_enable = true;
+			}
+
+			rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
+			if (rc < 0) {
+				return NVME_TCP_PDU_FATAL;
+			}
+			pdu->readv_offset += rc;
+
+			if (spdk_unlikely(pdu->dif_ctx != NULL)) {
+				rc = nvmf_tcp_pdu_payload_insert_dif(pdu, pdu->readv_offset - rc, rc);
+				if (rc != 0) {
+					return NVME_TCP_PDU_FATAL;
+				}
+			}
+
+			if (pdu->readv_offset < data_len) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+
+			/* All of this PDU has now been read from the socket. */
+			nvmf_tcp_pdu_payload_handle(tqpair, ttransport);
+			break;
+		case NVME_TCP_PDU_RECV_STATE_ERROR:
+			if (!spdk_sock_is_connected(tqpair->sock)) {
+				return NVME_TCP_PDU_FATAL;
+			}
+			break;
+		default:
+			assert(0);
+			SPDK_ERRLOG("code should not come to here");
+			break;
+		}
+	} while (tqpair->recv_state != prev_state);
+
+	return rc;
+}
+
+static int
+nvmf_tcp_req_parse_sgl(struct spdk_nvmf_tcp_req *tcp_req,
+		       struct spdk_nvmf_transport *transport,
+		       struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_request		*req = &tcp_req->req;
+	struct spdk_nvme_cmd			*cmd;
+	struct spdk_nvme_cpl			*rsp;
+	struct spdk_nvme_sgl_descriptor		*sgl;
+	uint32_t				length;
+
+	cmd = &req->cmd->nvme_cmd;
+	rsp = &req->rsp->nvme_cpl;
+	sgl = &cmd->dptr.sgl1;
+
+	length = sgl->unkeyed.length;
+
+	if (sgl->generic.type == SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK &&
+	    sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_TRANSPORT) {
+		if (length > transport->opts.max_io_size) {
+			SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
+				    length, transport->opts.max_io_size);
+			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+			return -1;
+		}
+
+		/* fill request length and populate iovs */
+		req->length = length;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Data requested length= 0x%x\n", length);
+
+		if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+			req->dif.orig_length = length;
+			length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
+			req->dif.elba_length = length;
+		}
+
+		if (spdk_nvmf_request_get_buffers(req, group, transport, length)) {
+			/* No available buffers. Queue this request up. */
+			SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "No available large data buffers. Queueing request %p\n",
+				      tcp_req);
+			return 0;
+		}
+
+		/* backward compatible */
+		req->data = req->iov[0].iov_base;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p took %d buffer/s from central pool, and data=%p\n",
+			      tcp_req, req->iovcnt, req->data);
+
+		return 0;
+	} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
+		   sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
+		uint64_t offset = sgl->address;
+		uint32_t max_len = transport->opts.in_capsule_data_size;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
+			      offset, length);
+
+		if (offset > max_len) {
+			SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
+				    offset, max_len);
+			rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
+			return -1;
+		}
+		max_len -= (uint32_t)offset;
+
+		if (length > max_len) {
+			SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
+				    length, max_len);
+			rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+			return -1;
+		}
+
+		req->data = tcp_req->buf + offset;
+		req->data_from_pool = false;
+		req->length = length;
+
+		if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+			length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
+			req->dif.elba_length = length;
+		}
+
+		req->iov[0].iov_base = req->data;
+		req->iov[0].iov_len = length;
+		req->iovcnt = 1;
+
+		return 0;
+	}
+
+	SPDK_ERRLOG("Invalid NVMf I/O Command SGL:  Type 0x%x, Subtype 0x%x\n",
+		    sgl->generic.type, sgl->generic.subtype);
+	rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
+	return -1;
+}
+
+static inline enum spdk_nvme_media_error_status_code
+nvmf_tcp_dif_error_to_compl_status(uint8_t err_type) {
+	enum spdk_nvme_media_error_status_code result;
+
+	switch (err_type)
+	{
+	case SPDK_DIF_REFTAG_ERROR:
+		result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR;
+		break;
+	case SPDK_DIF_APPTAG_ERROR:
+		result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR;
+		break;
+	case SPDK_DIF_GUARD_ERROR:
+		result = SPDK_NVME_SC_GUARD_CHECK_ERROR;
+		break;
+	default:
+		SPDK_UNREACHABLE();
+		break;
+	}
+
+	return result;
+}
+
+static void
+nvmf_tcp_send_c2h_data(struct spdk_nvmf_tcp_qpair *tqpair,
+		       struct spdk_nvmf_tcp_req *tcp_req)
+{
+	struct nvme_tcp_pdu *rsp_pdu;
+	struct spdk_nvme_tcp_c2h_data_hdr *c2h_data;
+	uint32_t plen, pdo, alignment;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+	rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req);
+	assert(rsp_pdu != NULL);
+
+	c2h_data = &rsp_pdu->hdr.c2h_data;
+	c2h_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_DATA;
+	plen = c2h_data->common.hlen = sizeof(*c2h_data);
+
+	if (tqpair->host_hdgst_enable) {
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+		c2h_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+	}
+
+	/* set the psh */
+	c2h_data->cccid = tcp_req->req.cmd->nvme_cmd.cid;
+	c2h_data->datal = tcp_req->req.length;
+	c2h_data->datao = 0;
+
+	/* set the padding */
+	rsp_pdu->padding_len = 0;
+	pdo = plen;
+	if (tqpair->cpda) {
+		alignment = (tqpair->cpda + 1) << 2;
+		if (alignment > plen) {
+			rsp_pdu->padding_len = alignment - plen;
+			pdo = plen = alignment;
+		}
+	}
+
+	c2h_data->common.pdo = pdo;
+	plen += c2h_data->datal;
+	if (tqpair->host_ddgst_enable) {
+		c2h_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	c2h_data->common.plen = plen;
+
+	if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+		rsp_pdu->dif_ctx = &tcp_req->req.dif.dif_ctx;
+	}
+
+	nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
+				  c2h_data->datao, c2h_data->datal);
+
+	if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+		struct spdk_nvme_cpl *rsp = &tcp_req->req.rsp->nvme_cpl;
+		struct spdk_dif_error err_blk = {};
+
+		rc = spdk_dif_verify_stream(rsp_pdu->data_iov, rsp_pdu->data_iovcnt,
+					    0, rsp_pdu->data_len, rsp_pdu->dif_ctx, &err_blk);
+		if (rc != 0) {
+			SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
+				    err_blk.err_type, err_blk.err_offset);
+			rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR;
+			rsp->status.sc = nvmf_tcp_dif_error_to_compl_status(err_blk.err_type);
+			nvmf_tcp_req_pdu_fini(tcp_req);
+			nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
+			return;
+		}
+	}
+
+	c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
+	if (tqpair->qpair.transport->opts.c2h_success) {
+		c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
+	}
+
+	nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_pdu_c2h_data_complete, tcp_req);
+}
+
+static int
+request_transfer_out(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_tcp_req	*tcp_req;
+	struct spdk_nvmf_qpair		*qpair;
+	struct spdk_nvmf_tcp_qpair	*tqpair;
+	struct spdk_nvme_cpl		*rsp;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+	qpair = req->qpair;
+	rsp = &req->rsp->nvme_cpl;
+	tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+
+	/* Advance our sq_head pointer */
+	if (qpair->sq_head == qpair->sq_head_max) {
+		qpair->sq_head = 0;
+	} else {
+		qpair->sq_head++;
+	}
+	rsp->sqhd = qpair->sq_head;
+
+	tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair);
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST);
+	if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+		nvmf_tcp_send_c2h_data(tqpair, tcp_req);
+	} else {
+		nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
+	}
+
+	return 0;
+}
+
+static void
+nvmf_tcp_set_incapsule_data(struct spdk_nvmf_tcp_qpair *tqpair,
+			    struct spdk_nvmf_tcp_req *tcp_req)
+{
+	struct nvme_tcp_pdu *pdu;
+	uint32_t plen = 0;
+
+	pdu = &tqpair->pdu_in_progress;
+	plen = pdu->hdr.common.hlen;
+
+	if (tqpair->host_hdgst_enable) {
+		plen += SPDK_NVME_TCP_DIGEST_LEN;
+	}
+
+	if (pdu->hdr.common.plen != plen) {
+		tcp_req->has_incapsule_data = true;
+	}
+}
+
+static bool
+nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
+		     struct spdk_nvmf_tcp_req *tcp_req)
+{
+	struct spdk_nvmf_tcp_qpair		*tqpair;
+	int					rc;
+	enum spdk_nvmf_tcp_req_state		prev_state;
+	bool					progress = false;
+	struct spdk_nvmf_transport		*transport = &ttransport->transport;
+	struct spdk_nvmf_transport_poll_group	*group;
+
+	tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair);
+	group = &tqpair->group->group;
+	assert(tcp_req->state != TCP_REQUEST_STATE_FREE);
+
+	/* If the qpair is not active, we need to abort the outstanding requests. */
+	if (tqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+		if (tcp_req->state == TCP_REQUEST_STATE_NEED_BUFFER) {
+			STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link);
+		}
+		nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED);
+	}
+
+	/* The loop here is to allow for several back-to-back state changes. */
+	do {
+		prev_state = tcp_req->state;
+
+		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p entering state %d on tqpair=%p\n", tcp_req, prev_state,
+			      tqpair);
+
+		switch (tcp_req->state) {
+		case TCP_REQUEST_STATE_FREE:
+			/* Some external code must kick a request into TCP_REQUEST_STATE_NEW
+			 * to escape this state. */
+			break;
+		case TCP_REQUEST_STATE_NEW:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEW, 0, 0, (uintptr_t)tcp_req, 0);
+
+			/* copy the cmd from the receive pdu */
+			tcp_req->cmd = tqpair->pdu_in_progress.hdr.capsule_cmd.ccsqe;
+
+			if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&tcp_req->req, &tcp_req->req.dif.dif_ctx))) {
+				tcp_req->req.dif.dif_insert_or_strip = true;
+				tqpair->pdu_in_progress.dif_ctx = &tcp_req->req.dif.dif_ctx;
+			}
+
+			/* The next state transition depends on the data transfer needs of this request. */
+			tcp_req->req.xfer = spdk_nvmf_req_get_xfer(&tcp_req->req);
+
+			/* If no data to transfer, ready to execute. */
+			if (tcp_req->req.xfer == SPDK_NVME_DATA_NONE) {
+				/* Reset the tqpair receving pdu state */
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+				nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+				break;
+			}
+
+			nvmf_tcp_set_incapsule_data(tqpair, tcp_req);
+
+			if (!tcp_req->has_incapsule_data) {
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+			}
+
+			nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_NEED_BUFFER);
+			STAILQ_INSERT_TAIL(&group->pending_buf_queue, &tcp_req->req, buf_link);
+			break;
+		case TCP_REQUEST_STATE_NEED_BUFFER:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEED_BUFFER, 0, 0, (uintptr_t)tcp_req, 0);
+
+			assert(tcp_req->req.xfer != SPDK_NVME_DATA_NONE);
+
+			if (!tcp_req->has_incapsule_data && (&tcp_req->req != STAILQ_FIRST(&group->pending_buf_queue))) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+					      "Not the first element to wait for the buf for tcp_req(%p) on tqpair=%p\n",
+					      tcp_req, tqpair);
+				/* This request needs to wait in line to obtain a buffer */
+				break;
+			}
+
+			/* Try to get a data buffer */
+			rc = nvmf_tcp_req_parse_sgl(tcp_req, transport, group);
+			if (rc < 0) {
+				STAILQ_REMOVE_HEAD(&group->pending_buf_queue, buf_link);
+				/* Reset the tqpair receving pdu state */
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+				nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE);
+				break;
+			}
+
+			if (!tcp_req->req.data) {
+				SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "No buffer allocated for tcp_req(%p) on tqpair(%p\n)",
+					      tcp_req, tqpair);
+				/* No buffers available. */
+				break;
+			}
+
+			STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link);
+
+			/* If data is transferring from host to controller, we need to do a transfer from the host. */
+			if (tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+				if (tcp_req->req.data_from_pool) {
+					SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Sending R2T for tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair);
+					nvmf_tcp_send_r2t_pdu(tqpair, tcp_req);
+				} else {
+					struct nvme_tcp_pdu *pdu;
+
+					nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
+
+					pdu = &tqpair->pdu_in_progress;
+					SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Not need to send r2t for tcp_req(%p) on tqpair=%p\n", tcp_req,
+						      tqpair);
+					/* No need to send r2t, contained in the capsuled data */
+					nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
+								  0, tcp_req->req.length);
+					nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+				}
+				break;
+			}
+
+			nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+			break;
+		case TCP_REQUEST_STATE_AWAITING_R2T_ACK:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK, 0, 0, (uintptr_t)tcp_req, 0);
+			/* The R2T completion or the h2c data incoming will kick it out of this state. */
+			break;
+		case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0,
+					  (uintptr_t)tcp_req, 0);
+			/* Some external code must kick a request into TCP_REQUEST_STATE_READY_TO_EXECUTE
+			 * to escape this state. */
+			break;
+		case TCP_REQUEST_STATE_READY_TO_EXECUTE:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, (uintptr_t)tcp_req, 0);
+
+			if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+				assert(tcp_req->req.dif.elba_length >= tcp_req->req.length);
+				tcp_req->req.length = tcp_req->req.dif.elba_length;
+			}
+
+			nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING);
+			spdk_nvmf_request_exec(&tcp_req->req);
+			break;
+		case TCP_REQUEST_STATE_EXECUTING:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)tcp_req, 0);
+			/* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED
+			 * to escape this state. */
+			break;
+		case TCP_REQUEST_STATE_EXECUTED:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)tcp_req, 0);
+
+			if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+				tcp_req->req.length = tcp_req->req.dif.orig_length;
+			}
+
+			nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE);
+			break;
+		case TCP_REQUEST_STATE_READY_TO_COMPLETE:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, (uintptr_t)tcp_req, 0);
+			rc = request_transfer_out(&tcp_req->req);
+			assert(rc == 0); /* No good way to handle this currently */
+			break;
+		case TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0,
+					  (uintptr_t)tcp_req,
+					  0);
+			/* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED
+			 * to escape this state. */
+			break;
+		case TCP_REQUEST_STATE_COMPLETED:
+			spdk_trace_record(TRACE_TCP_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)tcp_req, 0);
+			if (tcp_req->req.data_from_pool) {
+				spdk_nvmf_request_free_buffers(&tcp_req->req, group, transport);
+			}
+			tcp_req->req.length = 0;
+			tcp_req->req.iovcnt = 0;
+			tcp_req->req.data = NULL;
+
+			nvmf_tcp_req_pdu_fini(tcp_req);
+
+			nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_FREE);
+			break;
+		case TCP_REQUEST_NUM_STATES:
+		default:
+			assert(0);
+			break;
+		}
+
+		if (tcp_req->state != prev_state) {
+			progress = true;
+		}
+	} while (tcp_req->state != prev_state);
+
+	return progress;
+}
+
+static void
+nvmf_tcp_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair = arg;
+	int rc;
+
+	assert(tqpair != NULL);
+	rc = nvmf_tcp_sock_process(tqpair);
+
+	/* If there was a new socket error, disconnect */
+	if (rc < 0) {
+		nvmf_tcp_qpair_disconnect(tqpair);
+	}
+}
+
+static int
+nvmf_tcp_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+			struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_tcp_poll_group	*tgroup;
+	struct spdk_nvmf_tcp_qpair	*tqpair;
+	int				rc;
+
+	tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+	rc = spdk_sock_group_add_sock(tgroup->sock_group, tqpair->sock,
+				      nvmf_tcp_sock_cb, tqpair);
+	if (rc != 0) {
+		SPDK_ERRLOG("Could not add sock to sock_group: %s (%d)\n",
+			    spdk_strerror(errno), errno);
+		return -1;
+	}
+
+	rc =  nvmf_tcp_qpair_sock_init(tqpair);
+	if (rc != 0) {
+		SPDK_ERRLOG("Cannot set sock opt for tqpair=%p\n", tqpair);
+		return -1;
+	}
+
+	rc = nvmf_tcp_qpair_init(&tqpair->qpair);
+	if (rc < 0) {
+		SPDK_ERRLOG("Cannot init tqpair=%p\n", tqpair);
+		return -1;
+	}
+
+	rc = nvmf_tcp_qpair_init_mem_resource(tqpair);
+	if (rc < 0) {
+		SPDK_ERRLOG("Cannot init memory resource info for tqpair=%p\n", tqpair);
+		return -1;
+	}
+
+	tqpair->group = tgroup;
+	tqpair->state = NVME_TCP_QPAIR_STATE_INVALID;
+	TAILQ_INSERT_TAIL(&tgroup->qpairs, tqpair, link);
+
+	return 0;
+}
+
+static int
+nvmf_tcp_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+			   struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_tcp_poll_group	*tgroup;
+	struct spdk_nvmf_tcp_qpair		*tqpair;
+	int				rc;
+
+	tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+	assert(tqpair->group == tgroup);
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "remove tqpair=%p from the tgroup=%p\n", tqpair, tgroup);
+	if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) {
+		TAILQ_REMOVE(&tgroup->await_req, tqpair, link);
+	} else {
+		TAILQ_REMOVE(&tgroup->qpairs, tqpair, link);
+	}
+
+	rc = spdk_sock_group_remove_sock(tgroup->sock_group, tqpair->sock);
+	if (rc != 0) {
+		SPDK_ERRLOG("Could not remove sock from sock_group: %s (%d)\n",
+			    spdk_strerror(errno), errno);
+	}
+
+	return rc;
+}
+
+static int
+nvmf_tcp_req_complete(struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_tcp_transport *ttransport;
+	struct spdk_nvmf_tcp_req *tcp_req;
+
+	ttransport = SPDK_CONTAINEROF(req->qpair->transport, struct spdk_nvmf_tcp_transport, transport);
+	tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+
+	nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED);
+	nvmf_tcp_req_process(ttransport, tcp_req);
+
+	return 0;
+}
+
+static void
+nvmf_tcp_close_qpair(struct spdk_nvmf_qpair *qpair)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair;
+
+	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Qpair: %p\n", qpair);
+
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+	tqpair->state = NVME_TCP_QPAIR_STATE_EXITED;
+	nvmf_tcp_qpair_destroy(tqpair);
+}
+
+static int
+nvmf_tcp_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_tcp_poll_group *tgroup;
+	int rc;
+	struct spdk_nvmf_request *req, *req_tmp;
+	struct spdk_nvmf_tcp_req *tcp_req;
+	struct spdk_nvmf_tcp_qpair *tqpair, *tqpair_tmp;
+	struct spdk_nvmf_tcp_transport *ttransport = SPDK_CONTAINEROF(group->transport,
+			struct spdk_nvmf_tcp_transport, transport);
+
+	tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+
+	if (spdk_unlikely(TAILQ_EMPTY(&tgroup->qpairs) && TAILQ_EMPTY(&tgroup->await_req))) {
+		return 0;
+	}
+
+	STAILQ_FOREACH_SAFE(req, &group->pending_buf_queue, buf_link, req_tmp) {
+		tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+		if (nvmf_tcp_req_process(ttransport, tcp_req) == false) {
+			break;
+		}
+	}
+
+	rc = spdk_sock_group_poll(tgroup->sock_group);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to poll sock_group=%p\n", tgroup->sock_group);
+	}
+
+	TAILQ_FOREACH_SAFE(tqpair, &tgroup->await_req, link, tqpair_tmp) {
+		nvmf_tcp_sock_process(tqpair);
+	}
+
+	return rc;
+}
+
+static int
+nvmf_tcp_qpair_get_trid(struct spdk_nvmf_qpair *qpair,
+			struct spdk_nvme_transport_id *trid, bool peer)
+{
+	struct spdk_nvmf_tcp_qpair     *tqpair;
+	uint16_t			port;
+
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+	spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_TCP);
+
+	if (peer) {
+		snprintf(trid->traddr, sizeof(trid->traddr), "%s", tqpair->initiator_addr);
+		port = tqpair->initiator_port;
+	} else {
+		snprintf(trid->traddr, sizeof(trid->traddr), "%s", tqpair->target_addr);
+		port = tqpair->target_port;
+	}
+
+	if (spdk_sock_is_ipv4(tqpair->sock)) {
+		trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+	} else if (spdk_sock_is_ipv6(tqpair->sock)) {
+		trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
+	} else {
+		return -1;
+	}
+
+	snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%d", port);
+	return 0;
+}
+
+static int
+nvmf_tcp_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+			      struct spdk_nvme_transport_id *trid)
+{
+	return nvmf_tcp_qpair_get_trid(qpair, trid, 0);
+}
+
+static int
+nvmf_tcp_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+			     struct spdk_nvme_transport_id *trid)
+{
+	return nvmf_tcp_qpair_get_trid(qpair, trid, 1);
+}
+
+static int
+nvmf_tcp_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+			       struct spdk_nvme_transport_id *trid)
+{
+	return nvmf_tcp_qpair_get_trid(qpair, trid, 0);
+}
+
+static void
+nvmf_tcp_req_set_abort_status(struct spdk_nvmf_request *req,
+			      struct spdk_nvmf_tcp_req *tcp_req_to_abort)
+{
+	tcp_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+	tcp_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+
+	nvmf_tcp_req_set_state(tcp_req_to_abort, TCP_REQUEST_STATE_READY_TO_COMPLETE);
+
+	req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */
+}
+
+static int
+_nvmf_tcp_qpair_abort_request(void *ctx)
+{
+	struct spdk_nvmf_request *req = ctx;
+	struct spdk_nvmf_tcp_req *tcp_req_to_abort = SPDK_CONTAINEROF(req->req_to_abort,
+			struct spdk_nvmf_tcp_req, req);
+	struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair,
+					     struct spdk_nvmf_tcp_qpair, qpair);
+	int rc;
+
+	spdk_poller_unregister(&req->poller);
+
+	switch (tcp_req_to_abort->state) {
+	case TCP_REQUEST_STATE_EXECUTING:
+		rc = nvmf_ctrlr_abort_request(req);
+		if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
+			return SPDK_POLLER_BUSY;
+		}
+		break;
+
+	case TCP_REQUEST_STATE_NEED_BUFFER:
+		STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue,
+			      &tcp_req_to_abort->req, spdk_nvmf_request, buf_link);
+
+		nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort);
+		break;
+
+	case TCP_REQUEST_STATE_AWAITING_R2T_ACK:
+		nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort);
+		break;
+
+	case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+		if (spdk_get_ticks() < req->timeout_tsc) {
+			req->poller = SPDK_POLLER_REGISTER(_nvmf_tcp_qpair_abort_request, req, 0);
+			return SPDK_POLLER_BUSY;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	spdk_nvmf_request_complete(req);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_tcp_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+			     struct spdk_nvmf_request *req)
+{
+	struct spdk_nvmf_tcp_qpair *tqpair;
+	struct spdk_nvmf_tcp_transport *ttransport;
+	struct spdk_nvmf_transport *transport;
+	uint16_t cid;
+	uint32_t i;
+	struct spdk_nvmf_tcp_req *tcp_req_to_abort = NULL;
+
+	tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+	ttransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_tcp_transport, transport);
+	transport = &ttransport->transport;
+
+	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
+
+	for (i = 0; i < tqpair->resource_count; i++) {
+		tcp_req_to_abort = &tqpair->reqs[i];
+
+		if (tcp_req_to_abort->state != TCP_REQUEST_STATE_FREE &&
+		    tcp_req_to_abort->req.cmd->nvme_cmd.cid == cid) {
+			break;
+		}
+	}
+
+	if (tcp_req_to_abort == NULL) {
+		spdk_nvmf_request_complete(req);
+		return;
+	}
+
+	req->req_to_abort = &tcp_req_to_abort->req;
+	req->timeout_tsc = spdk_get_ticks() +
+			   transport->opts.abort_timeout_sec * spdk_get_ticks_hz();
+	req->poller = NULL;
+
+	_nvmf_tcp_qpair_abort_request(req);
+}
+
+#define SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH 128
+#define SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH 128
+#define SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR 128
+#define SPDK_NVMF_TCP_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
+#define SPDK_NVMF_TCP_DEFAULT_MAX_IO_SIZE 131072
+#define SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE 131072
+#define SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS 511
+#define SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE 32
+#define SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION true
+#define SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP false
+#define SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY 0
+#define SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC 1
+
+static void
+nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts)
+{
+	opts->max_queue_depth =		SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH;
+	opts->max_qpairs_per_ctrlr =	SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR;
+	opts->in_capsule_data_size =	SPDK_NVMF_TCP_DEFAULT_IN_CAPSULE_DATA_SIZE;
+	opts->max_io_size =		SPDK_NVMF_TCP_DEFAULT_MAX_IO_SIZE;
+	opts->io_unit_size =		SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE;
+	opts->max_aq_depth =		SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH;
+	opts->num_shared_buffers =	SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS;
+	opts->buf_cache_size =		SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE;
+	opts->c2h_success =		SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION;
+	opts->dif_insert_or_strip =	SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP;
+	opts->sock_priority =		SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY;
+	opts->abort_timeout_sec =	SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC;
+}
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp = {
+	.name = "TCP",
+	.type = SPDK_NVME_TRANSPORT_TCP,
+	.opts_init = nvmf_tcp_opts_init,
+	.create = nvmf_tcp_create,
+	.destroy = nvmf_tcp_destroy,
+
+	.listen = nvmf_tcp_listen,
+	.stop_listen = nvmf_tcp_stop_listen,
+	.accept = nvmf_tcp_accept,
+
+	.listener_discover = nvmf_tcp_discover,
+
+	.poll_group_create = nvmf_tcp_poll_group_create,
+	.get_optimal_poll_group = nvmf_tcp_get_optimal_poll_group,
+	.poll_group_destroy = nvmf_tcp_poll_group_destroy,
+	.poll_group_add = nvmf_tcp_poll_group_add,
+	.poll_group_remove = nvmf_tcp_poll_group_remove,
+	.poll_group_poll = nvmf_tcp_poll_group_poll,
+
+	.req_free = nvmf_tcp_req_free,
+	.req_complete = nvmf_tcp_req_complete,
+
+	.qpair_fini = nvmf_tcp_close_qpair,
+	.qpair_get_local_trid = nvmf_tcp_qpair_get_local_trid,
+	.qpair_get_peer_trid = nvmf_tcp_qpair_get_peer_trid,
+	.qpair_get_listen_trid = nvmf_tcp_qpair_get_listen_trid,
+	.qpair_abort_request = nvmf_tcp_qpair_abort_request,
+};
+
+SPDK_NVMF_TRANSPORT_REGISTER(tcp, &spdk_nvmf_transport_tcp);
+SPDK_LOG_REGISTER_COMPONENT("nvmf_tcp", SPDK_LOG_NVMF_TCP)
diff --git a/src/spdk/lib/nvmf/transport.c b/src/spdk/lib/nvmf/transport.c
new file mode 100644
index 000000000..11bb152df
--- /dev/null
+++ b/src/spdk/lib/nvmf/transport.c
@@ -0,0 +1,572 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/config.h"
+#include "spdk/log.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+
+#define MAX_MEMPOOL_NAME_LENGTH 40
+
+struct nvmf_transport_ops_list_element {
+	struct spdk_nvmf_transport_ops			ops;
+	TAILQ_ENTRY(nvmf_transport_ops_list_element)	link;
+};
+
+TAILQ_HEAD(nvmf_transport_ops_list, nvmf_transport_ops_list_element)
+g_spdk_nvmf_transport_ops = TAILQ_HEAD_INITIALIZER(g_spdk_nvmf_transport_ops);
+
+static inline const struct spdk_nvmf_transport_ops *
+nvmf_get_transport_ops(const char *transport_name)
+{
+	struct nvmf_transport_ops_list_element *ops;
+	TAILQ_FOREACH(ops, &g_spdk_nvmf_transport_ops, link) {
+		if (strcasecmp(transport_name, ops->ops.name) == 0) {
+			return &ops->ops;
+		}
+	}
+	return NULL;
+}
+
+void
+spdk_nvmf_transport_register(const struct spdk_nvmf_transport_ops *ops)
+{
+	struct nvmf_transport_ops_list_element *new_ops;
+
+	if (nvmf_get_transport_ops(ops->name) != NULL) {
+		SPDK_ERRLOG("Double registering nvmf transport type %s.\n", ops->name);
+		assert(false);
+		return;
+	}
+
+	new_ops = calloc(1, sizeof(*new_ops));
+	if (new_ops == NULL) {
+		SPDK_ERRLOG("Unable to allocate memory to register new transport type %s.\n", ops->name);
+		assert(false);
+		return;
+	}
+
+	new_ops->ops = *ops;
+
+	TAILQ_INSERT_TAIL(&g_spdk_nvmf_transport_ops, new_ops, link);
+}
+
+const struct spdk_nvmf_transport_opts *
+spdk_nvmf_get_transport_opts(struct spdk_nvmf_transport *transport)
+{
+	return &transport->opts;
+}
+
+spdk_nvme_transport_type_t
+spdk_nvmf_get_transport_type(struct spdk_nvmf_transport *transport)
+{
+	return transport->ops->type;
+}
+
+const char *
+spdk_nvmf_get_transport_name(struct spdk_nvmf_transport *transport)
+{
+	return transport->ops->name;
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_transport_create(const char *transport_name, struct spdk_nvmf_transport_opts *opts)
+{
+	const struct spdk_nvmf_transport_ops *ops = NULL;
+	struct spdk_nvmf_transport *transport;
+	char spdk_mempool_name[MAX_MEMPOOL_NAME_LENGTH];
+	int chars_written;
+
+	ops = nvmf_get_transport_ops(transport_name);
+	if (!ops) {
+		SPDK_ERRLOG("Transport type '%s' unavailable.\n", transport_name);
+		return NULL;
+	}
+
+	if (opts->max_aq_depth < SPDK_NVMF_MIN_ADMIN_MAX_SQ_SIZE) {
+		SPDK_ERRLOG("max_aq_depth %u is less than minimum defined by NVMf spec, use min value\n",
+			    opts->max_aq_depth);
+		opts->max_aq_depth = SPDK_NVMF_MIN_ADMIN_MAX_SQ_SIZE;
+	}
+
+	transport = ops->create(opts);
+	if (!transport) {
+		SPDK_ERRLOG("Unable to create new transport of type %s\n", transport_name);
+		return NULL;
+	}
+
+	TAILQ_INIT(&transport->listeners);
+
+	transport->ops = ops;
+	transport->opts = *opts;
+	chars_written = snprintf(spdk_mempool_name, MAX_MEMPOOL_NAME_LENGTH, "%s_%s_%s", "spdk_nvmf",
+				 transport_name, "data");
+	if (chars_written < 0) {
+		SPDK_ERRLOG("Unable to generate transport data buffer pool name.\n");
+		ops->destroy(transport);
+		return NULL;
+	}
+
+	transport->data_buf_pool = spdk_mempool_create(spdk_mempool_name,
+				   opts->num_shared_buffers,
+				   opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT,
+				   SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+				   SPDK_ENV_SOCKET_ID_ANY);
+
+	if (!transport->data_buf_pool) {
+		SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n");
+		ops->destroy(transport);
+		return NULL;
+	}
+
+	return transport;
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_transport_get_first(struct spdk_nvmf_tgt *tgt)
+{
+	return TAILQ_FIRST(&tgt->transports);
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_transport_get_next(struct spdk_nvmf_transport *transport)
+{
+	return TAILQ_NEXT(transport, link);
+}
+
+int
+spdk_nvmf_transport_destroy(struct spdk_nvmf_transport *transport)
+{
+	if (transport->data_buf_pool != NULL) {
+		if (spdk_mempool_count(transport->data_buf_pool) !=
+		    transport->opts.num_shared_buffers) {
+			SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n",
+				    spdk_mempool_count(transport->data_buf_pool),
+				    transport->opts.num_shared_buffers);
+		}
+	}
+
+	spdk_mempool_free(transport->data_buf_pool);
+
+	return transport->ops->destroy(transport);
+}
+
+struct spdk_nvmf_listener *
+nvmf_transport_find_listener(struct spdk_nvmf_transport *transport,
+			     const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_listener *listener;
+
+	TAILQ_FOREACH(listener, &transport->listeners, link) {
+		if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) {
+			return listener;
+		}
+	}
+
+	return NULL;
+}
+
+int
+spdk_nvmf_transport_listen(struct spdk_nvmf_transport *transport,
+			   const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_listener *listener;
+	int rc;
+
+	listener = nvmf_transport_find_listener(transport, trid);
+	if (!listener) {
+		listener = calloc(1, sizeof(*listener));
+		if (!listener) {
+			return -ENOMEM;
+		}
+
+		listener->ref = 1;
+		listener->trid = *trid;
+		TAILQ_INSERT_TAIL(&transport->listeners, listener, link);
+
+		rc = transport->ops->listen(transport, &listener->trid);
+		if (rc != 0) {
+			TAILQ_REMOVE(&transport->listeners, listener, link);
+			free(listener);
+		}
+		return rc;
+	}
+
+	++listener->ref;
+
+	return 0;
+}
+
+int
+spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport,
+				const struct spdk_nvme_transport_id *trid)
+{
+	struct spdk_nvmf_listener *listener;
+
+	listener = nvmf_transport_find_listener(transport, trid);
+	if (!listener) {
+		return -ENOENT;
+	}
+
+	if (--listener->ref == 0) {
+		TAILQ_REMOVE(&transport->listeners, listener, link);
+		transport->ops->stop_listen(transport, trid);
+		free(listener);
+	}
+
+	return 0;
+}
+
+uint32_t
+nvmf_transport_accept(struct spdk_nvmf_transport *transport)
+{
+	return transport->ops->accept(transport);
+}
+
+void
+nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport,
+				 struct spdk_nvme_transport_id *trid,
+				 struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+	transport->ops->listener_discover(transport, trid, entry);
+}
+
+struct spdk_nvmf_transport_poll_group *
+nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+	struct spdk_nvmf_transport_poll_group *group;
+	struct spdk_nvmf_transport_pg_cache_buf *buf;
+
+	group = transport->ops->poll_group_create(transport);
+	if (!group) {
+		return NULL;
+	}
+	group->transport = transport;
+
+	STAILQ_INIT(&group->pending_buf_queue);
+	STAILQ_INIT(&group->buf_cache);
+
+	if (transport->opts.buf_cache_size) {
+		group->buf_cache_count = 0;
+		group->buf_cache_size = transport->opts.buf_cache_size;
+		while (group->buf_cache_count < group->buf_cache_size) {
+			buf = (struct spdk_nvmf_transport_pg_cache_buf *)spdk_mempool_get(transport->data_buf_pool);
+			if (!buf) {
+				SPDK_NOTICELOG("Unable to reserve the full number of buffers for the pg buffer cache.\n");
+				break;
+			}
+			STAILQ_INSERT_HEAD(&group->buf_cache, buf, link);
+			group->buf_cache_count++;
+		}
+	}
+	return group;
+}
+
+struct spdk_nvmf_transport_poll_group *
+nvmf_transport_get_optimal_poll_group(struct spdk_nvmf_transport *transport,
+				      struct spdk_nvmf_qpair *qpair)
+{
+	if (transport->ops->get_optimal_poll_group) {
+		return transport->ops->get_optimal_poll_group(qpair);
+	} else {
+		return NULL;
+	}
+}
+
+void
+nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+	struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp;
+
+	if (!STAILQ_EMPTY(&group->pending_buf_queue)) {
+		SPDK_ERRLOG("Pending I/O list wasn't empty on poll group destruction\n");
+	}
+
+	STAILQ_FOREACH_SAFE(buf, &group->buf_cache, link, tmp) {
+		STAILQ_REMOVE(&group->buf_cache, buf, spdk_nvmf_transport_pg_cache_buf, link);
+		spdk_mempool_put(group->transport->data_buf_pool, buf);
+	}
+	group->transport->ops->poll_group_destroy(group);
+}
+
+int
+nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+			      struct spdk_nvmf_qpair *qpair)
+{
+	if (qpair->transport) {
+		assert(qpair->transport == group->transport);
+		if (qpair->transport != group->transport) {
+			return -1;
+		}
+	} else {
+		qpair->transport = group->transport;
+	}
+
+	return group->transport->ops->poll_group_add(group, qpair);
+}
+
+int
+nvmf_transport_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+				 struct spdk_nvmf_qpair *qpair)
+{
+	int rc = ENOTSUP;
+
+	assert(qpair->transport == group->transport);
+	if (group->transport->ops->poll_group_remove) {
+		rc = group->transport->ops->poll_group_remove(group, qpair);
+	}
+
+	return rc;
+}
+
+int
+nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+	return group->transport->ops->poll_group_poll(group);
+}
+
+int
+nvmf_transport_req_free(struct spdk_nvmf_request *req)
+{
+	return req->qpair->transport->ops->req_free(req);
+}
+
+int
+nvmf_transport_req_complete(struct spdk_nvmf_request *req)
+{
+	return req->qpair->transport->ops->req_complete(req);
+}
+
+void
+nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair)
+{
+	qpair->transport->ops->qpair_fini(qpair);
+}
+
+int
+nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+				   struct spdk_nvme_transport_id *trid)
+{
+	return qpair->transport->ops->qpair_get_peer_trid(qpair, trid);
+}
+
+int
+nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+				    struct spdk_nvme_transport_id *trid)
+{
+	return qpair->transport->ops->qpair_get_local_trid(qpair, trid);
+}
+
+int
+nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+				     struct spdk_nvme_transport_id *trid)
+{
+	return qpair->transport->ops->qpair_get_listen_trid(qpair, trid);
+}
+
+void
+nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+				   struct spdk_nvmf_request *req)
+{
+	qpair->transport->ops->qpair_abort_request(qpair, req);
+}
+
+bool
+spdk_nvmf_transport_opts_init(const char *transport_name,
+			      struct spdk_nvmf_transport_opts *opts)
+{
+	const struct spdk_nvmf_transport_ops *ops;
+
+	ops = nvmf_get_transport_ops(transport_name);
+	if (!ops) {
+		SPDK_ERRLOG("Transport type %s unavailable.\n", transport_name);
+		return false;
+	}
+
+	ops->opts_init(opts);
+	return true;
+}
+
+int
+spdk_nvmf_transport_poll_group_get_stat(struct spdk_nvmf_tgt *tgt,
+					struct spdk_nvmf_transport *transport,
+					struct spdk_nvmf_transport_poll_group_stat **stat)
+{
+	if (transport->ops->poll_group_get_stat) {
+		return transport->ops->poll_group_get_stat(tgt, stat);
+	} else {
+		return -ENOTSUP;
+	}
+}
+
+void
+spdk_nvmf_transport_poll_group_free_stat(struct spdk_nvmf_transport *transport,
+		struct spdk_nvmf_transport_poll_group_stat *stat)
+{
+	if (transport->ops->poll_group_free_stat) {
+		transport->ops->poll_group_free_stat(stat);
+	}
+}
+
+void
+spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req,
+			       struct spdk_nvmf_transport_poll_group *group,
+			       struct spdk_nvmf_transport *transport)
+{
+	uint32_t i;
+
+	for (i = 0; i < req->iovcnt; i++) {
+		if (group->buf_cache_count < group->buf_cache_size) {
+			STAILQ_INSERT_HEAD(&group->buf_cache,
+					   (struct spdk_nvmf_transport_pg_cache_buf *)req->buffers[i],
+					   link);
+			group->buf_cache_count++;
+		} else {
+			spdk_mempool_put(transport->data_buf_pool, req->buffers[i]);
+		}
+		req->iov[i].iov_base = NULL;
+		req->buffers[i] = NULL;
+		req->iov[i].iov_len = 0;
+	}
+	req->data_from_pool = false;
+}
+
+static inline int
+nvmf_request_set_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length,
+			uint32_t io_unit_size)
+{
+	req->buffers[req->iovcnt] = buf;
+	req->iov[req->iovcnt].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) &
+					 ~NVMF_DATA_BUFFER_MASK);
+	req->iov[req->iovcnt].iov_len  = spdk_min(length, io_unit_size);
+	length -= req->iov[req->iovcnt].iov_len;
+	req->iovcnt++;
+
+	return length;
+}
+
+static int
+nvmf_request_get_buffers(struct spdk_nvmf_request *req,
+			 struct spdk_nvmf_transport_poll_group *group,
+			 struct spdk_nvmf_transport *transport,
+			 uint32_t length)
+{
+	uint32_t io_unit_size = transport->opts.io_unit_size;
+	uint32_t num_buffers;
+	uint32_t i = 0, j;
+	void *buffer, *buffers[NVMF_REQ_MAX_BUFFERS];
+
+	/* If the number of buffers is too large, then we know the I/O is larger than allowed.
+	 *  Fail it.
+	 */
+	num_buffers = SPDK_CEIL_DIV(length, io_unit_size);
+	if (num_buffers + req->iovcnt > NVMF_REQ_MAX_BUFFERS) {
+		return -EINVAL;
+	}
+
+	while (i < num_buffers) {
+		if (!(STAILQ_EMPTY(&group->buf_cache))) {
+			group->buf_cache_count--;
+			buffer = STAILQ_FIRST(&group->buf_cache);
+			STAILQ_REMOVE_HEAD(&group->buf_cache, link);
+			assert(buffer != NULL);
+
+			length = nvmf_request_set_buffer(req, buffer, length, io_unit_size);
+			i++;
+		} else {
+			if (spdk_mempool_get_bulk(transport->data_buf_pool, buffers,
+						  num_buffers - i)) {
+				return -ENOMEM;
+			}
+			for (j = 0; j < num_buffers - i; j++) {
+				length = nvmf_request_set_buffer(req, buffers[j], length, io_unit_size);
+			}
+			i += num_buffers - i;
+		}
+	}
+
+	assert(length == 0);
+
+	req->data_from_pool = true;
+	return 0;
+}
+
+int
+spdk_nvmf_request_get_buffers(struct spdk_nvmf_request *req,
+			      struct spdk_nvmf_transport_poll_group *group,
+			      struct spdk_nvmf_transport *transport,
+			      uint32_t length)
+{
+	int rc;
+
+	req->iovcnt = 0;
+
+	rc = nvmf_request_get_buffers(req, group, transport, length);
+	if (rc == -ENOMEM) {
+		spdk_nvmf_request_free_buffers(req, group, transport);
+	}
+
+	return rc;
+}
+
+int
+spdk_nvmf_request_get_buffers_multi(struct spdk_nvmf_request *req,
+				    struct spdk_nvmf_transport_poll_group *group,
+				    struct spdk_nvmf_transport *transport,
+				    uint32_t *lengths, uint32_t num_lengths)
+{
+	int rc = 0;
+	uint32_t i;
+
+	req->iovcnt = 0;
+
+	for (i = 0; i < num_lengths; i++) {
+		rc = nvmf_request_get_buffers(req, group, transport, lengths[i]);
+		if (rc != 0) {
+			goto err_exit;
+		}
+	}
+
+	return 0;
+
+err_exit:
+	spdk_nvmf_request_free_buffers(req, group, transport);
+	return rc;
+}
diff --git a/src/spdk/lib/nvmf/transport.h b/src/spdk/lib/nvmf/transport.h
new file mode 100644
index 000000000..38b5d8db3
--- /dev/null
+++ b/src/spdk/lib/nvmf/transport.h
@@ -0,0 +1,82 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_NVMF_TRANSPORT_H
+#define SPDK_NVMF_TRANSPORT_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_transport.h"
+
+uint32_t nvmf_transport_accept(struct spdk_nvmf_transport *transport);
+
+void nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport,
+				      struct spdk_nvme_transport_id *trid,
+				      struct spdk_nvmf_discovery_log_page_entry *entry);
+
+struct spdk_nvmf_transport_poll_group *nvmf_transport_poll_group_create(
+	struct spdk_nvmf_transport *transport);
+struct spdk_nvmf_transport_poll_group *nvmf_transport_get_optimal_poll_group(
+	struct spdk_nvmf_transport *transport, struct spdk_nvmf_qpair *qpair);
+
+void nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
+
+int nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+				  struct spdk_nvmf_qpair *qpair);
+
+int nvmf_transport_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+				     struct spdk_nvmf_qpair *qpair);
+
+int nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group);
+
+int nvmf_transport_req_free(struct spdk_nvmf_request *req);
+
+int nvmf_transport_req_complete(struct spdk_nvmf_request *req);
+
+void nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair);
+
+int nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+				       struct spdk_nvme_transport_id *trid);
+
+int nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+					struct spdk_nvme_transport_id *trid);
+
+int nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+		struct spdk_nvme_transport_id *trid);
+
+void nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+					struct spdk_nvmf_request *req);
+
+#endif /* SPDK_NVMF_TRANSPORT_H */
diff --git a/src/spdk/lib/rdma/Makefile b/src/spdk/lib/rdma/Makefile
new file mode 100644
index 000000000..e6374557d
--- /dev/null
+++ b/src/spdk/lib/rdma/Makefile
@@ -0,0 +1,70 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation. All rights reserved.
+#  Copyright (c) Mellanox Technologies LTD. All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 1
+SO_MINOR := 0
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rdma.map)
+
+LIBNAME = rdma
+
+ifeq ($(CONFIG_RDMA_PROV),verbs)
+C_SRCS = rdma_verbs.c
+else ifeq ($(CONFIG_RDMA_PROV),mlx5_dv)
+C_SRCS = rdma_mlx5_dv.c
+LOCAL_SYS_LIBS += -lmlx5
+else
+$(error Wrong RDMA provider specified: $(CONFIG_RDMA_PROV))
+endif
+
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rdma/rdma_mlx5_dv.c b/src/spdk/lib/rdma/rdma_mlx5_dv.c
new file mode 100644
index 000000000..bae3afdda
--- /dev/null
+++ b/src/spdk/lib/rdma/rdma_mlx5_dv.c
@@ -0,0 +1,316 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/rdma_cma.h>
+#include <infiniband/mlx5dv.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+
+#include "spdk_internal/rdma.h"
+#include "spdk_internal/log.h"
+
+struct spdk_rdma_mlx5_dv_qp {
+	struct spdk_rdma_qp common;
+	struct ibv_qp_ex *qpex;
+};
+
+static int
+rdma_mlx5_dv_init_qpair(struct spdk_rdma_mlx5_dv_qp *mlx5_qp)
+{
+	struct ibv_qp_attr qp_attr;
+	int qp_attr_mask, rc;
+
+	qp_attr.qp_state = IBV_QPS_INIT;
+	rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask);
+	if (rc) {
+		SPDK_ERRLOG("Failed to init attr IBV_QPS_INIT, errno %s (%d)\n", spdk_strerror(errno), errno);
+		return rc;
+	}
+
+	rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask);
+	if (rc) {
+		SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_INIT) failed, rc %d\n", rc);
+		return rc;
+	}
+
+	qp_attr.qp_state = IBV_QPS_RTR;
+	rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask);
+	if (rc) {
+		SPDK_ERRLOG("Failed to init attr IBV_QPS_RTR, errno %s (%d)\n", spdk_strerror(errno), errno);
+		return rc;
+	}
+
+	rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask);
+	if (rc) {
+		SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_RTR) failed, rc %d\n", rc);
+		return rc;
+	}
+
+	qp_attr.qp_state = IBV_QPS_RTS;
+	rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask);
+	if (rc) {
+		SPDK_ERRLOG("Failed to init attr IBV_QPS_RTR, errno %s (%d)\n", spdk_strerror(errno), errno);
+		return rc;
+	}
+
+	rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask);
+	if (rc) {
+		SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_RTS) failed, rc %d\n", rc);
+	}
+
+	return rc;
+}
+
+struct spdk_rdma_qp *
+spdk_rdma_qp_create(struct rdma_cm_id *cm_id, struct spdk_rdma_qp_init_attr *qp_attr)
+{
+	assert(cm_id);
+	assert(qp_attr);
+
+	struct ibv_qp *qp;
+	struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+	struct ibv_qp_init_attr_ex dv_qp_attr = {
+		.qp_context = qp_attr->qp_context,
+		.send_cq = qp_attr->send_cq,
+		.recv_cq = qp_attr->recv_cq,
+		.srq = qp_attr->srq,
+		.cap = qp_attr->cap,
+		.qp_type = IBV_QPT_RC,
+		.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS,
+		.pd = qp_attr->pd ? qp_attr->pd : cm_id->pd
+	};
+
+	assert(dv_qp_attr.pd);
+
+	mlx5_qp = calloc(1, sizeof(*mlx5_qp));
+	if (!mlx5_qp) {
+		SPDK_ERRLOG("qp memory allocation failed\n");
+		return NULL;
+	}
+
+	qp = mlx5dv_create_qp(cm_id->verbs, &dv_qp_attr, NULL);
+
+	if (!qp) {
+		SPDK_ERRLOG("Failed to create qpair, errno %s (%d)\n", spdk_strerror(errno), errno);
+		free(mlx5_qp);
+		return NULL;
+	}
+
+	mlx5_qp->common.qp = qp;
+	mlx5_qp->common.cm_id = cm_id;
+	mlx5_qp->qpex = ibv_qp_to_qp_ex(qp);
+
+	if (!mlx5_qp->qpex) {
+		spdk_rdma_qp_destroy(&mlx5_qp->common);
+		return NULL;
+	}
+
+	qp_attr->cap = dv_qp_attr.cap;
+
+	return &mlx5_qp->common;
+}
+
+int
+spdk_rdma_qp_accept(struct spdk_rdma_qp *spdk_rdma_qp, struct rdma_conn_param *conn_param)
+{
+	struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+
+	assert(spdk_rdma_qp != NULL);
+	assert(spdk_rdma_qp->cm_id != NULL);
+
+	mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+	/* NVMEoF target must move qpair to RTS state */
+	if (rdma_mlx5_dv_init_qpair(mlx5_qp) != 0) {
+		SPDK_ERRLOG("Failed to initialize qpair\n");
+		/* Set errno to be compliant with rdma_accept behaviour */
+		errno = ECONNABORTED;
+		return -1;
+	}
+
+	return rdma_accept(spdk_rdma_qp->cm_id, conn_param);
+}
+
+int
+spdk_rdma_qp_complete_connect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+	struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+	int rc;
+
+	assert(spdk_rdma_qp);
+
+	mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+	rc = rdma_mlx5_dv_init_qpair(mlx5_qp);
+	if (rc) {
+		SPDK_ERRLOG("Failed to initialize qpair\n");
+		return rc;
+	}
+
+	rc = rdma_establish(mlx5_qp->common.cm_id);
+	if (rc) {
+		SPDK_ERRLOG("rdma_establish failed, errno %s (%d)\n", spdk_strerror(errno), errno);
+	}
+
+	return rc;
+}
+
+void
+spdk_rdma_qp_destroy(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+	struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+	int rc;
+
+	assert(spdk_rdma_qp != NULL);
+
+	mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+	if (spdk_rdma_qp->send_wrs.first != NULL) {
+		SPDK_WARNLOG("Destroying qpair with queued Work Requests\n");
+	}
+
+	if (mlx5_qp->common.qp) {
+		rc = ibv_destroy_qp(mlx5_qp->common.qp);
+		if (rc) {
+			SPDK_ERRLOG("Failed to destroy ibv qp %p, rc %d\n", mlx5_qp->common.qp, rc);
+		}
+	}
+
+	free(mlx5_qp);
+}
+
+int
+spdk_rdma_qp_disconnect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+	int rc = 0;
+
+	assert(spdk_rdma_qp != NULL);
+
+	if (spdk_rdma_qp->qp) {
+		struct ibv_qp_attr qp_attr = {.qp_state = IBV_QPS_ERR};
+
+		rc = ibv_modify_qp(spdk_rdma_qp->qp, &qp_attr, IBV_QP_STATE);
+		if (rc) {
+			SPDK_ERRLOG("Failed to modify ibv qp %p state to ERR, rc %d\n", spdk_rdma_qp->qp, rc);
+			return rc;
+		}
+	}
+
+	if (spdk_rdma_qp->cm_id) {
+		rc = rdma_disconnect(spdk_rdma_qp->cm_id);
+		if (rc) {
+			SPDK_ERRLOG("rdma_disconnect failed, errno %s (%d)\n", spdk_strerror(errno), errno);
+		}
+	}
+
+	return rc;
+}
+
+bool
+spdk_rdma_qp_queue_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr *first)
+{
+	struct ibv_send_wr *tmp;
+	struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+	bool is_first;
+
+	assert(spdk_rdma_qp);
+	assert(first);
+
+	is_first = spdk_rdma_qp->send_wrs.first == NULL;
+	mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+	if (is_first) {
+		ibv_wr_start(mlx5_qp->qpex);
+		spdk_rdma_qp->send_wrs.first = first;
+	} else {
+		spdk_rdma_qp->send_wrs.last->next = first;
+	}
+
+	for (tmp = first; tmp != NULL; tmp = tmp->next) {
+		mlx5_qp->qpex->wr_id = tmp->wr_id;
+		mlx5_qp->qpex->wr_flags = tmp->send_flags;
+
+		switch (tmp->opcode) {
+		case IBV_WR_SEND:
+			ibv_wr_send(mlx5_qp->qpex);
+			break;
+		case IBV_WR_SEND_WITH_INV:
+			ibv_wr_send_inv(mlx5_qp->qpex, tmp->invalidate_rkey);
+			break;
+		case IBV_WR_RDMA_READ:
+			ibv_wr_rdma_read(mlx5_qp->qpex, tmp->wr.rdma.rkey, tmp->wr.rdma.remote_addr);
+			break;
+		case IBV_WR_RDMA_WRITE:
+			ibv_wr_rdma_write(mlx5_qp->qpex, tmp->wr.rdma.rkey, tmp->wr.rdma.remote_addr);
+			break;
+		default:
+			SPDK_ERRLOG("Unexpected opcode %d\n", tmp->opcode);
+			assert(0);
+		}
+
+		ibv_wr_set_sge_list(mlx5_qp->qpex, tmp->num_sge, tmp->sg_list);
+
+		spdk_rdma_qp->send_wrs.last = tmp;
+	}
+
+	return is_first;
+}
+
+int
+spdk_rdma_qp_flush_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr **bad_wr)
+{
+	struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+	int rc;
+
+	assert(bad_wr);
+	assert(spdk_rdma_qp);
+
+	mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+	if (spdk_unlikely(spdk_rdma_qp->send_wrs.first == NULL)) {
+		return 0;
+	}
+
+	rc =  ibv_wr_complete(mlx5_qp->qpex);
+
+	if (spdk_unlikely(rc)) {
+		/* If ibv_wr_complete reports an error that means that no WRs are posted to NIC */
+		*bad_wr = spdk_rdma_qp->send_wrs.first;
+	}
+
+	spdk_rdma_qp->send_wrs.first = NULL;
+
+	return rc;
+}
diff --git a/src/spdk/lib/rdma/rdma_verbs.c b/src/spdk/lib/rdma/rdma_verbs.c
new file mode 100644
index 000000000..66be5bf60
--- /dev/null
+++ b/src/spdk/lib/rdma/rdma_verbs.c
@@ -0,0 +1,167 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/rdma_cma.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+
+#include "spdk_internal/rdma.h"
+#include "spdk_internal/log.h"
+
+struct spdk_rdma_qp *
+spdk_rdma_qp_create(struct rdma_cm_id *cm_id, struct spdk_rdma_qp_init_attr *qp_attr)
+{
+	struct spdk_rdma_qp *spdk_rdma_qp;
+	int rc;
+	struct ibv_qp_init_attr attr = {
+		.qp_context = qp_attr->qp_context,
+		.send_cq = qp_attr->send_cq,
+		.recv_cq = qp_attr->recv_cq,
+		.srq = qp_attr->srq,
+		.cap = qp_attr->cap,
+		.qp_type = IBV_QPT_RC
+	};
+
+	spdk_rdma_qp = calloc(1, sizeof(*spdk_rdma_qp));
+	if (!spdk_rdma_qp) {
+		SPDK_ERRLOG("qp memory allocation failed\n");
+		return NULL;
+	}
+
+	rc = rdma_create_qp(cm_id, qp_attr->pd, &attr);
+	if (rc) {
+		SPDK_ERRLOG("Failed to create qp, errno %s (%d)\n", spdk_strerror(errno), errno);
+		free(spdk_rdma_qp);
+		return NULL;
+	}
+
+	qp_attr->cap = attr.cap;
+	spdk_rdma_qp->qp = cm_id->qp;
+	spdk_rdma_qp->cm_id = cm_id;
+
+	return spdk_rdma_qp;
+}
+
+int
+spdk_rdma_qp_accept(struct spdk_rdma_qp *spdk_rdma_qp, struct rdma_conn_param *conn_param)
+{
+	assert(spdk_rdma_qp != NULL);
+	assert(spdk_rdma_qp->cm_id != NULL);
+
+	return rdma_accept(spdk_rdma_qp->cm_id, conn_param);
+}
+
+int
+spdk_rdma_qp_complete_connect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+	/* Nothing to be done for Verbs */
+	return 0;
+}
+
+void
+spdk_rdma_qp_destroy(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+	assert(spdk_rdma_qp != NULL);
+
+	if (spdk_rdma_qp->send_wrs.first != NULL) {
+		SPDK_WARNLOG("Destroying qpair with queued Work Requests\n");
+	}
+
+	if (spdk_rdma_qp->qp) {
+		rdma_destroy_qp(spdk_rdma_qp->cm_id);
+	}
+
+	free(spdk_rdma_qp);
+}
+
+int
+spdk_rdma_qp_disconnect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+	int rc = 0;
+
+	assert(spdk_rdma_qp != NULL);
+
+	if (spdk_rdma_qp->cm_id) {
+		rc = rdma_disconnect(spdk_rdma_qp->cm_id);
+		if (rc) {
+			SPDK_ERRLOG("rdma_disconnect failed, errno %s (%d)\n", spdk_strerror(errno), errno);
+		}
+	}
+
+	return rc;
+}
+
+bool
+spdk_rdma_qp_queue_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr *first)
+{
+	struct ibv_send_wr *last;
+
+	assert(spdk_rdma_qp);
+	assert(first);
+
+	last = first;
+	while (last->next != NULL) {
+		last = last->next;
+	}
+
+	if (spdk_rdma_qp->send_wrs.first == NULL) {
+		spdk_rdma_qp->send_wrs.first = first;
+		spdk_rdma_qp->send_wrs.last = last;
+		return true;
+	} else {
+		spdk_rdma_qp->send_wrs.last->next = first;
+		spdk_rdma_qp->send_wrs.last = last;
+		return false;
+	}
+}
+
+int
+spdk_rdma_qp_flush_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr **bad_wr)
+{
+	int rc;
+
+	assert(spdk_rdma_qp);
+	assert(bad_wr);
+
+	if (spdk_unlikely(!spdk_rdma_qp->send_wrs.first)) {
+		return 0;
+	}
+
+	rc = ibv_post_send(spdk_rdma_qp->qp, spdk_rdma_qp->send_wrs.first, bad_wr);
+
+	spdk_rdma_qp->send_wrs.first = NULL;
+
+	return rc;
+}
diff --git a/src/spdk/lib/rdma/spdk_rdma.map b/src/spdk/lib/rdma/spdk_rdma.map
new file mode 100644
index 000000000..9268a2191
--- /dev/null
+++ b/src/spdk/lib/rdma/spdk_rdma.map
@@ -0,0 +1,14 @@
+{
+	global:
+
+	# Public functions
+	spdk_rdma_qp_create;
+	spdk_rdma_qp_accept;
+	spdk_rdma_qp_complete_connect;
+	spdk_rdma_qp_destroy;
+	spdk_rdma_qp_disconnect;
+	spdk_rdma_qp_queue_send_wrs;
+	spdk_rdma_qp_flush_send_wrs;
+
+	local: *;
+};
diff --git a/src/spdk/lib/reduce/Makefile b/src/spdk/lib/reduce/Makefile
new file mode 100644
index 000000000..fb417cd57
--- /dev/null
+++ b/src/spdk/lib/reduce/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = reduce.c
+LIBNAME = reduce
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_reduce.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/reduce/reduce.c b/src/spdk/lib/reduce/reduce.c
new file mode 100644
index 000000000..6188f6c6c
--- /dev/null
+++ b/src/spdk/lib/reduce/reduce.c
@@ -0,0 +1,1625 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/reduce.h"
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/bit_array.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+
+#include "libpmem.h"
+
+/* Always round up the size of the PM region to the nearest cacheline. */
+#define REDUCE_PM_SIZE_ALIGNMENT	64
+
+/* Offset into the backing device where the persistent memory file's path is stored. */
+#define REDUCE_BACKING_DEV_PATH_OFFSET	4096
+
+#define REDUCE_EMPTY_MAP_ENTRY	-1ULL
+
+#define REDUCE_NUM_VOL_REQUESTS	256
+
+/* Structure written to offset 0 of both the pm file and the backing device. */
+struct spdk_reduce_vol_superblock {
+	uint8_t				signature[8];
+	struct spdk_reduce_vol_params	params;
+	uint8_t				reserved[4048];
+};
+SPDK_STATIC_ASSERT(sizeof(struct spdk_reduce_vol_superblock) == 4096, "size incorrect");
+
+#define SPDK_REDUCE_SIGNATURE "SPDKREDU"
+/* null terminator counts one */
+SPDK_STATIC_ASSERT(sizeof(SPDK_REDUCE_SIGNATURE) - 1 ==
+		   sizeof(((struct spdk_reduce_vol_superblock *)0)->signature), "size incorrect");
+
+#define REDUCE_PATH_MAX 4096
+
+#define REDUCE_ZERO_BUF_SIZE 0x100000
+
+/**
+ * Describes a persistent memory file used to hold metadata associated with a
+ *  compressed volume.
+ */
+struct spdk_reduce_pm_file {
+	char			path[REDUCE_PATH_MAX];
+	void			*pm_buf;
+	int			pm_is_pmem;
+	uint64_t		size;
+};
+
+#define REDUCE_IO_READV		1
+#define REDUCE_IO_WRITEV	2
+
+struct spdk_reduce_chunk_map {
+	uint32_t		compressed_size;
+	uint32_t		reserved;
+	uint64_t		io_unit_index[0];
+};
+
+struct spdk_reduce_vol_request {
+	/**
+	 *  Scratch buffer used for uncompressed chunk.  This is used for:
+	 *   1) source buffer for compression operations
+	 *   2) destination buffer for decompression operations
+	 *   3) data buffer when writing uncompressed chunk to disk
+	 *   4) data buffer when reading uncompressed chunk from disk
+	 */
+	uint8_t					*decomp_buf;
+	struct iovec				*decomp_buf_iov;
+
+	/**
+	 * These are used to construct the iovecs that are sent to
+	 *  the decomp engine, they point to a mix of the scratch buffer
+	 *  and user buffer
+	 */
+	struct iovec				decomp_iov[REDUCE_MAX_IOVECS + 2];
+	int					decomp_iovcnt;
+
+	/**
+	 *  Scratch buffer used for compressed chunk.  This is used for:
+	 *   1) destination buffer for compression operations
+	 *   2) source buffer for decompression operations
+	 *   3) data buffer when writing compressed chunk to disk
+	 *   4) data buffer when reading compressed chunk from disk
+	 */
+	uint8_t					*comp_buf;
+	struct iovec				*comp_buf_iov;
+	struct iovec				*iov;
+	bool					rmw;
+	struct spdk_reduce_vol			*vol;
+	int					type;
+	int					reduce_errno;
+	int					iovcnt;
+	int					num_backing_ops;
+	uint32_t				num_io_units;
+	bool					chunk_is_compressed;
+	uint64_t				offset;
+	uint64_t				logical_map_index;
+	uint64_t				length;
+	uint64_t				chunk_map_index;
+	struct spdk_reduce_chunk_map		*chunk;
+	spdk_reduce_vol_op_complete		cb_fn;
+	void					*cb_arg;
+	TAILQ_ENTRY(spdk_reduce_vol_request)	tailq;
+	struct spdk_reduce_vol_cb_args		backing_cb_args;
+};
+
+struct spdk_reduce_vol {
+	struct spdk_reduce_vol_params		params;
+	uint32_t				backing_io_units_per_chunk;
+	uint32_t				backing_lba_per_io_unit;
+	uint32_t				logical_blocks_per_chunk;
+	struct spdk_reduce_pm_file		pm_file;
+	struct spdk_reduce_backing_dev		*backing_dev;
+	struct spdk_reduce_vol_superblock	*backing_super;
+	struct spdk_reduce_vol_superblock	*pm_super;
+	uint64_t				*pm_logical_map;
+	uint64_t				*pm_chunk_maps;
+
+	struct spdk_bit_array			*allocated_chunk_maps;
+	struct spdk_bit_array			*allocated_backing_io_units;
+
+	struct spdk_reduce_vol_request		*request_mem;
+	TAILQ_HEAD(, spdk_reduce_vol_request)	free_requests;
+	TAILQ_HEAD(, spdk_reduce_vol_request)	executing_requests;
+	TAILQ_HEAD(, spdk_reduce_vol_request)	queued_requests;
+
+	/* Single contiguous buffer used for all request buffers for this volume. */
+	uint8_t					*buf_mem;
+	struct iovec				*buf_iov_mem;
+};
+
+static void _start_readv_request(struct spdk_reduce_vol_request *req);
+static void _start_writev_request(struct spdk_reduce_vol_request *req);
+static uint8_t *g_zero_buf;
+static int g_vol_count = 0;
+
+/*
+ * Allocate extra metadata chunks and corresponding backing io units to account for
+ *  outstanding IO in worst case scenario where logical map is completely allocated
+ *  and no data can be compressed.  We need extra chunks in this case to handle
+ *  in-flight writes since reduce never writes data in place.
+ */
+#define REDUCE_NUM_EXTRA_CHUNKS 128
+
+static void
+_reduce_persist(struct spdk_reduce_vol *vol, const void *addr, size_t len)
+{
+	if (vol->pm_file.pm_is_pmem) {
+		pmem_persist(addr, len);
+	} else {
+		pmem_msync(addr, len);
+	}
+}
+
+static uint64_t
+_get_pm_logical_map_size(uint64_t vol_size, uint64_t chunk_size)
+{
+	uint64_t chunks_in_logical_map, logical_map_size;
+
+	chunks_in_logical_map = vol_size / chunk_size;
+	logical_map_size = chunks_in_logical_map * sizeof(uint64_t);
+
+	/* Round up to next cacheline. */
+	return spdk_divide_round_up(logical_map_size, REDUCE_PM_SIZE_ALIGNMENT) *
+	       REDUCE_PM_SIZE_ALIGNMENT;
+}
+
+static uint64_t
+_get_total_chunks(uint64_t vol_size, uint64_t chunk_size)
+{
+	uint64_t num_chunks;
+
+	num_chunks = vol_size / chunk_size;
+	num_chunks += REDUCE_NUM_EXTRA_CHUNKS;
+
+	return num_chunks;
+}
+
+static inline uint32_t
+_reduce_vol_get_chunk_struct_size(uint64_t backing_io_units_per_chunk)
+{
+	return sizeof(struct spdk_reduce_chunk_map) + sizeof(uint64_t) * backing_io_units_per_chunk;
+}
+
+static uint64_t
+_get_pm_total_chunks_size(uint64_t vol_size, uint64_t chunk_size, uint64_t backing_io_unit_size)
+{
+	uint64_t io_units_per_chunk, num_chunks, total_chunks_size;
+
+	num_chunks = _get_total_chunks(vol_size, chunk_size);
+	io_units_per_chunk = chunk_size / backing_io_unit_size;
+
+	total_chunks_size = num_chunks * _reduce_vol_get_chunk_struct_size(io_units_per_chunk);
+
+	return spdk_divide_round_up(total_chunks_size, REDUCE_PM_SIZE_ALIGNMENT) *
+	       REDUCE_PM_SIZE_ALIGNMENT;
+}
+
+static struct spdk_reduce_chunk_map *
+_reduce_vol_get_chunk_map(struct spdk_reduce_vol *vol, uint64_t chunk_map_index)
+{
+	uintptr_t chunk_map_addr;
+
+	assert(chunk_map_index < _get_total_chunks(vol->params.vol_size, vol->params.chunk_size));
+
+	chunk_map_addr = (uintptr_t)vol->pm_chunk_maps;
+	chunk_map_addr += chunk_map_index *
+			  _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk);
+
+	return (struct spdk_reduce_chunk_map *)chunk_map_addr;
+}
+
+static int
+_validate_vol_params(struct spdk_reduce_vol_params *params)
+{
+	if (params->vol_size > 0) {
+		/**
+		 * User does not pass in the vol size - it gets calculated by libreduce from
+		 *  values in this structure plus the size of the backing device.
+		 */
+		return -EINVAL;
+	}
+
+	if (params->chunk_size == 0 || params->backing_io_unit_size == 0 ||
+	    params->logical_block_size == 0) {
+		return -EINVAL;
+	}
+
+	/* Chunk size must be an even multiple of the backing io unit size. */
+	if ((params->chunk_size % params->backing_io_unit_size) != 0) {
+		return -EINVAL;
+	}
+
+	/* Chunk size must be an even multiple of the logical block size. */
+	if ((params->chunk_size % params->logical_block_size) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static uint64_t
+_get_vol_size(uint64_t chunk_size, uint64_t backing_dev_size)
+{
+	uint64_t num_chunks;
+
+	num_chunks = backing_dev_size / chunk_size;
+	if (num_chunks <= REDUCE_NUM_EXTRA_CHUNKS) {
+		return 0;
+	}
+
+	num_chunks -= REDUCE_NUM_EXTRA_CHUNKS;
+	return num_chunks * chunk_size;
+}
+
+static uint64_t
+_get_pm_file_size(struct spdk_reduce_vol_params *params)
+{
+	uint64_t total_pm_size;
+
+	total_pm_size = sizeof(struct spdk_reduce_vol_superblock);
+	total_pm_size += _get_pm_logical_map_size(params->vol_size, params->chunk_size);
+	total_pm_size += _get_pm_total_chunks_size(params->vol_size, params->chunk_size,
+			 params->backing_io_unit_size);
+	return total_pm_size;
+}
+
+const struct spdk_uuid *
+spdk_reduce_vol_get_uuid(struct spdk_reduce_vol *vol)
+{
+	return &vol->params.uuid;
+}
+
+static void
+_initialize_vol_pm_pointers(struct spdk_reduce_vol *vol)
+{
+	uint64_t logical_map_size;
+
+	/* Superblock is at the beginning of the pm file. */
+	vol->pm_super = (struct spdk_reduce_vol_superblock *)vol->pm_file.pm_buf;
+
+	/* Logical map immediately follows the super block. */
+	vol->pm_logical_map = (uint64_t *)(vol->pm_super + 1);
+
+	/* Chunks maps follow the logical map. */
+	logical_map_size = _get_pm_logical_map_size(vol->params.vol_size, vol->params.chunk_size);
+	vol->pm_chunk_maps = (uint64_t *)((uint8_t *)vol->pm_logical_map + logical_map_size);
+}
+
+/* We need 2 iovs during load - one for the superblock, another for the path */
+#define LOAD_IOV_COUNT	2
+
+struct reduce_init_load_ctx {
+	struct spdk_reduce_vol			*vol;
+	struct spdk_reduce_vol_cb_args		backing_cb_args;
+	spdk_reduce_vol_op_with_handle_complete	cb_fn;
+	void					*cb_arg;
+	struct iovec				iov[LOAD_IOV_COUNT];
+	void					*path;
+};
+
+static int
+_allocate_vol_requests(struct spdk_reduce_vol *vol)
+{
+	struct spdk_reduce_vol_request *req;
+	int i;
+
+	/* Allocate 2x since we need buffers for both read/write and compress/decompress
+	 *  intermediate buffers.
+	 */
+	vol->buf_mem = spdk_malloc(2 * REDUCE_NUM_VOL_REQUESTS * vol->params.chunk_size,
+				   64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (vol->buf_mem == NULL) {
+		return -ENOMEM;
+	}
+
+	vol->request_mem = calloc(REDUCE_NUM_VOL_REQUESTS, sizeof(*req));
+	if (vol->request_mem == NULL) {
+		spdk_free(vol->buf_mem);
+		vol->buf_mem = NULL;
+		return -ENOMEM;
+	}
+
+	/* Allocate 2x since we need iovs for both read/write and compress/decompress intermediate
+	 *  buffers.
+	 */
+	vol->buf_iov_mem = calloc(REDUCE_NUM_VOL_REQUESTS,
+				  2 * sizeof(struct iovec) * vol->backing_io_units_per_chunk);
+	if (vol->buf_iov_mem == NULL) {
+		free(vol->request_mem);
+		spdk_free(vol->buf_mem);
+		vol->request_mem = NULL;
+		vol->buf_mem = NULL;
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < REDUCE_NUM_VOL_REQUESTS; i++) {
+		req = &vol->request_mem[i];
+		TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq);
+		req->decomp_buf_iov = &vol->buf_iov_mem[(2 * i) * vol->backing_io_units_per_chunk];
+		req->decomp_buf = vol->buf_mem + (2 * i) * vol->params.chunk_size;
+		req->comp_buf_iov = &vol->buf_iov_mem[(2 * i + 1) * vol->backing_io_units_per_chunk];
+		req->comp_buf = vol->buf_mem + (2 * i + 1) * vol->params.chunk_size;
+	}
+
+	return 0;
+}
+
+static void
+_init_load_cleanup(struct spdk_reduce_vol *vol, struct reduce_init_load_ctx *ctx)
+{
+	if (ctx != NULL) {
+		spdk_free(ctx->path);
+		free(ctx);
+	}
+
+	if (vol != NULL) {
+		if (vol->pm_file.pm_buf != NULL) {
+			pmem_unmap(vol->pm_file.pm_buf, vol->pm_file.size);
+		}
+
+		spdk_free(vol->backing_super);
+		spdk_bit_array_free(&vol->allocated_chunk_maps);
+		spdk_bit_array_free(&vol->allocated_backing_io_units);
+		free(vol->request_mem);
+		free(vol->buf_iov_mem);
+		spdk_free(vol->buf_mem);
+		free(vol);
+	}
+}
+
+static int
+_alloc_zero_buff(void)
+{
+	int rc = 0;
+
+	/* The zero buffer is shared between all volumnes and just used
+	 * for reads so allocate one global instance here if not already
+	 * allocated when another vol init'd or loaded.
+	 */
+	if (g_vol_count++ == 0) {
+		g_zero_buf = spdk_zmalloc(REDUCE_ZERO_BUF_SIZE,
+					  64, NULL, SPDK_ENV_LCORE_ID_ANY,
+					  SPDK_MALLOC_DMA);
+		if (g_zero_buf == NULL) {
+			rc = -ENOMEM;
+		}
+	}
+	return rc;
+}
+
+static void
+_init_write_super_cpl(void *cb_arg, int reduce_errno)
+{
+	struct reduce_init_load_ctx *init_ctx = cb_arg;
+	int rc;
+
+	rc = _allocate_vol_requests(init_ctx->vol);
+	if (rc != 0) {
+		init_ctx->cb_fn(init_ctx->cb_arg, NULL, rc);
+		_init_load_cleanup(init_ctx->vol, init_ctx);
+		return;
+	}
+
+	rc = _alloc_zero_buff();
+	if (rc != 0) {
+		init_ctx->cb_fn(init_ctx->cb_arg, NULL, rc);
+		_init_load_cleanup(init_ctx->vol, init_ctx);
+		return;
+	}
+
+	init_ctx->cb_fn(init_ctx->cb_arg, init_ctx->vol, reduce_errno);
+	/* Only clean up the ctx - the vol has been passed to the application
+	 *  for use now that initialization was successful.
+	 */
+	_init_load_cleanup(NULL, init_ctx);
+}
+
+static void
+_init_write_path_cpl(void *cb_arg, int reduce_errno)
+{
+	struct reduce_init_load_ctx *init_ctx = cb_arg;
+	struct spdk_reduce_vol *vol = init_ctx->vol;
+
+	init_ctx->iov[0].iov_base = vol->backing_super;
+	init_ctx->iov[0].iov_len = sizeof(*vol->backing_super);
+	init_ctx->backing_cb_args.cb_fn = _init_write_super_cpl;
+	init_ctx->backing_cb_args.cb_arg = init_ctx;
+	vol->backing_dev->writev(vol->backing_dev, init_ctx->iov, 1,
+				 0, sizeof(*vol->backing_super) / vol->backing_dev->blocklen,
+				 &init_ctx->backing_cb_args);
+}
+
+static int
+_allocate_bit_arrays(struct spdk_reduce_vol *vol)
+{
+	uint64_t total_chunks, total_backing_io_units;
+	uint32_t i, num_metadata_io_units;
+
+	total_chunks = _get_total_chunks(vol->params.vol_size, vol->params.chunk_size);
+	vol->allocated_chunk_maps = spdk_bit_array_create(total_chunks);
+	total_backing_io_units = total_chunks * (vol->params.chunk_size / vol->params.backing_io_unit_size);
+	vol->allocated_backing_io_units = spdk_bit_array_create(total_backing_io_units);
+
+	if (vol->allocated_chunk_maps == NULL || vol->allocated_backing_io_units == NULL) {
+		return -ENOMEM;
+	}
+
+	/* Set backing io unit bits associated with metadata. */
+	num_metadata_io_units = (sizeof(*vol->backing_super) + REDUCE_PATH_MAX) /
+				vol->backing_dev->blocklen;
+	for (i = 0; i < num_metadata_io_units; i++) {
+		spdk_bit_array_set(vol->allocated_backing_io_units, i);
+	}
+
+	return 0;
+}
+
+void
+spdk_reduce_vol_init(struct spdk_reduce_vol_params *params,
+		     struct spdk_reduce_backing_dev *backing_dev,
+		     const char *pm_file_dir,
+		     spdk_reduce_vol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_reduce_vol *vol;
+	struct reduce_init_load_ctx *init_ctx;
+	uint64_t backing_dev_size;
+	size_t mapped_len;
+	int dir_len, max_dir_len, rc;
+
+	/* We need to append a path separator and the UUID to the supplied
+	 * path.
+	 */
+	max_dir_len = REDUCE_PATH_MAX - SPDK_UUID_STRING_LEN - 1;
+	dir_len = strnlen(pm_file_dir, max_dir_len);
+	/* Strip trailing slash if the user provided one - we will add it back
+	 * later when appending the filename.
+	 */
+	if (pm_file_dir[dir_len - 1] == '/') {
+		dir_len--;
+	}
+	if (dir_len == max_dir_len) {
+		SPDK_ERRLOG("pm_file_dir (%s) too long\n", pm_file_dir);
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	rc = _validate_vol_params(params);
+	if (rc != 0) {
+		SPDK_ERRLOG("invalid vol params\n");
+		cb_fn(cb_arg, NULL, rc);
+		return;
+	}
+
+	backing_dev_size = backing_dev->blockcnt * backing_dev->blocklen;
+	params->vol_size = _get_vol_size(params->chunk_size, backing_dev_size);
+	if (params->vol_size == 0) {
+		SPDK_ERRLOG("backing device is too small\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	if (backing_dev->readv == NULL || backing_dev->writev == NULL ||
+	    backing_dev->unmap == NULL) {
+		SPDK_ERRLOG("backing_dev function pointer not specified\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	vol = calloc(1, sizeof(*vol));
+	if (vol == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	TAILQ_INIT(&vol->free_requests);
+	TAILQ_INIT(&vol->executing_requests);
+	TAILQ_INIT(&vol->queued_requests);
+
+	vol->backing_super = spdk_zmalloc(sizeof(*vol->backing_super), 0, NULL,
+					  SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (vol->backing_super == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		_init_load_cleanup(vol, NULL);
+		return;
+	}
+
+	init_ctx = calloc(1, sizeof(*init_ctx));
+	if (init_ctx == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		_init_load_cleanup(vol, NULL);
+		return;
+	}
+
+	init_ctx->path = spdk_zmalloc(REDUCE_PATH_MAX, 0, NULL,
+				      SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (init_ctx->path == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		_init_load_cleanup(vol, init_ctx);
+		return;
+	}
+
+	if (spdk_mem_all_zero(&params->uuid, sizeof(params->uuid))) {
+		spdk_uuid_generate(&params->uuid);
+	}
+
+	memcpy(vol->pm_file.path, pm_file_dir, dir_len);
+	vol->pm_file.path[dir_len] = '/';
+	spdk_uuid_fmt_lower(&vol->pm_file.path[dir_len + 1], SPDK_UUID_STRING_LEN,
+			    &params->uuid);
+	vol->pm_file.size = _get_pm_file_size(params);
+	vol->pm_file.pm_buf = pmem_map_file(vol->pm_file.path, vol->pm_file.size,
+					    PMEM_FILE_CREATE | PMEM_FILE_EXCL, 0600,
+					    &mapped_len, &vol->pm_file.pm_is_pmem);
+	if (vol->pm_file.pm_buf == NULL) {
+		SPDK_ERRLOG("could not pmem_map_file(%s): %s\n",
+			    vol->pm_file.path, strerror(errno));
+		cb_fn(cb_arg, NULL, -errno);
+		_init_load_cleanup(vol, init_ctx);
+		return;
+	}
+
+	if (vol->pm_file.size != mapped_len) {
+		SPDK_ERRLOG("could not map entire pmem file (size=%" PRIu64 " mapped=%" PRIu64 ")\n",
+			    vol->pm_file.size, mapped_len);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		_init_load_cleanup(vol, init_ctx);
+		return;
+	}
+
+	vol->backing_io_units_per_chunk = params->chunk_size / params->backing_io_unit_size;
+	vol->logical_blocks_per_chunk = params->chunk_size / params->logical_block_size;
+	vol->backing_lba_per_io_unit = params->backing_io_unit_size / backing_dev->blocklen;
+	memcpy(&vol->params, params, sizeof(*params));
+
+	vol->backing_dev = backing_dev;
+
+	rc = _allocate_bit_arrays(vol);
+	if (rc != 0) {
+		cb_fn(cb_arg, NULL, rc);
+		_init_load_cleanup(vol, init_ctx);
+		return;
+	}
+
+	memcpy(vol->backing_super->signature, SPDK_REDUCE_SIGNATURE,
+	       sizeof(vol->backing_super->signature));
+	memcpy(&vol->backing_super->params, params, sizeof(*params));
+
+	_initialize_vol_pm_pointers(vol);
+
+	memcpy(vol->pm_super, vol->backing_super, sizeof(*vol->backing_super));
+	/* Writing 0xFF's is equivalent of filling it all with SPDK_EMPTY_MAP_ENTRY.
+	 * Note that this writes 0xFF to not just the logical map but the chunk maps as well.
+	 */
+	memset(vol->pm_logical_map, 0xFF, vol->pm_file.size - sizeof(*vol->backing_super));
+	_reduce_persist(vol, vol->pm_file.pm_buf, vol->pm_file.size);
+
+	init_ctx->vol = vol;
+	init_ctx->cb_fn = cb_fn;
+	init_ctx->cb_arg = cb_arg;
+
+	memcpy(init_ctx->path, vol->pm_file.path, REDUCE_PATH_MAX);
+	init_ctx->iov[0].iov_base = init_ctx->path;
+	init_ctx->iov[0].iov_len = REDUCE_PATH_MAX;
+	init_ctx->backing_cb_args.cb_fn = _init_write_path_cpl;
+	init_ctx->backing_cb_args.cb_arg = init_ctx;
+	/* Write path to offset 4K on backing device - just after where the super
+	 *  block will be written.  We wait until this is committed before writing the
+	 *  super block to guarantee we don't get the super block written without the
+	 *  the path if the system crashed in the middle of a write operation.
+	 */
+	vol->backing_dev->writev(vol->backing_dev, init_ctx->iov, 1,
+				 REDUCE_BACKING_DEV_PATH_OFFSET / vol->backing_dev->blocklen,
+				 REDUCE_PATH_MAX / vol->backing_dev->blocklen,
+				 &init_ctx->backing_cb_args);
+}
+
+static void destroy_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno);
+
+static void
+_load_read_super_and_path_cpl(void *cb_arg, int reduce_errno)
+{
+	struct reduce_init_load_ctx *load_ctx = cb_arg;
+	struct spdk_reduce_vol *vol = load_ctx->vol;
+	uint64_t backing_dev_size;
+	uint64_t i, num_chunks, logical_map_index;
+	struct spdk_reduce_chunk_map *chunk;
+	size_t mapped_len;
+	uint32_t j;
+	int rc;
+
+	rc = _alloc_zero_buff();
+	if (rc) {
+		goto error;
+	}
+
+	if (memcmp(vol->backing_super->signature,
+		   SPDK_REDUCE_SIGNATURE,
+		   sizeof(vol->backing_super->signature)) != 0) {
+		/* This backing device isn't a libreduce backing device. */
+		rc = -EILSEQ;
+		goto error;
+	}
+
+	/* If the cb_fn is destroy_load_cb, it means we are wanting to destroy this compress bdev.
+	 *  So don't bother getting the volume ready to use - invoke the callback immediately
+	 *  so destroy_load_cb can delete the metadata off of the block device and delete the
+	 *  persistent memory file if it exists.
+	 */
+	memcpy(vol->pm_file.path, load_ctx->path, sizeof(vol->pm_file.path));
+	if (load_ctx->cb_fn == (*destroy_load_cb)) {
+		load_ctx->cb_fn(load_ctx->cb_arg, vol, 0);
+		_init_load_cleanup(NULL, load_ctx);
+		return;
+	}
+
+	memcpy(&vol->params, &vol->backing_super->params, sizeof(vol->params));
+	vol->backing_io_units_per_chunk = vol->params.chunk_size / vol->params.backing_io_unit_size;
+	vol->logical_blocks_per_chunk = vol->params.chunk_size / vol->params.logical_block_size;
+	vol->backing_lba_per_io_unit = vol->params.backing_io_unit_size / vol->backing_dev->blocklen;
+
+	rc = _allocate_bit_arrays(vol);
+	if (rc != 0) {
+		goto error;
+	}
+
+	backing_dev_size = vol->backing_dev->blockcnt * vol->backing_dev->blocklen;
+	if (_get_vol_size(vol->params.chunk_size, backing_dev_size) < vol->params.vol_size) {
+		SPDK_ERRLOG("backing device size %" PRIi64 " smaller than expected\n",
+			    backing_dev_size);
+		rc = -EILSEQ;
+		goto error;
+	}
+
+	vol->pm_file.size = _get_pm_file_size(&vol->params);
+	vol->pm_file.pm_buf = pmem_map_file(vol->pm_file.path, 0, 0, 0, &mapped_len,
+					    &vol->pm_file.pm_is_pmem);
+	if (vol->pm_file.pm_buf == NULL) {
+		SPDK_ERRLOG("could not pmem_map_file(%s): %s\n", vol->pm_file.path, strerror(errno));
+		rc = -errno;
+		goto error;
+	}
+
+	if (vol->pm_file.size != mapped_len) {
+		SPDK_ERRLOG("could not map entire pmem file (size=%" PRIu64 " mapped=%" PRIu64 ")\n",
+			    vol->pm_file.size, mapped_len);
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	rc = _allocate_vol_requests(vol);
+	if (rc != 0) {
+		goto error;
+	}
+
+	_initialize_vol_pm_pointers(vol);
+
+	num_chunks = vol->params.vol_size / vol->params.chunk_size;
+	for (i = 0; i < num_chunks; i++) {
+		logical_map_index = vol->pm_logical_map[i];
+		if (logical_map_index == REDUCE_EMPTY_MAP_ENTRY) {
+			continue;
+		}
+		spdk_bit_array_set(vol->allocated_chunk_maps, logical_map_index);
+		chunk = _reduce_vol_get_chunk_map(vol, logical_map_index);
+		for (j = 0; j < vol->backing_io_units_per_chunk; j++) {
+			if (chunk->io_unit_index[j] != REDUCE_EMPTY_MAP_ENTRY) {
+				spdk_bit_array_set(vol->allocated_backing_io_units, chunk->io_unit_index[j]);
+			}
+		}
+	}
+
+	load_ctx->cb_fn(load_ctx->cb_arg, vol, 0);
+	/* Only clean up the ctx - the vol has been passed to the application
+	 *  for use now that volume load was successful.
+	 */
+	_init_load_cleanup(NULL, load_ctx);
+	return;
+
+error:
+	load_ctx->cb_fn(load_ctx->cb_arg, NULL, rc);
+	_init_load_cleanup(vol, load_ctx);
+}
+
+void
+spdk_reduce_vol_load(struct spdk_reduce_backing_dev *backing_dev,
+		     spdk_reduce_vol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_reduce_vol *vol;
+	struct reduce_init_load_ctx *load_ctx;
+
+	if (backing_dev->readv == NULL || backing_dev->writev == NULL ||
+	    backing_dev->unmap == NULL) {
+		SPDK_ERRLOG("backing_dev function pointer not specified\n");
+		cb_fn(cb_arg, NULL, -EINVAL);
+		return;
+	}
+
+	vol = calloc(1, sizeof(*vol));
+	if (vol == NULL) {
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	TAILQ_INIT(&vol->free_requests);
+	TAILQ_INIT(&vol->executing_requests);
+	TAILQ_INIT(&vol->queued_requests);
+
+	vol->backing_super = spdk_zmalloc(sizeof(*vol->backing_super), 64, NULL,
+					  SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (vol->backing_super == NULL) {
+		_init_load_cleanup(vol, NULL);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	vol->backing_dev = backing_dev;
+
+	load_ctx = calloc(1, sizeof(*load_ctx));
+	if (load_ctx == NULL) {
+		_init_load_cleanup(vol, NULL);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	load_ctx->path = spdk_zmalloc(REDUCE_PATH_MAX, 64, NULL,
+				      SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (load_ctx->path == NULL) {
+		_init_load_cleanup(vol, load_ctx);
+		cb_fn(cb_arg, NULL, -ENOMEM);
+		return;
+	}
+
+	load_ctx->vol = vol;
+	load_ctx->cb_fn = cb_fn;
+	load_ctx->cb_arg = cb_arg;
+
+	load_ctx->iov[0].iov_base = vol->backing_super;
+	load_ctx->iov[0].iov_len = sizeof(*vol->backing_super);
+	load_ctx->iov[1].iov_base = load_ctx->path;
+	load_ctx->iov[1].iov_len = REDUCE_PATH_MAX;
+	load_ctx->backing_cb_args.cb_fn = _load_read_super_and_path_cpl;
+	load_ctx->backing_cb_args.cb_arg = load_ctx;
+	vol->backing_dev->readv(vol->backing_dev, load_ctx->iov, LOAD_IOV_COUNT, 0,
+				(sizeof(*vol->backing_super) + REDUCE_PATH_MAX) /
+				vol->backing_dev->blocklen,
+				&load_ctx->backing_cb_args);
+}
+
+void
+spdk_reduce_vol_unload(struct spdk_reduce_vol *vol,
+		       spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+	if (vol == NULL) {
+		/* This indicates a programming error. */
+		assert(false);
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	if (--g_vol_count == 0) {
+		spdk_free(g_zero_buf);
+	}
+	assert(g_vol_count >= 0);
+	_init_load_cleanup(vol, NULL);
+	cb_fn(cb_arg, 0);
+}
+
+struct reduce_destroy_ctx {
+	spdk_reduce_vol_op_complete		cb_fn;
+	void					*cb_arg;
+	struct spdk_reduce_vol			*vol;
+	struct spdk_reduce_vol_superblock	*super;
+	struct iovec				iov;
+	struct spdk_reduce_vol_cb_args		backing_cb_args;
+	int					reduce_errno;
+	char					pm_path[REDUCE_PATH_MAX];
+};
+
+static void
+destroy_unload_cpl(void *cb_arg, int reduce_errno)
+{
+	struct reduce_destroy_ctx *destroy_ctx = cb_arg;
+
+	if (destroy_ctx->reduce_errno == 0) {
+		if (unlink(destroy_ctx->pm_path)) {
+			SPDK_ERRLOG("%s could not be unlinked: %s\n",
+				    destroy_ctx->pm_path, strerror(errno));
+		}
+	}
+
+	/* Even if the unload somehow failed, we still pass the destroy_ctx
+	 * reduce_errno since that indicates whether or not the volume was
+	 * actually destroyed.
+	 */
+	destroy_ctx->cb_fn(destroy_ctx->cb_arg, destroy_ctx->reduce_errno);
+	spdk_free(destroy_ctx->super);
+	free(destroy_ctx);
+}
+
+static void
+_destroy_zero_super_cpl(void *cb_arg, int reduce_errno)
+{
+	struct reduce_destroy_ctx *destroy_ctx = cb_arg;
+	struct spdk_reduce_vol *vol = destroy_ctx->vol;
+
+	destroy_ctx->reduce_errno = reduce_errno;
+	spdk_reduce_vol_unload(vol, destroy_unload_cpl, destroy_ctx);
+}
+
+static void
+destroy_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno)
+{
+	struct reduce_destroy_ctx *destroy_ctx = cb_arg;
+
+	if (reduce_errno != 0) {
+		destroy_ctx->cb_fn(destroy_ctx->cb_arg, reduce_errno);
+		spdk_free(destroy_ctx->super);
+		free(destroy_ctx);
+		return;
+	}
+
+	destroy_ctx->vol = vol;
+	memcpy(destroy_ctx->pm_path, vol->pm_file.path, sizeof(destroy_ctx->pm_path));
+	destroy_ctx->iov.iov_base = destroy_ctx->super;
+	destroy_ctx->iov.iov_len = sizeof(*destroy_ctx->super);
+	destroy_ctx->backing_cb_args.cb_fn = _destroy_zero_super_cpl;
+	destroy_ctx->backing_cb_args.cb_arg = destroy_ctx;
+	vol->backing_dev->writev(vol->backing_dev, &destroy_ctx->iov, 1, 0,
+				 sizeof(*destroy_ctx->super) / vol->backing_dev->blocklen,
+				 &destroy_ctx->backing_cb_args);
+}
+
+void
+spdk_reduce_vol_destroy(struct spdk_reduce_backing_dev *backing_dev,
+			spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+	struct reduce_destroy_ctx *destroy_ctx;
+
+	destroy_ctx = calloc(1, sizeof(*destroy_ctx));
+	if (destroy_ctx == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	destroy_ctx->super = spdk_zmalloc(sizeof(*destroy_ctx->super), 64, NULL,
+					  SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (destroy_ctx->super == NULL) {
+		free(destroy_ctx);
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+	destroy_ctx->cb_fn = cb_fn;
+	destroy_ctx->cb_arg = cb_arg;
+	spdk_reduce_vol_load(backing_dev, destroy_load_cb, destroy_ctx);
+}
+
+static bool
+_request_spans_chunk_boundary(struct spdk_reduce_vol *vol, uint64_t offset, uint64_t length)
+{
+	uint64_t start_chunk, end_chunk;
+
+	start_chunk = offset / vol->logical_blocks_per_chunk;
+	end_chunk = (offset + length - 1) / vol->logical_blocks_per_chunk;
+
+	return (start_chunk != end_chunk);
+}
+
+typedef void (*reduce_request_fn)(void *_req, int reduce_errno);
+
+static void
+_reduce_vol_complete_req(struct spdk_reduce_vol_request *req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *next_req;
+	struct spdk_reduce_vol *vol = req->vol;
+
+	req->cb_fn(req->cb_arg, reduce_errno);
+	TAILQ_REMOVE(&vol->executing_requests, req, tailq);
+
+	TAILQ_FOREACH(next_req, &vol->queued_requests, tailq) {
+		if (next_req->logical_map_index == req->logical_map_index) {
+			TAILQ_REMOVE(&vol->queued_requests, next_req, tailq);
+			if (next_req->type == REDUCE_IO_READV) {
+				_start_readv_request(next_req);
+			} else {
+				assert(next_req->type == REDUCE_IO_WRITEV);
+				_start_writev_request(next_req);
+			}
+			break;
+		}
+	}
+
+	TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq);
+}
+
+static void
+_write_write_done(void *_req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *req = _req;
+	struct spdk_reduce_vol *vol = req->vol;
+	uint64_t old_chunk_map_index;
+	struct spdk_reduce_chunk_map *old_chunk;
+	uint32_t i;
+
+	if (reduce_errno != 0) {
+		req->reduce_errno = reduce_errno;
+	}
+
+	assert(req->num_backing_ops > 0);
+	if (--req->num_backing_ops > 0) {
+		return;
+	}
+
+	if (req->reduce_errno != 0) {
+		_reduce_vol_complete_req(req, req->reduce_errno);
+		return;
+	}
+
+	old_chunk_map_index = vol->pm_logical_map[req->logical_map_index];
+	if (old_chunk_map_index != REDUCE_EMPTY_MAP_ENTRY) {
+		old_chunk = _reduce_vol_get_chunk_map(vol, old_chunk_map_index);
+		for (i = 0; i < vol->backing_io_units_per_chunk; i++) {
+			if (old_chunk->io_unit_index[i] == REDUCE_EMPTY_MAP_ENTRY) {
+				break;
+			}
+			assert(spdk_bit_array_get(vol->allocated_backing_io_units, old_chunk->io_unit_index[i]) == true);
+			spdk_bit_array_clear(vol->allocated_backing_io_units, old_chunk->io_unit_index[i]);
+			old_chunk->io_unit_index[i] = REDUCE_EMPTY_MAP_ENTRY;
+		}
+		spdk_bit_array_clear(vol->allocated_chunk_maps, old_chunk_map_index);
+	}
+
+	/*
+	 * We don't need to persist the clearing of the old chunk map here.  The old chunk map
+	 * becomes invalid after we update the logical map, since the old chunk map will no
+	 * longer have a reference to it in the logical map.
+	 */
+
+	/* Persist the new chunk map.  This must be persisted before we update the logical map. */
+	_reduce_persist(vol, req->chunk,
+			_reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk));
+
+	vol->pm_logical_map[req->logical_map_index] = req->chunk_map_index;
+
+	_reduce_persist(vol, &vol->pm_logical_map[req->logical_map_index], sizeof(uint64_t));
+
+	_reduce_vol_complete_req(req, 0);
+}
+
+static void
+_issue_backing_ops(struct spdk_reduce_vol_request *req, struct spdk_reduce_vol *vol,
+		   reduce_request_fn next_fn, bool is_write)
+{
+	struct iovec *iov;
+	uint8_t *buf;
+	uint32_t i;
+
+	if (req->chunk_is_compressed) {
+		iov = req->comp_buf_iov;
+		buf = req->comp_buf;
+	} else {
+		iov = req->decomp_buf_iov;
+		buf = req->decomp_buf;
+	}
+
+	req->num_backing_ops = req->num_io_units;
+	req->backing_cb_args.cb_fn = next_fn;
+	req->backing_cb_args.cb_arg = req;
+	for (i = 0; i < req->num_io_units; i++) {
+		iov[i].iov_base = buf + i * vol->params.backing_io_unit_size;
+		iov[i].iov_len = vol->params.backing_io_unit_size;
+		if (is_write) {
+			vol->backing_dev->writev(vol->backing_dev, &iov[i], 1,
+						 req->chunk->io_unit_index[i] * vol->backing_lba_per_io_unit,
+						 vol->backing_lba_per_io_unit, &req->backing_cb_args);
+		} else {
+			vol->backing_dev->readv(vol->backing_dev, &iov[i], 1,
+						req->chunk->io_unit_index[i] * vol->backing_lba_per_io_unit,
+						vol->backing_lba_per_io_unit, &req->backing_cb_args);
+		}
+	}
+}
+
+static void
+_reduce_vol_write_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn,
+			uint32_t compressed_size)
+{
+	struct spdk_reduce_vol *vol = req->vol;
+	uint32_t i;
+	uint64_t chunk_offset, remainder, total_len = 0;
+	uint8_t *buf;
+	int j;
+
+	req->chunk_map_index = spdk_bit_array_find_first_clear(vol->allocated_chunk_maps, 0);
+
+	/* TODO: fail if no chunk map found - but really this should not happen if we
+	 * size the number of requests similarly to number of extra chunk maps
+	 */
+	assert(req->chunk_map_index != UINT32_MAX);
+	spdk_bit_array_set(vol->allocated_chunk_maps, req->chunk_map_index);
+
+	req->chunk = _reduce_vol_get_chunk_map(vol, req->chunk_map_index);
+	req->num_io_units = spdk_divide_round_up(compressed_size,
+			    vol->params.backing_io_unit_size);
+	req->chunk_is_compressed = (req->num_io_units != vol->backing_io_units_per_chunk);
+	req->chunk->compressed_size =
+		req->chunk_is_compressed ? compressed_size : vol->params.chunk_size;
+
+	/* if the chunk is uncompressed we need to copy the data from the host buffers. */
+	if (req->chunk_is_compressed == false) {
+		chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+		buf = req->decomp_buf;
+		total_len = chunk_offset * vol->params.logical_block_size;
+
+		/* zero any offset into chunk */
+		if (req->rmw == false && chunk_offset) {
+			memset(buf, 0, total_len);
+		}
+		buf += total_len;
+
+		/* copy the data */
+		for (j = 0; j < req->iovcnt; j++) {
+			memcpy(buf, req->iov[j].iov_base, req->iov[j].iov_len);
+			buf += req->iov[j].iov_len;
+			total_len += req->iov[j].iov_len;
+		}
+
+		/* zero any remainder */
+		remainder = vol->params.chunk_size - total_len;
+		total_len += remainder;
+		if (req->rmw == false && remainder) {
+			memset(buf, 0, remainder);
+		}
+		assert(total_len == vol->params.chunk_size);
+	}
+
+	for (i = 0; i < req->num_io_units; i++) {
+		req->chunk->io_unit_index[i] = spdk_bit_array_find_first_clear(vol->allocated_backing_io_units, 0);
+		/* TODO: fail if no backing block found - but really this should also not
+		 * happen (see comment above).
+		 */
+		assert(req->chunk->io_unit_index[i] != UINT32_MAX);
+		spdk_bit_array_set(vol->allocated_backing_io_units, req->chunk->io_unit_index[i]);
+	}
+
+	_issue_backing_ops(req, vol, next_fn, true /* write */);
+}
+
+static void
+_write_compress_done(void *_req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *req = _req;
+
+	/* Negative reduce_errno indicates failure for compression operations.
+	 * Just write the uncompressed data instead.  Force this to happen
+	 * by just passing the full chunk size to _reduce_vol_write_chunk.
+	 * When it sees the data couldn't be compressed, it will just write
+	 * the uncompressed buffer to disk.
+	 */
+	if (reduce_errno < 0) {
+		reduce_errno = req->vol->params.chunk_size;
+	}
+
+	/* Positive reduce_errno indicates number of bytes in compressed buffer. */
+	_reduce_vol_write_chunk(req, _write_write_done, (uint32_t)reduce_errno);
+}
+
+static void
+_reduce_vol_compress_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+	struct spdk_reduce_vol *vol = req->vol;
+
+	req->backing_cb_args.cb_fn = next_fn;
+	req->backing_cb_args.cb_arg = req;
+	req->comp_buf_iov[0].iov_base = req->comp_buf;
+	req->comp_buf_iov[0].iov_len = vol->params.chunk_size;
+	vol->backing_dev->compress(vol->backing_dev,
+				   &req->decomp_iov[0], req->decomp_iovcnt, req->comp_buf_iov, 1,
+				   &req->backing_cb_args);
+}
+
+static void
+_reduce_vol_decompress_chunk_scratch(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+	struct spdk_reduce_vol *vol = req->vol;
+
+	req->backing_cb_args.cb_fn = next_fn;
+	req->backing_cb_args.cb_arg = req;
+	req->comp_buf_iov[0].iov_base = req->comp_buf;
+	req->comp_buf_iov[0].iov_len = req->chunk->compressed_size;
+	req->decomp_buf_iov[0].iov_base = req->decomp_buf;
+	req->decomp_buf_iov[0].iov_len = vol->params.chunk_size;
+	vol->backing_dev->decompress(vol->backing_dev,
+				     req->comp_buf_iov, 1, req->decomp_buf_iov, 1,
+				     &req->backing_cb_args);
+}
+
+static void
+_reduce_vol_decompress_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+	struct spdk_reduce_vol *vol = req->vol;
+	uint64_t chunk_offset, remainder = 0;
+	uint64_t ttl_len = 0;
+	int i;
+
+	req->decomp_iovcnt = 0;
+	chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+
+	if (chunk_offset) {
+		/* first iov point to our scratch buffer for any offset into the chunk */
+		req->decomp_iov[0].iov_base = req->decomp_buf;
+		req->decomp_iov[0].iov_len = chunk_offset * vol->params.logical_block_size;
+		ttl_len += req->decomp_iov[0].iov_len;
+		req->decomp_iovcnt = 1;
+	}
+
+	/* now the user data iov, direct to the user buffer */
+	for (i = 0; i < req->iovcnt; i++) {
+		req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base;
+		req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len;
+		ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len;
+	}
+	req->decomp_iovcnt += req->iovcnt;
+
+	/* send the rest of the chunk to our scratch buffer */
+	remainder = vol->params.chunk_size - ttl_len;
+	if (remainder) {
+		req->decomp_iov[req->decomp_iovcnt].iov_base = req->decomp_buf + ttl_len;
+		req->decomp_iov[req->decomp_iovcnt].iov_len = remainder;
+		ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len;
+		req->decomp_iovcnt++;
+	}
+	assert(ttl_len == vol->params.chunk_size);
+
+	req->backing_cb_args.cb_fn = next_fn;
+	req->backing_cb_args.cb_arg = req;
+	req->comp_buf_iov[0].iov_base = req->comp_buf;
+	req->comp_buf_iov[0].iov_len = req->chunk->compressed_size;
+	vol->backing_dev->decompress(vol->backing_dev,
+				     req->comp_buf_iov, 1, &req->decomp_iov[0], req->decomp_iovcnt,
+				     &req->backing_cb_args);
+}
+
+static void
+_write_decompress_done(void *_req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *req = _req;
+	struct spdk_reduce_vol *vol = req->vol;
+	uint64_t chunk_offset, remainder, ttl_len = 0;
+	int i;
+
+	/* Negative reduce_errno indicates failure for compression operations. */
+	if (reduce_errno < 0) {
+		_reduce_vol_complete_req(req, reduce_errno);
+		return;
+	}
+
+	/* Positive reduce_errno indicates number of bytes in decompressed
+	 *  buffer.  This should equal the chunk size - otherwise that's another
+	 *  type of failure.
+	 */
+	if ((uint32_t)reduce_errno != vol->params.chunk_size) {
+		_reduce_vol_complete_req(req, -EIO);
+		return;
+	}
+
+	req->decomp_iovcnt = 0;
+	chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+
+	if (chunk_offset) {
+		req->decomp_iov[0].iov_base = req->decomp_buf;
+		req->decomp_iov[0].iov_len = chunk_offset * vol->params.logical_block_size;
+		ttl_len += req->decomp_iov[0].iov_len;
+		req->decomp_iovcnt = 1;
+	}
+
+	for (i = 0; i < req->iovcnt; i++) {
+		req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base;
+		req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len;
+		ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len;
+	}
+	req->decomp_iovcnt += req->iovcnt;
+
+	remainder = vol->params.chunk_size - ttl_len;
+	if (remainder) {
+		req->decomp_iov[req->decomp_iovcnt].iov_base = req->decomp_buf + ttl_len;
+		req->decomp_iov[req->decomp_iovcnt].iov_len = remainder;
+		ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len;
+		req->decomp_iovcnt++;
+	}
+	assert(ttl_len == vol->params.chunk_size);
+
+	_reduce_vol_compress_chunk(req, _write_compress_done);
+}
+
+static void
+_write_read_done(void *_req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *req = _req;
+
+	if (reduce_errno != 0) {
+		req->reduce_errno = reduce_errno;
+	}
+
+	assert(req->num_backing_ops > 0);
+	if (--req->num_backing_ops > 0) {
+		return;
+	}
+
+	if (req->reduce_errno != 0) {
+		_reduce_vol_complete_req(req, req->reduce_errno);
+		return;
+	}
+
+	if (req->chunk_is_compressed) {
+		_reduce_vol_decompress_chunk_scratch(req, _write_decompress_done);
+	} else {
+		_write_decompress_done(req, req->chunk->compressed_size);
+	}
+}
+
+static void
+_read_decompress_done(void *_req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *req = _req;
+	struct spdk_reduce_vol *vol = req->vol;
+
+	/* Negative reduce_errno indicates failure for compression operations. */
+	if (reduce_errno < 0) {
+		_reduce_vol_complete_req(req, reduce_errno);
+		return;
+	}
+
+	/* Positive reduce_errno indicates number of bytes in decompressed
+	 *  buffer.  This should equal the chunk size - otherwise that's another
+	 *  type of failure.
+	 */
+	if ((uint32_t)reduce_errno != vol->params.chunk_size) {
+		_reduce_vol_complete_req(req, -EIO);
+		return;
+	}
+
+	_reduce_vol_complete_req(req, 0);
+}
+
+static void
+_read_read_done(void *_req, int reduce_errno)
+{
+	struct spdk_reduce_vol_request *req = _req;
+	uint64_t chunk_offset;
+	uint8_t *buf;
+	int i;
+
+	if (reduce_errno != 0) {
+		req->reduce_errno = reduce_errno;
+	}
+
+	assert(req->num_backing_ops > 0);
+	if (--req->num_backing_ops > 0) {
+		return;
+	}
+
+	if (req->reduce_errno != 0) {
+		_reduce_vol_complete_req(req, req->reduce_errno);
+		return;
+	}
+
+	if (req->chunk_is_compressed) {
+		_reduce_vol_decompress_chunk(req, _read_decompress_done);
+	} else {
+
+		/* If the chunk was compressed, the data would have been sent to the
+		 *  host buffers by the decompression operation, if not we need to memcpy here.
+		 */
+		chunk_offset = req->offset % req->vol->logical_blocks_per_chunk;
+		buf = req->decomp_buf + chunk_offset * req->vol->params.logical_block_size;
+		for (i = 0; i < req->iovcnt; i++) {
+			memcpy(req->iov[i].iov_base, buf, req->iov[i].iov_len);
+			buf += req->iov[i].iov_len;
+		}
+
+		_read_decompress_done(req, req->chunk->compressed_size);
+	}
+}
+
+static void
+_reduce_vol_read_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+	struct spdk_reduce_vol *vol = req->vol;
+
+	req->chunk_map_index = vol->pm_logical_map[req->logical_map_index];
+	assert(req->chunk_map_index != UINT32_MAX);
+
+	req->chunk = _reduce_vol_get_chunk_map(vol, req->chunk_map_index);
+	req->num_io_units = spdk_divide_round_up(req->chunk->compressed_size,
+			    vol->params.backing_io_unit_size);
+	req->chunk_is_compressed = (req->num_io_units != vol->backing_io_units_per_chunk);
+
+	_issue_backing_ops(req, vol, next_fn, false /* read */);
+}
+
+static bool
+_iov_array_is_valid(struct spdk_reduce_vol *vol, struct iovec *iov, int iovcnt,
+		    uint64_t length)
+{
+	uint64_t size = 0;
+	int i;
+
+	if (iovcnt > REDUCE_MAX_IOVECS) {
+		return false;
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		size += iov[i].iov_len;
+	}
+
+	return size == (length * vol->params.logical_block_size);
+}
+
+static bool
+_check_overlap(struct spdk_reduce_vol *vol, uint64_t logical_map_index)
+{
+	struct spdk_reduce_vol_request *req;
+
+	TAILQ_FOREACH(req, &vol->executing_requests, tailq) {
+		if (logical_map_index == req->logical_map_index) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void
+_start_readv_request(struct spdk_reduce_vol_request *req)
+{
+	TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq);
+	_reduce_vol_read_chunk(req, _read_read_done);
+}
+
+void
+spdk_reduce_vol_readv(struct spdk_reduce_vol *vol,
+		      struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+		      spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_reduce_vol_request *req;
+	uint64_t logical_map_index;
+	bool overlapped;
+	int i;
+
+	if (length == 0) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	if (_request_spans_chunk_boundary(vol, offset, length)) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	if (!_iov_array_is_valid(vol, iov, iovcnt, length)) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	logical_map_index = offset / vol->logical_blocks_per_chunk;
+	overlapped = _check_overlap(vol, logical_map_index);
+
+	if (!overlapped && vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) {
+		/*
+		 * This chunk hasn't been allocated.  So treat the data as all
+		 * zeroes for this chunk - do the memset and immediately complete
+		 * the operation.
+		 */
+		for (i = 0; i < iovcnt; i++) {
+			memset(iov[i].iov_base, 0, iov[i].iov_len);
+		}
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	req = TAILQ_FIRST(&vol->free_requests);
+	if (req == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	TAILQ_REMOVE(&vol->free_requests, req, tailq);
+	req->type = REDUCE_IO_READV;
+	req->vol = vol;
+	req->iov = iov;
+	req->iovcnt = iovcnt;
+	req->offset = offset;
+	req->logical_map_index = logical_map_index;
+	req->length = length;
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	if (!overlapped) {
+		_start_readv_request(req);
+	} else {
+		TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq);
+	}
+}
+
+static void
+_start_writev_request(struct spdk_reduce_vol_request *req)
+{
+	struct spdk_reduce_vol *vol = req->vol;
+	uint64_t chunk_offset, ttl_len = 0;
+	uint64_t remainder = 0;
+	uint32_t lbsize;
+	int i;
+
+	TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq);
+	if (vol->pm_logical_map[req->logical_map_index] != REDUCE_EMPTY_MAP_ENTRY) {
+		if ((req->length * vol->params.logical_block_size) < vol->params.chunk_size) {
+			/* Read old chunk, then overwrite with data from this write
+			 *  operation.
+			 */
+			req->rmw = true;
+			_reduce_vol_read_chunk(req, _write_read_done);
+			return;
+		}
+	}
+
+	lbsize = vol->params.logical_block_size;
+	req->decomp_iovcnt = 0;
+	req->rmw = false;
+
+	/* Note: point to our zero buf for offset into the chunk. */
+	chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+	if (chunk_offset != 0) {
+		ttl_len += chunk_offset * lbsize;
+		req->decomp_iov[0].iov_base = g_zero_buf;
+		req->decomp_iov[0].iov_len = ttl_len;
+		req->decomp_iovcnt = 1;
+	}
+
+	/* now the user data iov, direct from the user buffer */
+	for (i = 0; i < req->iovcnt; i++) {
+		req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base;
+		req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len;
+		ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len;
+	}
+	req->decomp_iovcnt += req->iovcnt;
+
+	remainder = vol->params.chunk_size - ttl_len;
+	if (remainder) {
+		req->decomp_iov[req->decomp_iovcnt].iov_base = g_zero_buf;
+		req->decomp_iov[req->decomp_iovcnt].iov_len = remainder;
+		ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len;
+		req->decomp_iovcnt++;
+	}
+	assert(ttl_len == req->vol->params.chunk_size);
+
+	_reduce_vol_compress_chunk(req, _write_compress_done);
+}
+
+void
+spdk_reduce_vol_writev(struct spdk_reduce_vol *vol,
+		       struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+		       spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+	struct spdk_reduce_vol_request *req;
+	uint64_t logical_map_index;
+	bool overlapped;
+
+	if (length == 0) {
+		cb_fn(cb_arg, 0);
+		return;
+	}
+
+	if (_request_spans_chunk_boundary(vol, offset, length)) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	if (!_iov_array_is_valid(vol, iov, iovcnt, length)) {
+		cb_fn(cb_arg, -EINVAL);
+		return;
+	}
+
+	logical_map_index = offset / vol->logical_blocks_per_chunk;
+	overlapped = _check_overlap(vol, logical_map_index);
+
+	req = TAILQ_FIRST(&vol->free_requests);
+	if (req == NULL) {
+		cb_fn(cb_arg, -ENOMEM);
+		return;
+	}
+
+	TAILQ_REMOVE(&vol->free_requests, req, tailq);
+	req->type = REDUCE_IO_WRITEV;
+	req->vol = vol;
+	req->iov = iov;
+	req->iovcnt = iovcnt;
+	req->offset = offset;
+	req->logical_map_index = logical_map_index;
+	req->length = length;
+	req->cb_fn = cb_fn;
+	req->cb_arg = cb_arg;
+
+	if (!overlapped) {
+		_start_writev_request(req);
+	} else {
+		TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq);
+	}
+}
+
+const struct spdk_reduce_vol_params *
+spdk_reduce_vol_get_params(struct spdk_reduce_vol *vol)
+{
+	return &vol->params;
+}
+
+void spdk_reduce_vol_print_info(struct spdk_reduce_vol *vol)
+{
+	uint64_t logical_map_size, num_chunks, ttl_chunk_sz;
+	uint32_t struct_size;
+	uint64_t chunk_map_size;
+
+	SPDK_NOTICELOG("vol info:\n");
+	SPDK_NOTICELOG("\tvol->params.backing_io_unit_size = 0x%x\n", vol->params.backing_io_unit_size);
+	SPDK_NOTICELOG("\tvol->params.logical_block_size = 0x%x\n", vol->params.logical_block_size);
+	SPDK_NOTICELOG("\tvol->params.chunk_size = 0x%x\n", vol->params.chunk_size);
+	SPDK_NOTICELOG("\tvol->params.vol_size = 0x%" PRIx64 "\n", vol->params.vol_size);
+	num_chunks = _get_total_chunks(vol->params.vol_size, vol->params.chunk_size);
+	SPDK_NOTICELOG("\ttotal chunks (including extra) = 0x%" PRIx64 "\n", num_chunks);
+	SPDK_NOTICELOG("\ttotal chunks (excluding extra) = 0x%" PRIx64 "\n",
+		       vol->params.vol_size / vol->params.chunk_size);
+	ttl_chunk_sz = _get_pm_total_chunks_size(vol->params.vol_size, vol->params.chunk_size,
+			vol->params.backing_io_unit_size);
+	SPDK_NOTICELOG("\ttotal_chunks_size = 0x%" PRIx64 "\n", ttl_chunk_sz);
+	struct_size = _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk);
+	SPDK_NOTICELOG("\tchunk_struct_size = 0x%x\n", struct_size);
+
+	SPDK_NOTICELOG("pmem info:\n");
+	SPDK_NOTICELOG("\tvol->pm_file.size = 0x%" PRIx64 "\n", vol->pm_file.size);
+	SPDK_NOTICELOG("\tvol->pm_file.pm_buf = %p\n", (void *)vol->pm_file.pm_buf);
+	SPDK_NOTICELOG("\tvol->pm_super = %p\n", (void *)vol->pm_super);
+	SPDK_NOTICELOG("\tvol->pm_logical_map = %p\n", (void *)vol->pm_logical_map);
+	logical_map_size = _get_pm_logical_map_size(vol->params.vol_size,
+			   vol->params.chunk_size);
+	SPDK_NOTICELOG("\tlogical_map_size = 0x%" PRIx64 "\n", logical_map_size);
+	SPDK_NOTICELOG("\tvol->pm_chunk_maps = %p\n", (void *)vol->pm_chunk_maps);
+	chunk_map_size = _get_pm_total_chunks_size(vol->params.vol_size, vol->params.chunk_size,
+			 vol->params.backing_io_unit_size);
+	SPDK_NOTICELOG("\tchunk_map_size = 0x%" PRIx64 "\n", chunk_map_size);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("reduce", SPDK_LOG_REDUCE)
diff --git a/src/spdk/lib/reduce/spdk_reduce.map b/src/spdk/lib/reduce/spdk_reduce.map
new file mode 100644
index 000000000..c53792710
--- /dev/null
+++ b/src/spdk/lib/reduce/spdk_reduce.map
@@ -0,0 +1,16 @@
+{
+	global:
+
+	# public functions
+	spdk_reduce_vol_get_uuid;
+	spdk_reduce_vol_init;
+	spdk_reduce_vol_load;
+	spdk_reduce_vol_unload;
+	spdk_reduce_vol_destroy;
+	spdk_reduce_vol_readv;
+	spdk_reduce_vol_writev;
+	spdk_reduce_vol_get_params;
+	spdk_reduce_vol_print_info;
+
+	local: *;
+};
diff --git a/src/spdk/lib/rocksdb/env_spdk.cc b/src/spdk/lib/rocksdb/env_spdk.cc
new file mode 100644
index 000000000..8695acca6
--- /dev/null
+++ b/src/spdk/lib/rocksdb/env_spdk.cc
@@ -0,0 +1,798 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rocksdb/env.h"
+#include <set>
+#include <iostream>
+#include <stdexcept>
+
+extern "C" {
+#include "spdk/env.h"
+#include "spdk/event.h"
+#include "spdk/blob.h"
+#include "spdk/blobfs.h"
+#include "spdk/blob_bdev.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/bdev.h"
+
+#include "spdk_internal/thread.h"
+}
+
+namespace rocksdb
+{
+
+struct spdk_filesystem *g_fs = NULL;
+struct spdk_bs_dev *g_bs_dev;
+uint32_t g_lcore = 0;
+std::string g_bdev_name;
+volatile bool g_spdk_ready = false;
+volatile bool g_spdk_start_failure = false;
+
+void SpdkInitializeThread(void);
+
+class SpdkThreadCtx
+{
+public:
+	struct spdk_fs_thread_ctx *channel;
+
+	SpdkThreadCtx(void) : channel(NULL)
+	{
+		SpdkInitializeThread();
+	}
+
+	~SpdkThreadCtx(void)
+	{
+		if (channel) {
+			spdk_fs_free_thread_ctx(channel);
+			channel = NULL;
+		}
+	}
+
+private:
+	SpdkThreadCtx(const SpdkThreadCtx &);
+	SpdkThreadCtx &operator=(const SpdkThreadCtx &);
+};
+
+thread_local SpdkThreadCtx g_sync_args;
+
+static void
+set_channel()
+{
+	struct spdk_thread *thread;
+
+	if (g_fs != NULL && g_sync_args.channel == NULL) {
+		thread = spdk_thread_create("spdK_rocksdb", NULL);
+		spdk_set_thread(thread);
+		g_sync_args.channel = spdk_fs_alloc_thread_ctx(g_fs);
+	}
+}
+
+static void
+__call_fn(void *arg1, void *arg2)
+{
+	fs_request_fn fn;
+
+	fn = (fs_request_fn)arg1;
+	fn(arg2);
+}
+
+static void
+__send_request(fs_request_fn fn, void *arg)
+{
+	struct spdk_event *event;
+
+	event = spdk_event_allocate(g_lcore, __call_fn, (void *)fn, arg);
+	spdk_event_call(event);
+}
+
+static std::string
+sanitize_path(const std::string &input, const std::string &mount_directory)
+{
+	int index = 0;
+	std::string name;
+	std::string input_tmp;
+
+	input_tmp = input.substr(mount_directory.length(), input.length());
+	for (const char &c : input_tmp) {
+		if (index == 0) {
+			if (c != '/') {
+				name = name.insert(index, 1, '/');
+				index++;
+			}
+			name = name.insert(index, 1, c);
+			index++;
+		} else {
+			if (name[index - 1] == '/' && c == '/') {
+				continue;
+			} else {
+				name = name.insert(index, 1, c);
+				index++;
+			}
+		}
+	}
+
+	if (name[name.size() - 1] == '/') {
+		name = name.erase(name.size() - 1, 1);
+	}
+	return name;
+}
+
+class SpdkSequentialFile : public SequentialFile
+{
+	struct spdk_file *mFile;
+	uint64_t mOffset;
+public:
+	SpdkSequentialFile(struct spdk_file *file) : mFile(file), mOffset(0) {}
+	virtual ~SpdkSequentialFile();
+
+	virtual Status Read(size_t n, Slice *result, char *scratch) override;
+	virtual Status Skip(uint64_t n) override;
+	virtual Status InvalidateCache(size_t offset, size_t length) override;
+};
+
+SpdkSequentialFile::~SpdkSequentialFile(void)
+{
+	set_channel();
+	spdk_file_close(mFile, g_sync_args.channel);
+}
+
+Status
+SpdkSequentialFile::Read(size_t n, Slice *result, char *scratch)
+{
+	int64_t ret;
+
+	set_channel();
+	ret = spdk_file_read(mFile, g_sync_args.channel, scratch, mOffset, n);
+	if (ret >= 0) {
+		mOffset += ret;
+		*result = Slice(scratch, ret);
+		return Status::OK();
+	} else {
+		errno = -ret;
+		return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+	}
+}
+
+Status
+SpdkSequentialFile::Skip(uint64_t n)
+{
+	mOffset += n;
+	return Status::OK();
+}
+
+Status
+SpdkSequentialFile::InvalidateCache(__attribute__((unused)) size_t offset,
+				    __attribute__((unused)) size_t length)
+{
+	return Status::OK();
+}
+
+class SpdkRandomAccessFile : public RandomAccessFile
+{
+	struct spdk_file *mFile;
+public:
+	SpdkRandomAccessFile(struct spdk_file *file) : mFile(file) {}
+	virtual ~SpdkRandomAccessFile();
+
+	virtual Status Read(uint64_t offset, size_t n, Slice *result, char *scratch) const override;
+	virtual Status InvalidateCache(size_t offset, size_t length) override;
+};
+
+SpdkRandomAccessFile::~SpdkRandomAccessFile(void)
+{
+	set_channel();
+	spdk_file_close(mFile, g_sync_args.channel);
+}
+
+Status
+SpdkRandomAccessFile::Read(uint64_t offset, size_t n, Slice *result, char *scratch) const
+{
+	int64_t rc;
+
+	set_channel();
+	rc = spdk_file_read(mFile, g_sync_args.channel, scratch, offset, n);
+	if (rc >= 0) {
+		*result = Slice(scratch, n);
+		return Status::OK();
+	} else {
+		errno = -rc;
+		return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+	}
+}
+
+Status
+SpdkRandomAccessFile::InvalidateCache(__attribute__((unused)) size_t offset,
+				      __attribute__((unused)) size_t length)
+{
+	return Status::OK();
+}
+
+class SpdkWritableFile : public WritableFile
+{
+	struct spdk_file *mFile;
+	uint64_t mSize;
+
+public:
+	SpdkWritableFile(struct spdk_file *file) : mFile(file), mSize(0) {}
+	~SpdkWritableFile()
+	{
+		if (mFile != NULL) {
+			Close();
+		}
+	}
+
+	virtual void SetIOPriority(Env::IOPriority pri)
+	{
+		if (pri == Env::IO_HIGH) {
+			spdk_file_set_priority(mFile, SPDK_FILE_PRIORITY_HIGH);
+		}
+	}
+
+	virtual Status Truncate(uint64_t size) override
+	{
+		int rc;
+
+		set_channel();
+		rc = spdk_file_truncate(mFile, g_sync_args.channel, size);
+		if (!rc) {
+			mSize = size;
+			return Status::OK();
+		} else {
+			errno = -rc;
+			return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+		}
+	}
+	virtual Status Close() override
+	{
+		set_channel();
+		spdk_file_close(mFile, g_sync_args.channel);
+		mFile = NULL;
+		return Status::OK();
+	}
+	virtual Status Append(const Slice &data) override;
+	virtual Status Flush() override
+	{
+		return Status::OK();
+	}
+	virtual Status Sync() override
+	{
+		int rc;
+
+		set_channel();
+		rc = spdk_file_sync(mFile, g_sync_args.channel);
+		if (!rc) {
+			return Status::OK();
+		} else {
+			errno = -rc;
+			return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+		}
+	}
+	virtual Status Fsync() override
+	{
+		int rc;
+
+		set_channel();
+		rc = spdk_file_sync(mFile, g_sync_args.channel);
+		if (!rc) {
+			return Status::OK();
+		} else {
+			errno = -rc;
+			return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+		}
+	}
+	virtual bool IsSyncThreadSafe() const override
+	{
+		return true;
+	}
+	virtual uint64_t GetFileSize() override
+	{
+		return mSize;
+	}
+	virtual Status InvalidateCache(__attribute__((unused)) size_t offset,
+				       __attribute__((unused)) size_t length) override
+	{
+		return Status::OK();
+	}
+	virtual Status Allocate(uint64_t offset, uint64_t len) override
+	{
+		int rc;
+
+		set_channel();
+		rc = spdk_file_truncate(mFile, g_sync_args.channel, offset + len);
+		if (!rc) {
+			return Status::OK();
+		} else {
+			errno = -rc;
+			return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+		}
+	}
+	virtual Status RangeSync(__attribute__((unused)) uint64_t offset,
+				 __attribute__((unused)) uint64_t nbytes) override
+	{
+		int rc;
+
+		/*
+		 * SPDK BlobFS does not have a range sync operation yet, so just sync
+		 *  the whole file.
+		 */
+		set_channel();
+		rc = spdk_file_sync(mFile, g_sync_args.channel);
+		if (!rc) {
+			return Status::OK();
+		} else {
+			errno = -rc;
+			return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+		}
+	}
+	virtual size_t GetUniqueId(char *id, size_t max_size) const override
+	{
+		int rc;
+
+		rc = spdk_file_get_id(mFile, id, max_size);
+		if (rc < 0) {
+			return 0;
+		} else {
+			return rc;
+		}
+	}
+};
+
+Status
+SpdkWritableFile::Append(const Slice &data)
+{
+	int64_t rc;
+
+	set_channel();
+	rc = spdk_file_write(mFile, g_sync_args.channel, (void *)data.data(), mSize, data.size());
+	if (rc >= 0) {
+		mSize += data.size();
+		return Status::OK();
+	} else {
+		errno = -rc;
+		return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+	}
+}
+
+class SpdkDirectory : public Directory
+{
+public:
+	SpdkDirectory() {}
+	~SpdkDirectory() {}
+	Status Fsync() override
+	{
+		return Status::OK();
+	}
+};
+
+class SpdkAppStartException : public std::runtime_error
+{
+public:
+	SpdkAppStartException(std::string mess): std::runtime_error(mess) {}
+};
+
+class SpdkEnv : public EnvWrapper
+{
+private:
+	pthread_t mSpdkTid;
+	std::string mDirectory;
+	std::string mConfig;
+	std::string mBdev;
+
+public:
+	SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf,
+		const std::string &bdev, uint64_t cache_size_in_mb);
+
+	virtual ~SpdkEnv();
+
+	virtual Status NewSequentialFile(const std::string &fname,
+					 unique_ptr<SequentialFile> *result,
+					 const EnvOptions &options) override
+	{
+		if (fname.compare(0, mDirectory.length(), mDirectory) == 0) {
+			struct spdk_file *file;
+			int rc;
+
+			std::string name = sanitize_path(fname, mDirectory);
+			set_channel();
+			rc = spdk_fs_open_file(g_fs, g_sync_args.channel,
+					       name.c_str(), 0, &file);
+			if (rc == 0) {
+				result->reset(new SpdkSequentialFile(file));
+				return Status::OK();
+			} else {
+				/* Myrocks engine uses errno(ENOENT) as one
+				 * special condition, for the purpose to
+				 * support MySQL, set the errno to right value.
+				 */
+				errno = -rc;
+				return Status::IOError(name, strerror(errno));
+			}
+		} else {
+			return EnvWrapper::NewSequentialFile(fname, result, options);
+		}
+	}
+
+	virtual Status NewRandomAccessFile(const std::string &fname,
+					   unique_ptr<RandomAccessFile> *result,
+					   const EnvOptions &options) override
+	{
+		if (fname.compare(0, mDirectory.length(), mDirectory) == 0) {
+			std::string name = sanitize_path(fname, mDirectory);
+			struct spdk_file *file;
+			int rc;
+
+			set_channel();
+			rc = spdk_fs_open_file(g_fs, g_sync_args.channel,
+					       name.c_str(), 0, &file);
+			if (rc == 0) {
+				result->reset(new SpdkRandomAccessFile(file));
+				return Status::OK();
+			} else {
+				errno = -rc;
+				return Status::IOError(name, strerror(errno));
+			}
+		} else {
+			return EnvWrapper::NewRandomAccessFile(fname, result, options);
+		}
+	}
+
+	virtual Status NewWritableFile(const std::string &fname,
+				       unique_ptr<WritableFile> *result,
+				       const EnvOptions &options) override
+	{
+		if (fname.compare(0, mDirectory.length(), mDirectory) == 0) {
+			std::string name = sanitize_path(fname, mDirectory);
+			struct spdk_file *file;
+			int rc;
+
+			set_channel();
+			rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(),
+					       SPDK_BLOBFS_OPEN_CREATE, &file);
+			if (rc == 0) {
+				result->reset(new SpdkWritableFile(file));
+				return Status::OK();
+			} else {
+				errno = -rc;
+				return Status::IOError(name, strerror(errno));
+			}
+		} else {
+			return EnvWrapper::NewWritableFile(fname, result, options);
+		}
+	}
+
+	virtual Status ReuseWritableFile(const std::string &fname,
+					 const std::string &old_fname,
+					 unique_ptr<WritableFile> *result,
+					 const EnvOptions &options) override
+	{
+		return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options);
+	}
+
+	virtual Status NewDirectory(__attribute__((unused)) const std::string &name,
+				    unique_ptr<Directory> *result) override
+	{
+		result->reset(new SpdkDirectory());
+		return Status::OK();
+	}
+	virtual Status FileExists(const std::string &fname) override
+	{
+		struct spdk_file_stat stat;
+		int rc;
+		std::string name = sanitize_path(fname, mDirectory);
+
+		set_channel();
+		rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat);
+		if (rc == 0) {
+			return Status::OK();
+		}
+		return EnvWrapper::FileExists(fname);
+	}
+	virtual Status RenameFile(const std::string &src, const std::string &t) override
+	{
+		int rc;
+		std::string src_name = sanitize_path(src, mDirectory);
+		std::string target_name = sanitize_path(t, mDirectory);
+
+		set_channel();
+		rc = spdk_fs_rename_file(g_fs, g_sync_args.channel,
+					 src_name.c_str(), target_name.c_str());
+		if (rc == -ENOENT) {
+			return EnvWrapper::RenameFile(src, t);
+		}
+		return Status::OK();
+	}
+	virtual Status LinkFile(__attribute__((unused)) const std::string &src,
+				__attribute__((unused)) const std::string &t) override
+	{
+		return Status::NotSupported("SpdkEnv does not support LinkFile");
+	}
+	virtual Status GetFileSize(const std::string &fname, uint64_t *size) override
+	{
+		struct spdk_file_stat stat;
+		int rc;
+		std::string name = sanitize_path(fname, mDirectory);
+
+		set_channel();
+		rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat);
+		if (rc == -ENOENT) {
+			return EnvWrapper::GetFileSize(fname, size);
+		}
+		*size = stat.size;
+		return Status::OK();
+	}
+	virtual Status DeleteFile(const std::string &fname) override
+	{
+		int rc;
+		std::string name = sanitize_path(fname, mDirectory);
+
+		set_channel();
+		rc = spdk_fs_delete_file(g_fs, g_sync_args.channel, name.c_str());
+		if (rc == -ENOENT) {
+			return EnvWrapper::DeleteFile(fname);
+		}
+		return Status::OK();
+	}
+	virtual Status LockFile(const std::string &fname, FileLock **lock) override
+	{
+		std::string name = sanitize_path(fname, mDirectory);
+		int64_t rc;
+
+		set_channel();
+		rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(),
+				       SPDK_BLOBFS_OPEN_CREATE, (struct spdk_file **)lock);
+		if (!rc) {
+			return Status::OK();
+		} else {
+			errno = -rc;
+			return Status::IOError(name, strerror(errno));
+		}
+	}
+	virtual Status UnlockFile(FileLock *lock) override
+	{
+		set_channel();
+		spdk_file_close((struct spdk_file *)lock, g_sync_args.channel);
+		return Status::OK();
+	}
+	virtual Status GetChildren(const std::string &dir,
+				   std::vector<std::string> *result) override
+	{
+		std::string::size_type pos;
+		std::set<std::string> dir_and_file_set;
+		std::string full_path;
+		std::string filename;
+		std::string dir_name;
+
+		if (dir.find("archive") != std::string::npos) {
+			return Status::OK();
+		}
+		if (dir.compare(0, mDirectory.length(), mDirectory) == 0) {
+			spdk_fs_iter iter;
+			struct spdk_file *file;
+			dir_name = sanitize_path(dir, mDirectory);
+
+			iter = spdk_fs_iter_first(g_fs);
+			while (iter != NULL) {
+				file = spdk_fs_iter_get_file(iter);
+				full_path = spdk_file_get_name(file);
+				if (strncmp(dir_name.c_str(), full_path.c_str(), dir_name.length())) {
+					iter = spdk_fs_iter_next(iter);
+					continue;
+				}
+				pos = full_path.find("/", dir_name.length() + 1);
+
+				if (pos != std::string::npos) {
+					filename = full_path.substr(dir_name.length() + 1, pos - dir_name.length() - 1);
+				} else {
+					filename = full_path.substr(dir_name.length() + 1);
+				}
+				dir_and_file_set.insert(filename);
+				iter = spdk_fs_iter_next(iter);
+			}
+
+			for (auto &s : dir_and_file_set) {
+				result->push_back(s);
+			}
+
+			result->push_back(".");
+			result->push_back("..");
+
+			return Status::OK();
+		}
+		return EnvWrapper::GetChildren(dir, result);
+	}
+};
+
+/* The thread local constructor doesn't work for the main thread, since
+ * the filesystem hasn't been loaded yet.  So we break out this
+ * SpdkInitializeThread function, so that the main thread can explicitly
+ * call it after the filesystem has been loaded.
+ */
+void SpdkInitializeThread(void)
+{
+	struct spdk_thread *thread;
+
+	if (g_fs != NULL) {
+		if (g_sync_args.channel) {
+			spdk_fs_free_thread_ctx(g_sync_args.channel);
+		}
+		thread = spdk_thread_create("spdk_rocksdb", NULL);
+		spdk_set_thread(thread);
+		g_sync_args.channel = spdk_fs_alloc_thread_ctx(g_fs);
+	}
+}
+
+static void
+fs_load_cb(__attribute__((unused)) void *ctx,
+	   struct spdk_filesystem *fs, int fserrno)
+{
+	if (fserrno == 0) {
+		g_fs = fs;
+	}
+	g_spdk_ready = true;
+}
+
+static void
+rocksdb_run(__attribute__((unused)) void *arg1)
+{
+	struct spdk_bdev *bdev;
+
+	bdev = spdk_bdev_get_by_name(g_bdev_name.c_str());
+
+	if (bdev == NULL) {
+		SPDK_ERRLOG("bdev %s not found\n", g_bdev_name.c_str());
+		exit(1);
+	}
+
+	g_lcore = spdk_env_get_first_core();
+
+	g_bs_dev = spdk_bdev_create_bs_dev(bdev, NULL, NULL);
+	printf("using bdev %s\n", g_bdev_name.c_str());
+	spdk_fs_load(g_bs_dev, __send_request, fs_load_cb, NULL);
+}
+
+static void
+fs_unload_cb(__attribute__((unused)) void *ctx,
+	     __attribute__((unused)) int fserrno)
+{
+	assert(fserrno == 0);
+
+	spdk_app_stop(0);
+}
+
+static void
+rocksdb_shutdown(void)
+{
+	if (g_fs != NULL) {
+		spdk_fs_unload(g_fs, fs_unload_cb, NULL);
+	} else {
+		fs_unload_cb(NULL, 0);
+	}
+}
+
+static void *
+initialize_spdk(void *arg)
+{
+	struct spdk_app_opts *opts = (struct spdk_app_opts *)arg;
+	int rc;
+
+	rc = spdk_app_start(opts, rocksdb_run, NULL);
+	/*
+	 * TODO:  Revisit for case of internal failure of
+	 * spdk_app_start(), itself.  At this time, it's known
+	 * the only application's use of spdk_app_stop() passes
+	 * a zero; i.e. no fail (non-zero) cases so here we
+	 * assume there was an internal failure and flag it
+	 * so we can throw an exception.
+	 */
+	if (rc) {
+		g_spdk_start_failure = true;
+	} else {
+		spdk_app_fini();
+		delete opts;
+	}
+	pthread_exit(NULL);
+
+}
+
+SpdkEnv::SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf,
+		 const std::string &bdev, uint64_t cache_size_in_mb)
+	: EnvWrapper(base_env), mDirectory(dir), mConfig(conf), mBdev(bdev)
+{
+	struct spdk_app_opts *opts = new struct spdk_app_opts;
+
+	spdk_app_opts_init(opts);
+	opts->name = "rocksdb";
+	opts->config_file = mConfig.c_str();
+	opts->shutdown_cb = rocksdb_shutdown;
+
+	spdk_fs_set_cache_size(cache_size_in_mb);
+	g_bdev_name = mBdev;
+
+	pthread_create(&mSpdkTid, NULL, &initialize_spdk, opts);
+	while (!g_spdk_ready && !g_spdk_start_failure)
+		;
+	if (g_spdk_start_failure) {
+		delete opts;
+		throw SpdkAppStartException("spdk_app_start() unable to start rocksdb_run()");
+	}
+
+	SpdkInitializeThread();
+}
+
+SpdkEnv::~SpdkEnv()
+{
+	/* This is a workaround for rocksdb test, we close the files if the rocksdb not
+	 * do the work before the test quit.
+	 */
+	if (g_fs != NULL) {
+		spdk_fs_iter iter;
+		struct spdk_file *file;
+
+		if (!g_sync_args.channel) {
+			SpdkInitializeThread();
+		}
+
+		iter = spdk_fs_iter_first(g_fs);
+		while (iter != NULL) {
+			file = spdk_fs_iter_get_file(iter);
+			spdk_file_close(file, g_sync_args.channel);
+			iter = spdk_fs_iter_next(iter);
+		}
+	}
+
+	spdk_app_start_shutdown();
+	pthread_join(mSpdkTid, NULL);
+}
+
+Env *NewSpdkEnv(Env *base_env, const std::string &dir, const std::string &conf,
+		const std::string &bdev, uint64_t cache_size_in_mb)
+{
+	try {
+		SpdkEnv *spdk_env = new SpdkEnv(base_env, dir, conf, bdev, cache_size_in_mb);
+		if (g_fs != NULL) {
+			return spdk_env;
+		} else {
+			delete spdk_env;
+			return NULL;
+		}
+	} catch (SpdkAppStartException &e) {
+		SPDK_ERRLOG("NewSpdkEnv: exception caught: %s", e.what());
+		return NULL;
+	} catch (...) {
+		SPDK_ERRLOG("NewSpdkEnv: default exception caught");
+		return NULL;
+	}
+}
+
+} // namespace rocksdb
diff --git a/src/spdk/lib/rocksdb/spdk.rocksdb.mk b/src/spdk/lib/rocksdb/spdk.rocksdb.mk
new file mode 100644
index 000000000..fe498cc39
--- /dev/null
+++ b/src/spdk/lib/rocksdb/spdk.rocksdb.mk
@@ -0,0 +1,70 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# This snippet will be included into the RocksDB Makefile
+
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
+
+CXXFLAGS +=  -I$(SPDK_DIR)/include -Iinclude/
+
+# The SPDK makefiles turn this on, but RocksDB won't compile with it.  So
+#  turn it off after including the SPDK makefiles.
+CXXFLAGS += -Wno-missing-declarations
+
+# The SPDK Makefiles may turn these options on but we do not want to enable
+#  them for the RocksDB source files.
+CXXFLAGS += -fno-profile-arcs -fno-test-coverage
+ifeq ($(CONFIG_UBSAN),y)
+CXXFLAGS += -fno-sanitize=undefined
+endif
+ifeq ($(CONFIG_ASAN),y)
+CXXFLAGS += -fno-sanitize=address
+endif
+
+SPDK_LIB_LIST = $(ALL_MODULES_LIST)
+SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM)
+SPDK_LIB_LIST += bdev accel event util conf trace log jsonrpc json rpc sock thread notify
+SPDK_LIB_LIST += bdev_rpc blobfs_bdev
+
+AM_LINK += $(SPDK_LIB_LINKER_ARGS) $(ENV_LINKER_ARGS)
+AM_LINK += $(SYS_LIBS)
+
+ifeq ($(CONFIG_UBSAN),y)
+AM_LINK += -fsanitize=undefined
+endif
+
+ifeq ($(CONFIG_COVERAGE),y)
+AM_LINK += -fprofile-arcs -ftest-coverage
+endif
diff --git a/src/spdk/lib/rpc/Makefile b/src/spdk/lib/rpc/Makefile
new file mode 100644
index 000000000..ead36f6ba
--- /dev/null
+++ b/src/spdk/lib/rpc/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = rpc.c
+LIBNAME = rpc
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rpc.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rpc/rpc.c b/src/spdk/lib/rpc/rpc.c
new file mode 100644
index 000000000..7182f41e9
--- /dev/null
+++ b/src/spdk/lib/rpc/rpc.c
@@ -0,0 +1,392 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/file.h>
+
+#include "spdk/stdinc.h"
+
+#include "spdk/queue.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/version.h"
+
+static struct sockaddr_un g_rpc_listen_addr_unix = {};
+static char g_rpc_lock_path[sizeof(g_rpc_listen_addr_unix.sun_path) + sizeof(".lock")];
+static int g_rpc_lock_fd = -1;
+
+static struct spdk_jsonrpc_server *g_jsonrpc_server = NULL;
+static uint32_t g_rpc_state;
+static bool g_rpcs_correct = true;
+
+struct spdk_rpc_method {
+	const char *name;
+	spdk_rpc_method_handler func;
+	SLIST_ENTRY(spdk_rpc_method) slist;
+	uint32_t state_mask;
+	bool is_deprecated;
+	struct spdk_rpc_method *is_alias_of;
+	bool deprecation_warning_printed;
+};
+
+static SLIST_HEAD(, spdk_rpc_method) g_rpc_methods = SLIST_HEAD_INITIALIZER(g_rpc_methods);
+
+void
+spdk_rpc_set_state(uint32_t state)
+{
+	g_rpc_state = state;
+}
+
+uint32_t
+spdk_rpc_get_state(void)
+{
+	return g_rpc_state;
+}
+
+static struct spdk_rpc_method *
+_get_rpc_method(const struct spdk_json_val *method)
+{
+	struct spdk_rpc_method *m;
+
+	SLIST_FOREACH(m, &g_rpc_methods, slist) {
+		if (spdk_json_strequal(method, m->name)) {
+			return m;
+		}
+	}
+
+	return NULL;
+}
+
+static struct spdk_rpc_method *
+_get_rpc_method_raw(const char *method)
+{
+	struct spdk_json_val method_val;
+
+	method_val.type = SPDK_JSON_VAL_STRING;
+	method_val.len = strlen(method);
+	method_val.start = (char *)method;
+
+	return _get_rpc_method(&method_val);
+}
+
+static void
+jsonrpc_handler(struct spdk_jsonrpc_request *request,
+		const struct spdk_json_val *method,
+		const struct spdk_json_val *params)
+{
+	struct spdk_rpc_method *m;
+
+	assert(method != NULL);
+
+	m = _get_rpc_method(method);
+	if (m == NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND, "Method not found");
+		return;
+	}
+
+	if (m->is_alias_of != NULL) {
+		if (m->is_deprecated && !m->deprecation_warning_printed) {
+			SPDK_WARNLOG("RPC method %s is deprecated.  Use %s instead.\n", m->name, m->is_alias_of->name);
+			m->deprecation_warning_printed = true;
+		}
+		m = m->is_alias_of;
+	}
+
+	if ((m->state_mask & g_rpc_state) == g_rpc_state) {
+		m->func(request, params);
+	} else {
+		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE,
+						     "Method is allowed in any state in the mask (%"PRIx32"),"
+						     " but current state is (%"PRIx32")",
+						     m->state_mask, g_rpc_state);
+	}
+}
+
+int
+spdk_rpc_listen(const char *listen_addr)
+{
+	int rc;
+
+	memset(&g_rpc_listen_addr_unix, 0, sizeof(g_rpc_listen_addr_unix));
+
+	g_rpc_listen_addr_unix.sun_family = AF_UNIX;
+	rc = snprintf(g_rpc_listen_addr_unix.sun_path,
+		      sizeof(g_rpc_listen_addr_unix.sun_path),
+		      "%s", listen_addr);
+	if (rc < 0 || (size_t)rc >= sizeof(g_rpc_listen_addr_unix.sun_path)) {
+		SPDK_ERRLOG("RPC Listen address Unix socket path too long\n");
+		g_rpc_listen_addr_unix.sun_path[0] = '\0';
+		return -1;
+	}
+
+	rc = snprintf(g_rpc_lock_path, sizeof(g_rpc_lock_path), "%s.lock",
+		      g_rpc_listen_addr_unix.sun_path);
+	if (rc < 0 || (size_t)rc >= sizeof(g_rpc_lock_path)) {
+		SPDK_ERRLOG("RPC lock path too long\n");
+		g_rpc_listen_addr_unix.sun_path[0] = '\0';
+		g_rpc_lock_path[0] = '\0';
+		return -1;
+	}
+
+	g_rpc_lock_fd = open(g_rpc_lock_path, O_RDONLY | O_CREAT, 0600);
+	if (g_rpc_lock_fd == -1) {
+		SPDK_ERRLOG("Cannot open lock file %s: %s\n",
+			    g_rpc_lock_path, spdk_strerror(errno));
+		g_rpc_listen_addr_unix.sun_path[0] = '\0';
+		g_rpc_lock_path[0] = '\0';
+		return -1;
+	}
+
+	rc = flock(g_rpc_lock_fd, LOCK_EX | LOCK_NB);
+	if (rc != 0) {
+		SPDK_ERRLOG("RPC Unix domain socket path %s in use. Specify another.\n",
+			    g_rpc_listen_addr_unix.sun_path);
+		g_rpc_listen_addr_unix.sun_path[0] = '\0';
+		g_rpc_lock_path[0] = '\0';
+		return -1;
+	}
+
+	/*
+	 * Since we acquired the lock, it is safe to delete the Unix socket file
+	 * if it still exists from a previous process.
+	 */
+	unlink(g_rpc_listen_addr_unix.sun_path);
+
+	g_jsonrpc_server = spdk_jsonrpc_server_listen(AF_UNIX, 0,
+			   (struct sockaddr *)&g_rpc_listen_addr_unix,
+			   sizeof(g_rpc_listen_addr_unix),
+			   jsonrpc_handler);
+	if (g_jsonrpc_server == NULL) {
+		SPDK_ERRLOG("spdk_jsonrpc_server_listen() failed\n");
+		close(g_rpc_lock_fd);
+		g_rpc_lock_fd = -1;
+		unlink(g_rpc_lock_path);
+		g_rpc_lock_path[0] = '\0';
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+spdk_rpc_accept(void)
+{
+	spdk_jsonrpc_server_poll(g_jsonrpc_server);
+}
+
+void
+spdk_rpc_register_method(const char *method, spdk_rpc_method_handler func, uint32_t state_mask)
+{
+	struct spdk_rpc_method *m;
+
+	m = _get_rpc_method_raw(method);
+	if (m != NULL) {
+		SPDK_ERRLOG("duplicate RPC %s registered...\n", method);
+		g_rpcs_correct = false;
+		return;
+	}
+
+	m = calloc(1, sizeof(struct spdk_rpc_method));
+	assert(m != NULL);
+
+	m->name = strdup(method);
+	assert(m->name != NULL);
+
+	m->func = func;
+	m->state_mask = state_mask;
+
+	/* TODO: use a hash table or sorted list */
+	SLIST_INSERT_HEAD(&g_rpc_methods, m, slist);
+}
+
+void
+spdk_rpc_register_alias_deprecated(const char *method, const char *alias)
+{
+	struct spdk_rpc_method *m, *base;
+
+	base = _get_rpc_method_raw(method);
+	if (base == NULL) {
+		SPDK_ERRLOG("cannot create alias %s - method %s does not exist\n",
+			    alias, method);
+		g_rpcs_correct = false;
+		return;
+	}
+
+	if (base->is_alias_of != NULL) {
+		SPDK_ERRLOG("cannot create alias %s of alias %s\n", alias, method);
+		g_rpcs_correct = false;
+		return;
+	}
+
+	m = calloc(1, sizeof(struct spdk_rpc_method));
+	assert(m != NULL);
+
+	m->name = strdup(alias);
+	assert(m->name != NULL);
+
+	m->is_alias_of = base;
+	m->is_deprecated = true;
+	m->state_mask = base->state_mask;
+
+	/* TODO: use a hash table or sorted list */
+	SLIST_INSERT_HEAD(&g_rpc_methods, m, slist);
+}
+
+bool
+spdk_rpc_verify_methods(void)
+{
+	return g_rpcs_correct;
+}
+
+int
+spdk_rpc_is_method_allowed(const char *method, uint32_t state_mask)
+{
+	struct spdk_rpc_method *m;
+
+	SLIST_FOREACH(m, &g_rpc_methods, slist) {
+		if (strcmp(m->name, method) != 0) {
+			continue;
+		}
+
+		if ((m->state_mask & state_mask) == state_mask) {
+			return 0;
+		} else {
+			return -EPERM;
+		}
+	}
+
+	return -ENOENT;
+}
+
+void
+spdk_rpc_close(void)
+{
+	if (g_jsonrpc_server) {
+		if (g_rpc_listen_addr_unix.sun_path[0]) {
+			/* Delete the Unix socket file */
+			unlink(g_rpc_listen_addr_unix.sun_path);
+			g_rpc_listen_addr_unix.sun_path[0] = '\0';
+		}
+
+		spdk_jsonrpc_server_shutdown(g_jsonrpc_server);
+		g_jsonrpc_server = NULL;
+
+		if (g_rpc_lock_fd != -1) {
+			close(g_rpc_lock_fd);
+			g_rpc_lock_fd = -1;
+		}
+
+		if (g_rpc_lock_path[0]) {
+			unlink(g_rpc_lock_path);
+			g_rpc_lock_path[0] = '\0';
+		}
+	}
+}
+
+struct rpc_get_methods {
+	bool current;
+	bool include_aliases;
+};
+
+static const struct spdk_json_object_decoder rpc_get_methods_decoders[] = {
+	{"current", offsetof(struct rpc_get_methods, current), spdk_json_decode_bool, true},
+	{"include_aliases", offsetof(struct rpc_get_methods, include_aliases), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_get_methods(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+	struct rpc_get_methods req = {};
+	struct spdk_json_write_ctx *w;
+	struct spdk_rpc_method *m;
+
+	if (params != NULL) {
+		if (spdk_json_decode_object(params, rpc_get_methods_decoders,
+					    SPDK_COUNTOF(rpc_get_methods_decoders), &req)) {
+			SPDK_ERRLOG("spdk_json_decode_object failed\n");
+			spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+							 "Invalid parameters");
+			return;
+		}
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+	SLIST_FOREACH(m, &g_rpc_methods, slist) {
+		if (m->is_alias_of != NULL && !req.include_aliases) {
+			continue;
+		}
+		if (req.current && ((m->state_mask & g_rpc_state) != g_rpc_state)) {
+			continue;
+		}
+		spdk_json_write_string(w, m->name);
+	}
+	spdk_json_write_array_end(w);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("rpc_get_methods", rpc_get_methods, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(rpc_get_methods, get_rpc_methods)
+
+static void
+rpc_spdk_get_version(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "spdk_get_version method requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string_fmt(w, "version", "%s", SPDK_VERSION_STRING);
+	spdk_json_write_named_object_begin(w, "fields");
+	spdk_json_write_named_uint32(w, "major", SPDK_VERSION_MAJOR);
+	spdk_json_write_named_uint32(w, "minor", SPDK_VERSION_MINOR);
+	spdk_json_write_named_uint32(w, "patch", SPDK_VERSION_PATCH);
+	spdk_json_write_named_string_fmt(w, "suffix", "%s", SPDK_VERSION_SUFFIX);
+#ifdef SPDK_GIT_COMMIT
+	spdk_json_write_named_string_fmt(w, "commit", "%s", SPDK_GIT_COMMIT_STRING);
+#endif
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("spdk_get_version", rpc_spdk_get_version,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(spdk_get_version, get_spdk_version)
diff --git a/src/spdk/lib/rpc/spdk_rpc.map b/src/spdk/lib/rpc/spdk_rpc.map
new file mode 100644
index 000000000..e15ff8b53
--- /dev/null
+++ b/src/spdk/lib/rpc/spdk_rpc.map
@@ -0,0 +1,16 @@
+{
+	global:
+
+	# public functions
+	spdk_rpc_verify_methods;
+	spdk_rpc_listen;
+	spdk_rpc_accept;
+	spdk_rpc_close;
+	spdk_rpc_register_method;
+	spdk_rpc_register_alias_deprecated;
+	spdk_rpc_is_method_allowed;
+	spdk_rpc_set_state;
+	spdk_rpc_get_state;
+
+	local: *;
+};
diff --git a/src/spdk/lib/rte_vhost/Makefile b/src/spdk/lib/rte_vhost/Makefile
new file mode 100644
index 000000000..aa073c6ca
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/Makefile
@@ -0,0 +1,50 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+CFLAGS += -include rte_config.h
+CFLAGS += -Wno-address-of-packed-member
+
+# These are the DPDK vhost files copied (for now) into SPDK
+C_SRCS += fd_man.c socket.c vhost_user.c vhost.c
+
+LIBNAME = rte_vhost
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rte_vhost/fd_man.c b/src/spdk/lib/rte_vhost/fd_man.c
new file mode 100644
index 000000000..2ceacc9ab
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/fd_man.c
@@ -0,0 +1,300 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "fd_man.h"
+
+#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL)
+
+static int
+get_last_valid_idx(struct fdset *pfdset, int last_valid_idx)
+{
+	int i;
+
+	for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--)
+		;
+
+	return i;
+}
+
+static void
+fdset_move(struct fdset *pfdset, int dst, int src)
+{
+	pfdset->fd[dst]    = pfdset->fd[src];
+	pfdset->rwfds[dst] = pfdset->rwfds[src];
+}
+
+static void
+fdset_shrink_nolock(struct fdset *pfdset)
+{
+	int i;
+	int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1);
+
+	for (i = 0; i < last_valid_idx; i++) {
+		if (pfdset->fd[i].fd != -1)
+			continue;
+
+		fdset_move(pfdset, i, last_valid_idx);
+		last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1);
+	}
+	pfdset->num = last_valid_idx + 1;
+}
+
+/*
+ * Find deleted fd entries and remove them
+ */
+static void
+fdset_shrink(struct fdset *pfdset)
+{
+	pthread_mutex_lock(&pfdset->fd_mutex);
+	fdset_shrink_nolock(pfdset);
+	pthread_mutex_unlock(&pfdset->fd_mutex);
+}
+
+/**
+ * Returns the index in the fdset for a given fd.
+ * @return
+ *   index for the fd, or -1 if fd isn't in the fdset.
+ */
+static int
+fdset_find_fd(struct fdset *pfdset, int fd)
+{
+	int i;
+
+	for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++)
+		;
+
+	return i == pfdset->num ? -1 : i;
+}
+
+static void
+fdset_add_fd(struct fdset *pfdset, int idx, int fd,
+	fd_cb rcb, fd_cb wcb, void *dat)
+{
+	struct fdentry *pfdentry = &pfdset->fd[idx];
+	struct pollfd *pfd = &pfdset->rwfds[idx];
+
+	pfdentry->fd  = fd;
+	pfdentry->rcb = rcb;
+	pfdentry->wcb = wcb;
+	pfdentry->dat = dat;
+
+	pfd->fd = fd;
+	pfd->events  = rcb ? POLLIN : 0;
+	pfd->events |= wcb ? POLLOUT : 0;
+	pfd->revents = 0;
+}
+
+void
+fdset_init(struct fdset *pfdset)
+{
+	int i;
+
+	if (pfdset == NULL)
+		return;
+
+	for (i = 0; i < MAX_FDS; i++) {
+		pfdset->fd[i].fd = -1;
+		pfdset->fd[i].dat = NULL;
+	}
+	pfdset->num = 0;
+}
+
+/**
+ * Register the fd in the fdset with read/write handler and context.
+ */
+int
+fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
+{
+	int i;
+
+	if (pfdset == NULL || fd == -1)
+		return -1;
+
+	pthread_mutex_lock(&pfdset->fd_mutex);
+	i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+	if (i == -1) {
+		fdset_shrink_nolock(pfdset);
+		i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+		if (i == -1) {
+			pthread_mutex_unlock(&pfdset->fd_mutex);
+			return -2;
+		}
+	}
+
+	fdset_add_fd(pfdset, i, fd, rcb, wcb, dat);
+	pthread_mutex_unlock(&pfdset->fd_mutex);
+
+	return 0;
+}
+
+/**
+ *  Unregister the fd from the fdset.
+ *  Returns context of a given fd or NULL.
+ */
+void *
+fdset_del(struct fdset *pfdset, int fd)
+{
+	int i;
+	void *dat = NULL;
+
+	if (pfdset == NULL || fd == -1)
+		return NULL;
+
+	do {
+		pthread_mutex_lock(&pfdset->fd_mutex);
+
+		i = fdset_find_fd(pfdset, fd);
+		if (i != -1 && pfdset->fd[i].busy == 0) {
+			/* busy indicates r/wcb is executing! */
+			dat = pfdset->fd[i].dat;
+			pfdset->fd[i].fd = -1;
+			pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+			pfdset->fd[i].dat = NULL;
+			i = -1;
+		}
+		pthread_mutex_unlock(&pfdset->fd_mutex);
+	} while (i != -1);
+
+	return dat;
+}
+
+
+/**
+ * This functions runs in infinite blocking loop until there is no fd in
+ * pfdset. It calls corresponding r/w handler if there is event on the fd.
+ *
+ * Before the callback is called, we set the flag to busy status; If other
+ * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
+ * will wait until the flag is reset to zero(which indicates the callback is
+ * finished), then it could free the context after fdset_del.
+ */
+void *
+fdset_event_dispatch(void *arg)
+{
+	int i;
+	struct pollfd *pfd;
+	struct fdentry *pfdentry;
+	fd_cb rcb, wcb;
+	void *dat;
+	int fd, numfds;
+	int remove1, remove2;
+	int need_shrink;
+	struct fdset *pfdset = arg;
+
+	if (pfdset == NULL)
+		return NULL;
+
+	while (1) {
+
+		/*
+		 * When poll is blocked, other threads might unregister
+		 * listenfds from and register new listenfds into fdset.
+		 * When poll returns, the entries for listenfds in the fdset
+		 * might have been updated. It is ok if there is unwanted call
+		 * for new listenfds.
+		 */
+		pthread_mutex_lock(&pfdset->fd_mutex);
+		numfds = pfdset->num;
+		pthread_mutex_unlock(&pfdset->fd_mutex);
+
+		poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+
+		need_shrink = 0;
+		for (i = 0; i < numfds; i++) {
+			pthread_mutex_lock(&pfdset->fd_mutex);
+
+			pfdentry = &pfdset->fd[i];
+			fd = pfdentry->fd;
+			pfd = &pfdset->rwfds[i];
+
+			if (fd < 0) {
+				need_shrink = 1;
+				pthread_mutex_unlock(&pfdset->fd_mutex);
+				continue;
+			}
+
+			if (!pfd->revents) {
+				pthread_mutex_unlock(&pfdset->fd_mutex);
+				continue;
+			}
+
+			remove1 = remove2 = 0;
+
+			rcb = pfdentry->rcb;
+			wcb = pfdentry->wcb;
+			dat = pfdentry->dat;
+			pfdentry->busy = 1;
+
+			pthread_mutex_unlock(&pfdset->fd_mutex);
+
+			if (rcb && pfd->revents & (POLLIN | FDPOLLERR))
+				rcb(fd, dat, &remove1);
+			if (wcb && pfd->revents & (POLLOUT | FDPOLLERR))
+				wcb(fd, dat, &remove2);
+			pfdentry->busy = 0;
+			/*
+			 * fdset_del needs to check busy flag.
+			 * We don't allow fdset_del to be called in callback
+			 * directly.
+			 */
+			/*
+			 * When we are to clean up the fd from fdset,
+			 * because the fd is closed in the cb,
+			 * the old fd val could be reused by when creates new
+			 * listen fd in another thread, we couldn't call
+			 * fd_set_del.
+			 */
+			if (remove1 || remove2) {
+				pfdentry->fd = -1;
+				need_shrink = 1;
+			}
+		}
+
+		if (need_shrink)
+			fdset_shrink(pfdset);
+	}
+
+	return NULL;
+}
diff --git a/src/spdk/lib/rte_vhost/fd_man.h b/src/spdk/lib/rte_vhost/fd_man.h
new file mode 100644
index 000000000..3a9d269b3
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/fd_man.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FD_MAN_H_
+#define _FD_MAN_H_
+#include <stdint.h>
+#include <pthread.h>
+#include <poll.h>
+
+#define MAX_FDS 1024
+
+typedef void (*fd_cb)(int fd, void *dat, int *remove);
+
+struct fdentry {
+	int fd;		/* -1 indicates this entry is empty */
+	fd_cb rcb;	/* callback when this fd is readable. */
+	fd_cb wcb;	/* callback when this fd is writeable. */
+	void *dat;	/* fd context */
+	int busy;	/* whether this entry is being used in cb. */
+};
+
+struct fdset {
+	struct pollfd rwfds[MAX_FDS];
+	struct fdentry fd[MAX_FDS];
+	pthread_mutex_t fd_mutex;
+	int num;	/* current fd number of this fdset */
+};
+
+
+void fdset_init(struct fdset *pfdset);
+
+int fdset_add(struct fdset *pfdset, int fd,
+	fd_cb rcb, fd_cb wcb, void *dat);
+
+void *fdset_del(struct fdset *pfdset, int fd);
+
+void *fdset_event_dispatch(void *arg);
+
+#endif
diff --git a/src/spdk/lib/rte_vhost/rte_vhost.h b/src/spdk/lib/rte_vhost/rte_vhost.h
new file mode 100644
index 000000000..b1b7f2cd8
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/rte_vhost.h
@@ -0,0 +1,635 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_VHOST_H_
+#define _RTE_VHOST_H_
+
+/**
+ * @file
+ * Interface to vhost-user
+ */
+
+#include <stdint.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <sys/eventfd.h>
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+
+#define RTE_VHOST_USER_CLIENT		(1ULL << 0)
+#define RTE_VHOST_USER_NO_RECONNECT	(1ULL << 1)
+#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct rte_vhost_mem_region {
+	uint64_t guest_phys_addr;
+	uint64_t guest_user_addr;
+	uint64_t host_user_addr;
+	uint64_t size;
+	void	 *mmap_addr;
+	uint64_t mmap_size;
+	int fd;
+};
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct rte_vhost_memory {
+	uint32_t nregions;
+	struct rte_vhost_mem_region regions[0];
+};
+
+struct rte_vhost_inflight_desc_split {
+	uint8_t inflight;
+	uint8_t padding[5];
+	uint16_t next;
+	uint64_t counter;
+};
+
+struct rte_vhost_inflight_info_split {
+	uint64_t features;
+	uint16_t version;
+	uint16_t desc_num;
+	uint16_t last_inflight_io;
+	uint16_t used_idx;
+	struct rte_vhost_inflight_desc_split desc[0];
+};
+
+struct rte_vhost_resubmit_desc {
+	uint16_t index;
+	uint64_t counter;
+};
+
+struct rte_vhost_resubmit_info {
+	struct rte_vhost_resubmit_desc *resubmit_list;
+	uint16_t resubmit_num;
+};
+
+struct rte_vhost_ring_inflight {
+	struct rte_vhost_inflight_info_split *inflight_split;
+	struct rte_vhost_resubmit_info *resubmit_inflight;
+};
+
+struct rte_vhost_vring {
+	union {
+		struct vring_desc *desc;
+		struct vring_packed_desc *desc_packed;
+	};
+	union {
+		struct vring_avail *avail;
+		struct vring_packed_desc_event *driver_event;
+	};
+	union {
+		struct vring_used *used;
+		struct vring_packed_desc_event *device_event;
+	};
+	uint64_t		log_guest_addr;
+
+	int			callfd;
+	int			kickfd;
+	uint16_t		size;
+};
+
+/**
+ * Device and vring operations.
+ */
+struct vhost_device_ops {
+	int (*new_device)(int vid);		/**< Add device. */
+	void (*destroy_device)(int vid);	/**< Remove device. */
+
+	int (*vring_state_changed)(int vid, uint16_t queue_id, int enable);	/**< triggered when a vring is enabled or disabled */
+
+	/**
+	 * Features could be changed after the feature negotiation.
+	 * For example, VHOST_F_LOG_ALL will be set/cleared at the
+	 * start/end of live migration, respectively. This callback
+	 * is used to inform the application on such change.
+	 */
+	int (*features_changed)(int vid, uint64_t features);
+	int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf);
+	int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd);
+	int (*vhost_nvme_set_bar_mr)(int vid, void *bar_addr, uint64_t bar_size);
+	int (*vhost_nvme_get_cap)(int vid, uint64_t *cap);
+
+	int (*new_connection)(int vid);
+	void (*destroy_connection)(int vid);
+
+	int (*get_config)(int vid, uint8_t *config, uint32_t config_len);
+	int (*set_config)(int vid, uint8_t *config, uint32_t offset,
+			  uint32_t len, uint32_t flags);
+
+	void *reserved[2]; /**< Reserved for future extension */
+};
+
+/**
+ * Convert guest physical address to host virtual address
+ *
+ * @param mem
+ *  the guest memory regions
+ * @param gpa
+ *  the guest physical address for querying
+ * @return
+ *  the host virtual address on success, 0 on failure
+ */
+static inline uint64_t __attribute__((always_inline))
+rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
+{
+	struct rte_vhost_mem_region *reg;
+	uint32_t i;
+
+	for (i = 0; i < mem->nregions; i++) {
+		reg = &mem->regions[i];
+		if (gpa >= reg->guest_phys_addr &&
+		    gpa <  reg->guest_phys_addr + reg->size) {
+			return gpa - reg->guest_phys_addr +
+			       reg->host_user_addr;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Convert guest physical address to host virtual address safely
+ *
+ * This variant of rte_vhost_gpa_to_vva() takes care all the
+ * requested length is mapped and contiguous in process address
+ * space.
+ *
+ * @param mem
+ *  the guest memory regions
+ * @param gpa
+ *  the guest physical address for querying
+ * @param len
+ *  the size of the requested area to map,
+ *  updated with actual size mapped
+ * @return
+ *  the host virtual address on success, 0 on failure  */
+static inline uint64_t
+rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
+	uint64_t gpa, uint64_t *len)
+{
+	struct rte_vhost_mem_region *r;
+	uint32_t i;
+
+	for (i = 0; i < mem->nregions; i++) {
+		r = &mem->regions[i];
+		if (gpa >= r->guest_phys_addr &&
+		    gpa <  r->guest_phys_addr + r->size) {
+
+			if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
+				*len = r->guest_phys_addr + r->size - gpa;
+
+			return gpa - r->guest_phys_addr +
+			       r->host_user_addr;
+		}
+	}
+	*len = 0;
+
+	return 0;
+}
+
+#define RTE_VHOST_NEED_LOG(features)	((features) & (1ULL << VHOST_F_LOG_ALL))
+
+/**
+ * Log the memory write start with given address.
+ *
+ * This function only need be invoked when the live migration starts.
+ * Therefore, we won't need call it at all in the most of time. For
+ * making the performance impact be minimum, it's suggested to do a
+ * check before calling it:
+ *
+ *        if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ *                rte_vhost_log_write(vid, addr, len);
+ *
+ * @param vid
+ *  vhost device ID
+ * @param addr
+ *  the starting address for write
+ * @param len
+ *  the length to write
+ */
+void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
+
+/**
+ * Log the used ring update start at given offset.
+ *
+ * Same as rte_vhost_log_write, it's suggested to do a check before
+ * calling it:
+ *
+ *        if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ *                rte_vhost_log_used_vring(vid, vring_idx, offset, len);
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  the vring index
+ * @param offset
+ *  the offset inside the used ring
+ * @param len
+ *  the length to write
+ */
+void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+			      uint64_t offset, uint64_t len);
+
+int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
+
+/**
+ * Register vhost driver. path could be different for multiple
+ * instance support.
+ */
+int rte_vhost_driver_register(const char *path, uint64_t flags);
+
+/* Unregister vhost driver. This is only meaningful to vhost user. */
+int rte_vhost_driver_unregister(const char *path);
+
+/**
+ * Set the feature bits the vhost-user driver supports.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_features(const char *path, uint64_t features);
+
+/**
+ * Enable vhost-user driver features.
+ *
+ * Note that
+ * - the param @features should be a subset of the feature bits provided
+ *   by rte_vhost_driver_set_features().
+ * - it must be invoked before vhost-user negotiation starts.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param features
+ *  Features to enable
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_enable_features(const char *path, uint64_t features);
+
+/**
+ * Disable vhost-user driver features.
+ *
+ * The two notes at rte_vhost_driver_enable_features() also apply here.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param features
+ *  Features to disable
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_disable_features(const char *path, uint64_t features);
+
+/**
+ * Get the feature bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param features
+ *  A pointer to store the queried feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_features(const char *path, uint64_t *features);
+
+/**
+ * Get the feature bits after negotiation
+ *
+ * @param vid
+ *  Vhost device ID
+ * @param features
+ *  A pointer to store the queried feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
+
+/* Register callbacks. */
+int rte_vhost_driver_callback_register(const char *path,
+	struct vhost_device_ops const * const ops);
+
+/**
+ *
+ * Start the vhost-user driver.
+ *
+ * This function triggers the vhost-user negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_start(const char *path);
+
+/**
+ * Get the MTU value of the device if set in QEMU.
+ *
+ * @param vid
+ *  virtio-net device ID
+ * @param mtu
+ *  The variable to store the MTU value
+ *
+ * @return
+ *  0: success
+ *  -EAGAIN: device not yet started
+ *  -ENOTSUP: device does not support MTU feature
+ */
+int rte_vhost_get_mtu(int vid, uint16_t *mtu);
+
+/**
+ * Get the numa node from which the virtio net device's memory
+ * is allocated.
+ *
+ * @param vid
+ *  vhost device ID
+ *
+ * @return
+ *  The numa node, -1 on failure
+ */
+int rte_vhost_get_numa_node(int vid);
+
+/**
+ * Get the virtio net device's ifname, which is the vhost-user socket
+ * file path.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param buf
+ *  The buffer to stored the queried ifname
+ * @param len
+ *  The length of buf
+ *
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_ifname(int vid, char *buf, size_t len);
+
+/**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index
+ *
+ * @return
+ *  num of avail entires left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+struct rte_mbuf;
+struct rte_mempool;
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtual device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue.
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index in mq case
+ * @param pkts
+ *  array to contain packets to be enqueued
+ * @param count
+ *  packets num to be enqueued
+ * @return
+ *  num of packets enqueued
+ */
+uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * This function gets guest buffers from the virtio device TX virtqueue,
+ * construct host mbufs, copies guest buffer content to host mbufs and
+ * store them in pkts to be processed.
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index in mq case
+ * @param mbuf_pool
+ *  mbuf_pool where host mbuf is allocated.
+ * @param pkts
+ *  array to contain packets to be dequeued
+ * @param count
+ *  packets num to be dequeued
+ * @return
+ *  num of packets dequeued
+ */
+uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * Get guest mem table: a list of memory regions.
+ *
+ * An rte_vhost_vhost_memory object will be allocated internaly, to hold the
+ * guest memory regions. Application should free it at destroy_device()
+ * callback.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param mem
+ *  To store the returned mem regions
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+
+/**
+ * Get guest vring info, including the vring address, vring size, etc.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param vring
+ *  the structure to hold the requested vring info
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+			      struct rte_vhost_vring *vring);
+
+/**
+ * Set id of the last descriptors in avail and used guest vrings.
+ *
+ * In case user application operates directly on buffers, it should use this
+ * function on device destruction to retrieve the same values later on in device
+ * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *)
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param last_avail_idx
+ *  id of the last descriptor in avail ring to be set
+ * @param last_used_idx
+ *  id of the last descriptor in used ring to be set
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+		uint16_t last_avail_idx, uint16_t last_used_idx);
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+		uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Notify the guest that used descriptors have been added to the vring.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_vring_call(int vid, uint16_t vring_idx);
+
+/**
+ * Get guest inflight vring info, including inflight ring and resubmit list.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param vring
+ *  the structure to hold the requested inflight vring info
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
+	struct rte_vhost_ring_inflight *vring);
+
+/**
+ * Set split inflight descriptor.
+ *
+ * This function save descriptors that has been comsumed in available
+ * ring
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param idx
+ *  inflight entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
+	uint16_t idx);
+
+/**
+ * Save the head of list that the last batch of used descriptors.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param idx
+ *  descriptor entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_last_inflight_io_split(int vid,
+	uint16_t vring_idx, uint16_t idx);
+
+/**
+ * Clear the split inflight status.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param last_used_idx
+ *  last used idx of used ring
+ * @param idx
+ *  inflight entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+	uint16_t last_used_idx, uint16_t idx);
+
+/**
+ * Save the head of list that the last batch of used descriptors.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param idx
+ *  descriptor entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_last_inflight_io_split(int vid,
+	uint16_t vring_idx, uint16_t idx);
+
+/**
+ * Clear the split inflight status.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param vring_idx
+ *  vring index
+ * @param last_used_idx
+ *  last used idx of used ring
+ * @param idx
+ *  inflight entry index
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+	uint16_t last_used_idx, uint16_t idx);
+#endif /* _RTE_VHOST_H_ */
diff --git a/src/spdk/lib/rte_vhost/socket.c b/src/spdk/lib/rte_vhost/socket.c
new file mode 100644
index 000000000..ec923518b
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/socket.c
@@ -0,0 +1,841 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+
+TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+	struct vhost_user_connection_list conn_list;
+	pthread_mutex_t conn_mutex;
+	char *path;
+	int socket_fd;
+	struct sockaddr_un un;
+	bool is_server;
+	bool reconnect;
+	bool dequeue_zero_copy;
+
+	/*
+	 * The "supported_features" indicates the feature bits the
+	 * vhost driver supports. The "features" indicates the feature
+	 * bits after the rte_vhost_driver_features_disable/enable().
+	 * It is also the final feature bits used for vhost-user
+	 * features negotiation.
+	 */
+	uint64_t supported_features;
+	uint64_t features;
+
+	struct vhost_device_ops const *notify_ops;
+};
+
+struct vhost_user_connection {
+	struct vhost_user_socket *vsocket;
+	int connfd;
+	int vid;
+
+	TAILQ_ENTRY(vhost_user_connection) next;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+	struct fdset fdset;
+	int vsocket_cnt;
+	pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int create_unix_socket(struct vhost_user_socket *vsocket);
+static int vhost_user_start_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+	.fdset = {
+		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+		.num = 0
+	},
+	.vsocket_cnt = 0,
+	.mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	int ret;
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = buf;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	ret = recvmsg(sockfd, &msgh, 0);
+	if (ret <= 0) {
+		if (ret)
+			RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed, %s\n", strerror(errno));
+		else
+			RTE_LOG(INFO, VHOST_CONFIG, "peer closed\n");
+		return ret;
+	}
+
+	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+		RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+		return -1;
+	}
+
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(fds, CMSG_DATA(cmsg), fdsize);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	int ret;
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = buf;
+	iov.iov_len = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+
+	if (fds && fd_num > 0) {
+		msgh.msg_control = control;
+		msgh.msg_controllen = sizeof(control);
+		cmsg = CMSG_FIRSTHDR(&msgh);
+		if (cmsg == NULL) {
+			RTE_LOG(ERR, VHOST_CONFIG,  "cmsg == NULL\n");
+			errno = EINVAL;
+			return -1;
+		}
+		cmsg->cmsg_len = CMSG_LEN(fdsize);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		memcpy(CMSG_DATA(cmsg), fds, fdsize);
+	} else {
+		msgh.msg_control = NULL;
+		msgh.msg_controllen = 0;
+	}
+
+	do {
+		ret = sendmsg(sockfd, &msgh, 0);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+	int vid;
+	size_t size;
+	struct vhost_user_connection *conn;
+	int ret;
+
+	conn = malloc(sizeof(*conn));
+	if (conn == NULL) {
+		close(fd);
+		return;
+	}
+
+	vid = vhost_new_device(vsocket->features, vsocket->notify_ops);
+	if (vid == -1) {
+		goto err;
+	}
+
+	size = strnlen(vsocket->path, PATH_MAX);
+	vhost_set_ifname(vid, vsocket->path, size);
+
+	if (vsocket->dequeue_zero_copy)
+		vhost_enable_dequeue_zero_copy(vid);
+
+	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+	if (vsocket->notify_ops->new_connection) {
+		ret = vsocket->notify_ops->new_connection(vid);
+		if (ret < 0) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"failed to add vhost user connection with fd %d\n",
+				fd);
+			goto err;
+		}
+	}
+
+	conn->connfd = fd;
+	conn->vsocket = vsocket;
+	conn->vid = vid;
+	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+			NULL, conn);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to add fd %d into vhost server fdset\n",
+			fd);
+
+		if (vsocket->notify_ops->destroy_connection)
+			vsocket->notify_ops->destroy_connection(conn->vid);
+
+		goto err;
+	}
+
+	pthread_mutex_lock(&vsocket->conn_mutex);
+	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
+	pthread_mutex_unlock(&vsocket->conn_mutex);
+	return;
+
+err:
+	free(conn);
+	close(fd);
+}
+
+/* call back when there is new vhost-user connection from client  */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+	struct vhost_user_socket *vsocket = dat;
+
+	fd = accept(fd, NULL, NULL);
+	if (fd < 0)
+		return;
+
+	RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+	vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+	struct vhost_user_connection *conn = dat;
+	struct vhost_user_socket *vsocket = conn->vsocket;
+	int ret;
+
+	ret = vhost_user_msg_handler(conn->vid, connfd);
+	if (ret < 0) {
+		*remove = 1;
+		vhost_destroy_device(conn->vid);
+
+		if (vsocket->notify_ops->destroy_connection)
+			vsocket->notify_ops->destroy_connection(conn->vid);
+
+		pthread_mutex_lock(&vsocket->conn_mutex);
+		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
+		if (conn->connfd != -1) {
+			close(conn->connfd);
+			conn->connfd = -1;
+		}
+		pthread_mutex_unlock(&vsocket->conn_mutex);
+
+		free(conn);
+
+		if (vsocket->reconnect) {
+			create_unix_socket(vsocket);
+			vhost_user_start_client(vsocket);
+		}
+	}
+}
+
+static int
+create_unix_socket(struct vhost_user_socket *vsocket)
+{
+	int fd;
+	struct sockaddr_un *un = &vsocket->un;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0)
+		return -1;
+	RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+		vsocket->is_server ? "server" : "client", fd);
+
+	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"vhost-user: can't set nonblocking mode for socket, fd: "
+			"%d (%s)\n", fd, strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	memset(un, 0, sizeof(*un));
+	un->sun_family = AF_UNIX;
+	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
+	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+	vsocket->socket_fd = fd;
+	return 0;
+}
+
+static int
+vhost_user_start_server(struct vhost_user_socket *vsocket)
+{
+	int ret;
+	int fd = vsocket->socket_fd;
+	const char *path = vsocket->path;
+
+	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to bind to %s: %s; remove it and try again\n",
+			path, strerror(errno));
+		goto err;
+	}
+	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+	ret = listen(fd, MAX_VIRTIO_BACKLOG);
+	if (ret < 0)
+		goto err;
+
+	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+		  NULL, vsocket);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to add listen fd %d to vhost server fdset\n",
+			fd);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	close(fd);
+	return -1;
+}
+
+struct vhost_user_reconnect {
+	struct sockaddr_un un;
+	int fd;
+	struct vhost_user_socket *vsocket;
+
+	TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+	struct vhost_user_reconnect_tailq_list head;
+	pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+	int ret, flags;
+
+	ret = connect(fd, un, sz);
+	if (ret < 0 && errno != EISCONN)
+		return -1;
+
+	flags = fcntl(fd, F_GETFL, 0);
+	if (flags < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"can't get flags for connfd %d\n", fd);
+		return -2;
+	}
+	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"can't disable nonblocking on fd %d\n", fd);
+		return -2;
+	}
+	return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+	int ret;
+	struct vhost_user_reconnect *reconn, *next;
+
+	while (1) {
+		pthread_mutex_lock(&reconn_list.mutex);
+
+		/*
+		 * An equal implementation of TAILQ_FOREACH_SAFE,
+		 * which does not exist on all platforms.
+		 */
+		for (reconn = TAILQ_FIRST(&reconn_list.head);
+		     reconn != NULL; reconn = next) {
+			next = TAILQ_NEXT(reconn, next);
+
+			ret = vhost_user_connect_nonblock(reconn->fd,
+						(struct sockaddr *)&reconn->un,
+						sizeof(reconn->un));
+			if (ret == -2) {
+				close(reconn->fd);
+				RTE_LOG(ERR, VHOST_CONFIG,
+					"reconnection for fd %d failed\n",
+					reconn->fd);
+				goto remove_fd;
+			}
+			if (ret == -1)
+				continue;
+
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"%s: connected\n", reconn->vsocket->path);
+			vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+			TAILQ_REMOVE(&reconn_list.head, reconn, next);
+			free(reconn);
+		}
+
+		pthread_mutex_unlock(&reconn_list.mutex);
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+	int ret;
+
+	pthread_mutex_init(&reconn_list.mutex, NULL);
+	TAILQ_INIT(&reconn_list.head);
+
+	ret = pthread_create(&reconn_tid, NULL,
+			     vhost_user_client_reconnect, NULL);
+	if (ret < 0)
+		RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+	return ret;
+}
+
+static int
+vhost_user_start_client(struct vhost_user_socket *vsocket)
+{
+	int ret;
+	int fd = vsocket->socket_fd;
+	const char *path = vsocket->path;
+	struct vhost_user_reconnect *reconn;
+
+	ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
+					  sizeof(vsocket->un));
+	if (ret == 0) {
+		vhost_user_add_connection(fd, vsocket);
+		return 0;
+	}
+
+	RTE_LOG(WARNING, VHOST_CONFIG,
+		"failed to connect to %s: %s\n",
+		path, strerror(errno));
+
+	if (ret == -2 || !vsocket->reconnect) {
+		close(fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
+	reconn = malloc(sizeof(*reconn));
+	if (reconn == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to allocate memory for reconnect\n");
+		close(fd);
+		return -1;
+	}
+	reconn->un = vsocket->un;
+	reconn->fd = fd;
+	reconn->vsocket = vsocket;
+	pthread_mutex_lock(&reconn_list.mutex);
+	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+	pthread_mutex_unlock(&reconn_list.mutex);
+
+	return 0;
+}
+
+static struct vhost_user_socket *
+find_vhost_user_socket(const char *path)
+{
+	int i;
+
+	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+		if (!strcmp(vsocket->path, path))
+			return vsocket;
+	}
+
+	return NULL;
+}
+
+int
+rte_vhost_driver_disable_features(const char *path, uint64_t features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket)
+		vsocket->features &= ~features;
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_enable_features(const char *path, uint64_t features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket) {
+		if ((vsocket->supported_features & features) != features) {
+			/*
+			 * trying to enable features the driver doesn't
+			 * support.
+			 */
+			pthread_mutex_unlock(&vhost_user.mutex);
+			return -1;
+		}
+		vsocket->features |= features;
+	}
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_features(const char *path, uint64_t features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket) {
+		vsocket->supported_features = features;
+		vsocket->features = features;
+	}
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_features(const char *path, uint64_t *features)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket)
+		*features = vsocket->features;
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	if (!vsocket) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"socket file %s is not registered yet.\n", path);
+		return -1;
+	} else {
+		return 0;
+	}
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+	int ret = -1;
+	struct vhost_user_socket *vsocket;
+
+	if (!path)
+		return -1;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+
+	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"error: the number of vhost sockets reaches maximum\n");
+		goto out;
+	}
+
+	vsocket = malloc(sizeof(struct vhost_user_socket));
+	if (!vsocket)
+		goto out;
+	memset(vsocket, 0, sizeof(struct vhost_user_socket));
+	vsocket->path = strdup(path);
+	if (!vsocket->path) {
+		free(vsocket);
+		goto out;
+	}
+	TAILQ_INIT(&vsocket->conn_list);
+	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+
+	/*
+	 * Set the supported features correctly for the builtin vhost-user
+	 * net driver.
+	 *
+	 * Applications know nothing about features the builtin virtio net
+	 * driver (virtio_net.c) supports, thus it's not possible for them
+	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
+	 * we set it unconditionally. If the application want to implement
+	 * another vhost-user driver (say SCSI), it should call the
+	 * rte_vhost_driver_set_features(), which will overwrite following
+	 * two values.
+	 */
+	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
+	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
+
+	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+		if (vsocket->reconnect && reconn_tid == 0) {
+			if (vhost_user_reconnect_init() < 0) {
+				free(vsocket->path);
+				free(vsocket);
+				goto out;
+			}
+		}
+	} else {
+		vsocket->is_server = true;
+	}
+	ret = create_unix_socket(vsocket);
+	if (ret < 0) {
+		free(vsocket->path);
+		free(vsocket);
+		goto out;
+	}
+
+	pthread_mutex_init(&vsocket->conn_mutex, NULL);
+	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+	int found = false;
+	struct vhost_user_reconnect *reconn, *next;
+
+	pthread_mutex_lock(&reconn_list.mutex);
+
+	for (reconn = TAILQ_FIRST(&reconn_list.head);
+	     reconn != NULL; reconn = next) {
+		next = TAILQ_NEXT(reconn, next);
+
+		if (reconn->vsocket == vsocket) {
+			TAILQ_REMOVE(&reconn_list.head, reconn, next);
+			close(reconn->fd);
+			free(reconn);
+			found = true;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&reconn_list.mutex);
+	return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+	int i;
+	int count;
+	struct vhost_user_connection *conn;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+
+	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+		if (!strcmp(vsocket->path, path)) {
+			if (vsocket->is_server) {
+				fdset_del(&vhost_user.fdset, vsocket->socket_fd);
+				close(vsocket->socket_fd);
+				unlink(path);
+			} else if (vsocket->reconnect) {
+				vhost_user_remove_reconnect(vsocket);
+			}
+
+			pthread_mutex_lock(&vsocket->conn_mutex);
+			TAILQ_FOREACH(conn, &vsocket->conn_list, next) {
+				close(conn->connfd);
+				conn->connfd = -1;
+			}
+			pthread_mutex_unlock(&vsocket->conn_mutex);
+
+			do {
+				pthread_mutex_lock(&vsocket->conn_mutex);
+				conn = TAILQ_FIRST(&vsocket->conn_list);
+				pthread_mutex_unlock(&vsocket->conn_mutex);
+			} while (conn != NULL);
+
+			free(vsocket->path);
+			free(vsocket);
+
+			count = --vhost_user.vsocket_cnt;
+			vhost_user.vsockets[i] = vhost_user.vsockets[count];
+			vhost_user.vsockets[count] = NULL;
+			pthread_mutex_unlock(&vhost_user.mutex);
+
+			return 0;
+		}
+	}
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(const char *path,
+	struct vhost_device_ops const * const ops)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	if (vsocket)
+		vsocket->notify_ops = ops;
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? 0 : -1;
+}
+
+struct vhost_device_ops const *
+vhost_driver_callback_get(const char *path)
+{
+	struct vhost_user_socket *vsocket;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	return vsocket ? vsocket->notify_ops : NULL;
+}
+
+int
+rte_vhost_driver_start(const char *path)
+{
+	struct vhost_user_socket *vsocket;
+	static pthread_t fdset_tid;
+
+	pthread_mutex_lock(&vhost_user.mutex);
+	vsocket = find_vhost_user_socket(path);
+	pthread_mutex_unlock(&vhost_user.mutex);
+
+	if (!vsocket)
+		return -1;
+
+	if (fdset_tid == 0) {
+		rte_cpuset_t orig_cpuset;
+		rte_cpuset_t tmp_cpuset;
+		long num_cores, i;
+		int ret;
+
+		CPU_ZERO(&tmp_cpuset);
+		num_cores = sysconf(_SC_NPROCESSORS_CONF);
+		/* Create a mask containing all CPUs */
+		for (i = 0; i < num_cores; i++) {
+			CPU_SET(i, &tmp_cpuset);
+		}
+
+		rte_thread_get_affinity(&orig_cpuset);
+		rte_thread_set_affinity(&tmp_cpuset);
+		ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch,
+				     &vhost_user.fdset);
+		rte_thread_set_affinity(&orig_cpuset);
+		if (ret < 0)
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"failed to create fdset handling thread");
+	}
+
+	if (vsocket->is_server)
+		return vhost_user_start_server(vsocket);
+	else
+		return vhost_user_start_client(vsocket);
+}
diff --git a/src/spdk/lib/rte_vhost/vhost.c b/src/spdk/lib/rte_vhost/vhost.c
new file mode 100644
index 000000000..8e875c585
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost.c
@@ -0,0 +1,565 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+
+#include "vhost.h"
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+struct virtio_net *
+get_device(int vid)
+{
+	struct virtio_net *dev = vhost_devices[vid];
+
+	if (unlikely(!dev)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) device not found.\n", vid);
+	}
+
+	return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+	if ((vq->callfd >= 0) && (destroy != 0))
+		close(vq->callfd);
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+	uint32_t i;
+
+	vhost_backend_cleanup(dev);
+
+	for (i = 0; i < dev->nr_vring; i++)
+		cleanup_vq(dev->virtqueue[i], destroy);
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct vhost_virtqueue *vq;
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		vq = dev->virtqueue[i];
+
+		rte_free(vq->shadow_used_ring);
+
+		rte_free(vq);
+	}
+
+	rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq)
+{
+	memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+	/* Backends are set to -1 indicating an inactive device. */
+	vq->backend = -1;
+
+	/*
+	 * always set the vq to enabled; this is to keep compatibility
+	 * with the old QEMU, whereas there is no SET_VRING_ENABLE message.
+	 */
+	vq->enabled = 1;
+
+	TAILQ_INIT(&vq->zmbuf_list);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq)
+{
+	int callfd;
+
+	callfd = vq->callfd;
+	init_vring_queue(vq);
+	vq->callfd = callfd;
+}
+
+int
+alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+	struct vhost_virtqueue *vq;
+
+	vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0);
+	if (vq == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to allocate memory for vring:%u.\n", vring_idx);
+		return -1;
+	}
+
+	dev->virtqueue[vring_idx] = vq;
+	init_vring_queue(vq);
+
+	dev->nr_vring += 1;
+
+	return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, nr_vring: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+	uint32_t i;
+
+	dev->negotiated_features = 0;
+	dev->protocol_features = 0;
+	dev->flags = 0;
+
+	for (i = 0; i < dev->nr_vring; i++)
+		reset_vring_queue(dev->virtqueue[i]);
+}
+
+/*
+ * Invoked when there is a new vhost-user connection established (when
+ * there is a new virtio device being attached).
+ */
+int
+vhost_new_device(uint64_t features, struct vhost_device_ops const *ops)
+{
+	struct virtio_net *dev;
+	int i;
+
+	dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+	if (dev == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to allocate memory for new dev.\n");
+		return -1;
+	}
+
+	for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+		if (vhost_devices[i] == NULL)
+			break;
+	}
+	if (i == MAX_VHOST_DEVICE) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to find a free slot for new device.\n");
+		rte_free(dev);
+		return -1;
+	}
+
+	vhost_devices[i] = dev;
+	dev->vid = i;
+	dev->features = features;
+	dev->notify_ops = ops;
+
+	return i;
+}
+
+/*
+ * Invoked when there is the vhost-user connection is broken (when
+ * the virtio device is being detached).
+ */
+void
+vhost_destroy_device(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(vid);
+	}
+
+	cleanup_device(dev, 1);
+	free_device(dev);
+
+	vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+	struct virtio_net *dev;
+	unsigned int len;
+
+	dev = get_device(vid);
+	if (dev == NULL)
+		return;
+
+	len = if_len > sizeof(dev->ifname) ?
+		sizeof(dev->ifname) : if_len;
+
+	strncpy(dev->ifname, if_name, len);
+	dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+void
+vhost_enable_dequeue_zero_copy(int vid)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	dev->dequeue_zero_copy = 1;
+}
+
+int
+rte_vhost_get_mtu(int vid, uint16_t *mtu)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (!dev)
+		return -ENODEV;
+
+	if (!(dev->flags & VIRTIO_DEV_READY))
+		return -EAGAIN;
+
+	if (!(dev->negotiated_features & VIRTIO_NET_F_MTU))
+		return -ENOTSUP;
+
+	*mtu = dev->mtu;
+
+	return 0;
+}
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+	struct virtio_net *dev = get_device(vid);
+	int numa_node;
+	int ret;
+
+	if (dev == NULL)
+		return -1;
+
+	ret = get_mempolicy(&numa_node, NULL, 0, dev,
+			    MPOL_F_NODE | MPOL_F_ADDR);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to query numa node: %d\n", vid, ret);
+		return -1;
+	}
+
+	return numa_node;
+#else
+	RTE_SET_USED(vid);
+	return -1;
+#endif
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return -1;
+
+	len = RTE_MIN(len, sizeof(dev->ifname));
+
+	strncpy(buf, dev->ifname, len);
+	buf[len - 1] = '\0';
+
+	return 0;
+}
+
+int
+rte_vhost_get_negotiated_features(int vid, uint64_t *features)
+{
+	struct virtio_net *dev;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	*features = dev->negotiated_features;
+	return 0;
+}
+
+int
+rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+	struct virtio_net *dev;
+	struct rte_vhost_memory *m;
+	size_t size;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region);
+	m = malloc(sizeof(struct rte_vhost_memory) + size);
+	if (!m)
+		return -1;
+
+	m->nregions = dev->mem->nregions;
+	memcpy(m->regions, dev->mem->regions, size);
+	*mem = m;
+
+	return 0;
+}
+
+int
+rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+			  struct rte_vhost_vring *vring)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	vring->desc  = vq->desc;
+	vring->avail = vq->avail;
+	vring->used  = vq->used;
+	vring->log_guest_addr  = vq->log_guest_addr;
+
+	vring->callfd  = vq->callfd;
+	vring->kickfd  = vq->kickfd;
+	vring->size    = vq->size;
+
+	return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	vq = dev->virtqueue[queue_id];
+	if (!vq->enabled)
+		return 0;
+
+	return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return -1;
+
+	if (enable) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"guest notification isn't supported.\n");
+		return -1;
+	}
+
+	dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+	return 0;
+}
+
+void
+rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
+{
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return;
+
+	vhost_log_write(dev, addr, len);
+}
+
+void
+rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+			 uint64_t offset, uint64_t len)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (dev == NULL)
+		return;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return;
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return;
+
+	vhost_log_used_vring(dev, vq, offset, len);
+}
+
+int
+rte_vhost_set_vring_base(int vid, uint16_t vring_idx,
+		uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	vq->last_avail_idx = last_avail_idx;
+	vq->last_used_idx = last_used_idx;
+
+	return 0;
+}
+
+int
+rte_vhost_get_vring_base(int vid, uint16_t vring_idx,
+		uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	*last_avail_idx = vq->last_avail_idx;
+	*last_used_idx = vq->last_used_idx;
+
+	return 0;
+}
+
+int
+rte_vhost_vring_call(int vid, uint16_t vring_idx)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if(!dev)
+		return -1;
+
+	if (vring_idx >= VHOST_MAX_VRING)
+		return -1;
+
+	vq = dev->virtqueue[vring_idx];
+	if (!vq)
+		return -1;
+
+	/* Ensure all our used ring changes are visible to the guest at the time
+	 * of interrupt.
+	 * TODO: this is currently an sfence on x86. For other architectures we
+	 * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
+	 */
+	rte_wmb();
+
+	if (vq->callfd != -1) {
+		eventfd_write(vq->callfd, (eventfd_t)1);
+		return 0;
+	}
+
+	return -1;
+}
+
+int
+rte_vhost_set_last_inflight_io_split(int vid, uint16_t vring_idx,
+				     uint16_t idx)
+{
+	return 0;
+}
+
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+				  uint16_t last_used_idx, uint16_t idx)
+{
+	return 0;
+}
+
+int
+rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
+				  uint16_t idx)
+{
+	return 0;
+}
+
+int
+rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
+				  struct rte_vhost_ring_inflight *vring)
+{
+	return 0;
+}
diff --git a/src/spdk/lib/rte_vhost/vhost.h b/src/spdk/lib/rte_vhost/vhost.h
new file mode 100644
index 000000000..d738dba7f
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost.h
@@ -0,0 +1,330 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+
+#include <rte_log.h>
+#include <rte_ether.h>
+
+#include "rte_vhost.h"
+#include "vhost_user.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+/* Used to indicate that the device is ready to operate */
+#define VIRTIO_DEV_READY 2
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+	uint64_t buf_addr;
+	uint32_t buf_len;
+	uint32_t desc_idx;
+};
+
+/*
+ * A structure to hold some fields needed in zero copy code path,
+ * mainly for associating an mbuf with the right desc_idx.
+ */
+struct zcopy_mbuf {
+	struct rte_mbuf *mbuf;
+	uint32_t desc_idx;
+	uint16_t in_use;
+
+	TAILQ_ENTRY(zcopy_mbuf) next;
+};
+TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+	struct vring_desc	*desc;
+	struct vring_avail	*avail;
+	struct vring_used	*used;
+	uint32_t		size;
+
+	uint16_t		last_avail_idx;
+	uint16_t		last_used_idx;
+#define VIRTIO_INVALID_EVENTFD		(-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD	(-2)
+
+	/* Backend value to determine if device should started/stopped */
+	int			backend;
+	/* Used to notify the guest (trigger interrupt) */
+	int			callfd;
+	/* Currently unused as polling mode is enabled */
+	int			kickfd;
+	int			enabled;
+
+	/* Physical address of used ring, for logging */
+	uint64_t		log_guest_addr;
+
+	uint16_t		nr_zmbuf;
+	uint16_t		zmbuf_size;
+	uint16_t		last_zmbuf_idx;
+	struct zcopy_mbuf	*zmbufs;
+	struct zcopy_mbuf_list	zmbuf_list;
+
+	struct vring_used_elem  *shadow_used_ring;
+	uint16_t                shadow_used_idx;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macros defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+#ifndef VIRTIO_NET_F_MQ
+ #define VIRTIO_NET_F_MQ		22
+#endif
+
+#define VHOST_MAX_VRING			0x100
+#define VHOST_MAX_QUEUE_PAIRS		0x80
+
+#ifndef VIRTIO_NET_F_MTU
+ #define VIRTIO_NET_F_MTU 3
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+#define VHOST_USER_F_PROTOCOL_FEATURES	30
+
+/* Features supported by this builtin vhost-user net driver. */
+#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+				(1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+				(1ULL << VIRTIO_NET_F_CTRL_RX) | \
+				(1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+				(1ULL << VIRTIO_NET_F_MQ)      | \
+				(1ULL << VIRTIO_F_VERSION_1)   | \
+				(1ULL << VHOST_F_LOG_ALL)      | \
+				(1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+				(1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+				(1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+				(1ULL << VIRTIO_NET_F_CSUM)    | \
+				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+				(1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+				(1ULL << VIRTIO_NET_F_MTU))
+
+
+struct guest_page {
+	uint64_t guest_phys_addr;
+	uint64_t host_phys_addr;
+	uint64_t size;
+};
+
+/* struct ether_addr was renamed to struct rte_ether_addr at one point */
+#ifdef RTE_ETHER_ADDR_LEN
+struct ether_addr {
+	uint8_t addr_bytes[RTE_ETHER_ADDR_LEN];
+} __attribute__((__packed__));
+#endif
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+	/* Frontend (QEMU) memory and memory region information */
+	struct rte_vhost_memory	*mem;
+	uint64_t		features;
+	uint64_t		negotiated_features;
+	uint64_t		protocol_features;
+	int			vid;
+	uint32_t		is_nvme;
+	uint32_t		flags;
+	uint16_t		vhost_hlen;
+	/* to tell if we need broadcast rarp packet */
+	rte_atomic16_t		broadcast_rarp;
+	uint32_t		nr_vring;
+	int			dequeue_zero_copy;
+	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+	char			ifname[IF_NAME_SZ];
+	uint64_t		log_size;
+	uint64_t		log_base;
+	uint64_t		log_addr;
+	struct ether_addr	mac;
+	uint16_t		mtu;
+
+	struct vhost_device_ops const *notify_ops;
+
+	uint32_t		nr_guest_pages;
+	uint32_t		max_guest_pages;
+	struct guest_page       *guest_pages;
+	int                     has_new_mem_table;
+	void			*bar_addr;
+	uint64_t		bar_size;
+	struct VhostUserMemory  mem_table;
+	int                     mem_table_fds[VHOST_MEMORY_MAX_NREGIONS];
+} __rte_cache_aligned;
+
+
+#define VHOST_LOG_PAGE	4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+	log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+	uint64_t page;
+
+	if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+		   !dev->log_base || !len))
+		return;
+
+	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+		return;
+
+	/* To make sure guest memory updates are committed before logging */
+	rte_smp_wmb();
+
+	page = addr / VHOST_LOG_PAGE;
+	while (page * VHOST_LOG_PAGE < addr + len) {
+		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+		page += 1;
+	}
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		     uint64_t offset, uint64_t len)
+{
+	vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define VHOST_LOG_LEVEL RTE_LOG_DEBUG
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+	char *pkt_addr = (char *)(addr); \
+	unsigned int index; \
+	char packet[VHOST_MAX_PRINT_BUFF]; \
+	\
+	if ((header)) \
+		snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+	else \
+		snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+	for (index = 0; index < (size); index++) { \
+		snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+			"%02hhx ", pkt_addr[index]); \
+	} \
+	snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+	\
+	VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define VHOST_LOG_LEVEL RTE_LOG_INFO
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE	1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* Convert guest physical address to host physical address */
+static inline phys_addr_t __attribute__((always_inline))
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+	uint32_t i;
+	struct guest_page *page;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		page = &dev->guest_pages[i];
+
+		if (gpa >= page->guest_phys_addr &&
+		    gpa + size < page->guest_phys_addr + page->size) {
+			return gpa - page->guest_phys_addr +
+			       page->host_phys_addr;
+		}
+	}
+
+	return 0;
+}
+
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(uint64_t features, struct vhost_device_ops const *ops);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+void vhost_enable_dequeue_zero_copy(int vid);
+
+struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+
+/*
+ * Backend-specific cleanup.
+ *
+ * TODO: fix it; we have one backend now
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/src/spdk/lib/rte_vhost/vhost_user.c b/src/spdk/lib/rte_vhost/vhost_user.c
new file mode 100644
index 000000000..a07483fcf
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost_user.c
@@ -0,0 +1,1426 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+#define VIRTIO_MIN_MTU 68
+#define VIRTIO_MAX_MTU 65535
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+	[VHOST_USER_NONE] = "VHOST_USER_NONE",
+	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
+	[VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
+	[VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
+	[VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
+	[VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
+	[VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
+	[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
+	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+	[VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN",
+	[VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL",
+	[VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP",
+	[VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP",
+	[VHOST_USER_NVME_SET_BAR_MR] = "VHOST_USER_NVME_SET_BAR_MR"
+};
+
+static uint64_t
+get_blk_size(int fd)
+{
+	struct stat stat;
+	int ret;
+
+	ret = fstat(fd, &stat);
+	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct rte_vhost_mem_region *reg;
+
+	if (!dev || !dev->mem)
+		return;
+
+	for (i = 0; i < dev->mem->nregions; i++) {
+		reg = &dev->mem->regions[i];
+		if (reg->host_user_addr) {
+			munmap(reg->mmap_addr, reg->mmap_size);
+			close(reg->fd);
+		}
+	}
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+	uint32_t i;
+
+	if (dev->has_new_mem_table) {
+		for (i = 0; i < dev->mem_table.nregions; i++) {
+			close(dev->mem_table_fds[i]);
+		}
+		dev->has_new_mem_table = 0;
+	}
+	if (dev->mem) {
+		free_mem_region(dev);
+		rte_free(dev->mem);
+		dev->mem = NULL;
+	}
+
+	free(dev->guest_pages);
+	dev->guest_pages = NULL;
+
+	if (dev->log_addr) {
+		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+		dev->log_addr = 0;
+	}
+	if (dev->bar_addr) {
+		munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+		dev->bar_addr = NULL;
+		dev->bar_size = 0;
+	}
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_user_set_owner(void)
+{
+	return 0;
+}
+
+static int
+vhost_user_reset_owner(struct virtio_net *dev)
+{
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	cleanup_device(dev, 0);
+	reset_device(dev);
+	return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static uint64_t
+vhost_user_get_features(struct virtio_net *dev)
+{
+	return dev->features;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_user_set_features(struct virtio_net *dev, uint64_t features)
+{
+	uint64_t vhost_features = 0;
+
+	vhost_features = vhost_user_get_features(dev);
+	if (features & ~vhost_features) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) received invalid negotiated features.\n",
+			dev->vid);
+		return -1;
+	}
+
+	if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) {
+		if (dev->notify_ops->features_changed) {
+			dev->notify_ops->features_changed(dev->vid, features);
+		} else {
+			dev->flags &= ~VIRTIO_DEV_RUNNING;
+			dev->notify_ops->destroy_device(dev->vid);
+		}
+	}
+
+	dev->negotiated_features = features;
+	if (dev->negotiated_features &
+		((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+		dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	} else {
+		dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+	}
+	VHOST_LOG_DEBUG(VHOST_CONFIG,
+		"(%d) mergeable RX buffers %s, virtio 1 %s\n",
+		dev->vid,
+		(dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+		(dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+	return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_user_set_vring_num(struct virtio_net *dev,
+			 VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+	vq->size = msg->payload.state.num;
+
+	if (dev->dequeue_zero_copy) {
+		vq->nr_zmbuf = 0;
+		vq->last_zmbuf_idx = 0;
+		vq->zmbuf_size = vq->size;
+		vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
+					 sizeof(struct zcopy_mbuf), 0);
+		if (vq->zmbufs == NULL) {
+			RTE_LOG(WARNING, VHOST_CONFIG,
+				"failed to allocate mem for zero copy; "
+				"zero copy is force disabled\n");
+			dev->dequeue_zero_copy = 0;
+		}
+	}
+
+	vq->shadow_used_ring = rte_malloc(NULL,
+				vq->size * sizeof(struct vring_used_elem),
+				RTE_CACHE_LINE_SIZE);
+	if (!vq->shadow_used_ring) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to allocate memory for shadow used ring.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+	int oldnode, newnode;
+	struct virtio_net *old_dev;
+	struct vhost_virtqueue *old_vq, *vq;
+	int ret;
+
+	old_dev = dev;
+	vq = old_vq = dev->virtqueue[index];
+
+	ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+			    MPOL_F_NODE | MPOL_F_ADDR);
+
+	/* check if we need to reallocate vq */
+	ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+			     MPOL_F_NODE | MPOL_F_ADDR);
+	if (ret) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Unable to get vq numa information.\n");
+		return dev;
+	}
+	if (oldnode != newnode) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"reallocate vq from %d to %d node\n", oldnode, newnode);
+		vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
+		if (!vq)
+			return dev;
+
+		memcpy(vq, old_vq, sizeof(*vq));
+		rte_free(old_vq);
+	}
+
+	/* check if we need to reallocate dev */
+	ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+			    MPOL_F_NODE | MPOL_F_ADDR);
+	if (ret) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Unable to get dev numa information.\n");
+		goto out;
+	}
+	if (oldnode != newnode) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"reallocate dev from %d to %d node\n",
+			oldnode, newnode);
+		dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+		if (!dev) {
+			dev = old_dev;
+			goto out;
+		}
+
+		memcpy(dev, old_dev, sizeof(*dev));
+		rte_free(old_dev);
+	}
+
+out:
+	dev->virtqueue[index] = vq;
+	vhost_devices[dev->vid] = dev;
+
+	return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+	return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
+{
+	struct rte_vhost_mem_region *reg;
+	uint32_t i;
+
+	/* Find the region where the address lives. */
+	for (i = 0; i < dev->mem->nregions; i++) {
+		reg = &dev->mem->regions[i];
+
+		if (qva >= reg->guest_user_addr &&
+		    qva <  reg->guest_user_addr + reg->size) {
+
+			if (unlikely(*len > reg->guest_user_addr + reg->size - qva))
+				*len = reg->guest_user_addr + reg->size - qva;
+
+			return qva - reg->guest_user_addr +
+			       reg->host_user_addr;
+		}
+	}
+
+	return 0;
+}
+
+static int vhost_setup_mem_table(struct virtio_net *dev);
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq;
+	uint64_t len;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	if (dev->has_new_mem_table) {
+		vhost_setup_mem_table(dev);
+		dev->has_new_mem_table = 0;
+	}
+
+	if (dev->mem == NULL)
+		return -1;
+
+	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
+	vq = dev->virtqueue[msg->payload.addr.index];
+
+	/* The addresses are converted from QEMU virtual to Vhost virtual. */
+	len = sizeof(struct vring_desc) * vq->size;
+	vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+			msg->payload.addr.desc_user_addr, &len);
+	if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to map desc ring.\n",
+			dev->vid);
+		return -1;
+	}
+
+	dev = numa_realloc(dev, msg->payload.addr.index);
+	vq = dev->virtqueue[msg->payload.addr.index];
+
+	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
+	vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+			msg->payload.addr.avail_user_addr, &len);
+	if (vq->avail == 0 ||
+			len != sizeof(struct vring_avail)
+			+ sizeof(uint16_t) * vq->size) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to find avail ring address.\n",
+			dev->vid);
+		return -1;
+	}
+
+	len = sizeof(struct vring_used) +
+		sizeof(struct vring_used_elem) * vq->size;
+	vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+			msg->payload.addr.used_user_addr, &len);
+	if (vq->used == 0 || len != sizeof(struct vring_used) +
+			sizeof(struct vring_used_elem) * vq->size) {
+
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to find used ring address.\n",
+			dev->vid);
+		return -1;
+	}
+
+	if (vq->last_used_idx != vq->used->idx) {
+		RTE_LOG(WARNING, VHOST_CONFIG,
+			"last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+			"some packets maybe resent for Tx and dropped for Rx\n",
+			vq->last_used_idx, vq->used->idx);
+		vq->last_used_idx  = vq->used->idx;
+		vq->last_avail_idx = vq->used->idx;
+	}
+
+	vq->log_guest_addr = msg->payload.addr.log_guest_addr;
+
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+			dev->vid, vq->desc);
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+			dev->vid, vq->avail);
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+			dev->vid, vq->used);
+	VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+			dev->vid, vq->log_guest_addr);
+
+	return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_user_set_vring_base(struct virtio_net *dev,
+			  VhostUserMsg *msg)
+{
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	dev->virtqueue[msg->payload.state.index]->last_used_idx  = msg->payload.state.num;
+	dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num;
+
+	return 0;
+}
+
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+		   uint64_t host_phys_addr, uint64_t size)
+{
+	struct guest_page *page, *last_page;
+
+	if (dev->nr_guest_pages == dev->max_guest_pages) {
+		dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2);
+		dev->guest_pages = realloc(dev->guest_pages,
+					dev->max_guest_pages * sizeof(*page));
+	}
+
+	if (dev->nr_guest_pages > 0) {
+		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+		/* merge if the two pages are continuous */
+		if (host_phys_addr == last_page->host_phys_addr +
+				      last_page->size) {
+			last_page->size += size;
+			return;
+		}
+	}
+
+	page = &dev->guest_pages[dev->nr_guest_pages++];
+	page->guest_phys_addr = guest_phys_addr;
+	page->host_phys_addr  = host_phys_addr;
+	page->size = size;
+}
+
+static void
+add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
+		uint64_t page_size)
+{
+	uint64_t reg_size = reg->size;
+	uint64_t host_user_addr  = reg->host_user_addr;
+	uint64_t guest_phys_addr = reg->guest_phys_addr;
+	uint64_t host_phys_addr;
+	uint64_t size;
+
+	host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+	size = page_size - (guest_phys_addr & (page_size - 1));
+	size = RTE_MIN(size, reg_size);
+
+	add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+	host_user_addr  += size;
+	guest_phys_addr += size;
+	reg_size -= size;
+
+	while (reg_size > 0) {
+		size = RTE_MIN(reg_size, page_size);
+		host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
+						  host_user_addr);
+		add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+
+		host_user_addr  += size;
+		guest_phys_addr += size;
+		reg_size -= size;
+	}
+}
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct guest_page *page;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		page = &dev->guest_pages[i];
+
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"guest physical page region %u\n"
+			"\t guest_phys_addr: %" PRIx64 "\n"
+			"\t host_phys_addr : %" PRIx64 "\n"
+			"\t size           : %" PRIx64 "\n",
+			i,
+			page->guest_phys_addr,
+			page->host_phys_addr,
+			page->size);
+	}
+}
+#else
+#define dump_guest_pages(dev)
+#endif
+
+static int
+vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	uint32_t i;
+
+	if (dev->has_new_mem_table) {
+		/*
+		 * The previous mem table was not consumed, so close the
+		 *  file descriptors from that mem table before copying
+		 *  the new one.
+		 */
+		for (i = 0; i < dev->mem_table.nregions; i++) {
+			close(dev->mem_table_fds[i]);
+		}
+	}
+
+	memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table));
+	memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds));
+	dev->has_new_mem_table = 1;
+	/* vhost-user-nvme will not send
+	 * set vring addr message, enable
+	 * memory address table now.
+	 */
+	if (dev->has_new_mem_table && dev->is_nvme) {
+		vhost_setup_mem_table(dev);
+		dev->has_new_mem_table = 0;
+	}
+
+	return 0;
+}
+
+ static int
+vhost_setup_mem_table(struct virtio_net *dev)
+{
+	struct VhostUserMemory memory = dev->mem_table;
+	struct rte_vhost_mem_region *reg;
+	struct vhost_virtqueue *vq;
+	void *mmap_addr;
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+	uint64_t alignment;
+	uint32_t i;
+	int fd;
+
+	if (dev->mem) {
+		free_mem_region(dev);
+		rte_free(dev->mem);
+		dev->mem = NULL;
+	}
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		vq = dev->virtqueue[i];
+		/* Those addresses won't be valid anymore in host address space
+		 * after setting new mem table. Initiator need to resend these
+		 * addresses.
+		 */
+		vq->desc = NULL;
+		vq->avail = NULL;
+		vq->used = NULL;
+	}
+
+	dev->nr_guest_pages = 0;
+	if (!dev->guest_pages) {
+		dev->max_guest_pages = 8;
+		dev->guest_pages = malloc(dev->max_guest_pages *
+						sizeof(struct guest_page));
+	}
+
+	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+		sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
+	if (dev->mem == NULL) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"(%d) failed to allocate memory for dev->mem\n",
+			dev->vid);
+		return -1;
+	}
+	dev->mem->nregions = memory.nregions;
+
+	for (i = 0; i < memory.nregions; i++) {
+		fd  = dev->mem_table_fds[i];
+		reg = &dev->mem->regions[i];
+
+		reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
+		reg->guest_user_addr = memory.regions[i].userspace_addr;
+		reg->size            = memory.regions[i].memory_size;
+		reg->fd              = fd;
+
+		mmap_offset = memory.regions[i].mmap_offset;
+		mmap_size   = reg->size + mmap_offset;
+
+		/* mmap() without flag of MAP_ANONYMOUS, should be called
+		 * with length argument aligned with hugepagesz at older
+		 * longterm version Linux, like 2.6.32 and 3.2.72, or
+		 * mmap() will fail with EINVAL.
+		 *
+		 * to avoid failure, make sure in caller to keep length
+		 * aligned.
+		 */
+		alignment = get_blk_size(fd);
+		if (alignment == (uint64_t)-1) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"couldn't get hugepage size through fstat\n");
+			goto err_mmap;
+		}
+		mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+		mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+				 MAP_SHARED | MAP_POPULATE, fd, 0);
+
+		if (mmap_addr == MAP_FAILED) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"mmap region %u failed.\n", i);
+			goto err_mmap;
+		}
+
+		if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"MADV_DONTDUMP advice setting failed.\n");
+		}
+
+		reg->mmap_addr = mmap_addr;
+		reg->mmap_size = mmap_size;
+		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+				      mmap_offset;
+
+		if (dev->dequeue_zero_copy)
+			add_guest_pages(dev, reg, alignment);
+
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"guest memory region %u, size: 0x%" PRIx64 "\n"
+			"\t guest physical addr: 0x%" PRIx64 "\n"
+			"\t guest virtual  addr: 0x%" PRIx64 "\n"
+			"\t host  virtual  addr: 0x%" PRIx64 "\n"
+			"\t mmap addr : 0x%" PRIx64 "\n"
+			"\t mmap size : 0x%" PRIx64 "\n"
+			"\t mmap align: 0x%" PRIx64 "\n"
+			"\t mmap off  : 0x%" PRIx64 "\n",
+			i, reg->size,
+			reg->guest_phys_addr,
+			reg->guest_user_addr,
+			reg->host_user_addr,
+			(uint64_t)(uintptr_t)mmap_addr,
+			mmap_size,
+			alignment,
+			mmap_offset);
+	}
+
+	dump_guest_pages(dev);
+
+	return 0;
+
+err_mmap:
+	free_mem_region(dev);
+	rte_free(dev->mem);
+	dev->mem = NULL;
+	return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+	return vq && vq->desc   &&
+	       vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+	       vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+	       vq->kickfd != VIRTIO_INVALID_EVENTFD &&
+	       vq->callfd != VIRTIO_INVALID_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+	struct vhost_virtqueue *vq;
+	uint32_t i;
+
+	if (dev->nr_vring == 0)
+		return 0;
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		vq = dev->virtqueue[i];
+
+		if (vq_is_ready(vq)) {
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"virtio is now ready for processing.\n");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static void
+vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	struct vhost_vring_file file;
+	struct vhost_virtqueue *vq;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+		file.fd = VIRTIO_INVALID_EVENTFD;
+	else
+		file.fd = pmsg->fds[0];
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring call idx:%d file:%d\n", file.index, file.fd);
+
+	vq = dev->virtqueue[file.index];
+	if (vq->callfd >= 0)
+		close(vq->callfd);
+
+	vq->callfd = file.fd;
+}
+
+static void
+vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	struct vhost_vring_file file;
+	struct vhost_virtqueue *vq;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+		file.fd = VIRTIO_INVALID_EVENTFD;
+	else
+		file.fd = pmsg->fds[0];
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring kick idx:%d file:%d\n", file.index, file.fd);
+
+	vq = dev->virtqueue[file.index];
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
+	vq->kickfd = file.fd;
+}
+
+static void
+free_zmbufs(struct vhost_virtqueue *vq)
+{
+	struct zcopy_mbuf *zmbuf, *next;
+
+	for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+	     zmbuf != NULL; zmbuf = next) {
+		next = TAILQ_NEXT(zmbuf, next);
+
+		rte_pktmbuf_free(zmbuf->mbuf);
+		TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+	}
+
+	rte_free(vq->zmbufs);
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+vhost_user_get_vring_base(struct virtio_net *dev,
+			  VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+	/* We have to stop the queue (virtio) if it is running. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	dev->flags &= ~VIRTIO_DEV_READY;
+
+	/* Here we are safe to get the last used index */
+	msg->payload.state.num = vq->last_used_idx;
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num);
+	/*
+	 * Based on current qemu vhost-user implementation, this message is
+	 * sent and only sent in vhost_vring_stop.
+	 * TODO: cleanup the vring, it isn't usable since here.
+	 */
+	if (vq->kickfd >= 0)
+		close(vq->kickfd);
+
+	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+	if (vq->callfd >= 0)
+		close(vq->callfd);
+
+	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+	if (dev->dequeue_zero_copy)
+		free_zmbufs(vq);
+	rte_free(vq->shadow_used_ring);
+	vq->shadow_used_ring = NULL;
+
+	return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+vhost_user_set_vring_enable(struct virtio_net *dev,
+			    VhostUserMsg *msg)
+{
+	int enable = (int)msg->payload.state.num;
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"set queue enable: %d to qp idx: %d\n",
+		enable, msg->payload.state.index);
+
+	if (dev->notify_ops->vring_state_changed)
+		dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable);
+
+	dev->virtqueue[msg->payload.state.index]->enabled = enable;
+
+	return 0;
+}
+
+static void
+vhost_user_set_protocol_features(struct virtio_net *dev,
+				 uint64_t protocol_features)
+{
+	if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+		return;
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	dev->protocol_features = protocol_features;
+}
+
+static int
+vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	int fd = msg->fds[0];
+	uint64_t size, off;
+	void *addr;
+
+	if (fd < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+		return -1;
+	}
+
+	if (msg->size != sizeof(VhostUserLog)) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"invalid log base msg size: %"PRId32" != %d\n",
+			msg->size, (int)sizeof(VhostUserLog));
+		return -1;
+	}
+
+	/* Remove from the data plane. */
+	if (dev->flags & VIRTIO_DEV_RUNNING) {
+		dev->flags &= ~VIRTIO_DEV_RUNNING;
+		dev->notify_ops->destroy_device(dev->vid);
+	}
+
+	size = msg->payload.log.mmap_size;
+	off  = msg->payload.log.mmap_offset;
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"log mmap size: %"PRId64", offset: %"PRId64"\n",
+		size, off);
+
+	/*
+	 * mmap from 0 to workaround a hugepage mmap bug: mmap will
+	 * fail when offset is not page size aligned.
+	 */
+	addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	close(fd);
+	if (addr == MAP_FAILED) {
+		RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+		return -1;
+	}
+
+	/*
+	 * Free previously mapped log memory on occasionally
+	 * multiple VHOST_USER_SET_LOG_BASE.
+	 */
+	if (dev->log_addr) {
+		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+	}
+	dev->log_addr = (uint64_t)(uintptr_t)addr;
+	dev->log_base = dev->log_addr + off;
+	dev->log_size = size;
+
+	return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+	RTE_LOG(DEBUG, VHOST_CONFIG,
+		":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+	memcpy(dev->mac.addr_bytes, mac, 6);
+
+	/*
+	 * Set the flag to inject a RARP broadcast packet at
+	 * rte_vhost_dequeue_burst().
+	 *
+	 * rte_smp_wmb() is for making sure the mac is copied
+	 * before the flag is set.
+	 */
+	rte_smp_wmb();
+	rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+	return 0;
+}
+
+static int
+vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	if (msg->payload.u64 < VIRTIO_MIN_MTU ||
+			msg->payload.u64 > VIRTIO_MAX_MTU) {
+		RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
+				msg->payload.u64);
+
+		return -1;
+	}
+
+	dev->mtu = msg->payload.u64;
+
+	return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+	int ret;
+
+	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+	if (ret <= 0)
+		return ret;
+
+	if (msg && msg->size) {
+		if (msg->size > sizeof(msg->payload)) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"invalid msg size: %d\n", msg->size);
+			return -1;
+		}
+		ret = read(sockfd, &msg->payload, msg->size);
+		if (ret <= 0)
+			return ret;
+		if (ret != (int)msg->size) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"read control message failed\n");
+			return -1;
+		}
+	}
+
+	return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+	int ret;
+
+	if (!msg)
+		return 0;
+
+	msg->flags &= ~VHOST_USER_VERSION_MASK;
+	msg->flags &= ~VHOST_USER_NEED_REPLY;
+	msg->flags |= VHOST_USER_VERSION;
+	msg->flags |= VHOST_USER_REPLY_MASK;
+
+	ret = send_fd_message(sockfd, (char *)msg,
+		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+	return ret;
+}
+
+/*
+ * Allocate a queue pair if it hasn't been allocated yet
+ */
+static int
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
+{
+	uint16_t vring_idx;
+
+	switch (msg->request) {
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+		vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+		break;
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_ENABLE:
+		vring_idx = msg->payload.state.index;
+		break;
+	case VHOST_USER_SET_VRING_ADDR:
+		vring_idx = msg->payload.addr.index;
+		break;
+	default:
+		return 0;
+	}
+
+	if (vring_idx >= VHOST_MAX_VRING) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"invalid vring index: %u\n", vring_idx);
+		return -1;
+	}
+
+	if (dev->virtqueue[vring_idx])
+		return 0;
+
+	return alloc_vring_queue(dev, vring_idx);
+}
+
+static int
+vhost_user_nvme_admin_passthrough(struct virtio_net *dev,
+				  void *cmd, void *cqe, void *buf)
+{
+	if (dev->notify_ops->vhost_nvme_admin_passthrough) {
+		return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf);
+	}
+
+	return -1;
+}
+
+static int
+vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd)
+{
+	if (dev->notify_ops->vhost_nvme_set_cq_call) {
+		return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd);
+	}
+
+	return -1;
+}
+
+static int
+vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap)
+{
+	if (dev->notify_ops->vhost_nvme_get_cap) {
+		return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap);
+	}
+
+	return -1;
+}
+
+static int
+vhost_user_nvme_set_bar_mr(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+	struct VhostUserMemory mem_table;
+	int fd = pmsg->fds[0];
+	void *mmap_addr;
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+	uint64_t alignment;
+	struct rte_vhost_mem_region reg;
+	int ret = 0;
+
+	memcpy(&mem_table, &pmsg->payload.memory, sizeof(mem_table));
+
+	reg.guest_phys_addr = mem_table.regions[0].guest_phys_addr;
+	reg.guest_user_addr = mem_table.regions[0].userspace_addr;
+	reg.size            = mem_table.regions[0].memory_size;
+	reg.fd              = fd;
+	mmap_offset = mem_table.regions[0].mmap_offset;
+	mmap_size   = reg.size + mmap_offset;
+
+	alignment = get_blk_size(fd);
+	if (alignment == (uint64_t)-1) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"couldn't get hugepage size through fstat\n");
+			return -1;
+	}
+	mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+	mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+			 MAP_SHARED | MAP_POPULATE, fd, 0);
+
+	if (mmap_addr == MAP_FAILED) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"mmap region failed.\n");
+		return -1;
+	}
+
+	if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"MADV_DONTDUMP advice setting failed.\n");
+	}
+
+	reg.mmap_addr = mmap_addr;
+	reg.mmap_size = mmap_size;
+	reg.host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+				      mmap_offset;
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+			"BAR memory region %u, size: 0x%" PRIx64 "\n"
+			"\t guest physical addr: 0x%" PRIx64 "\n"
+			"\t guest virtual  addr: 0x%" PRIx64 "\n"
+			"\t host  virtual  addr: 0x%" PRIx64 "\n"
+			"\t mmap addr : 0x%" PRIx64 "\n"
+			"\t mmap size : 0x%" PRIx64 "\n"
+			"\t mmap align: 0x%" PRIx64 "\n"
+			"\t mmap off  : 0x%" PRIx64 "\n",
+			0, reg.size,
+			reg.guest_phys_addr,
+			reg.guest_user_addr,
+			reg.host_user_addr,
+			(uint64_t)(uintptr_t)mmap_addr,
+			mmap_size,
+			alignment,
+			mmap_offset);
+
+	if (dev->bar_addr) {
+		munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+	}
+	dev->bar_addr = (void *)(uintptr_t)reg.host_user_addr;
+	dev->bar_size = reg.mmap_size;
+
+	if (dev->notify_ops->vhost_nvme_set_bar_mr) {
+		ret = dev->notify_ops->vhost_nvme_set_bar_mr(dev->vid, dev->bar_addr, dev->bar_size);
+		if (ret) {
+			munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+			dev->bar_addr = NULL;
+			dev->bar_size = 0;
+		}
+	}
+
+	return ret;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+	struct virtio_net *dev;
+	struct VhostUserMsg msg;
+	struct vhost_vring_file file;
+	int ret;
+	uint64_t cap;
+	uint64_t enable;
+	uint8_t cqe[16];
+	uint8_t cmd[64];
+	uint8_t buf[4096];
+
+	dev = get_device(vid);
+	if (dev == NULL)
+		return -1;
+
+	ret = read_vhost_message(fd, &msg);
+	if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+		if (ret < 0)
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"vhost read message failed\n");
+		else if (ret == 0)
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"vhost peer closed\n");
+		else
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"vhost read incorrect message\n");
+
+		return -1;
+	}
+
+	RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n",
+		dev->ifname, vhost_message_str[msg.request]);
+
+	ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to alloc queue\n");
+		return -1;
+	}
+
+	switch (msg.request) {
+	case VHOST_USER_GET_CONFIG:
+		if (dev->notify_ops->get_config(dev->vid,
+						msg.payload.config.region,
+						msg.payload.config.size) != 0) {
+			msg.size = sizeof(uint64_t);
+		}
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_CONFIG:
+		if ((dev->notify_ops->set_config(dev->vid,
+						msg.payload.config.region,
+						msg.payload.config.offset,
+						msg.payload.config.size,
+						msg.payload.config.flags)) != 0) {
+			ret = 1;
+		} else {
+			ret = 0;
+		}
+		break;
+	case VHOST_USER_NVME_ADMIN:
+		if (!dev->is_nvme) {
+			dev->is_nvme = 1;
+		}
+		memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd));
+		ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf);
+		memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe));
+		msg.size = sizeof(cqe);
+		/* NVMe Identify Command */
+		if (cmd[0] == 0x06) {
+			memcpy(msg.payload.nvme.buf, &buf, 4096);
+			msg.size += 4096;
+		}
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_NVME_SET_CQ_CALL:
+		file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
+		file.fd = msg.fds[0];
+		ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd);
+		break;
+	case VHOST_USER_NVME_GET_CAP:
+		ret = vhost_user_nvme_get_cap(dev, &cap);
+		if (!ret)
+			msg.payload.u64 = cap;
+		else
+			msg.payload.u64 = 0;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_NVME_START_STOP:
+		enable = msg.payload.u64;
+		/* device must be started before set cq call */
+		if (enable) {
+			if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+				if (dev->notify_ops->new_device(dev->vid) == 0)
+					dev->flags |= VIRTIO_DEV_RUNNING;
+			}
+		} else {
+			if (dev->flags & VIRTIO_DEV_RUNNING) {
+				dev->flags &= ~VIRTIO_DEV_RUNNING;
+				dev->notify_ops->destroy_device(dev->vid);
+			}
+		}
+		break;
+	case VHOST_USER_NVME_SET_BAR_MR:
+		ret = vhost_user_nvme_set_bar_mr(dev, &msg);
+		break;
+	case VHOST_USER_GET_FEATURES:
+		msg.payload.u64 = vhost_user_get_features(dev);
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_FEATURES:
+		vhost_user_set_features(dev, msg.payload.u64);
+		break;
+
+	case VHOST_USER_GET_PROTOCOL_FEATURES:
+		msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
+		vhost_user_set_protocol_features(dev, msg.payload.u64);
+		break;
+
+	case VHOST_USER_SET_OWNER:
+		vhost_user_set_owner();
+		break;
+	case VHOST_USER_RESET_OWNER:
+		vhost_user_reset_owner(dev);
+		break;
+
+	case VHOST_USER_SET_MEM_TABLE:
+		ret = vhost_user_set_mem_table(dev, &msg);
+		break;
+
+	case VHOST_USER_SET_LOG_BASE:
+		vhost_user_set_log_base(dev, &msg);
+
+		/* it needs a reply */
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+	case VHOST_USER_SET_LOG_FD:
+		close(msg.fds[0]);
+		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+		break;
+
+	case VHOST_USER_SET_VRING_NUM:
+		vhost_user_set_vring_num(dev, &msg);
+		break;
+	case VHOST_USER_SET_VRING_ADDR:
+		vhost_user_set_vring_addr(dev, &msg);
+		break;
+	case VHOST_USER_SET_VRING_BASE:
+		vhost_user_set_vring_base(dev, &msg);
+		break;
+
+	case VHOST_USER_GET_VRING_BASE:
+		vhost_user_get_vring_base(dev, &msg);
+		msg.size = sizeof(msg.payload.state);
+		send_vhost_message(fd, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_KICK:
+		vhost_user_set_vring_kick(dev, &msg);
+		break;
+	case VHOST_USER_SET_VRING_CALL:
+		vhost_user_set_vring_call(dev, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_ERR:
+		if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+			close(msg.fds[0]);
+		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+		break;
+
+	case VHOST_USER_GET_QUEUE_NUM:
+		msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_ENABLE:
+		vhost_user_set_vring_enable(dev, &msg);
+		break;
+	case VHOST_USER_SEND_RARP:
+		vhost_user_send_rarp(dev, &msg);
+		break;
+
+	case VHOST_USER_NET_SET_MTU:
+		ret = vhost_user_net_set_mtu(dev, &msg);
+		break;
+
+	default:
+		ret = -1;
+		break;
+
+	}
+
+	if (msg.flags & VHOST_USER_NEED_REPLY) {
+		msg.payload.u64 = !!ret;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(fd, &msg);
+	}
+
+	if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
+		dev->flags |= VIRTIO_DEV_READY;
+
+		if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+			if (dev->dequeue_zero_copy) {
+				RTE_LOG(INFO, VHOST_CONFIG,
+						"dequeue zero copy is enabled\n");
+			}
+
+			if (dev->notify_ops->new_device(dev->vid) == 0)
+				dev->flags |= VIRTIO_DEV_RUNNING;
+		}
+	}
+
+	return 0;
+}
diff --git a/src/spdk/lib/rte_vhost/vhost_user.h b/src/spdk/lib/rte_vhost/vhost_user.h
new file mode 100644
index 000000000..d20574b64
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost_user.h
@@ -0,0 +1,171 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_vhost.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+#define VHOST_USER_PROTOCOL_F_MQ	0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD	1
+#define VHOST_USER_PROTOCOL_F_RARP	2
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK	3
+#define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#define VHOST_USER_PROTOCOL_F_CONFIG 9
+
+#define VHOST_USER_PROTOCOL_FEATURES	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+					 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+typedef enum VhostUserRequest {
+	VHOST_USER_NONE = 0,
+	VHOST_USER_GET_FEATURES = 1,
+	VHOST_USER_SET_FEATURES = 2,
+	VHOST_USER_SET_OWNER = 3,
+	VHOST_USER_RESET_OWNER = 4,
+	VHOST_USER_SET_MEM_TABLE = 5,
+	VHOST_USER_SET_LOG_BASE = 6,
+	VHOST_USER_SET_LOG_FD = 7,
+	VHOST_USER_SET_VRING_NUM = 8,
+	VHOST_USER_SET_VRING_ADDR = 9,
+	VHOST_USER_SET_VRING_BASE = 10,
+	VHOST_USER_GET_VRING_BASE = 11,
+	VHOST_USER_SET_VRING_KICK = 12,
+	VHOST_USER_SET_VRING_CALL = 13,
+	VHOST_USER_SET_VRING_ERR = 14,
+	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+	VHOST_USER_GET_QUEUE_NUM = 17,
+	VHOST_USER_SET_VRING_ENABLE = 18,
+	VHOST_USER_SEND_RARP = 19,
+	VHOST_USER_NET_SET_MTU = 20,
+	VHOST_USER_GET_CONFIG = 24,
+	VHOST_USER_SET_CONFIG = 25,
+	VHOST_USER_NVME_ADMIN = 80,
+	VHOST_USER_NVME_SET_CQ_CALL = 81,
+	VHOST_USER_NVME_GET_CAP = 82,
+	VHOST_USER_NVME_START_STOP = 83,
+	VHOST_USER_NVME_IO_CMD = 84,
+	VHOST_USER_NVME_SET_BAR_MR = 85,
+	VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+	VHOST_USER_SLAVE_NONE = 0,
+	VHOST_USER_SLAVE_IOTLB_MSG = 1,
+	VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
+	VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+	uint64_t guest_phys_addr;
+	uint64_t memory_size;
+	uint64_t userspace_addr;
+	uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+	uint32_t nregions;
+	uint32_t padding;
+	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserConfig {
+	uint32_t offset;
+	uint32_t size;
+	uint32_t flags;
+	uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+typedef struct VhostUserMsg {
+	VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+#define VHOST_USER_NEED_REPLY		(0x1 << 3)
+	uint32_t flags;
+	uint32_t size; /* the following payload size */
+	union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+		uint64_t u64;
+		struct vhost_vring_state state;
+		struct vhost_vring_addr addr;
+		VhostUserMemory memory;
+		VhostUserLog    log;
+		VhostUserConfig config;
+		struct nvme {
+			union {
+				uint8_t req[64];
+				uint8_t cqe[16];
+			} cmd;
+			uint8_t buf[4096];
+		} nvme;
+	} payload;
+	int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
diff --git a/src/spdk/lib/scsi/Makefile b/src/spdk/lib/scsi/Makefile
new file mode 100644
index 000000000..8f8a8c326
--- /dev/null
+++ b/src/spdk/lib/scsi/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = dev.c lun.c port.c scsi.c scsi_bdev.c scsi_pr.c scsi_rpc.c task.c
+LIBNAME = scsi
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_scsi.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/scsi/dev.c b/src/spdk/lib/scsi/dev.c
new file mode 100644
index 000000000..6d3cfdf31
--- /dev/null
+++ b/src/spdk/lib/scsi/dev.c
@@ -0,0 +1,436 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+static struct spdk_scsi_dev g_devs[SPDK_SCSI_MAX_DEVS];
+
+struct spdk_scsi_dev *
+scsi_dev_get_list(void)
+{
+	return g_devs;
+}
+
+static struct spdk_scsi_dev *
+allocate_dev(void)
+{
+	struct spdk_scsi_dev *dev;
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) {
+		dev = &g_devs[i];
+		if (!dev->is_allocated) {
+			memset(dev, 0, sizeof(*dev));
+			dev->id = i;
+			dev->is_allocated = 1;
+			return dev;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+free_dev(struct spdk_scsi_dev *dev)
+{
+	assert(dev->is_allocated == 1);
+	assert(dev->removed == true);
+
+	dev->is_allocated = 0;
+
+	if (dev->remove_cb) {
+		dev->remove_cb(dev->remove_ctx, 0);
+		dev->remove_cb = NULL;
+	}
+}
+
+void
+spdk_scsi_dev_destruct(struct spdk_scsi_dev *dev,
+		       spdk_scsi_dev_destruct_cb_t cb_fn, void *cb_arg)
+{
+	int lun_cnt;
+	int i;
+
+	if (dev == NULL) {
+		if (cb_fn) {
+			cb_fn(cb_arg, -EINVAL);
+		}
+		return;
+	}
+
+	if (dev->removed) {
+		if (cb_fn) {
+			cb_fn(cb_arg, -EINVAL);
+		}
+		return;
+	}
+
+	dev->removed = true;
+	dev->remove_cb = cb_fn;
+	dev->remove_ctx = cb_arg;
+	lun_cnt = 0;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		if (dev->lun[i] == NULL) {
+			continue;
+		}
+
+		/*
+		 * LUN will remove itself from this dev when all outstanding IO
+		 * is done. When no more LUNs, dev will be deleted.
+		 */
+		scsi_lun_destruct(dev->lun[i]);
+		lun_cnt++;
+	}
+
+	if (lun_cnt == 0) {
+		free_dev(dev);
+		return;
+	}
+}
+
+static int
+scsi_dev_find_lowest_free_lun_id(struct spdk_scsi_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		if (dev->lun[i] == NULL) {
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+int
+spdk_scsi_dev_add_lun(struct spdk_scsi_dev *dev, const char *bdev_name, int lun_id,
+		      void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+		      void *hotremove_ctx)
+{
+	struct spdk_bdev *bdev;
+	struct spdk_scsi_lun *lun;
+
+	bdev = spdk_bdev_get_by_name(bdev_name);
+	if (bdev == NULL) {
+		SPDK_ERRLOG("device %s: cannot find bdev '%s' (target %d)\n",
+			    dev->name, bdev_name, lun_id);
+		return -1;
+	}
+
+	/* Search the lowest free LUN ID if LUN ID is default */
+	if (lun_id == -1) {
+		lun_id = scsi_dev_find_lowest_free_lun_id(dev);
+		if (lun_id == -1) {
+			SPDK_ERRLOG("Free LUN ID is not found\n");
+			return -1;
+		}
+	}
+
+	lun = scsi_lun_construct(bdev, hotremove_cb, hotremove_ctx);
+	if (lun == NULL) {
+		return -1;
+	}
+
+	lun->id = lun_id;
+	lun->dev = dev;
+	dev->lun[lun_id] = lun;
+	return 0;
+}
+
+void
+spdk_scsi_dev_delete_lun(struct spdk_scsi_dev *dev,
+			 struct spdk_scsi_lun *lun)
+{
+	int lun_cnt = 0;
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		if (dev->lun[i] == lun) {
+			dev->lun[i] = NULL;
+		}
+
+		if (dev->lun[i]) {
+			lun_cnt++;
+		}
+	}
+
+	if (dev->removed == true && lun_cnt == 0) {
+		free_dev(dev);
+	}
+}
+
+struct spdk_scsi_dev *spdk_scsi_dev_construct(const char *name, const char *bdev_name_list[],
+		int *lun_id_list, int num_luns, uint8_t protocol_id,
+		void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+		void *hotremove_ctx)
+{
+	struct spdk_scsi_dev *dev;
+	size_t name_len;
+	bool found_lun_0;
+	int i, rc;
+
+	name_len = strlen(name);
+	if (name_len > sizeof(dev->name) - 1) {
+		SPDK_ERRLOG("device %s: name longer than maximum allowed length %zu\n",
+			    name, sizeof(dev->name) - 1);
+		return NULL;
+	}
+
+	if (num_luns == 0) {
+		SPDK_ERRLOG("device %s: no LUNs specified\n", name);
+		return NULL;
+	}
+
+	found_lun_0 = false;
+	for (i = 0; i < num_luns; i++) {
+		if (lun_id_list[i] == 0) {
+			found_lun_0 = true;
+			break;
+		}
+	}
+
+	if (!found_lun_0) {
+		SPDK_ERRLOG("device %s: no LUN 0 specified\n", name);
+		return NULL;
+	}
+
+	for (i = 0; i < num_luns; i++) {
+		if (bdev_name_list[i] == NULL) {
+			SPDK_ERRLOG("NULL spdk_scsi_lun for LUN %d\n",
+				    lun_id_list[i]);
+			return NULL;
+		}
+	}
+
+	dev = allocate_dev();
+	if (dev == NULL) {
+		return NULL;
+	}
+
+	memcpy(dev->name, name, name_len + 1);
+
+	dev->num_ports = 0;
+	dev->protocol_id = protocol_id;
+
+	for (i = 0; i < num_luns; i++) {
+		rc = spdk_scsi_dev_add_lun(dev, bdev_name_list[i], lun_id_list[i],
+					   hotremove_cb, hotremove_ctx);
+		if (rc < 0) {
+			spdk_scsi_dev_destruct(dev, NULL, NULL);
+			return NULL;
+		}
+	}
+
+	return dev;
+}
+
+void
+spdk_scsi_dev_queue_mgmt_task(struct spdk_scsi_dev *dev,
+			      struct spdk_scsi_task *task)
+{
+	assert(task != NULL);
+
+	scsi_lun_execute_mgmt_task(task->lun, task);
+}
+
+void
+spdk_scsi_dev_queue_task(struct spdk_scsi_dev *dev,
+			 struct spdk_scsi_task *task)
+{
+	assert(task != NULL);
+
+	scsi_lun_execute_task(task->lun, task);
+}
+
+static struct spdk_scsi_port *
+scsi_dev_find_free_port(struct spdk_scsi_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) {
+		if (!dev->port[i].is_used) {
+			return &dev->port[i];
+		}
+	}
+
+	return NULL;
+}
+
+int
+spdk_scsi_dev_add_port(struct spdk_scsi_dev *dev, uint64_t id, const char *name)
+{
+	struct spdk_scsi_port *port;
+	int rc;
+
+	if (dev->num_ports == SPDK_SCSI_DEV_MAX_PORTS) {
+		SPDK_ERRLOG("device already has %d ports\n", SPDK_SCSI_DEV_MAX_PORTS);
+		return -1;
+	}
+
+	port = spdk_scsi_dev_find_port_by_id(dev, id);
+	if (port != NULL) {
+		SPDK_ERRLOG("device already has port(%" PRIu64 ")\n", id);
+		return -1;
+	}
+
+	port = scsi_dev_find_free_port(dev);
+	if (port == NULL) {
+		assert(false);
+		return -1;
+	}
+
+	rc = scsi_port_construct(port, id, dev->num_ports, name);
+	if (rc != 0) {
+		return rc;
+	}
+
+	dev->num_ports++;
+	return 0;
+}
+
+int
+spdk_scsi_dev_delete_port(struct spdk_scsi_dev *dev, uint64_t id)
+{
+	struct spdk_scsi_port *port;
+
+	port = spdk_scsi_dev_find_port_by_id(dev, id);
+	if (port == NULL) {
+		SPDK_ERRLOG("device does not have specified port(%" PRIu64 ")\n", id);
+		return -1;
+	}
+
+	scsi_port_destruct(port);
+
+	dev->num_ports--;
+
+	return 0;
+}
+
+struct spdk_scsi_port *
+spdk_scsi_dev_find_port_by_id(struct spdk_scsi_dev *dev, uint64_t id)
+{
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) {
+		if (!dev->port[i].is_used) {
+			continue;
+		}
+		if (dev->port[i].id == id) {
+			return &dev->port[i];
+		}
+	}
+
+	/* No matching port found. */
+	return NULL;
+}
+
+void
+spdk_scsi_dev_free_io_channels(struct spdk_scsi_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		if (dev->lun[i] == NULL) {
+			continue;
+		}
+		scsi_lun_free_io_channel(dev->lun[i]);
+	}
+}
+
+int
+spdk_scsi_dev_allocate_io_channels(struct spdk_scsi_dev *dev)
+{
+	int i, rc;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		if (dev->lun[i] == NULL) {
+			continue;
+		}
+		rc = scsi_lun_allocate_io_channel(dev->lun[i]);
+		if (rc < 0) {
+			spdk_scsi_dev_free_io_channels(dev);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+const char *
+spdk_scsi_dev_get_name(const struct spdk_scsi_dev *dev)
+{
+	return dev->name;
+}
+
+int
+spdk_scsi_dev_get_id(const struct spdk_scsi_dev *dev)
+{
+	return dev->id;
+}
+
+struct spdk_scsi_lun *
+spdk_scsi_dev_get_lun(struct spdk_scsi_dev *dev, int lun_id)
+{
+	struct spdk_scsi_lun *lun;
+
+	if (lun_id < 0 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) {
+		return NULL;
+	}
+
+	lun = dev->lun[lun_id];
+
+	if (lun != NULL && !spdk_scsi_lun_is_removing(lun)) {
+		return lun;
+	} else {
+		return NULL;
+	}
+}
+
+bool
+spdk_scsi_dev_has_pending_tasks(const struct spdk_scsi_dev *dev,
+				const struct spdk_scsi_port *initiator_port)
+{
+	int i;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; ++i) {
+		if (dev->lun[i] &&
+		    (scsi_lun_has_pending_tasks(dev->lun[i], initiator_port) ||
+		     scsi_lun_has_pending_mgmt_tasks(dev->lun[i], initiator_port))) {
+			return true;
+		}
+	}
+
+	return false;
+}
diff --git a/src/spdk/lib/scsi/lun.c b/src/spdk/lib/scsi/lun.c
new file mode 100644
index 000000000..262137d80
--- /dev/null
+++ b/src/spdk/lib/scsi/lun.c
@@ -0,0 +1,623 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+
+static void scsi_lun_execute_tasks(struct spdk_scsi_lun *lun);
+static void _scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun);
+
+void
+scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+	if (lun) {
+		TAILQ_REMOVE(&lun->tasks, task, scsi_link);
+		spdk_trace_record(TRACE_SCSI_TASK_DONE, lun->dev->id, 0, (uintptr_t)task, 0);
+	}
+	task->cpl_fn(task);
+}
+
+static void
+scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+	TAILQ_REMOVE(&lun->mgmt_tasks, task, scsi_link);
+
+	task->cpl_fn(task);
+
+	/* Try to execute the first pending mgmt task if it exists. */
+	_scsi_lun_execute_mgmt_task(lun);
+}
+
+static bool
+_scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun)
+{
+	return !TAILQ_EMPTY(&lun->pending_mgmt_tasks);
+}
+
+static bool
+scsi_lun_has_outstanding_mgmt_tasks(const struct spdk_scsi_lun *lun)
+{
+	return !TAILQ_EMPTY(&lun->mgmt_tasks);
+}
+
+static bool
+_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun)
+{
+	return !TAILQ_EMPTY(&lun->pending_tasks);
+}
+
+static bool
+scsi_lun_has_outstanding_tasks(const struct spdk_scsi_lun *lun)
+{
+	return !TAILQ_EMPTY(&lun->tasks);
+}
+
+/* Reset task have to wait until all prior outstanding tasks complete. */
+static int
+scsi_lun_reset_check_outstanding_tasks(void *arg)
+{
+	struct spdk_scsi_task *task = (struct spdk_scsi_task *)arg;
+	struct spdk_scsi_lun *lun = task->lun;
+
+	if (scsi_lun_has_outstanding_tasks(lun)) {
+		return SPDK_POLLER_BUSY;
+	}
+	spdk_poller_unregister(&lun->reset_poller);
+
+	scsi_lun_complete_mgmt_task(lun, task);
+	return SPDK_POLLER_BUSY;
+}
+
+void
+scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+	if (task->status == SPDK_SCSI_STATUS_GOOD) {
+		if (scsi_lun_has_outstanding_tasks(lun)) {
+			lun->reset_poller =
+				SPDK_POLLER_REGISTER(scsi_lun_reset_check_outstanding_tasks,
+						     task, 10);
+			return;
+		}
+	}
+
+	scsi_lun_complete_mgmt_task(lun, task);
+}
+
+static void
+scsi_lun_append_mgmt_task(struct spdk_scsi_lun *lun,
+			  struct spdk_scsi_task *task)
+{
+	TAILQ_INSERT_TAIL(&lun->pending_mgmt_tasks, task, scsi_link);
+}
+
+static void
+_scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun)
+{
+	struct spdk_scsi_task *task;
+
+	if (!TAILQ_EMPTY(&lun->mgmt_tasks)) {
+		return;
+	}
+
+	task = TAILQ_FIRST(&lun->pending_mgmt_tasks);
+	if (spdk_likely(task == NULL)) {
+		/* Try to execute all pending tasks */
+		scsi_lun_execute_tasks(lun);
+		return;
+	}
+	TAILQ_REMOVE(&lun->pending_mgmt_tasks, task, scsi_link);
+
+	TAILQ_INSERT_TAIL(&lun->mgmt_tasks, task, scsi_link);
+
+	if (lun->removed) {
+		task->response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN;
+		scsi_lun_complete_mgmt_task(lun, task);
+		return;
+	}
+
+	switch (task->function) {
+	case SPDK_SCSI_TASK_FUNC_ABORT_TASK:
+		task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		SPDK_ERRLOG("ABORT_TASK failed\n");
+		break;
+
+	case SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET:
+		task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		SPDK_ERRLOG("ABORT_TASK_SET failed\n");
+		break;
+
+	case SPDK_SCSI_TASK_FUNC_LUN_RESET:
+		bdev_scsi_reset(task);
+		return;
+
+	default:
+		SPDK_ERRLOG("Unknown Task Management Function!\n");
+		/*
+		 * Task management functions other than those above should never
+		 * reach this point having been filtered by the frontend. Reject
+		 * the task as being unsupported.
+		 */
+		task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+		break;
+	}
+
+	scsi_lun_complete_mgmt_task(lun, task);
+}
+
+void
+scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun,
+			   struct spdk_scsi_task *task)
+{
+	scsi_lun_append_mgmt_task(lun, task);
+	_scsi_lun_execute_mgmt_task(lun);
+}
+
+static void
+_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+	int rc;
+
+	task->status = SPDK_SCSI_STATUS_GOOD;
+	spdk_trace_record(TRACE_SCSI_TASK_START, lun->dev->id, task->length, (uintptr_t)task, 0);
+	TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link);
+	if (!lun->removed) {
+		/* Check the command is allowed or not when reservation is exist */
+		if (spdk_unlikely(lun->reservation.flags & SCSI_SPC2_RESERVE)) {
+			rc = scsi2_reserve_check(task);
+		} else {
+			rc = scsi_pr_check(task);
+		}
+		if (spdk_unlikely(rc < 0)) {
+			/* Reservation Conflict */
+			rc = SPDK_SCSI_TASK_COMPLETE;
+		} else {
+			rc = bdev_scsi_execute(task);
+		}
+	} else {
+		spdk_scsi_task_process_abort(task);
+		rc = SPDK_SCSI_TASK_COMPLETE;
+	}
+
+	switch (rc) {
+	case SPDK_SCSI_TASK_PENDING:
+		break;
+
+	case SPDK_SCSI_TASK_COMPLETE:
+		scsi_lun_complete_task(lun, task);
+		break;
+
+	default:
+		abort();
+	}
+}
+
+static void
+scsi_lun_append_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+	TAILQ_INSERT_TAIL(&lun->pending_tasks, task, scsi_link);
+}
+
+static void
+scsi_lun_execute_tasks(struct spdk_scsi_lun *lun)
+{
+	struct spdk_scsi_task *task, *task_tmp;
+
+	TAILQ_FOREACH_SAFE(task, &lun->pending_tasks, scsi_link, task_tmp) {
+		TAILQ_REMOVE(&lun->pending_tasks, task, scsi_link);
+		_scsi_lun_execute_task(lun, task);
+	}
+}
+
+void
+scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+	if (spdk_unlikely(_scsi_lun_has_pending_mgmt_tasks(lun))) {
+		/* Add the IO task to pending list and wait for completion of
+		 * existing mgmt tasks.
+		 */
+		scsi_lun_append_task(lun, task);
+	} else if (spdk_unlikely(_scsi_lun_has_pending_tasks(lun))) {
+		/* If there is any pending IO task, append the IO task to the
+		 * tail of the pending list, and then execute all pending IO tasks
+		 * from the head to submit IO tasks in order.
+		 */
+		scsi_lun_append_task(lun, task);
+		scsi_lun_execute_tasks(lun);
+	} else {
+		/* Execute the IO task directly. */
+		_scsi_lun_execute_task(lun, task);
+	}
+}
+
+static void
+_scsi_lun_remove(void *arg)
+{
+	struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg;
+
+	spdk_bdev_close(lun->bdev_desc);
+	spdk_scsi_dev_delete_lun(lun->dev, lun);
+	free(lun);
+}
+
+static void
+scsi_lun_remove(struct spdk_scsi_lun *lun)
+{
+	struct spdk_scsi_pr_registrant *reg, *tmp;
+	struct spdk_thread *thread;
+
+	TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+		TAILQ_REMOVE(&lun->reg_head, reg, link);
+		free(reg);
+	}
+
+	thread = spdk_get_thread();
+	if (thread != lun->thread) {
+		spdk_thread_send_msg(lun->thread, _scsi_lun_remove, lun);
+	} else {
+		_scsi_lun_remove(lun);
+	}
+}
+
+static int
+scsi_lun_check_io_channel(void *arg)
+{
+	struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg;
+
+	if (lun->io_channel) {
+		return SPDK_POLLER_BUSY;
+	}
+	spdk_poller_unregister(&lun->hotremove_poller);
+
+	scsi_lun_remove(lun);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+scsi_lun_notify_hot_remove(struct spdk_scsi_lun *lun)
+{
+	struct spdk_scsi_lun_desc *desc, *tmp;
+
+	if (lun->hotremove_cb) {
+		lun->hotremove_cb(lun, lun->hotremove_ctx);
+	}
+
+	TAILQ_FOREACH_SAFE(desc, &lun->open_descs, link, tmp) {
+		if (desc->hotremove_cb) {
+			desc->hotremove_cb(lun, desc->hotremove_ctx);
+		} else {
+			spdk_scsi_lun_close(desc);
+		}
+	}
+
+	if (lun->io_channel) {
+		lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_io_channel,
+					lun, 10);
+	} else {
+		scsi_lun_remove(lun);
+	}
+}
+
+static int
+scsi_lun_check_outstanding_tasks(void *arg)
+{
+	struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg;
+
+	if (scsi_lun_has_outstanding_tasks(lun) ||
+	    scsi_lun_has_outstanding_mgmt_tasks(lun)) {
+		return SPDK_POLLER_BUSY;
+	}
+	spdk_poller_unregister(&lun->hotremove_poller);
+
+	scsi_lun_notify_hot_remove(lun);
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+_scsi_lun_hot_remove(void *arg1)
+{
+	struct spdk_scsi_lun *lun = arg1;
+
+	/* If lun->removed is set, no new task can be submitted to the LUN.
+	 * Execute previously queued tasks, which will be immediately aborted.
+	 */
+	scsi_lun_execute_tasks(lun);
+
+	/* Then we only need to wait for all outstanding tasks to be completed
+	 * before notifying the upper layer about the removal.
+	 */
+	if (scsi_lun_has_outstanding_tasks(lun) ||
+	    scsi_lun_has_outstanding_mgmt_tasks(lun)) {
+		lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_outstanding_tasks,
+					lun, 10);
+	} else {
+		scsi_lun_notify_hot_remove(lun);
+	}
+}
+
+static void
+scsi_lun_hot_remove(void *remove_ctx)
+{
+	struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)remove_ctx;
+	struct spdk_thread *thread;
+
+	if (lun->removed) {
+		return;
+	}
+
+	lun->removed = true;
+	if (lun->io_channel == NULL) {
+		_scsi_lun_hot_remove(lun);
+		return;
+	}
+
+	thread = spdk_io_channel_get_thread(lun->io_channel);
+	if (thread != spdk_get_thread()) {
+		spdk_thread_send_msg(thread, _scsi_lun_hot_remove, lun);
+	} else {
+		_scsi_lun_hot_remove(lun);
+	}
+}
+
+/**
+ * \brief Constructs a new spdk_scsi_lun object based on the provided parameters.
+ *
+ * \param bdev  bdev associated with this LUN
+ *
+ * \return NULL if bdev == NULL
+ * \return pointer to the new spdk_scsi_lun object otherwise
+ */
+struct spdk_scsi_lun *scsi_lun_construct(struct spdk_bdev *bdev,
+		void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+		void *hotremove_ctx)
+{
+	struct spdk_scsi_lun *lun;
+	int rc;
+
+	if (bdev == NULL) {
+		SPDK_ERRLOG("bdev must be non-NULL\n");
+		return NULL;
+	}
+
+	lun = calloc(1, sizeof(*lun));
+	if (lun == NULL) {
+		SPDK_ERRLOG("could not allocate lun\n");
+		return NULL;
+	}
+
+	rc = spdk_bdev_open(bdev, true, scsi_lun_hot_remove, lun, &lun->bdev_desc);
+
+	if (rc != 0) {
+		SPDK_ERRLOG("bdev %s cannot be opened, error=%d\n", spdk_bdev_get_name(bdev), rc);
+		free(lun);
+		return NULL;
+	}
+
+	lun->thread = spdk_get_thread();
+
+	TAILQ_INIT(&lun->tasks);
+	TAILQ_INIT(&lun->pending_tasks);
+	TAILQ_INIT(&lun->mgmt_tasks);
+	TAILQ_INIT(&lun->pending_mgmt_tasks);
+
+	lun->bdev = bdev;
+	lun->io_channel = NULL;
+	lun->hotremove_cb = hotremove_cb;
+	lun->hotremove_ctx = hotremove_ctx;
+	TAILQ_INIT(&lun->open_descs);
+	TAILQ_INIT(&lun->reg_head);
+
+	return lun;
+}
+
+void
+scsi_lun_destruct(struct spdk_scsi_lun *lun)
+{
+	scsi_lun_hot_remove(lun);
+}
+
+int
+spdk_scsi_lun_open(struct spdk_scsi_lun *lun, spdk_scsi_lun_remove_cb_t hotremove_cb,
+		   void *hotremove_ctx, struct spdk_scsi_lun_desc **_desc)
+{
+	struct spdk_scsi_lun_desc *desc;
+
+	desc = calloc(1, sizeof(*desc));
+	if (desc == NULL) {
+		SPDK_ERRLOG("calloc() failed for LUN descriptor.\n");
+		return -ENOMEM;
+	}
+
+	TAILQ_INSERT_TAIL(&lun->open_descs, desc, link);
+
+	desc->lun = lun;
+	desc->hotremove_cb = hotremove_cb;
+	desc->hotremove_ctx = hotremove_ctx;
+	*_desc = desc;
+
+	return 0;
+}
+
+void
+spdk_scsi_lun_close(struct spdk_scsi_lun_desc *desc)
+{
+	struct spdk_scsi_lun *lun = desc->lun;
+
+	TAILQ_REMOVE(&lun->open_descs, desc, link);
+	free(desc);
+
+	assert(!TAILQ_EMPTY(&lun->open_descs) || lun->io_channel == NULL);
+}
+
+int
+scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun)
+{
+	if (lun->io_channel != NULL) {
+		if (spdk_get_thread() == spdk_io_channel_get_thread(lun->io_channel)) {
+			lun->ref++;
+			return 0;
+		}
+		SPDK_ERRLOG("io_channel already allocated for lun %s\n",
+			    spdk_bdev_get_name(lun->bdev));
+		return -1;
+	}
+
+	lun->io_channel = spdk_bdev_get_io_channel(lun->bdev_desc);
+	if (lun->io_channel == NULL) {
+		return -1;
+	}
+	lun->ref = 1;
+	return 0;
+}
+
+void
+scsi_lun_free_io_channel(struct spdk_scsi_lun *lun)
+{
+	if (lun->io_channel == NULL) {
+		return;
+	}
+
+	if (spdk_get_thread() != spdk_io_channel_get_thread(lun->io_channel)) {
+		SPDK_ERRLOG("io_channel was freed by different thread\n");
+		return;
+	}
+
+	lun->ref--;
+	if (lun->ref == 0) {
+		spdk_put_io_channel(lun->io_channel);
+		lun->io_channel = NULL;
+	}
+}
+
+int
+spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun_desc *desc)
+{
+	struct spdk_scsi_lun *lun = desc->lun;
+
+	return scsi_lun_allocate_io_channel(lun);
+}
+
+void
+spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun_desc *desc)
+{
+	struct spdk_scsi_lun *lun = desc->lun;
+
+	scsi_lun_free_io_channel(lun);
+}
+
+int
+spdk_scsi_lun_get_id(const struct spdk_scsi_lun *lun)
+{
+	return lun->id;
+}
+
+const char *
+spdk_scsi_lun_get_bdev_name(const struct spdk_scsi_lun *lun)
+{
+	return spdk_bdev_get_name(lun->bdev);
+}
+
+const struct spdk_scsi_dev *
+spdk_scsi_lun_get_dev(const struct spdk_scsi_lun *lun)
+{
+	return lun->dev;
+}
+
+bool
+scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun,
+				const struct spdk_scsi_port *initiator_port)
+{
+	struct spdk_scsi_task *task;
+
+	if (initiator_port == NULL) {
+		return _scsi_lun_has_pending_mgmt_tasks(lun) ||
+		       scsi_lun_has_outstanding_mgmt_tasks(lun);
+	}
+
+	TAILQ_FOREACH(task, &lun->pending_mgmt_tasks, scsi_link) {
+		if (task->initiator_port == initiator_port) {
+			return true;
+		}
+	}
+
+	TAILQ_FOREACH(task, &lun->mgmt_tasks, scsi_link) {
+		if (task->initiator_port == initiator_port) {
+			return true;
+		}
+	}
+
+	return false;
+}
+/* This check includes both pending and submitted (outstanding) tasks. */
+bool
+scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun,
+			   const struct spdk_scsi_port *initiator_port)
+{
+	struct spdk_scsi_task *task;
+
+	if (initiator_port == NULL) {
+		return _scsi_lun_has_pending_tasks(lun) ||
+		       scsi_lun_has_outstanding_tasks(lun);
+	}
+
+	TAILQ_FOREACH(task, &lun->pending_tasks, scsi_link) {
+		if (task->initiator_port == initiator_port) {
+			return true;
+		}
+	}
+
+	TAILQ_FOREACH(task, &lun->tasks, scsi_link) {
+		if (task->initiator_port == initiator_port) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool
+spdk_scsi_lun_is_removing(const struct spdk_scsi_lun *lun)
+{
+	return lun->removed;
+}
+
+bool
+spdk_scsi_lun_get_dif_ctx(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task,
+			  struct spdk_dif_ctx *dif_ctx)
+{
+	return bdev_scsi_get_dif_ctx(lun->bdev, task, dif_ctx);
+}
diff --git a/src/spdk/lib/scsi/port.c b/src/spdk/lib/scsi/port.c
new file mode 100644
index 000000000..09311bac2
--- /dev/null
+++ b/src/spdk/lib/scsi/port.c
@@ -0,0 +1,134 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+#include "spdk/endian.h"
+
+struct spdk_scsi_port *
+spdk_scsi_port_create(uint64_t id, uint16_t index, const char *name)
+{
+	struct spdk_scsi_port *port;
+
+	port = calloc(1, sizeof(struct spdk_scsi_port));
+
+	if (!port) {
+		return NULL;
+	}
+
+	if (scsi_port_construct(port, id, index, name) != 0) {
+		spdk_scsi_port_free(&port);
+		return NULL;
+	}
+
+	return port;
+}
+
+void
+spdk_scsi_port_free(struct spdk_scsi_port **pport)
+{
+	struct spdk_scsi_port *port;
+
+	if (!pport) {
+		return;
+	}
+
+	port = *pport;
+	*pport = NULL;
+	free(port);
+}
+
+int
+scsi_port_construct(struct spdk_scsi_port *port, uint64_t id, uint16_t index,
+		    const char *name)
+{
+	if (strlen(name) >= sizeof(port->name)) {
+		SPDK_ERRLOG("port name too long\n");
+		return -1;
+	}
+
+	port->is_used = 1;
+	port->id = id;
+	port->index = index;
+	snprintf(port->name, sizeof(port->name), "%s", name);
+	return 0;
+}
+
+void
+scsi_port_destruct(struct spdk_scsi_port *port)
+{
+	memset(port, 0, sizeof(struct spdk_scsi_port));
+}
+
+const char *
+spdk_scsi_port_get_name(const struct spdk_scsi_port *port)
+{
+	return port->name;
+}
+
+/*
+ * spc3r23 7.5.4.6 iSCSI initiator port TransportID,
+ * using code format 0x01.
+ */
+void
+spdk_scsi_port_set_iscsi_transport_id(struct spdk_scsi_port *port, char *iscsi_name,
+				      uint64_t isid)
+{
+	struct spdk_scsi_iscsi_transport_id *data;
+	uint32_t len;
+	char *name;
+
+	memset(port->transport_id, 0, sizeof(port->transport_id));
+	port->transport_id_len = 0;
+
+	data = (struct spdk_scsi_iscsi_transport_id *)port->transport_id;
+
+	data->protocol_id = (uint8_t)SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI;
+	data->format = 0x1;
+
+	name = data->name;
+	len = snprintf(name, SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH - sizeof(*data),
+		       "%s,i,0x%12.12" PRIx64, iscsi_name, isid);
+	do {
+		name[len++] = '\0';
+	} while (len & 3);
+
+	if (len < 20) {
+		SPDK_ERRLOG("The length of Transport ID should >= 20 bytes\n");
+		return;
+	}
+
+	to_be16(&data->additional_len, len);
+	port->transport_id_len = len + sizeof(*data);
+}
diff --git a/src/spdk/lib/scsi/scsi.c b/src/spdk/lib/scsi/scsi.c
new file mode 100644
index 000000000..c18192e37
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi.c
@@ -0,0 +1,110 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+struct spdk_scsi_globals g_scsi;
+
+int
+spdk_scsi_init(void)
+{
+	int rc;
+
+	rc = pthread_mutex_init(&g_scsi.mutex, NULL);
+	if (rc != 0) {
+		SPDK_ERRLOG("mutex_init() failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+spdk_scsi_fini(void)
+{
+	pthread_mutex_destroy(&g_scsi.mutex);
+}
+
+SPDK_TRACE_REGISTER_FN(scsi_trace, "scsi", TRACE_GROUP_SCSI)
+{
+	spdk_trace_register_owner(OWNER_SCSI_DEV, 'd');
+	spdk_trace_register_object(OBJECT_SCSI_TASK, 't');
+	spdk_trace_register_description("SCSI_TASK_DONE", TRACE_SCSI_TASK_DONE,
+					OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, "");
+	spdk_trace_register_description("SCSI_TASK_START", TRACE_SCSI_TASK_START,
+					OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, "");
+}
+
+uint64_t
+spdk_scsi_lun_id_int_to_fmt(int lun_id)
+{
+	uint64_t fmt_lun, method;
+
+	if (SPDK_SCSI_DEV_MAX_LUN <= 0x0100) {
+		/* below 256 */
+		method = 0x00U;
+		fmt_lun = (method & 0x03U) << 62;
+		fmt_lun |= ((uint64_t)lun_id & 0x00ffU) << 48;
+	} else if (SPDK_SCSI_DEV_MAX_LUN <= 0x4000) {
+		/* below 16384 */
+		method = 0x01U;
+		fmt_lun = (method & 0x03U) << 62;
+		fmt_lun |= ((uint64_t)lun_id & 0x3fffU) << 48;
+	} else {
+		/* XXX */
+		fmt_lun = 0;
+	}
+
+	return fmt_lun;
+}
+
+int
+spdk_scsi_lun_id_fmt_to_int(uint64_t fmt_lun)
+{
+	uint64_t method;
+	int lun_i;
+
+	method = (fmt_lun >> 62) & 0x03U;
+	fmt_lun = fmt_lun >> 48;
+	if (method == 0x00U) {
+		lun_i = (int)(fmt_lun & 0x00ffU);
+	} else if (method == 0x01U) {
+		lun_i = (int)(fmt_lun & 0x3fffU);
+	} else {
+		lun_i = 0xffffU;
+	}
+	return lun_i;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("scsi", SPDK_LOG_SCSI)
diff --git a/src/spdk/lib/scsi/scsi_bdev.c b/src/spdk/lib/scsi/scsi_bdev.c
new file mode 100644
index 000000000..bf0fb5af7
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_bdev.c
@@ -0,0 +1,2067 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+/*
+ * TODO: move bdev SCSI error code translation tests to bdev unit test
+ * and remove this include.
+ */
+#include "spdk/bdev_module.h"
+
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#define SPDK_WORK_BLOCK_SIZE		(4ULL * 1024ULL * 1024ULL)
+#define SPDK_WORK_ATS_BLOCK_SIZE	(1ULL * 1024ULL * 1024ULL)
+#define MAX_SERIAL_STRING		32
+
+#define DEFAULT_DISK_VENDOR		"INTEL"
+#define DEFAULT_DISK_REVISION		"0001"
+#define DEFAULT_DISK_ROTATION_RATE	1	/* Non-rotating medium */
+#define DEFAULT_DISK_FORM_FACTOR	0x02	/* 3.5 inch */
+#define DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT	256
+
+#define INQUIRY_OFFSET(field)		offsetof(struct spdk_scsi_cdb_inquiry_data, field) + \
+					sizeof(((struct spdk_scsi_cdb_inquiry_data *)0x0)->field)
+
+static void bdev_scsi_process_block_resubmit(void *arg);
+
+static int
+hex2bin(char ch)
+{
+	if ((ch >= '0') && (ch <= '9')) {
+		return ch - '0';
+	}
+	ch = tolower(ch);
+	if ((ch >= 'a') && (ch <= 'f')) {
+		return ch - 'a' + 10;
+	}
+	return (int)ch;
+}
+
+static void
+bdev_scsi_set_naa_ieee_extended(const char *name, uint8_t *buf)
+{
+	int i, value, count = 0;
+	uint64_t local_value;
+
+	for (i = 0; (i < 16) && (name[i] != '\0'); i++) {
+		value = hex2bin(name[i]);
+		if (i % 2) {
+			buf[count++] |= value << 4;
+		} else {
+			buf[count] = value;
+		}
+	}
+
+	local_value = *(uint64_t *)buf;
+	/*
+	 * see spc3r23 7.6.3.6.2,
+	 *  NAA IEEE Extended identifer format
+	 */
+	local_value &= 0x0fff000000ffffffull;
+	/* NAA 02, and 00 03 47 for IEEE Intel */
+	local_value |= 0x2000000347000000ull;
+
+	to_be64((void *)buf, local_value);
+}
+
+static int
+bdev_scsi_report_luns(struct spdk_scsi_lun *lun,
+		      int sel, uint8_t *data, int alloc_len)
+{
+	struct spdk_scsi_dev *dev;
+	uint64_t fmt_lun;
+	int hlen, len = 0;
+	int i;
+
+	if (alloc_len < 8) {
+		return -1;
+	}
+
+	if (sel == 0x00) {
+		/* logical unit with addressing method */
+	} else if (sel == 0x01) {
+		/* well known logical unit */
+	} else if (sel == 0x02) {
+		/* logical unit */
+	} else {
+		return -1;
+	}
+
+	/* LUN LIST LENGTH */
+	memset(data, 0, 4);
+
+	/* Reserved */
+	memset(&data[4], 0, 4);
+	hlen = 8;
+
+	dev = lun->dev;
+
+	for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+		if (dev->lun[i] == NULL) {
+			continue;
+		}
+
+		if (alloc_len - (hlen + len) < 8) {
+			return -1;
+		}
+
+		fmt_lun = spdk_scsi_lun_id_int_to_fmt(i);
+
+		/* LUN */
+		to_be64(&data[hlen + len], fmt_lun);
+		len += 8;
+	}
+
+	/* LUN LIST LENGTH */
+	to_be32(data, len);
+
+	return hlen + len;
+}
+
+static int
+bdev_scsi_pad_scsi_name(char *dst, const char *name)
+{
+	size_t len;
+
+	len = strlen(name);
+	memcpy(dst, name, len);
+	do {
+		dst[len++] = '\0';
+	} while (len & 3);
+
+	return len;
+}
+
+static int
+bdev_scsi_inquiry(struct spdk_bdev *bdev, struct spdk_scsi_task *task,
+		  uint8_t *cdb, uint8_t *data, uint16_t alloc_len)
+{
+	struct spdk_scsi_lun	*lun;
+	struct spdk_scsi_dev	*dev;
+	struct spdk_scsi_port	*port;
+	uint32_t blocks, optimal_blocks;
+	int hlen = 0, plen, plen2;
+	uint16_t len = 0;
+	int pc;
+	int pd;
+	int evpd;
+	int i;
+	struct spdk_scsi_cdb_inquiry *inq = (struct spdk_scsi_cdb_inquiry *)cdb;
+
+	/* standard inquiry command at lease with 36 Bytes */
+	if (alloc_len < 0x24) {
+		goto inq_error;
+	}
+
+	lun = task->lun;
+	dev = lun->dev;
+	port = task->target_port;
+
+	pd = SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK;
+	pc = inq->page_code;
+	evpd = inq->evpd & 0x1;
+
+	if (!evpd && pc) {
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return -1;
+	}
+
+	if (evpd) {
+		struct spdk_scsi_vpd_page *vpage = (struct spdk_scsi_vpd_page *)data;
+
+		/* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */
+		vpage->peripheral_device_type = pd;
+		vpage->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED;
+		/* PAGE CODE */
+		vpage->page_code = pc;
+
+		/* Vital product data */
+		switch (pc) {
+		case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES:
+			hlen = 4;
+
+			vpage->params[0] = SPDK_SPC_VPD_SUPPORTED_VPD_PAGES;
+			vpage->params[1] = SPDK_SPC_VPD_UNIT_SERIAL_NUMBER;
+			vpage->params[2] = SPDK_SPC_VPD_DEVICE_IDENTIFICATION;
+			vpage->params[3] = SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES;
+			vpage->params[4] = SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA;
+			vpage->params[5] = SPDK_SPC_VPD_MODE_PAGE_POLICY;
+			vpage->params[6] = SPDK_SPC_VPD_SCSI_PORTS;
+			vpage->params[7] = SPDK_SPC_VPD_BLOCK_LIMITS;
+			vpage->params[8] = SPDK_SPC_VPD_BLOCK_DEV_CHARS;
+			len = 9;
+			if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+				vpage->params[9] = SPDK_SPC_VPD_BLOCK_THIN_PROVISION;
+				len++;
+			}
+
+			/* PAGE LENGTH */
+			to_be16(vpage->alloc_len, len);
+			break;
+
+		case SPDK_SPC_VPD_UNIT_SERIAL_NUMBER: {
+			const char *name = spdk_bdev_get_name(bdev);
+
+			hlen = 4;
+
+			/* PRODUCT SERIAL NUMBER */
+			len = strlen(name) + 1;
+			if (len > MAX_SERIAL_STRING) {
+				len = MAX_SERIAL_STRING;
+			}
+
+			memcpy(vpage->params, name, len - 1);
+			vpage->params[len - 1] = 0;
+
+			/* PAGE LENGTH */
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		case SPDK_SPC_VPD_DEVICE_IDENTIFICATION: {
+			const char *name = spdk_bdev_get_name(bdev);
+			const char *product_name = spdk_bdev_get_product_name(bdev);
+			uint8_t protocol_id = dev->protocol_id;
+			uint8_t *buf = vpage->params;
+			struct spdk_scsi_desig_desc *desig;
+
+			hlen = 4;
+
+			/* Check total length by calculated how much space all entries take */
+			len = sizeof(struct spdk_scsi_desig_desc) + 8;
+			len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING;
+			len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_DEV_MAX_NAME + 1;
+			len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_PORT_MAX_NAME_LENGTH;
+			len += sizeof(struct spdk_scsi_desig_desc) + 4;
+			len += sizeof(struct spdk_scsi_desig_desc) + 4;
+			len += sizeof(struct spdk_scsi_desig_desc) + 4;
+			if (sizeof(struct spdk_scsi_vpd_page) + len > alloc_len) {
+				spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+							  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+							  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+							  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+				return -1;
+			}
+
+			/* Now fill out the designator array */
+
+			/* NAA designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_NAA;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = 8;
+			bdev_scsi_set_naa_ieee_extended(name, desig->desig);
+			len = sizeof(struct spdk_scsi_desig_desc) + 8;
+
+			buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			/* T10 Vendor ID designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_ASCII;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_T10_VENDOR_ID;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = 8 + 16 + MAX_SERIAL_STRING;
+			spdk_strcpy_pad(desig->desig, DEFAULT_DISK_VENDOR, 8, ' ');
+			spdk_strcpy_pad(&desig->desig[8], product_name, 16, ' ');
+			spdk_strcpy_pad(&desig->desig[24], name, MAX_SERIAL_STRING, ' ');
+			len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING;
+
+			buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			/* SCSI Device Name designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_DEVICE;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = bdev_scsi_pad_scsi_name(desig->desig, dev->name);
+			len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			/* SCSI Port Name designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = snprintf(desig->desig, SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s", port->name);
+			len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			/* Relative Target Port designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_RELATIVE_TARGET_PORT;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = 4;
+			memset(desig->desig, 0, 2); /* Reserved */
+			to_be16(&desig->desig[2], port->index);
+			len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			/* Target port group designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_TARGET_PORT_GROUP;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = 4;
+			memset(desig->desig, 0, 4);
+			len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			/* Logical unit group designator */
+			desig = (struct spdk_scsi_desig_desc *)buf;
+			desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+			desig->protocol_id = protocol_id;
+			desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_LOGICAL_UNIT_GROUP;
+			desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT;
+			desig->reserved0 = 0;
+			desig->piv = 1;
+			desig->reserved1 = 0;
+			desig->len = 4;
+			memset(desig->desig, 0, 2); /* Reserved */
+			to_be16(&desig->desig[2], dev->id);
+			len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+			to_be16(vpage->alloc_len, len);
+
+			break;
+		}
+
+		case SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA: {
+			struct spdk_scsi_vpd_ext_inquiry *vext = (struct spdk_scsi_vpd_ext_inquiry *)vpage;
+
+			hlen = 4;
+			memset((uint8_t *)vext + hlen, 0, sizeof(*vext) - hlen);
+
+			/* RTO(3) GRD_CHK(2) APP_CHK(1) REF_CHK(0) */
+
+			/* GROUP_SUP(4) PRIOR_SUP(3) HEADSUP(2) ORDSUP(1) SIMPSUP(0) */
+			vext->sup = SPDK_SCSI_VEXT_HEADSUP | SPDK_SCSI_VEXT_SIMPSUP;
+
+			/* NV_SUP(1) V_SUP(0) */
+
+			/* Reserved[7-63] */
+
+			len = 64 - hlen;
+
+			/* PAGE LENGTH */
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		case SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES:
+			/* PAGE LENGTH */
+			hlen = 4;
+
+			to_be16(vpage->alloc_len, len);
+			break;
+
+		case SPDK_SPC_VPD_MODE_PAGE_POLICY: {
+			struct spdk_scsi_mpage_policy_desc *pdesc =
+				(struct spdk_scsi_mpage_policy_desc *)vpage->params;
+
+			hlen = 4;
+
+			/* Mode page policy descriptor 1 */
+
+			/* POLICY PAGE CODE(5-0) */
+			/* all page code */
+			pdesc->page_code = 0x3f;
+
+			/* POLICY SUBPAGE CODE */
+			/* all sub page */
+			pdesc->sub_page_code = 0xff;
+
+			/* MLUS(7) MODE PAGE POLICY(1-0) */
+			/* MLUS own copy */
+			/* Shared MODE PAGE policy */
+			pdesc->policy = 0;
+			/* Reserved */
+			pdesc->reserved = 0;
+
+			len += 4;
+
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		case SPDK_SPC_VPD_SCSI_PORTS: {
+			/* PAGE LENGTH */
+			hlen = 4;
+
+			/* Identification descriptor list */
+			for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) {
+				struct spdk_scsi_port_desc *sdesc;
+				struct spdk_scsi_tgt_port_desc *pdesc;
+
+				if (!dev->port[i].is_used) {
+					continue;
+				}
+
+				/* Identification descriptor N */
+				sdesc = (struct spdk_scsi_port_desc *)&vpage->params[len];
+
+				/* Reserved */
+				sdesc->reserved = 0;
+
+				/* RELATIVE PORT IDENTIFIER */
+				to_be16(&sdesc->rel_port_id, dev->port[i].index);
+
+				/* Reserved */
+				sdesc->reserved2 = 0;
+
+				/* INITIATOR PORT TRANSPORTID LENGTH */
+				sdesc->init_port_len = 0;
+
+				/* Reserved */
+				sdesc->init_port_id = 0;
+
+				/* TARGET PORT DESCRIPTORS LENGTH */
+				sdesc->tgt_desc_len = 0;
+
+				len += 12;
+
+				plen2 = 0;
+				/* Target port descriptor 1 */
+				pdesc = (struct spdk_scsi_tgt_port_desc *)sdesc->tgt_desc;
+
+				/* PROTOCOL IDENTIFIER(7-4) CODE SET(3-0) */
+				pdesc->code_set =
+					SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI << 4 |
+					SPDK_SPC_VPD_CODE_SET_UTF8;
+
+				/* PIV(7) ASSOCIATION(5-4) IDENTIFIER TYPE(3-0) */
+				pdesc->desig_type = SPDK_SPC_VPD_DESIG_PIV |
+						    SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT << 4 |
+						    SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME;
+
+				/* Reserved */
+				pdesc->reserved = 0;
+
+				/* IDENTIFIER */
+				plen = snprintf((char *)pdesc->designator,
+						SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s",
+						dev->port[i].name);
+				pdesc->len = plen;
+
+				plen2 += 4 + plen;
+
+				/* TARGET PORT DESCRIPTORS LENGTH */
+				to_be16(&sdesc->tgt_desc_len, plen2);
+
+				len += plen2;
+			}
+
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		case SPDK_SPC_VPD_BLOCK_LIMITS: {
+			uint32_t block_size = spdk_bdev_get_data_block_size(bdev);
+
+			/* PAGE LENGTH */
+			memset(&data[4], 0, 60);
+
+			hlen = 4;
+
+			/* WSNZ(0) */
+			/* support zero length in WRITE SAME */
+
+			/* MAXIMUM COMPARE AND WRITE LENGTH */
+			blocks = SPDK_WORK_ATS_BLOCK_SIZE / block_size;
+
+			if (blocks > 0xff) {
+				blocks = 0xff;
+			}
+
+			data[5] = (uint8_t)blocks;
+
+			/* force align to 4KB */
+			if (block_size < 4096) {
+				optimal_blocks = 4096 / block_size;
+			} else {
+				optimal_blocks = 1;
+			}
+
+			/* OPTIMAL TRANSFER LENGTH GRANULARITY */
+			to_be16(&data[6], optimal_blocks);
+
+			blocks = SPDK_WORK_BLOCK_SIZE / block_size;
+
+			/* MAXIMUM TRANSFER LENGTH */
+			to_be32(&data[8], blocks);
+			/* OPTIMAL TRANSFER LENGTH */
+			to_be32(&data[12], blocks);
+
+			/* MAXIMUM PREFETCH XDREAD XDWRITE TRANSFER LENGTH */
+
+			len = 20 - hlen;
+
+			if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+				/*
+				 * MAXIMUM UNMAP LBA COUNT: indicates the
+				 * maximum  number of LBAs that may be
+				 * unmapped by an UNMAP command.
+				 */
+				/* For now, choose 4MB as the maximum. */
+				to_be32(&data[20], 4194304);
+
+				/*
+				 * MAXIMUM UNMAP BLOCK DESCRIPTOR COUNT:
+				 * indicates the maximum number of UNMAP
+				 * block descriptors that shall be contained
+				 * in the parameter data transferred to the
+				 * device server for an UNMAP command.
+				 * The bdev layer automatically splits unmap
+				 * requests, so pick an arbitrary high number here.
+				 */
+				to_be32(&data[24], DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT);
+
+				/*
+				 * The UGAVALID bit is left as 0 which means neither the
+				 * OPTIMAL UNMAP GRANULARITY nor the UNMAP GRANULARITY
+				 * ALIGNMENT fields are valid.
+				 */
+
+				/*
+				 * MAXIMUM WRITE SAME LENGTH: indicates the
+				 * maximum number of contiguous logical blocks
+				 * that the device server allows to be unmapped
+				 * or written in a single WRITE SAME command.
+				 */
+				to_be64(&data[36], 512);
+
+				/* Reserved */
+				/* not specified */
+				len = 64 - hlen;
+			}
+
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		case SPDK_SPC_VPD_BLOCK_DEV_CHARS: {
+			/* PAGE LENGTH */
+			hlen = 4;
+			len = 64 - hlen;
+
+			to_be16(&data[4], DEFAULT_DISK_ROTATION_RATE);
+
+			/* Reserved */
+			data[6] = 0;
+			/* NOMINAL FORM FACTOR(3-0) */
+			data[7] = DEFAULT_DISK_FORM_FACTOR << 4;
+			/* Reserved */
+			memset(&data[8], 0, 64 - 8);
+
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: {
+			if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+				goto inq_error;
+			}
+
+			hlen = 4;
+			len = 7;
+
+			/*
+			 *  PAGE LENGTH : if the DP bit is set to one, then the
+			 *  page length shall be set  0004h.
+			 */
+			to_be16(&data[2], 0x0004);
+
+			/*
+			 * THRESHOLD EXPONENT : it indicates the threshold set
+			 * size in LBAs as a power of 2( i.e., the threshold
+			 * set size  = 2 ^ (threshold exponent).
+			 */
+			data[4] = 0;
+
+			/*
+			 * Set the LBPU bit to indicate  the support for UNMAP
+			 * command.
+			 */
+			data[5] |= SPDK_SCSI_UNMAP_LBPU;
+
+			/*
+			 * Set the provisioning type to thin provision.
+			 */
+			data[6] = SPDK_SCSI_UNMAP_THIN_PROVISIONING;
+
+			to_be16(vpage->alloc_len, len);
+			break;
+		}
+
+		default:
+			if (pc >= 0xc0 && pc <= 0xff) {
+				SPDK_DEBUGLOG(SPDK_LOG_SCSI, "Vendor specific INQUIRY VPD page 0x%x\n", pc);
+			} else {
+				SPDK_ERRLOG("unsupported INQUIRY VPD page 0x%x\n", pc);
+			}
+			goto inq_error;
+		}
+	} else {
+		struct spdk_scsi_cdb_inquiry_data *inqdata =
+			(struct spdk_scsi_cdb_inquiry_data *)data;
+
+		/* Standard INQUIRY data */
+		/* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */
+		inqdata->peripheral_device_type = pd;
+		inqdata->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED;
+
+		/* RMB(7) */
+		inqdata->rmb = 0;
+
+		/* VERSION */
+		/* See SPC3/SBC2/MMC4/SAM2 for more details */
+		inqdata->version = SPDK_SPC_VERSION_SPC3;
+
+		/* NORMACA(5) HISUP(4) RESPONSE DATA FORMAT(3-0) */
+		/* format 2 */ /* hierarchical support */
+		inqdata->response = 2 | 1 << 4;
+
+		hlen = 5;
+
+		/* SCCS(7) ACC(6) TPGS(5-4) 3PC(3) PROTECT(0) */
+		/* Not support TPGS */
+		inqdata->flags = 0;
+
+		/* MULTIP */
+		inqdata->flags2 = 0x10;
+
+		/* WBUS16(5) SYNC(4) LINKED(3) CMDQUE(1) VS(0) */
+		/* CMDQUE */
+		inqdata->flags3 = 0x2;
+
+		/* T10 VENDOR IDENTIFICATION */
+		spdk_strcpy_pad(inqdata->t10_vendor_id, DEFAULT_DISK_VENDOR, 8, ' ');
+
+		/* PRODUCT IDENTIFICATION */
+		spdk_strcpy_pad(inqdata->product_id, spdk_bdev_get_product_name(bdev), 16, ' ');
+
+		/* PRODUCT REVISION LEVEL */
+		spdk_strcpy_pad(inqdata->product_rev, DEFAULT_DISK_REVISION, 4, ' ');
+
+		/*
+		 * Standard inquiry data ends here.  Only populate remaining fields if alloc_len
+		 *  indicates enough space to hold it.
+		 */
+		len = INQUIRY_OFFSET(product_rev) - 5;
+
+		if (alloc_len >= INQUIRY_OFFSET(vendor)) {
+			/* Vendor specific */
+			memset(inqdata->vendor, 0x20, 20);
+			len += sizeof(inqdata->vendor);
+		}
+
+		if (alloc_len >= INQUIRY_OFFSET(ius)) {
+			/* CLOCKING(3-2) QAS(1) IUS(0) */
+			inqdata->ius = 0;
+			len += sizeof(inqdata->ius);
+		}
+
+		if (alloc_len >= INQUIRY_OFFSET(reserved)) {
+			/* Reserved */
+			inqdata->reserved = 0;
+			len += sizeof(inqdata->reserved);
+		}
+
+		/* VERSION DESCRIPTOR 1-8 */
+		if (alloc_len >= INQUIRY_OFFSET(reserved) + 2) {
+			to_be16(&inqdata->desc[0], 0x0960);
+			len += 2;
+		}
+
+		if (alloc_len >= INQUIRY_OFFSET(reserved) + 4) {
+			to_be16(&inqdata->desc[2], 0x0300); /* SPC-3 (no version claimed) */
+			len += 2;
+		}
+
+		if (alloc_len >= INQUIRY_OFFSET(reserved) + 6) {
+			to_be16(&inqdata->desc[4], 0x320); /* SBC-2 (no version claimed) */
+			len += 2;
+		}
+
+		if (alloc_len >= INQUIRY_OFFSET(reserved) + 8) {
+			to_be16(&inqdata->desc[6], 0x0040); /* SAM-2 (no version claimed) */
+			len += 2;
+		}
+
+		/*
+		 * We only fill out 4 descriptors, but if the allocation length goes past
+		 *  that, zero the remaining bytes.  This fixes some SCSI compliance tests
+		 *  which expect a full 96 bytes to be returned, including the unpopulated
+		 *  version descriptors 5-8 (4 * 2 = 8 bytes) plus the 22 bytes of reserved
+		 *  space (bytes 74-95) - for a total of 30 bytes.
+		 */
+		if (alloc_len > INQUIRY_OFFSET(reserved) + 8) {
+			i = alloc_len - (INQUIRY_OFFSET(reserved) + 8);
+			if (i > 30) {
+				i = 30;
+			}
+			memset(&inqdata->desc[8], 0, i);
+			len += i;
+		}
+
+		/* ADDITIONAL LENGTH */
+		inqdata->add_len = len;
+	}
+
+	return hlen + len;
+
+inq_error:
+	task->data_transferred = 0;
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+				  SPDK_SCSI_SENSE_NO_SENSE,
+				  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -1;
+}
+
+static void
+mode_sense_page_init(uint8_t *buf, int len, int page, int subpage)
+{
+	if (!buf) {
+		return;
+	}
+
+	memset(buf, 0, len);
+	if (subpage != 0) {
+		buf[0] = page | 0x40; /* PAGE + SPF=1 */
+		buf[1] = subpage;
+		to_be16(&buf[2], len - 4);
+	} else {
+		buf[0] = page;
+		buf[1] = len - 2;
+	}
+}
+
+static int
+bdev_scsi_mode_sense_page(struct spdk_bdev *bdev,
+			  uint8_t *cdb, int pc, int page, int subpage,
+			  uint8_t *data, struct spdk_scsi_task *task)
+{
+	uint8_t *cp = data;
+	int len = 0;
+	int plen;
+	int i;
+
+	if (pc == 0x00) {
+		/* Current values */
+	} else if (pc == 0x01) {
+		/* Changeable values */
+		/* As we currently do not support changeable values,
+		   all parameters are reported as zero. */
+	} else if (pc == 0x02) {
+		/* Default values */
+	} else {
+		/* Saved values not supported */
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_SAVING_PARAMETERS_NOT_SUPPORTED,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return -1;
+	}
+
+	switch (page) {
+	case 0x00:
+		/* Vendor specific */
+		break;
+	case 0x01:
+		/* Read-Write Error Recovery */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+			      "MODE_SENSE Read-Write Error Recovery\n");
+		if (subpage != 0x00) {
+			break;
+		}
+		plen = 0x0a + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+		len += plen;
+		break;
+	case 0x02:
+		/* Disconnect-Reconnect */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+			      "MODE_SENSE Disconnect-Reconnect\n");
+		if (subpage != 0x00) {
+			break;
+		}
+		plen = 0x0e + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+		len += plen;
+		break;
+	case 0x03:
+		/* Obsolete (Format Device) */
+		break;
+	case 0x04:
+		/* Obsolete (Rigid Disk Geometry) */
+		break;
+	case 0x05:
+		/* Obsolete (Rigid Disk Geometry) */
+		break;
+	case 0x06:
+		/* Reserved */
+		break;
+	case 0x07:
+		/* Verify Error Recovery */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+			      "MODE_SENSE Verify Error Recovery\n");
+
+		if (subpage != 0x00) {
+			break;
+		}
+
+		plen = 0x0a + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+		len += plen;
+		break;
+	case 0x08: {
+		/* Caching */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE Caching\n");
+		if (subpage != 0x00) {
+			break;
+		}
+
+		plen = 0x12 + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+
+		if (cp && spdk_bdev_has_write_cache(bdev) && pc != 0x01) {
+			cp[2] |= 0x4;        /* WCE */
+		}
+
+		/* Read Cache Disable (RCD) = 1 */
+		if (cp && pc != 0x01) {
+			cp[2] |= 0x1;
+		}
+
+		len += plen;
+		break;
+	}
+	case 0x09:
+		/* Obsolete */
+		break;
+	case 0x0a:
+		switch (subpage) {
+		case 0x00:
+			/* Control */
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+				      "MODE_SENSE Control\n");
+			plen = 0x0a + 2;
+			mode_sense_page_init(cp, plen, page, subpage);
+			len += plen;
+			break;
+		case 0x01:
+			/* Control Extension */
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+				      "MODE_SENSE Control Extension\n");
+			plen = 0x1c + 4;
+			mode_sense_page_init(cp, plen, page, subpage);
+			len += plen;
+			break;
+		case 0xff:
+			/* All subpages */
+			len += bdev_scsi_mode_sense_page(bdev,
+							 cdb, pc, page,
+							 0x00,
+							 cp ? &cp[len] : NULL, task);
+			len += bdev_scsi_mode_sense_page(bdev,
+							 cdb, pc, page,
+							 0x01,
+							 cp ? &cp[len] : NULL, task);
+			break;
+		default:
+			/* 0x02-0x3e: Reserved */
+			break;
+		}
+		break;
+	case 0x0b:
+		/* Obsolete (Medium Types Supported) */
+		break;
+	case 0x0c:
+		/* Obsolete (Notch And Partitio) */
+		break;
+	case 0x0d:
+		/* Obsolete */
+		break;
+	case 0x0e:
+	case 0x0f:
+		/* Reserved */
+		break;
+	case 0x10:
+		/* XOR Control */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE XOR Control\n");
+		if (subpage != 0x00) {
+			break;
+		}
+		plen = 0x16 + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+		len += plen;
+		break;
+	case 0x11:
+	case 0x12:
+	case 0x13:
+		/* Reserved */
+		break;
+	case 0x14:
+		/* Enclosure Services Management */
+		break;
+	case 0x15:
+	case 0x16:
+	case 0x17:
+		/* Reserved */
+		break;
+	case 0x18:
+		/* Protocol-Specific LUN */
+		break;
+	case 0x19:
+		/* Protocol-Specific Port */
+		break;
+	case 0x1a:
+		/* Power Condition */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+			      "MODE_SENSE Power Condition\n");
+		if (subpage != 0x00) {
+			break;
+		}
+		plen = 0x0a + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+		len += plen;
+		break;
+	case 0x1b:
+		/* Reserved */
+		break;
+	case 0x1c:
+		/* Informational Exceptions Control */
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+			      "MODE_SENSE Informational Exceptions Control\n");
+		if (subpage != 0x00) {
+			break;
+		}
+
+		plen = 0x0a + 2;
+		mode_sense_page_init(cp, plen, page, subpage);
+		len += plen;
+		break;
+	case 0x1d:
+	case 0x1e:
+	case 0x1f:
+		/* Reserved */
+		break;
+	case 0x20:
+	case 0x21:
+	case 0x22:
+	case 0x23:
+	case 0x24:
+	case 0x25:
+	case 0x26:
+	case 0x27:
+	case 0x28:
+	case 0x29:
+	case 0x2a:
+	case 0x2b:
+	case 0x2c:
+	case 0x2d:
+	case 0x2e:
+	case 0x2f:
+	case 0x30:
+	case 0x31:
+	case 0x32:
+	case 0x33:
+	case 0x34:
+	case 0x35:
+	case 0x36:
+	case 0x37:
+	case 0x38:
+	case 0x39:
+	case 0x3a:
+	case 0x3b:
+	case 0x3c:
+	case 0x3d:
+	case 0x3e:
+		/* Vendor-specific */
+		break;
+	case 0x3f:
+		switch (subpage) {
+		case 0x00:
+			/* All mode pages */
+			for (i = 0x00; i < 0x3e; i ++) {
+				len += bdev_scsi_mode_sense_page(
+					       bdev, cdb, pc, i, 0x00,
+					       cp ? &cp[len] : NULL, task);
+			}
+			break;
+		case 0xff:
+			/* All mode pages and subpages */
+			for (i = 0x00; i < 0x3e; i ++) {
+				len += bdev_scsi_mode_sense_page(
+					       bdev, cdb, pc, i, 0x00,
+					       cp ? &cp[len] : NULL, task);
+			}
+			for (i = 0x00; i < 0x3e; i ++) {
+				len += bdev_scsi_mode_sense_page(
+					       bdev, cdb, pc, i, 0xff,
+					       cp ? &cp[len] : NULL, task);
+			}
+			break;
+		default:
+			/* 0x01-0x3e: Reserved */
+			break;
+		}
+	}
+
+	return len;
+}
+
+static int
+bdev_scsi_mode_sense(struct spdk_bdev *bdev, int md,
+		     uint8_t *cdb, int dbd, int llbaa, int pc,
+		     int page, int subpage, uint8_t *data, struct spdk_scsi_task *task)
+{
+	uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev);
+	uint32_t block_size = spdk_bdev_get_data_block_size(bdev);
+	uint8_t *hdr, *bdesc, *pages;
+	int hlen;
+	int blen;
+	int plen, total;
+
+	assert(md == 6 || md == 10);
+
+	if (md == 6) {
+		hlen = 4;
+		blen = 8; /* For MODE SENSE 6 only short LBA */
+	} else {
+		hlen = 8;
+		blen = llbaa ? 16 : 8;
+	}
+
+	if (dbd) {
+		blen = 0;
+	}
+
+	pages = data ? &data[hlen + blen] : NULL;
+	plen = bdev_scsi_mode_sense_page(bdev, cdb, pc, page,
+					 subpage,
+					 pages, task);
+	if (plen < 0) {
+		return -1;
+	}
+
+	total = hlen + blen + plen;
+	if (data == NULL) {
+		return total;
+	}
+
+	hdr = &data[0];
+	if (hlen == 4) {
+		hdr[0] = total - 1;            /* Mode Data Length */
+		hdr[1] = 0;                    /* Medium Type */
+		hdr[2] = 0;                    /* Device-Specific Parameter */
+		hdr[3] = blen;                 /* Block Descripter Length */
+	} else {
+		to_be16(&hdr[0], total - 2);   /* Mode Data Length */
+		hdr[2] = 0;                    /* Medium Type */
+		hdr[3] = 0;                    /* Device-Specific Parameter */
+		hdr[4] = llbaa ? 0x1 : 0;      /* Long/short LBA */
+		hdr[5] = 0;                    /* Reserved */
+		to_be16(&hdr[6], blen);        /* Block Descripter Length */
+	}
+
+	bdesc = &data[hlen];
+	if (blen == 16) {
+		/* Number of Blocks */
+		to_be64(&bdesc[0], num_blocks);
+		/* Reserved */
+		memset(&bdesc[8], 0, 4);
+		/* Block Length */
+		to_be32(&bdesc[12], block_size);
+	} else if (blen == 8) {
+		/* Number of Blocks */
+		if (num_blocks > 0xffffffffULL) {
+			memset(&bdesc[0], 0xff, 4);
+		} else {
+			to_be32(&bdesc[0], num_blocks);
+		}
+
+		/* Block Length */
+		to_be32(&bdesc[4], block_size);
+	}
+
+	return total;
+}
+
+static void
+bdev_scsi_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success,
+			    void *cb_arg)
+{
+	struct spdk_scsi_task *task = cb_arg;
+	int sc, sk, asc, ascq;
+
+	spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq);
+
+	spdk_bdev_free_io(bdev_io);
+
+	spdk_scsi_task_set_status(task, sc, sk, asc, ascq);
+	scsi_lun_complete_task(task->lun, task);
+}
+
+static void
+bdev_scsi_read_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success,
+				 void *cb_arg)
+{
+	struct spdk_scsi_task *task = cb_arg;
+	int sc, sk, asc, ascq;
+
+	task->bdev_io = bdev_io;
+
+	spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq);
+
+	spdk_scsi_task_set_status(task, sc, sk, asc, ascq);
+	scsi_lun_complete_task(task->lun, task);
+}
+
+static void
+bdev_scsi_task_complete_reset(struct spdk_bdev_io *bdev_io, bool success,
+			      void *cb_arg)
+{
+	struct spdk_scsi_task *task = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (success) {
+		task->response = SPDK_SCSI_TASK_MGMT_RESP_SUCCESS;
+	}
+
+	scsi_lun_complete_reset_task(task->lun, task);
+}
+
+static void
+bdev_scsi_queue_io(struct spdk_scsi_task *task, spdk_bdev_io_wait_cb cb_fn, void *cb_arg)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_bdev *bdev = lun->bdev;
+	struct spdk_io_channel *ch = lun->io_channel;
+	int rc;
+
+	task->bdev_io_wait.bdev = bdev;
+	task->bdev_io_wait.cb_fn = cb_fn;
+	task->bdev_io_wait.cb_arg = cb_arg;
+
+	rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait);
+	if (rc != 0) {
+		assert(false);
+	}
+}
+
+static int
+bdev_scsi_sync(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+	       struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+	       uint64_t lba, uint32_t num_blocks)
+{
+	uint64_t bdev_num_blocks;
+	int rc;
+
+	if (num_blocks == 0) {
+		return SPDK_SCSI_TASK_COMPLETE;
+	}
+
+	bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+
+	if (lba >= bdev_num_blocks || num_blocks > bdev_num_blocks ||
+	    lba > (bdev_num_blocks - num_blocks)) {
+		SPDK_ERRLOG("end of media\n");
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_NO_SENSE,
+					  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return SPDK_SCSI_TASK_COMPLETE;
+	}
+
+	rc = spdk_bdev_flush_blocks(bdev_desc, bdev_ch, lba, num_blocks,
+				    bdev_scsi_task_complete_cmd, task);
+
+	if (rc) {
+		if (rc == -ENOMEM) {
+			bdev_scsi_queue_io(task, bdev_scsi_process_block_resubmit, task);
+			return SPDK_SCSI_TASK_PENDING;
+		}
+		SPDK_ERRLOG("spdk_bdev_flush_blocks() failed\n");
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_NO_SENSE,
+					  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return SPDK_SCSI_TASK_COMPLETE;
+	}
+	task->data_transferred = 0;
+	return SPDK_SCSI_TASK_PENDING;
+}
+
+static uint64_t
+_bytes_to_blocks(uint32_t block_size, uint64_t offset_bytes, uint64_t *offset_blocks,
+		 uint64_t num_bytes, uint64_t *num_blocks)
+{
+	uint8_t shift_cnt;
+
+	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
+	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
+		shift_cnt = spdk_u32log2(block_size);
+		*offset_blocks = offset_bytes >> shift_cnt;
+		*num_blocks = num_bytes >> shift_cnt;
+		return (offset_bytes - (*offset_blocks << shift_cnt)) |
+		       (num_bytes - (*num_blocks << shift_cnt));
+	} else {
+		*offset_blocks = offset_bytes / block_size;
+		*num_blocks = num_bytes / block_size;
+		return (offset_bytes % block_size) | (num_bytes % block_size);
+	}
+}
+
+static int
+bdev_scsi_readwrite(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+		    struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+		    uint64_t lba, uint32_t xfer_len, bool is_read)
+{
+	uint64_t bdev_num_blocks, offset_blocks, num_blocks;
+	uint32_t max_xfer_len, block_size;
+	int sk = SPDK_SCSI_SENSE_NO_SENSE, asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+	int rc;
+
+	task->data_transferred = 0;
+
+	if (spdk_unlikely(task->dxfer_dir != SPDK_SCSI_DIR_NONE &&
+			  task->dxfer_dir != (is_read ? SPDK_SCSI_DIR_FROM_DEV : SPDK_SCSI_DIR_TO_DEV))) {
+		SPDK_ERRLOG("Incorrect data direction\n");
+		goto check_condition;
+	}
+
+	bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+	if (spdk_unlikely(bdev_num_blocks <= lba || bdev_num_blocks - lba < xfer_len)) {
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "end of media\n");
+		sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+		asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE;
+		goto check_condition;
+	}
+
+	if (spdk_unlikely(xfer_len == 0)) {
+		task->status = SPDK_SCSI_STATUS_GOOD;
+		return SPDK_SCSI_TASK_COMPLETE;
+	}
+
+	block_size = spdk_bdev_get_data_block_size(bdev);
+
+	/* Transfer Length is limited to the Block Limits VPD page Maximum Transfer Length */
+	max_xfer_len = SPDK_WORK_BLOCK_SIZE / block_size;
+	if (spdk_unlikely(xfer_len > max_xfer_len)) {
+		SPDK_ERRLOG("xfer_len %" PRIu32 " > maximum transfer length %" PRIu32 "\n",
+			    xfer_len, max_xfer_len);
+		sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+		asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+		goto check_condition;
+	}
+
+	if (!is_read) {
+		/* Additional check for Transfer Length */
+		if (xfer_len * block_size > task->transfer_len) {
+			SPDK_ERRLOG("xfer_len %" PRIu32 " * block_size %" PRIu32 " > transfer_len %u\n",
+				    xfer_len, block_size, task->transfer_len);
+			goto check_condition;
+		}
+	}
+
+	if (_bytes_to_blocks(block_size, task->offset, &offset_blocks, task->length, &num_blocks) != 0) {
+		SPDK_ERRLOG("task's offset %" PRIu64 " or length %" PRIu32 " is not block multiple\n",
+			    task->offset, task->length);
+		goto check_condition;
+	}
+
+	offset_blocks += lba;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+		      "%s: lba=%"PRIu64", len=%"PRIu64"\n",
+		      is_read ? "Read" : "Write", offset_blocks, num_blocks);
+
+	if (is_read) {
+		rc = spdk_bdev_readv_blocks(bdev_desc, bdev_ch, task->iovs, task->iovcnt,
+					    offset_blocks, num_blocks,
+					    bdev_scsi_read_task_complete_cmd, task);
+	} else {
+		rc = spdk_bdev_writev_blocks(bdev_desc, bdev_ch, task->iovs, task->iovcnt,
+					     offset_blocks, num_blocks,
+					     bdev_scsi_task_complete_cmd, task);
+	}
+
+	if (rc) {
+		if (rc == -ENOMEM) {
+			bdev_scsi_queue_io(task, bdev_scsi_process_block_resubmit, task);
+			return SPDK_SCSI_TASK_PENDING;
+		}
+		SPDK_ERRLOG("spdk_bdev_%s_blocks() failed\n", is_read ? "readv" : "writev");
+		goto check_condition;
+	}
+
+	task->data_transferred = task->length;
+	return SPDK_SCSI_TASK_PENDING;
+
+check_condition:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, sk, asc,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return SPDK_SCSI_TASK_COMPLETE;
+}
+
+struct spdk_bdev_scsi_unmap_ctx {
+	struct spdk_scsi_task		*task;
+	struct spdk_scsi_unmap_bdesc	desc[DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT];
+	uint32_t			count;
+};
+
+static int bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+			   struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+			   struct spdk_bdev_scsi_unmap_ctx *ctx);
+
+static void
+bdev_scsi_task_complete_unmap_cmd(struct spdk_bdev_io *bdev_io, bool success,
+				  void *cb_arg)
+{
+	struct spdk_bdev_scsi_unmap_ctx *ctx = cb_arg;
+	struct spdk_scsi_task *task = ctx->task;
+	int sc, sk, asc, ascq;
+
+	ctx->count--;
+
+	task->bdev_io = bdev_io;
+
+	if (task->status == SPDK_SCSI_STATUS_GOOD) {
+		spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq);
+		spdk_scsi_task_set_status(task, sc, sk, asc, ascq);
+	}
+
+	if (ctx->count == 0) {
+		scsi_lun_complete_task(task->lun, task);
+		free(ctx);
+	}
+}
+
+static int
+__copy_desc(struct spdk_bdev_scsi_unmap_ctx *ctx, uint8_t *data, size_t data_len)
+{
+	uint16_t	desc_data_len;
+	uint16_t	desc_count;
+
+	if (!data) {
+		return -EINVAL;
+	}
+
+	if (data_len < 8) {
+		/* We can't even get the reported length, so fail. */
+		return -EINVAL;
+	}
+
+	desc_data_len = from_be16(&data[2]);
+	desc_count = desc_data_len / 16;
+
+	if (desc_data_len > (data_len - 8)) {
+		SPDK_ERRLOG("Error - desc_data_len (%u) > data_len (%lu) - 8\n",
+			    desc_data_len, data_len);
+		return -EINVAL;
+	}
+
+	if (desc_count > DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT) {
+		SPDK_ERRLOG("desc_count (%u) greater than max allowed (%u)\n",
+			    desc_count, DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->desc, &data[8], desc_data_len);
+	return desc_count;
+}
+
+static void
+bdev_scsi_unmap_resubmit(void *arg)
+{
+	struct spdk_bdev_scsi_unmap_ctx	*ctx = arg;
+	struct spdk_scsi_task *task = ctx->task;
+	struct spdk_scsi_lun *lun = task->lun;
+
+	bdev_scsi_unmap(lun->bdev, lun->bdev_desc, lun->io_channel, task, ctx);
+}
+
+static int
+bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+		struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+		struct spdk_bdev_scsi_unmap_ctx *ctx)
+{
+	uint8_t				*data;
+	int				i, desc_count = -1;
+	int				data_len;
+	int				rc;
+
+	assert(task->status == SPDK_SCSI_STATUS_GOOD);
+
+	if (ctx == NULL) {
+		ctx = calloc(1, sizeof(*ctx));
+		if (!ctx) {
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_NO_SENSE,
+						  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			return SPDK_SCSI_TASK_COMPLETE;
+		}
+
+		ctx->task = task;
+		ctx->count = 0;
+	}
+
+
+	if (task->iovcnt == 1) {
+		data = (uint8_t *)task->iovs[0].iov_base;
+		data_len = task->iovs[0].iov_len;
+		desc_count = __copy_desc(ctx, data, data_len);
+	} else {
+		data = spdk_scsi_task_gather_data(task, &data_len);
+		if (data) {
+			desc_count = __copy_desc(ctx, data, data_len);
+			free(data);
+		}
+	}
+
+	if (desc_count < 0) {
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		free(ctx);
+		return SPDK_SCSI_TASK_COMPLETE;
+	}
+
+	for (i = ctx->count; i < desc_count; i++) {
+		struct spdk_scsi_unmap_bdesc	*desc;
+		uint64_t offset_blocks;
+		uint64_t num_blocks;
+
+		desc = &ctx->desc[i];
+
+		offset_blocks = from_be64(&desc->lba);
+		num_blocks = from_be32(&desc->block_count);
+
+		if (num_blocks == 0) {
+			continue;
+		}
+
+		ctx->count++;
+		rc = spdk_bdev_unmap_blocks(bdev_desc, bdev_ch, offset_blocks, num_blocks,
+					    bdev_scsi_task_complete_unmap_cmd, ctx);
+
+		if (rc) {
+			if (rc == -ENOMEM) {
+				bdev_scsi_queue_io(task, bdev_scsi_unmap_resubmit, ctx);
+				/* Unmap was not yet submitted to bdev */
+				ctx->count--;
+				return SPDK_SCSI_TASK_PENDING;
+			}
+			SPDK_ERRLOG("SCSI Unmapping failed\n");
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_NO_SENSE,
+						  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			ctx->count--;
+			/* We can't complete here - we may have to wait for previously
+			 * submitted unmaps to complete */
+			break;
+		}
+	}
+
+	if (ctx->count == 0) {
+		free(ctx);
+		return SPDK_SCSI_TASK_COMPLETE;
+	}
+
+	return SPDK_SCSI_TASK_PENDING;
+}
+
+static int
+bdev_scsi_process_block(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_bdev *bdev = lun->bdev;
+	uint64_t lba;
+	uint32_t xfer_len;
+	uint32_t len = 0;
+	uint8_t *cdb = task->cdb;
+
+	/* XXX: We need to support FUA bit for writes! */
+	switch (cdb[0]) {
+	case SPDK_SBC_READ_6:
+	case SPDK_SBC_WRITE_6:
+		lba = (uint64_t)cdb[1] << 16;
+		lba |= (uint64_t)cdb[2] << 8;
+		lba |= (uint64_t)cdb[3];
+		xfer_len = cdb[4];
+		if (xfer_len == 0) {
+			xfer_len = 256;
+		}
+		return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+					   task, lba, xfer_len,
+					   cdb[0] == SPDK_SBC_READ_6);
+
+	case SPDK_SBC_READ_10:
+	case SPDK_SBC_WRITE_10:
+		lba = from_be32(&cdb[2]);
+		xfer_len = from_be16(&cdb[7]);
+		return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+					   task, lba, xfer_len,
+					   cdb[0] == SPDK_SBC_READ_10);
+
+	case SPDK_SBC_READ_12:
+	case SPDK_SBC_WRITE_12:
+		lba = from_be32(&cdb[2]);
+		xfer_len = from_be32(&cdb[6]);
+		return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+					   task, lba, xfer_len,
+					   cdb[0] == SPDK_SBC_READ_12);
+	case SPDK_SBC_READ_16:
+	case SPDK_SBC_WRITE_16:
+		lba = from_be64(&cdb[2]);
+		xfer_len = from_be32(&cdb[10]);
+		return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+					   task, lba, xfer_len,
+					   cdb[0] == SPDK_SBC_READ_16);
+
+	case SPDK_SBC_READ_CAPACITY_10: {
+		uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev);
+		uint8_t buffer[8];
+
+		if (num_blocks - 1 > 0xffffffffULL) {
+			memset(buffer, 0xff, 4);
+		} else {
+			to_be32(buffer, num_blocks - 1);
+		}
+		to_be32(&buffer[4], spdk_bdev_get_data_block_size(bdev));
+
+		len = spdk_min(task->length, sizeof(buffer));
+		if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) {
+			break;
+		}
+
+		task->data_transferred = len;
+		task->status = SPDK_SCSI_STATUS_GOOD;
+		break;
+	}
+
+	case SPDK_SPC_SERVICE_ACTION_IN_16:
+		switch (cdb[1] & 0x1f) { /* SERVICE ACTION */
+		case SPDK_SBC_SAI_READ_CAPACITY_16: {
+			uint8_t buffer[32] = {0};
+
+			to_be64(&buffer[0], spdk_bdev_get_num_blocks(bdev) - 1);
+			to_be32(&buffer[8], spdk_bdev_get_data_block_size(bdev));
+			/*
+			 * Set the TPE bit to 1 to indicate thin provisioning.
+			 * The position of TPE bit is the 7th bit in 14th byte
+			 * in READ CAPACITY (16) parameter data.
+			 */
+			if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+				buffer[14] |= 1 << 7;
+			}
+
+			len = spdk_min(from_be32(&cdb[10]), sizeof(buffer));
+			if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) {
+				break;
+			}
+
+			task->data_transferred = len;
+			task->status = SPDK_SCSI_STATUS_GOOD;
+			break;
+		}
+
+		default:
+			return SPDK_SCSI_TASK_UNKNOWN;
+		}
+		break;
+
+	case SPDK_SBC_SYNCHRONIZE_CACHE_10:
+	case SPDK_SBC_SYNCHRONIZE_CACHE_16:
+		if (cdb[0] == SPDK_SBC_SYNCHRONIZE_CACHE_10) {
+			lba = from_be32(&cdb[2]);
+			len = from_be16(&cdb[7]);
+		} else {
+			lba = from_be64(&cdb[2]);
+			len = from_be32(&cdb[10]);
+		}
+
+		if (len == 0) {
+			len = spdk_bdev_get_num_blocks(bdev) - lba;
+		}
+
+		return bdev_scsi_sync(bdev, lun->bdev_desc, lun->io_channel, task, lba, len);
+		break;
+
+	case SPDK_SBC_UNMAP:
+		return bdev_scsi_unmap(bdev, lun->bdev_desc, lun->io_channel, task, NULL);
+
+	default:
+		return SPDK_SCSI_TASK_UNKNOWN;
+	}
+
+	return SPDK_SCSI_TASK_COMPLETE;
+}
+
+static void
+bdev_scsi_process_block_resubmit(void *arg)
+{
+	struct spdk_scsi_task *task = arg;
+
+	bdev_scsi_process_block(task);
+}
+
+static int
+bdev_scsi_check_len(struct spdk_scsi_task *task, int len, int min_len)
+{
+	if (len >= min_len) {
+		return 0;
+	}
+
+	/* INVALID FIELD IN CDB */
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+				  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+				  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -1;
+}
+
+static int
+bdev_scsi_process_primary(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_bdev *bdev = lun->bdev;
+	int alloc_len = -1;
+	int data_len = -1;
+	uint8_t *cdb = task->cdb;
+	uint8_t *data = NULL;
+	int rc = 0;
+	int pllen, md = 0;
+	int llba;
+	int dbd, pc, page, subpage;
+	int cmd_parsed = 0;
+
+	switch (cdb[0]) {
+	case SPDK_SPC_INQUIRY:
+		alloc_len = from_be16(&cdb[3]);
+		data_len = spdk_max(4096, alloc_len);
+		data = calloc(1, data_len);
+		assert(data != NULL);
+		rc = bdev_scsi_inquiry(bdev, task, cdb, data, data_len);
+		data_len = spdk_min(rc, data_len);
+		if (rc < 0) {
+			break;
+		}
+
+		SPDK_LOGDUMP(SPDK_LOG_SCSI, "INQUIRY", data, data_len);
+		break;
+
+	case SPDK_SPC_REPORT_LUNS: {
+		int sel;
+
+		sel = cdb[2];
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "sel=%x\n", sel);
+
+		alloc_len = from_be32(&cdb[6]);
+		rc = bdev_scsi_check_len(task, alloc_len, 16);
+		if (rc < 0) {
+			break;
+		}
+
+		data_len = spdk_max(4096, alloc_len);
+		data = calloc(1, data_len);
+		assert(data != NULL);
+		rc = bdev_scsi_report_luns(task->lun, sel, data, data_len);
+		data_len = rc;
+		if (rc < 0) {
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_NO_SENSE,
+						  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			break;
+		}
+
+		SPDK_LOGDUMP(SPDK_LOG_SCSI, "REPORT LUNS", data, data_len);
+		break;
+	}
+
+	case SPDK_SPC_MODE_SELECT_6:
+	case SPDK_SPC_MODE_SELECT_10:
+		if (cdb[0] == SPDK_SPC_MODE_SELECT_6) {
+			/* MODE_SELECT(6) must have at least a 4 byte header. */
+			md = 4;
+			pllen = cdb[4];
+		} else {
+			/* MODE_SELECT(10) must have at least an 8 byte header. */
+			md = 8;
+			pllen = from_be16(&cdb[7]);
+		}
+
+		if (pllen == 0) {
+			break;
+		}
+
+		rc = bdev_scsi_check_len(task, pllen, md);
+		if (rc < 0) {
+			break;
+		}
+
+		data = spdk_scsi_task_gather_data(task, &rc);
+		if (rc < 0) {
+			break;
+		}
+		data_len = rc;
+
+		rc = bdev_scsi_check_len(task, data_len, spdk_max(pllen, md));
+		if (rc < 0) {
+			break;
+		}
+
+		rc = pllen;
+		data_len = 0;
+		break;
+
+	case SPDK_SPC_MODE_SENSE_6:
+		alloc_len = cdb[4];
+		md = 6;
+	/* FALLTHROUGH */
+	case SPDK_SPC_MODE_SENSE_10:
+		llba = 0;
+
+		if (md == 0) {
+			alloc_len = from_be16(&cdb[7]);
+			llba = !!(cdb[1] & 0x10);
+			md = 10;
+		}
+
+		dbd = !!(cdb[1] & 0x8);
+		pc = (cdb[2] & 0xc0) >> 6;
+		page = cdb[2] & 0x3f;
+		subpage = cdb[3];
+
+		/* First call with no buffer to discover needed buffer size */
+		rc = bdev_scsi_mode_sense(bdev, md,
+					  cdb, dbd, llba, pc,
+					  page, subpage,
+					  NULL, task);
+		if (rc < 0) {
+			break;
+		}
+
+		data_len = rc;
+		data = calloc(1, data_len);
+		assert(data != NULL);
+
+		/* First call with no buffer to discover needed buffer size */
+		rc = bdev_scsi_mode_sense(bdev, md,
+					  cdb, dbd, llba, pc,
+					  page, subpage,
+					  data, task);
+		if (rc < 0) {
+			/* INVALID FIELD IN CDB */
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+						  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			break;
+		}
+		break;
+
+	case SPDK_SPC_REQUEST_SENSE: {
+		int desc;
+		int sk, asc, ascq;
+
+		desc = cdb[1] & 0x1;
+		if (desc != 0) {
+			/* INVALID FIELD IN CDB */
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+						  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			break;
+		}
+
+		alloc_len = cdb[4];
+
+		/* NO ADDITIONAL SENSE INFORMATION */
+		sk = SPDK_SCSI_SENSE_NO_SENSE;
+		asc = 0x00;
+		ascq = 0x00;
+
+		spdk_scsi_task_build_sense_data(task, sk, asc, ascq);
+
+		data_len = task->sense_data_len;
+		data = calloc(1, data_len);
+		assert(data != NULL);
+		memcpy(data, task->sense_data, data_len);
+		break;
+	}
+
+	case SPDK_SPC_LOG_SELECT:
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SELECT\n");
+		cmd_parsed = 1;
+	/* FALLTHROUGH */
+	case SPDK_SPC_LOG_SENSE:
+		if (!cmd_parsed) {
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SENSE\n");
+		}
+
+		/* INVALID COMMAND OPERATION CODE */
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		rc = -1;
+		break;
+
+	case SPDK_SPC_TEST_UNIT_READY:
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "TEST_UNIT_READY\n");
+		cmd_parsed = 1;
+	/* FALLTHROUGH */
+	case SPDK_SBC_START_STOP_UNIT:
+		if (!cmd_parsed) {
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "START_STOP_UNIT\n");
+		}
+
+		rc = 0;
+		break;
+
+	case SPDK_SPC_PERSISTENT_RESERVE_OUT:
+		pllen = from_be32(&cdb[5]);
+		rc = bdev_scsi_check_len(task, pllen, 24);
+		if (rc < 0) {
+			break;
+		}
+
+		data = spdk_scsi_task_gather_data(task, &rc);
+		if (rc < 0) {
+			break;
+		}
+		data_len = rc;
+		if (data_len < 24) {
+			rc = -1;
+			break;
+		}
+
+		rc = scsi_pr_out(task, cdb, data, data_len);
+		if (rc < 0) {
+			break;
+		}
+		rc = pllen;
+		data_len = 0;
+		break;
+
+	case SPDK_SPC_PERSISTENT_RESERVE_IN:
+		alloc_len = from_be16(&cdb[7]);
+		data_len = alloc_len;
+		data = calloc(1, data_len);
+		assert(data != NULL);
+		rc = scsi_pr_in(task, cdb, data, data_len);
+		break;
+
+	case SPDK_SPC2_RESERVE_6:
+	case SPDK_SPC2_RESERVE_10:
+		rc = scsi2_reserve(task, cdb);
+		if (rc == 0) {
+			if (cdb[0] == SPDK_SPC2_RESERVE_10) {
+				rc = from_be16(&cdb[7]);
+			}
+			data_len = 0;
+		}
+		break;
+
+	case SPDK_SPC2_RELEASE_6:
+	case SPDK_SPC2_RELEASE_10:
+		rc = scsi2_release(task);
+		break;
+
+	default:
+		return SPDK_SCSI_TASK_UNKNOWN;
+	}
+
+	if (rc >= 0 && data_len > 0) {
+		assert(alloc_len >= 0);
+		spdk_scsi_task_scatter_data(task, data, spdk_min(alloc_len, data_len));
+		rc = spdk_min(data_len, alloc_len);
+	}
+
+	if (rc >= 0) {
+		task->data_transferred = rc;
+		task->status = SPDK_SCSI_STATUS_GOOD;
+	}
+
+	if (data) {
+		free(data);
+	}
+
+	return SPDK_SCSI_TASK_COMPLETE;
+}
+
+int
+bdev_scsi_execute(struct spdk_scsi_task *task)
+{
+	int rc;
+
+	if ((rc = bdev_scsi_process_block(task)) == SPDK_SCSI_TASK_UNKNOWN) {
+		if ((rc = bdev_scsi_process_primary(task)) == SPDK_SCSI_TASK_UNKNOWN) {
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "unsupported SCSI OP=0x%x\n", task->cdb[0]);
+			/* INVALID COMMAND OPERATION CODE */
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+						  SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			return SPDK_SCSI_TASK_COMPLETE;
+		}
+	}
+
+	return rc;
+}
+
+static void
+bdev_scsi_reset_resubmit(void *arg)
+{
+	struct spdk_scsi_task *task = arg;
+
+	bdev_scsi_reset(task);
+}
+
+void
+bdev_scsi_reset(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	int rc;
+
+	rc = spdk_bdev_reset(lun->bdev_desc, lun->io_channel, bdev_scsi_task_complete_reset,
+			     task);
+	if (rc == -ENOMEM) {
+		bdev_scsi_queue_io(task, bdev_scsi_reset_resubmit, task);
+	}
+}
+
+bool
+bdev_scsi_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_scsi_task *task,
+		      struct spdk_dif_ctx *dif_ctx)
+{
+	uint32_t ref_tag = 0, dif_check_flags = 0, data_offset;
+	uint8_t *cdb;
+	int rc;
+
+	if (spdk_likely(spdk_bdev_get_md_size(bdev) == 0)) {
+		return false;
+	}
+
+	cdb = task->cdb;
+	data_offset = task->offset;
+
+	/* We use lower 32 bits of LBA as Reference. Tag */
+	switch (cdb[0]) {
+	case SPDK_SBC_READ_6:
+	case SPDK_SBC_WRITE_6:
+		ref_tag = (uint32_t)cdb[1] << 16;
+		ref_tag |= (uint32_t)cdb[2] << 8;
+		ref_tag |= (uint32_t)cdb[3];
+		break;
+	case SPDK_SBC_READ_10:
+	case SPDK_SBC_WRITE_10:
+	case SPDK_SBC_READ_12:
+	case SPDK_SBC_WRITE_12:
+		ref_tag = from_be32(&cdb[2]);
+		break;
+	case SPDK_SBC_READ_16:
+	case SPDK_SBC_WRITE_16:
+		ref_tag = (uint32_t)from_be64(&cdb[2]);
+		break;
+	default:
+		return false;
+	}
+
+	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
+		dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
+	}
+
+	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
+		dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
+	}
+
+	rc = spdk_dif_ctx_init(dif_ctx,
+			       spdk_bdev_get_block_size(bdev),
+			       spdk_bdev_get_md_size(bdev),
+			       spdk_bdev_is_md_interleaved(bdev),
+			       spdk_bdev_is_dif_head_of_md(bdev),
+			       spdk_bdev_get_dif_type(bdev),
+			       dif_check_flags,
+			       ref_tag, 0, 0, data_offset, 0);
+
+	return (rc == 0) ? true : false;
+}
diff --git a/src/spdk/lib/scsi/scsi_internal.h b/src/spdk/lib/scsi/scsi_internal.h
new file mode 100644
index 000000000..2da3a99a8
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_internal.h
@@ -0,0 +1,214 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_SCSI_INTERNAL_H
+#define SPDK_SCSI_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/scsi.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/trace.h"
+#include "spdk/dif.h"
+
+#include "spdk_internal/log.h"
+
+enum {
+	SPDK_SCSI_TASK_UNKNOWN = -1,
+	SPDK_SCSI_TASK_COMPLETE,
+	SPDK_SCSI_TASK_PENDING,
+};
+
+struct spdk_scsi_port {
+	uint8_t			is_used;
+	uint64_t		id;
+	uint16_t		index;
+	uint16_t		transport_id_len;
+	char			transport_id[SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH];
+	char			name[SPDK_SCSI_PORT_MAX_NAME_LENGTH];
+};
+
+/* Registrant with I_T nextus */
+struct spdk_scsi_pr_registrant {
+	uint64_t				rkey;
+	uint16_t				relative_target_port_id;
+	uint16_t				transport_id_len;
+	char					transport_id[SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH];
+	char					initiator_port_name[SPDK_SCSI_PORT_MAX_NAME_LENGTH];
+	char					target_port_name[SPDK_SCSI_PORT_MAX_NAME_LENGTH];
+	struct spdk_scsi_port			*initiator_port;
+	struct spdk_scsi_port			*target_port;
+	TAILQ_ENTRY(spdk_scsi_pr_registrant)	link;
+};
+
+#define SCSI_SPC2_RESERVE			0x00000001U
+
+/* Reservation with LU_SCOPE */
+struct spdk_scsi_pr_reservation {
+	uint32_t				flags;
+	struct spdk_scsi_pr_registrant		*holder;
+	enum spdk_scsi_pr_type_code		rtype;
+	uint64_t				crkey;
+};
+
+struct spdk_scsi_dev {
+	int					id;
+	int					is_allocated;
+	bool					removed;
+	spdk_scsi_dev_destruct_cb_t		remove_cb;
+	void					*remove_ctx;
+
+	char					name[SPDK_SCSI_DEV_MAX_NAME + 1];
+
+	struct spdk_scsi_lun			*lun[SPDK_SCSI_DEV_MAX_LUN];
+
+	int					num_ports;
+	struct spdk_scsi_port			port[SPDK_SCSI_DEV_MAX_PORTS];
+
+	uint8_t					protocol_id;
+};
+
+struct spdk_scsi_lun_desc {
+	struct spdk_scsi_lun		*lun;
+	spdk_scsi_lun_remove_cb_t	hotremove_cb;
+	void				*hotremove_ctx;
+	TAILQ_ENTRY(spdk_scsi_lun_desc)	link;
+};
+
+struct spdk_scsi_lun {
+	/** LUN id for this logical unit. */
+	int id;
+
+	/** Pointer to the SCSI device containing this LUN. */
+	struct spdk_scsi_dev *dev;
+
+	/** The bdev associated with this LUN. */
+	struct spdk_bdev *bdev;
+
+	/** Descriptor for opened block device. */
+	struct spdk_bdev_desc *bdev_desc;
+
+	/** The thread which opens this LUN. */
+	struct spdk_thread *thread;
+
+	/** I/O channel for the bdev associated with this LUN. */
+	struct spdk_io_channel *io_channel;
+
+	/**  The reference number for this LUN, thus we can correctly free the io_channel */
+	uint32_t ref;
+
+	/** Poller to release the resource of the lun when it is hot removed */
+	struct spdk_poller *hotremove_poller;
+
+	/** The LUN is removed */
+	bool removed;
+
+	/** Callback to be fired when LUN removal is first triggered. */
+	void (*hotremove_cb)(const struct spdk_scsi_lun *lun, void *arg);
+
+	/** Argument for hotremove_cb */
+	void *hotremove_ctx;
+
+	/** Registrant head for I_T nexus */
+	TAILQ_HEAD(, spdk_scsi_pr_registrant) reg_head;
+	/** Persistent Reservation Generation */
+	uint32_t pr_generation;
+	/** Reservation for the LUN */
+	struct spdk_scsi_pr_reservation reservation;
+	/** Reservation holder for SPC2 RESERVE(6) and RESERVE(10) */
+	struct spdk_scsi_pr_registrant scsi2_holder;
+
+	/** List of open descriptors for this LUN. */
+	TAILQ_HEAD(, spdk_scsi_lun_desc) open_descs;
+
+	/** submitted tasks */
+	TAILQ_HEAD(tasks, spdk_scsi_task) tasks;
+
+	/** pending tasks */
+	TAILQ_HEAD(pending_tasks, spdk_scsi_task) pending_tasks;
+
+	/** submitted management tasks */
+	TAILQ_HEAD(mgmt_tasks, spdk_scsi_task) mgmt_tasks;
+
+	/** pending management tasks */
+	TAILQ_HEAD(pending_mgmt_tasks, spdk_scsi_task) pending_mgmt_tasks;
+
+	/** poller to check completion of tasks prior to reset */
+	struct spdk_poller *reset_poller;
+};
+
+struct spdk_scsi_lun *scsi_lun_construct(struct spdk_bdev *bdev,
+		void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+		void *hotremove_ctx);
+void scsi_lun_destruct(struct spdk_scsi_lun *lun);
+
+void scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+void scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+bool scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun,
+				     const struct spdk_scsi_port *initiator_port);
+void scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+void scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+bool scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun,
+				const struct spdk_scsi_port *initiator_port);
+int scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun);
+void scsi_lun_free_io_channel(struct spdk_scsi_lun *lun);
+
+struct spdk_scsi_dev *scsi_dev_get_list(void);
+
+int scsi_port_construct(struct spdk_scsi_port *port, uint64_t id,
+			uint16_t index, const char *name);
+void scsi_port_destruct(struct spdk_scsi_port *port);
+
+int bdev_scsi_execute(struct spdk_scsi_task *task);
+void bdev_scsi_reset(struct spdk_scsi_task *task);
+
+bool bdev_scsi_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_scsi_task *task,
+			   struct spdk_dif_ctx *dif_ctx);
+
+int scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len);
+int scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len);
+int scsi_pr_check(struct spdk_scsi_task *task);
+
+int scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb);
+int scsi2_release(struct spdk_scsi_task *task);
+int scsi2_reserve_check(struct spdk_scsi_task *task);
+
+struct spdk_scsi_globals {
+	pthread_mutex_t mutex;
+};
+
+extern struct spdk_scsi_globals g_scsi;
+
+#endif /* SPDK_SCSI_INTERNAL_H */
diff --git a/src/spdk/lib/scsi/scsi_pr.c b/src/spdk/lib/scsi/scsi_pr.c
new file mode 100644
index 000000000..4e17cc2c6
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_pr.c
@@ -0,0 +1,1067 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+#include "spdk/endian.h"
+
+/* Get registrant by I_T nexus */
+static struct spdk_scsi_pr_registrant *
+scsi_pr_get_registrant(struct spdk_scsi_lun *lun,
+		       struct spdk_scsi_port *initiator_port,
+		       struct spdk_scsi_port *target_port)
+{
+	struct spdk_scsi_pr_registrant *reg, *tmp;
+
+	TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+		if (initiator_port == reg->initiator_port &&
+		    target_port == reg->target_port) {
+			return reg;
+		}
+	}
+
+	return NULL;
+}
+
+static bool
+scsi2_it_nexus_is_holder(struct spdk_scsi_lun *lun,
+			 struct spdk_scsi_port *initiator_port,
+			 struct spdk_scsi_port *target_port)
+{
+	struct spdk_scsi_pr_registrant *reg = lun->reservation.holder;
+
+	assert(reg != NULL);
+
+	if ((reg->initiator_port == initiator_port) &&
+	    (reg->target_port == target_port)) {
+		return true;
+	}
+
+	return false;
+}
+
+/* Reservation type is all registrants or not */
+static inline bool
+scsi_pr_is_all_registrants_type(struct spdk_scsi_lun *lun)
+{
+	return (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+		lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS);
+}
+
+/* Registrant is reservation holder or not */
+static inline bool
+scsi_pr_registrant_is_holder(struct spdk_scsi_lun *lun,
+			     struct spdk_scsi_pr_registrant *reg)
+{
+	if (scsi_pr_is_all_registrants_type(lun)) {
+		return true;
+	}
+
+	return (lun->reservation.holder == reg);
+}
+
+/* LUN holds a reservation or not */
+static inline bool
+scsi_pr_has_reservation(struct spdk_scsi_lun *lun)
+{
+	return !(lun->reservation.holder == NULL);
+}
+
+static int
+scsi_pr_register_registrant(struct spdk_scsi_lun *lun,
+			    struct spdk_scsi_port *initiator_port,
+			    struct spdk_scsi_port *target_port,
+			    uint64_t sa_rkey)
+{
+	struct spdk_scsi_pr_registrant *reg;
+
+	/* Register sa_rkey with the I_T nexus */
+	reg = calloc(1, sizeof(*reg));
+	if (!reg) {
+		return -ENOMEM;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: new registrant registered "
+		      "with key 0x%"PRIx64"\n", sa_rkey);
+
+	/* New I_T nexus */
+	reg->initiator_port = initiator_port;
+	if (initiator_port) {
+		snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s",
+			 initiator_port->name);
+		reg->transport_id_len = initiator_port->transport_id_len;
+		memcpy(reg->transport_id, initiator_port->transport_id, reg->transport_id_len);
+	}
+	reg->target_port = target_port;
+	if (target_port) {
+		snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s",
+			 target_port->name);
+		reg->relative_target_port_id = target_port->index;
+	}
+	reg->rkey = sa_rkey;
+	TAILQ_INSERT_TAIL(&lun->reg_head, reg, link);
+	lun->pr_generation++;
+
+	return 0;
+}
+
+static void
+scsi_pr_release_reservation(struct spdk_scsi_lun *lun, struct spdk_scsi_pr_registrant *reg)
+{
+	bool all_regs = false;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: release reservation "
+		      "with type %u\n", lun->reservation.rtype);
+
+	/* TODO: Unit Attention */
+	all_regs = scsi_pr_is_all_registrants_type(lun);
+	if (all_regs && !TAILQ_EMPTY(&lun->reg_head)) {
+		lun->reservation.holder = TAILQ_FIRST(&lun->reg_head);
+		return;
+	}
+
+	memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation));
+}
+
+static void
+scsi_pr_reserve_reservation(struct spdk_scsi_lun *lun,
+			    enum spdk_scsi_pr_type_code type,
+			    uint64_t rkey,
+			    struct spdk_scsi_pr_registrant *holder)
+{
+	lun->reservation.rtype = type;
+	lun->reservation.crkey = rkey;
+	lun->reservation.holder = holder;
+}
+
+static void
+scsi_pr_unregister_registrant(struct spdk_scsi_lun *lun,
+			      struct spdk_scsi_pr_registrant *reg)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: unregister registrant\n");
+
+	TAILQ_REMOVE(&lun->reg_head, reg, link);
+	if (scsi_pr_registrant_is_holder(lun, reg)) {
+		scsi_pr_release_reservation(lun, reg);
+	}
+
+	free(reg);
+	lun->pr_generation++;
+}
+
+static void
+scsi_pr_replace_registrant_key(struct spdk_scsi_lun *lun,
+			       struct spdk_scsi_pr_registrant *reg,
+			       uint64_t sa_rkey)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: replace with new "
+		      "reservation key 0x%"PRIx64"\n", sa_rkey);
+	reg->rkey = sa_rkey;
+	lun->pr_generation++;
+}
+
+static int
+scsi_pr_out_reserve(struct spdk_scsi_task *task,
+		    enum spdk_scsi_pr_type_code rtype, uint64_t rkey,
+		    uint8_t spec_i_pt, uint8_t all_tg_pt, uint8_t aptpl)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT RESERVE: rkey 0x%"PRIx64", requested "
+		      "reservation type %u, type %u\n", rkey, rtype, lun->reservation.rtype);
+
+	/* TODO: don't support now */
+	if (spec_i_pt || all_tg_pt || aptpl) {
+		SPDK_ERRLOG("Unspported spec_i_pt/all_tg_pt fields "
+			    "or invalid aptpl field\n");
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return -EINVAL;
+	}
+
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	/* No registration for the I_T nexus */
+	if (!reg) {
+		SPDK_ERRLOG("No registration\n");
+		goto conflict;
+	}
+
+	/* invalid reservation key */
+	if (reg->rkey != rkey) {
+		SPDK_ERRLOG("Reservation key 0x%"PRIx64" don't match 0x%"PRIx64"\n",
+			    rkey, reg->rkey);
+		goto conflict;
+	}
+
+	/* reservation holder already exists */
+	if (scsi_pr_has_reservation(lun)) {
+		if (rtype != lun->reservation.rtype) {
+			SPDK_ERRLOG("Reservation type doesn't match\n");
+			goto conflict;
+		}
+
+		if (!scsi_pr_registrant_is_holder(lun, reg)) {
+			SPDK_ERRLOG("Only 1 holder is allowed for type %u\n", rtype);
+			goto conflict;
+		}
+	} else {
+		/* current I_T nexus is the first reservation holder */
+		scsi_pr_reserve_reservation(lun, rtype, rkey, reg);
+	}
+
+	return 0;
+
+conflict:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+				  SPDK_SCSI_SENSE_NO_SENSE,
+				  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -EINVAL;
+}
+
+static int
+scsi_pr_out_register(struct spdk_scsi_task *task,
+		     enum spdk_scsi_pr_out_service_action_code action,
+		     uint64_t rkey, uint64_t sa_rkey,
+		     uint8_t spec_i_pt, uint8_t all_tg_pt, uint8_t aptpl)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg;
+	int sc, sk, asc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT REGISTER: rkey 0x%"PRIx64", "
+		      "sa_key 0x%"PRIx64", reservation type %u\n", rkey, sa_rkey, lun->reservation.rtype);
+
+	/* TODO: don't support now */
+	if (spec_i_pt || all_tg_pt || aptpl) {
+		SPDK_ERRLOG("Unsupported spec_i_pt/all_tg_pt/aptpl field\n");
+		sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+		sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+		asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+		goto error_exit;
+	}
+
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	/* an unregistered I_T nexus session */
+	if (!reg) {
+		if (rkey && (action == SPDK_SCSI_PR_OUT_REGISTER)) {
+			SPDK_ERRLOG("Reservation key field is not empty\n");
+			sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+			sk = SPDK_SCSI_SENSE_NO_SENSE;
+			asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			goto error_exit;
+		}
+
+		if (!sa_rkey) {
+			/* Do nothing except return GOOD status */
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: service action "
+				      "reservation key is zero, do noting\n");
+			return 0;
+		}
+		/* Add a new registrant for the I_T nexus */
+		return scsi_pr_register_registrant(lun, task->initiator_port,
+						   task->target_port, sa_rkey);
+	} else {
+		/* a registered I_T nexus */
+		if (rkey != reg->rkey && action == SPDK_SCSI_PR_OUT_REGISTER) {
+			SPDK_ERRLOG("Reservation key 0x%"PRIx64" don't match "
+				    "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey);
+			sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+			sk = SPDK_SCSI_SENSE_NO_SENSE;
+			asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+			goto error_exit;
+		}
+
+		if (!sa_rkey) {
+			/* unregister */
+			scsi_pr_unregister_registrant(lun, reg);
+		} else {
+			/* replace */
+			scsi_pr_replace_registrant_key(lun, reg, sa_rkey);
+		}
+	}
+
+	return 0;
+
+error_exit:
+	spdk_scsi_task_set_status(task, sc, sk, asc, SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE);
+	return -EINVAL;
+}
+
+static int
+scsi_pr_out_release(struct spdk_scsi_task *task,
+		    enum spdk_scsi_pr_type_code rtype, uint64_t rkey)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg;
+	int sk, asc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT RELEASE: rkey 0x%"PRIx64", "
+		      "reservation type %u\n", rkey, rtype);
+
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	if (!reg) {
+		SPDK_ERRLOG("No registration\n");
+		sk = SPDK_SCSI_SENSE_NOT_READY;
+		asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+		goto check_condition;
+	}
+
+	/* no reservation holder */
+	if (!scsi_pr_has_reservation(lun)) {
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "RELEASE: no reservation holder\n");
+		return 0;
+	}
+
+	if (lun->reservation.rtype != rtype || rkey != lun->reservation.crkey) {
+		sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+		asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+		goto check_condition;
+	}
+
+	/* I_T nexus is not a persistent reservation holder */
+	if (!scsi_pr_registrant_is_holder(lun, reg)) {
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "RELEASE: current I_T nexus is not holder\n");
+		return 0;
+	}
+
+	scsi_pr_release_reservation(lun, reg);
+
+	return 0;
+
+check_condition:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, sk, asc,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -EINVAL;
+}
+
+static int
+scsi_pr_out_clear(struct spdk_scsi_task *task, uint64_t rkey)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg, *tmp;
+	int sc, sk, asc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT CLEAR: rkey 0x%"PRIx64"\n", rkey);
+
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	if (!reg) {
+		SPDK_ERRLOG("No registration\n");
+		sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+		sk = SPDK_SCSI_SENSE_NOT_READY;
+		asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+		goto error_exit;
+	}
+
+	if (rkey != reg->rkey) {
+		SPDK_ERRLOG("Reservation key 0x%"PRIx64" doesn't match "
+			    "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey);
+		sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+		sk = SPDK_SCSI_SENSE_NO_SENSE;
+		asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+		goto error_exit;
+	}
+
+	TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+		scsi_pr_unregister_registrant(lun, reg);
+	}
+
+	return 0;
+
+error_exit:
+	spdk_scsi_task_set_status(task, sc, sk, asc, SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -EINVAL;
+}
+
+static void
+scsi_pr_remove_all_regs_by_key(struct spdk_scsi_lun *lun, uint64_t sa_rkey)
+{
+	struct spdk_scsi_pr_registrant *reg, *tmp;
+
+	TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+		if (reg->rkey == sa_rkey) {
+			scsi_pr_unregister_registrant(lun, reg);
+		}
+	}
+}
+
+static void
+scsi_pr_remove_all_other_regs(struct spdk_scsi_lun *lun, struct spdk_scsi_pr_registrant *reg)
+{
+	struct spdk_scsi_pr_registrant *reg_tmp, *reg_tmp2;
+
+	TAILQ_FOREACH_SAFE(reg_tmp, &lun->reg_head, link, reg_tmp2) {
+		if (reg_tmp != reg) {
+			scsi_pr_unregister_registrant(lun, reg_tmp);
+		}
+	}
+}
+
+static int
+scsi_pr_out_preempt(struct spdk_scsi_task *task,
+		    enum spdk_scsi_pr_out_service_action_code action,
+		    enum spdk_scsi_pr_type_code rtype,
+		    uint64_t rkey, uint64_t sa_rkey)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg;
+	bool all_regs = false;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT PREEMPT: rkey 0x%"PRIx64", sa_rkey 0x%"PRIx64" "
+		      "action %u, type %u, reservation type %u\n",
+		      rkey, sa_rkey, action, rtype, lun->reservation.rtype);
+
+	/* I_T nexus is not registered */
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	if (!reg) {
+		SPDK_ERRLOG("No registration\n");
+		goto conflict;
+	}
+	if (rkey != reg->rkey) {
+		SPDK_ERRLOG("Reservation key 0x%"PRIx64" doesn't match "
+			    "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey);
+		goto conflict;
+	}
+
+	/* no persistent reservation */
+	if (!scsi_pr_has_reservation(lun)) {
+		scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: no persistent reservation\n");
+		goto exit;
+	}
+
+	all_regs = scsi_pr_is_all_registrants_type(lun);
+
+	if (all_regs) {
+		if (sa_rkey != 0) {
+			scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: All registrants type with sa_rkey\n");
+		} else {
+			/* remove all other registrants and release persistent reservation if any */
+			scsi_pr_remove_all_other_regs(lun, reg);
+			/* create persistent reservation using new type and scope */
+			scsi_pr_reserve_reservation(lun, rtype, 0, reg);
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: All registrants type with sa_rkey zeroed\n");
+		}
+		goto exit;
+	}
+
+	assert(lun->reservation.crkey != 0);
+
+	if (sa_rkey != lun->reservation.crkey) {
+		if (!sa_rkey) {
+			SPDK_ERRLOG("Zeroed sa_rkey\n");
+			spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+						  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+						  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+						  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			return -EINVAL;
+		}
+		scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+		goto exit;
+	}
+
+	if (scsi_pr_registrant_is_holder(lun, reg)) {
+		scsi_pr_reserve_reservation(lun, rtype, rkey, reg);
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: preempt itself with type %u\n", rtype);
+		goto exit;
+	}
+
+	/* unregister registrants if any */
+	scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	if (!reg) {
+		SPDK_ERRLOG("Current I_T nexus registrant was removed\n");
+		goto conflict;
+	}
+
+	/* preempt the holder */
+	scsi_pr_reserve_reservation(lun, rtype, rkey, reg);
+
+exit:
+	lun->pr_generation++;
+	return 0;
+
+conflict:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+				  SPDK_SCSI_SENSE_NO_SENSE,
+				  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -EINVAL;
+}
+
+int
+scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb,
+	    uint8_t *data, uint16_t data_len)
+{
+	int rc = -1;
+	uint64_t rkey, sa_rkey;
+	uint8_t spec_i_pt, all_tg_pt, aptpl;
+	enum spdk_scsi_pr_out_service_action_code action;
+	enum spdk_scsi_pr_scope_code scope;
+	enum spdk_scsi_pr_type_code rtype;
+	struct spdk_scsi_pr_out_param_list *param = (struct spdk_scsi_pr_out_param_list *)data;
+
+	action = cdb[1] & 0x0f;
+	scope = (cdb[2] >> 4) & 0x0f;
+	rtype = cdb[2] & 0x0f;
+
+	rkey = from_be64(&param->rkey);
+	sa_rkey = from_be64(&param->sa_rkey);
+	aptpl = param->aptpl;
+	spec_i_pt = param->spec_i_pt;
+	all_tg_pt = param->all_tg_pt;
+
+	switch (action) {
+	case SPDK_SCSI_PR_OUT_REGISTER:
+	case SPDK_SCSI_PR_OUT_REG_AND_IGNORE_KEY:
+		rc = scsi_pr_out_register(task, action, rkey, sa_rkey,
+					  spec_i_pt, all_tg_pt, aptpl);
+		break;
+	case SPDK_SCSI_PR_OUT_RESERVE:
+		if (scope != SPDK_SCSI_PR_LU_SCOPE) {
+			goto invalid;
+		}
+		rc = scsi_pr_out_reserve(task, rtype, rkey,
+					 spec_i_pt, all_tg_pt, aptpl);
+		break;
+	case SPDK_SCSI_PR_OUT_RELEASE:
+		if (scope != SPDK_SCSI_PR_LU_SCOPE) {
+			goto invalid;
+		}
+		rc = scsi_pr_out_release(task, rtype, rkey);
+		break;
+	case SPDK_SCSI_PR_OUT_CLEAR:
+		rc = scsi_pr_out_clear(task, rkey);
+		break;
+	case SPDK_SCSI_PR_OUT_PREEMPT:
+		if (scope != SPDK_SCSI_PR_LU_SCOPE) {
+			goto invalid;
+		}
+		rc = scsi_pr_out_preempt(task, action, rtype, rkey, sa_rkey);
+		break;
+	default:
+		SPDK_ERRLOG("Invalid service action code %u\n", action);
+		goto invalid;
+	}
+
+	return rc;
+
+invalid:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+				  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+				  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -EINVAL;
+}
+
+static int
+scsi_pr_in_read_keys(struct spdk_scsi_task *task, uint8_t *data,
+		     uint16_t data_len)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_in_read_keys_data *keys;
+	struct spdk_scsi_pr_registrant *reg, *tmp;
+	uint16_t count = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ KEYS\n");
+	keys = (struct spdk_scsi_pr_in_read_keys_data *)data;
+
+	to_be32(&keys->header.pr_generation, lun->pr_generation);
+	TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+		if (((count + 1) * 8 + sizeof(keys->header)) > data_len) {
+			break;
+		}
+		to_be64(&keys->rkeys[count], reg->rkey);
+		count++;
+	}
+	to_be32(&keys->header.additional_len, count * 8);
+
+	return (sizeof(keys->header) + count * 8);
+}
+
+static int
+scsi_pr_in_read_reservations(struct spdk_scsi_task *task,
+			     uint8_t *data, uint16_t data_len)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_in_read_reservations_data *param;
+	bool all_regs = false;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ RESERVATIONS\n");
+	param = (struct spdk_scsi_pr_in_read_reservations_data *)(data);
+
+	to_be32(&param->header.pr_generation, lun->pr_generation);
+	if (scsi_pr_has_reservation(lun)) {
+		all_regs = scsi_pr_is_all_registrants_type(lun);
+		if (all_regs) {
+			to_be64(&param->rkey, 0);
+		} else {
+			to_be64(&param->rkey, lun->reservation.crkey);
+		}
+		to_be32(&param->header.additional_len, 16);
+		param->scope = SPDK_SCSI_PR_LU_SCOPE;
+		param->type = lun->reservation.rtype;
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "READ RESERVATIONS with valid reservation\n");
+		return sizeof(*param);
+	}
+
+	/* no reservation */
+	to_be32(&param->header.additional_len, 0);
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "READ RESERVATIONS no reservation\n");
+	return sizeof(param->header);
+}
+
+static int
+scsi_pr_in_report_capabilities(struct spdk_scsi_task *task,
+			       uint8_t *data, uint16_t data_len)
+{
+	struct spdk_scsi_pr_in_report_capabilities_data *param;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN REPORT CAPABILITIES\n");
+	param = (struct spdk_scsi_pr_in_report_capabilities_data *)data;
+
+	memset(param, 0, sizeof(*param));
+	to_be16(&param->length, sizeof(*param));
+	/* Compatible reservation handling to support RESERVE/RELEASE defined in SPC-2 */
+	param->crh = 1;
+	param->tmv = 1;
+	param->wr_ex = 1;
+	param->ex_ac = 1;
+	param->wr_ex_ro = 1;
+	param->ex_ac_ro = 1;
+	param->wr_ex_ar = 1;
+	param->ex_ac_ar = 1;
+
+	return sizeof(*param);
+}
+
+static int
+scsi_pr_in_read_full_status(struct spdk_scsi_task *task,
+			    uint8_t *data, uint16_t data_len)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_in_full_status_data *param;
+	struct spdk_scsi_pr_in_full_status_desc *desc;
+	struct spdk_scsi_pr_registrant *reg, *tmp;
+	bool all_regs = false;
+	uint32_t add_len = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ FULL STATUS\n");
+
+	all_regs = scsi_pr_is_all_registrants_type(lun);
+	param = (struct spdk_scsi_pr_in_full_status_data *)data;
+	to_be32(&param->header.pr_generation, lun->pr_generation);
+
+	TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+		desc = (struct spdk_scsi_pr_in_full_status_desc *)
+		       ((uint8_t *)param->desc_list + add_len);
+		if (add_len + sizeof(*desc) + sizeof(param->header) > data_len) {
+			break;
+		}
+		add_len += sizeof(*desc);
+		desc->rkey = reg->rkey;
+		if (all_regs || lun->reservation.holder == reg) {
+			desc->r_holder = true;
+			desc->type = lun->reservation.rtype;
+		} else {
+			desc->r_holder = false;
+			desc->type = 0;
+		}
+		desc->all_tg_pt = 0;
+		desc->scope = SPDK_SCSI_PR_LU_SCOPE;
+		desc->relative_target_port_id = reg->relative_target_port_id;
+		if (add_len + reg->transport_id_len + sizeof(param->header) > data_len) {
+			break;
+		}
+		add_len += reg->transport_id_len;
+		memcpy(&desc->transport_id, reg->transport_id, reg->transport_id_len);
+		to_be32(&desc->desc_len, reg->transport_id_len);
+	}
+	to_be32(&param->header.additional_len, add_len);
+
+	return (sizeof(param->header) + add_len);
+}
+
+int
+scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb,
+	   uint8_t *data, uint16_t data_len)
+{
+	enum spdk_scsi_pr_in_action_code action;
+	int rc = 0;
+
+	action = cdb[1] & 0x1f;
+	if (data_len < sizeof(struct spdk_scsi_pr_in_read_header)) {
+		goto invalid;
+	}
+
+	switch (action) {
+	case SPDK_SCSI_PR_IN_READ_KEYS:
+		rc = scsi_pr_in_read_keys(task, data, data_len);
+		break;
+	case SPDK_SCSI_PR_IN_READ_RESERVATION:
+		if (data_len < sizeof(struct spdk_scsi_pr_in_read_reservations_data)) {
+			goto invalid;
+		}
+		rc = scsi_pr_in_read_reservations(task, data, data_len);
+		break;
+	case SPDK_SCSI_PR_IN_REPORT_CAPABILITIES:
+		rc = scsi_pr_in_report_capabilities(task, data, data_len);
+		break;
+	case SPDK_SCSI_PR_IN_READ_FULL_STATUS:
+		rc = scsi_pr_in_read_full_status(task, data, data_len);
+		break;
+	default:
+		goto invalid;
+	}
+
+	return rc;
+
+invalid:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+				  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+				  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -EINVAL;
+}
+
+int
+scsi_pr_check(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	uint8_t *cdb = task->cdb;
+	enum spdk_scsi_pr_type_code rtype;
+	enum spdk_scsi_pr_out_service_action_code action;
+	struct spdk_scsi_pr_registrant *reg;
+	bool dma_to_device = false;
+
+	/* no reservation holders */
+	if (!scsi_pr_has_reservation(lun)) {
+		return 0;
+	}
+
+	rtype = lun->reservation.rtype;
+	assert(rtype != 0);
+
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	/* current I_T nexus hold the reservation */
+	if (scsi_pr_registrant_is_holder(lun, reg)) {
+		return 0;
+	}
+
+	/* reservation is held by other I_T nexus */
+	switch (cdb[0]) {
+	case SPDK_SPC_INQUIRY:
+	case SPDK_SPC_REPORT_LUNS:
+	case SPDK_SPC_REQUEST_SENSE:
+	case SPDK_SPC_LOG_SENSE:
+	case SPDK_SPC_TEST_UNIT_READY:
+	case SPDK_SBC_START_STOP_UNIT:
+	case SPDK_SBC_READ_CAPACITY_10:
+	case SPDK_SPC_PERSISTENT_RESERVE_IN:
+	case SPDK_SPC_SERVICE_ACTION_IN_16:
+	/* CRH enabled, processed by scsi2_reserve() */
+	case SPDK_SPC2_RESERVE_6:
+	case SPDK_SPC2_RESERVE_10:
+	/* CRH enabled, processed by scsi2_release() */
+	case SPDK_SPC2_RELEASE_6:
+	case SPDK_SPC2_RELEASE_10:
+		return 0;
+	case SPDK_SPC_MODE_SELECT_6:
+	case SPDK_SPC_MODE_SELECT_10:
+	case SPDK_SPC_MODE_SENSE_6:
+	case SPDK_SPC_MODE_SENSE_10:
+	case SPDK_SPC_LOG_SELECT:
+		/* I_T nexus is registrant but not holder */
+		if (!reg) {
+			SPDK_DEBUGLOG(SPDK_LOG_SCSI, "CHECK: current I_T nexus "
+				      "is not registered, cdb 0x%x\n", cdb[0]);
+			goto conflict;
+		}
+		return 0;
+	case SPDK_SPC_PERSISTENT_RESERVE_OUT:
+		action = cdb[1] & 0x1f;
+		SPDK_DEBUGLOG(SPDK_LOG_SCSI, "CHECK: PR OUT action %u\n", action);
+		switch (action) {
+		case SPDK_SCSI_PR_OUT_RELEASE:
+		case SPDK_SCSI_PR_OUT_CLEAR:
+		case SPDK_SCSI_PR_OUT_PREEMPT:
+		case SPDK_SCSI_PR_OUT_PREEMPT_AND_ABORT:
+			if (!reg) {
+				SPDK_ERRLOG("CHECK: PR OUT action %u\n", action);
+				goto conflict;
+			}
+			return 0;
+		case SPDK_SCSI_PR_OUT_REGISTER:
+		case SPDK_SCSI_PR_OUT_REG_AND_IGNORE_KEY:
+			return 0;
+		case SPDK_SCSI_PR_OUT_REG_AND_MOVE:
+			SPDK_ERRLOG("CHECK: PR OUT action %u\n", action);
+			goto conflict;
+		default:
+			SPDK_ERRLOG("CHECK: PR OUT invalid action %u\n", action);
+			goto conflict;
+		}
+
+	/* For most SBC R/W commands */
+	default:
+		break;
+	}
+
+	switch (cdb[0]) {
+	case SPDK_SBC_READ_6:
+	case SPDK_SBC_READ_10:
+	case SPDK_SBC_READ_12:
+	case SPDK_SBC_READ_16:
+		break;
+	case SPDK_SBC_WRITE_6:
+	case SPDK_SBC_WRITE_10:
+	case SPDK_SBC_WRITE_12:
+	case SPDK_SBC_WRITE_16:
+	case SPDK_SBC_UNMAP:
+	case SPDK_SBC_SYNCHRONIZE_CACHE_10:
+	case SPDK_SBC_SYNCHRONIZE_CACHE_16:
+		dma_to_device = true;
+		break;
+	default:
+		SPDK_ERRLOG("CHECK: unsupported SCSI command cdb 0x%x\n", cdb[0]);
+		goto conflict;
+	}
+
+	switch (rtype) {
+	case SPDK_SCSI_PR_WRITE_EXCLUSIVE:
+		if (dma_to_device) {
+			SPDK_ERRLOG("CHECK: Write Exclusive reservation type "
+				    "rejects command 0x%x\n", cdb[0]);
+			goto conflict;
+		}
+		break;
+	case SPDK_SCSI_PR_EXCLUSIVE_ACCESS:
+		SPDK_ERRLOG("CHECK: Exclusive Access reservation type "
+			    "rejects command 0x%x\n", cdb[0]);
+		goto conflict;
+	case SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY:
+	case SPDK_SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS:
+		if (!reg && dma_to_device) {
+			SPDK_ERRLOG("CHECK: Registrants only reservation "
+				    "type  reject command 0x%x\n", cdb[0]);
+			goto conflict;
+		}
+		break;
+	case SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY:
+	case SPDK_SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		if (!reg) {
+			SPDK_ERRLOG("CHECK: All Registrants reservation "
+				    "type  reject command 0x%x\n", cdb[0]);
+			goto conflict;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+
+conflict:
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+				  SPDK_SCSI_SENSE_NO_SENSE,
+				  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -1;
+}
+
+static int
+scsi2_check_reservation_conflict(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg;
+	bool conflict = false;
+
+	reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+	if (reg) {
+		/*
+		 * From spc4r31 5.9.3 Exceptions to SPC-2 RESERVE and RELEASE
+		 * behavior
+		 *
+		 * A RESERVE(6) or RESERVE(10) command shall complete with GOOD
+		 * status, but no reservation shall be established and the
+		 * persistent reservation shall not be changed, if the command
+		 * is received from a) and b) below.
+		 *
+		 * A RELEASE(6) or RELEASE(10) command shall complete with GOOD
+		 * status, but the persistent reservation shall not be released,
+		 * if the command is received from a) and b)
+		 *
+		 * a) An I_T nexus that is a persistent reservation holder; or
+		 * b) An I_T nexus that is registered if a registrants only or
+		 *    all registrants type persistent reservation is present.
+		 *
+		 * In all other cases, a RESERVE(6) command, RESERVE(10) command,
+		 * RELEASE(6) command, or RELEASE(10) command shall be processed
+		 * as defined in SPC-2.
+		 */
+		if (scsi_pr_registrant_is_holder(lun, reg)) {
+			return 1;
+		}
+
+		if (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY ||
+		    lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY) {
+			return 1;
+		}
+
+		conflict = true;
+	} else {
+		/*
+		 * From spc2r20 5.5.1 Reservations overview:
+		 *
+		 * If a logical unit has executed a PERSISTENT RESERVE OUT
+		 * command with the REGISTER or the REGISTER AND IGNORE
+		 * EXISTING KEY service action and is still registered by any
+		 * initiator, all RESERVE commands and all RELEASE commands
+		 * regardless of initiator shall conflict and shall terminate
+		 * with a RESERVATION CONFLICT status.
+		 */
+		conflict = TAILQ_EMPTY(&lun->reg_head) ? false : true;
+	}
+
+	if (conflict) {
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+					  SPDK_SCSI_SENSE_NO_SENSE,
+					  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	struct spdk_scsi_pr_registrant *reg = &lun->scsi2_holder;
+	int ret;
+
+	/* Obsolete Bits and LongID set, returning ILLEGAL_REQUEST */
+	if (cdb[1] & 0x3) {
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return -1;
+	}
+
+	ret = scsi2_check_reservation_conflict(task);
+	/* PERSISTENT RESERVE is enabled */
+	if (ret == 1) {
+		return 0;
+	} else if (ret < 0) {
+		return ret;
+	}
+
+	/* SPC2 RESERVE */
+	reg->initiator_port = task->initiator_port;
+	if (task->initiator_port) {
+		snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s",
+			 task->initiator_port->name);
+		reg->transport_id_len = task->initiator_port->transport_id_len;
+		memcpy(reg->transport_id, task->initiator_port->transport_id,
+		       reg->transport_id_len);
+	}
+	reg->target_port = task->target_port;
+	if (task->target_port) {
+		snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s",
+			 task->target_port->name);
+	}
+
+	lun->reservation.flags = SCSI_SPC2_RESERVE;
+	lun->reservation.holder = &lun->scsi2_holder;
+
+	return 0;
+}
+
+int
+scsi2_release(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	int ret;
+
+	ret = scsi2_check_reservation_conflict(task);
+	/* PERSISTENT RESERVE is enabled */
+	if (ret == 1) {
+		return 0;
+	} else if (ret < 0) {
+		return ret;
+	}
+
+	assert(lun->reservation.flags & SCSI_SPC2_RESERVE);
+
+	memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation));
+	memset(&lun->scsi2_holder, 0, sizeof(struct spdk_scsi_pr_registrant));
+
+	return 0;
+}
+
+int scsi2_reserve_check(struct spdk_scsi_task *task)
+{
+	struct spdk_scsi_lun *lun = task->lun;
+	uint8_t *cdb = task->cdb;
+
+	switch (cdb[0]) {
+	case SPDK_SPC_INQUIRY:
+	case SPDK_SPC2_RELEASE_6:
+	case SPDK_SPC2_RELEASE_10:
+		return 0;
+
+	default:
+		break;
+	}
+
+	/* no reservation holders */
+	if (!scsi_pr_has_reservation(lun)) {
+		return 0;
+	}
+
+	if (scsi2_it_nexus_is_holder(lun, task->initiator_port, task->target_port)) {
+		return 0;
+	}
+
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+				  SPDK_SCSI_SENSE_NO_SENSE,
+				  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	return -1;
+}
diff --git a/src/spdk/lib/scsi/scsi_rpc.c b/src/spdk/lib/scsi/scsi_rpc.c
new file mode 100644
index 000000000..1938ddac7
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_rpc.c
@@ -0,0 +1,77 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+static void
+rpc_scsi_get_devices(struct spdk_jsonrpc_request *request,
+		     const struct spdk_json_val *params)
+{
+	struct spdk_json_write_ctx *w;
+	struct spdk_scsi_dev *devs = scsi_dev_get_list();
+	int i;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "scsi_get_devices requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+
+	for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) {
+		struct spdk_scsi_dev *dev = &devs[i];
+
+		if (!dev->is_allocated) {
+			continue;
+		}
+
+		spdk_json_write_object_begin(w);
+
+		spdk_json_write_named_int32(w, "id", dev->id);
+
+		spdk_json_write_named_string(w, "device_name", dev->name);
+
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("scsi_get_devices", rpc_scsi_get_devices, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(scsi_get_devices, get_scsi_devices)
diff --git a/src/spdk/lib/scsi/spdk_scsi.map b/src/spdk/lib/scsi/spdk_scsi.map
new file mode 100644
index 000000000..643372699
--- /dev/null
+++ b/src/spdk/lib/scsi/spdk_scsi.map
@@ -0,0 +1,49 @@
+{
+	global:
+
+	# Public functions
+	spdk_scsi_init;
+	spdk_scsi_fini;
+	spdk_scsi_lun_get_id;
+	spdk_scsi_lun_get_bdev_name;
+	spdk_scsi_lun_get_dev;
+	spdk_scsi_lun_is_removing;
+	spdk_scsi_dev_get_name;
+	spdk_scsi_dev_get_id;
+	spdk_scsi_dev_get_lun;
+	spdk_scsi_dev_has_pending_tasks;
+	spdk_scsi_dev_destruct;
+	spdk_scsi_dev_queue_mgmt_task;
+	spdk_scsi_dev_queue_task;
+	spdk_scsi_dev_add_port;
+	spdk_scsi_dev_delete_port;
+	spdk_scsi_dev_find_port_by_id;
+	spdk_scsi_dev_allocate_io_channels;
+	spdk_scsi_dev_free_io_channels;
+	spdk_scsi_dev_construct;
+	spdk_scsi_dev_delete_lun;
+	spdk_scsi_dev_add_lun;
+	spdk_scsi_port_create;
+	spdk_scsi_port_free;
+	spdk_scsi_port_get_name;
+	spdk_scsi_task_construct;
+	spdk_scsi_task_put;
+	spdk_scsi_task_set_data;
+	spdk_scsi_task_scatter_data;
+	spdk_scsi_task_gather_data;
+	spdk_scsi_task_build_sense_data;
+	spdk_scsi_task_set_status;
+	spdk_scsi_task_copy_status;
+	spdk_scsi_task_process_null_lun;
+	spdk_scsi_task_process_abort;
+	spdk_scsi_lun_open;
+	spdk_scsi_lun_close;
+	spdk_scsi_lun_allocate_io_channel;
+	spdk_scsi_lun_free_io_channel;
+	spdk_scsi_lun_get_dif_ctx;
+	spdk_scsi_port_set_iscsi_transport_id;
+	spdk_scsi_lun_id_int_to_fmt;
+	spdk_scsi_lun_id_fmt_to_int;
+
+	local: *;
+};
diff --git a/src/spdk/lib/scsi/task.c b/src/spdk/lib/scsi/task.c
new file mode 100644
index 000000000..7fd8305ec
--- /dev/null
+++ b/src/spdk/lib/scsi/task.c
@@ -0,0 +1,300 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+
+static void
+scsi_task_free_data(struct spdk_scsi_task *task)
+{
+	if (task->alloc_len != 0) {
+		spdk_dma_free(task->iov.iov_base);
+		task->alloc_len = 0;
+	}
+
+	task->iov.iov_base = NULL;
+	task->iov.iov_len = 0;
+}
+
+void
+spdk_scsi_task_put(struct spdk_scsi_task *task)
+{
+	if (!task) {
+		return;
+	}
+
+	assert(task->ref > 0);
+	task->ref--;
+
+	if (task->ref == 0) {
+		struct spdk_bdev_io *bdev_io = task->bdev_io;
+
+		if (bdev_io) {
+			spdk_bdev_free_io(bdev_io);
+		}
+
+		scsi_task_free_data(task);
+
+		task->free_fn(task);
+	}
+}
+
+void
+spdk_scsi_task_construct(struct spdk_scsi_task *task,
+			 spdk_scsi_task_cpl cpl_fn,
+			 spdk_scsi_task_free free_fn)
+{
+	assert(task != NULL);
+	assert(cpl_fn != NULL);
+	assert(free_fn != NULL);
+
+	task->cpl_fn = cpl_fn;
+	task->free_fn = free_fn;
+
+	task->ref++;
+
+	/*
+	 * Pre-fill the iov_buffers to point to the embedded iov
+	 */
+	assert(task->iov.iov_base == NULL);
+	task->iovs = &task->iov;
+	task->iovcnt = 1;
+}
+
+static void *
+scsi_task_alloc_data(struct spdk_scsi_task *task, uint32_t alloc_len)
+{
+	assert(task->alloc_len == 0);
+
+	task->iov.iov_base = spdk_dma_zmalloc(alloc_len, 0, NULL);
+	task->iov.iov_len = alloc_len;
+	task->alloc_len = alloc_len;
+
+	return task->iov.iov_base;
+}
+
+int
+spdk_scsi_task_scatter_data(struct spdk_scsi_task *task, const void *src, size_t buf_len)
+{
+	size_t len = 0;
+	size_t buf_left = buf_len;
+	int i;
+	struct iovec *iovs = task->iovs;
+	const uint8_t *pos;
+
+	if (buf_len == 0) {
+		return 0;
+	}
+
+	if (task->iovcnt == 1 && iovs[0].iov_base == NULL) {
+		scsi_task_alloc_data(task, buf_len);
+		iovs[0] = task->iov;
+	}
+
+	for (i = 0; i < task->iovcnt; i++) {
+		assert(iovs[i].iov_base != NULL);
+		len += iovs[i].iov_len;
+	}
+
+	if (len < buf_len) {
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		return -1;
+	}
+
+	pos = src;
+
+	for (i = 0; i < task->iovcnt; i++) {
+		len = spdk_min(iovs[i].iov_len, buf_left);
+		buf_left -= len;
+		memcpy(iovs[i].iov_base, pos, len);
+		pos += len;
+	}
+
+	return buf_len;
+}
+
+void *
+spdk_scsi_task_gather_data(struct spdk_scsi_task *task, int *len)
+{
+	int i;
+	struct iovec *iovs = task->iovs;
+	size_t buf_len = 0;
+	uint8_t *buf, *pos;
+
+	for (i = 0; i < task->iovcnt; i++) {
+		assert(iovs[i].iov_base != NULL);
+		buf_len += iovs[i].iov_len;
+	}
+
+	if (buf_len == 0) {
+		*len = 0;
+		return NULL;
+	}
+
+	buf = calloc(1, buf_len);
+	if (buf == NULL) {
+		*len = -1;
+		return NULL;
+	}
+
+	pos = buf;
+	for (i = 0; i < task->iovcnt; i++) {
+		memcpy(pos, iovs[i].iov_base, iovs[i].iov_len);
+		pos += iovs[i].iov_len;
+	}
+
+	*len = buf_len;
+	return buf;
+}
+
+void
+spdk_scsi_task_set_data(struct spdk_scsi_task *task, void *data, uint32_t len)
+{
+	assert(task->iovcnt == 1);
+	assert(task->alloc_len == 0);
+
+	task->iovs[0].iov_base = data;
+	task->iovs[0].iov_len = len;
+}
+
+void
+spdk_scsi_task_build_sense_data(struct spdk_scsi_task *task, int sk, int asc, int ascq)
+{
+	uint8_t *cp;
+	int resp_code;
+
+	resp_code = 0x70; /* Current + Fixed format */
+
+	/* Sense Data */
+	cp = task->sense_data;
+
+	/* VALID(7) RESPONSE CODE(6-0) */
+	cp[0] = 0x80 | resp_code;
+	/* Obsolete */
+	cp[1] = 0;
+	/* FILEMARK(7) EOM(6) ILI(5) SENSE KEY(3-0) */
+	cp[2] = sk & 0xf;
+	/* INFORMATION */
+	memset(&cp[3], 0, 4);
+
+	/* ADDITIONAL SENSE LENGTH */
+	cp[7] = 10;
+
+	/* COMMAND-SPECIFIC INFORMATION */
+	memset(&cp[8], 0, 4);
+	/* ADDITIONAL SENSE CODE */
+	cp[12] = asc;
+	/* ADDITIONAL SENSE CODE QUALIFIER */
+	cp[13] = ascq;
+	/* FIELD REPLACEABLE UNIT CODE */
+	cp[14] = 0;
+
+	/* SKSV(7) SENSE KEY SPECIFIC(6-0,7-0,7-0) */
+	cp[15] = 0;
+	cp[16] = 0;
+	cp[17] = 0;
+
+	/* SenseLength */
+	task->sense_data_len = 18;
+}
+
+void
+spdk_scsi_task_set_status(struct spdk_scsi_task *task, int sc, int sk,
+			  int asc, int ascq)
+{
+	if (sc == SPDK_SCSI_STATUS_CHECK_CONDITION) {
+		spdk_scsi_task_build_sense_data(task, sk, asc, ascq);
+	}
+	task->status = sc;
+}
+
+void
+spdk_scsi_task_copy_status(struct spdk_scsi_task *dst,
+			   struct spdk_scsi_task *src)
+{
+	memcpy(dst->sense_data, src->sense_data, src->sense_data_len);
+	dst->sense_data_len = src->sense_data_len;
+	dst->status = src->status;
+}
+
+void
+spdk_scsi_task_process_null_lun(struct spdk_scsi_task *task)
+{
+	uint8_t buffer[36];
+	uint32_t allocation_len;
+	uint32_t data_len;
+
+	task->length = task->transfer_len;
+	if (task->cdb[0] == SPDK_SPC_INQUIRY) {
+		/*
+		 * SPC-4 states that INQUIRY commands to an unsupported LUN
+		 *  must be served with PERIPHERAL QUALIFIER = 0x3 and
+		 *  PERIPHERAL DEVICE TYPE = 0x1F.
+		 */
+		data_len = sizeof(buffer);
+
+		memset(buffer, 0, data_len);
+		/* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */
+		buffer[0] = 0x03 << 5 | 0x1f;
+		/* ADDITIONAL LENGTH */
+		buffer[4] = data_len - 5;
+
+		allocation_len = from_be16(&task->cdb[3]);
+		if (spdk_scsi_task_scatter_data(task, buffer, spdk_min(allocation_len, data_len)) >= 0) {
+			task->data_transferred = data_len;
+			task->status = SPDK_SCSI_STATUS_GOOD;
+		}
+	} else {
+		/* LOGICAL UNIT NOT SUPPORTED */
+		spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+					  SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+					  SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_SUPPORTED,
+					  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		task->data_transferred = 0;
+	}
+}
+
+void
+spdk_scsi_task_process_abort(struct spdk_scsi_task *task)
+{
+	spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+				  SPDK_SCSI_SENSE_ABORTED_COMMAND,
+				  SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+				  SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+}
diff --git a/src/spdk/lib/sock/Makefile b/src/spdk/lib/sock/Makefile
new file mode 100644
index 000000000..82fe41e90
--- /dev/null
+++ b/src/spdk/lib/sock/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 1
+
+C_SRCS = sock.c net_framework.c sock_rpc.c
+
+LIBNAME = sock
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_sock.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/sock/net_framework.c b/src/spdk/lib/sock/net_framework.c
new file mode 100644
index 000000000..45d52d162
--- /dev/null
+++ b/src/spdk/lib/sock/net_framework.c
@@ -0,0 +1,107 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/log.h"
+#include "spdk/net.h"
+#include "spdk/queue.h"
+
+static STAILQ_HEAD(, spdk_net_framework) g_net_frameworks =
+	STAILQ_HEAD_INITIALIZER(g_net_frameworks);
+
+static spdk_net_init_cb g_init_cb_fn = NULL;
+static void *g_init_cb_arg = NULL;
+
+static spdk_net_fini_cb g_fini_cb_fn = NULL;
+static void *g_fini_cb_arg = NULL;
+
+struct spdk_net_framework *g_next_net_framework = NULL;
+
+static inline struct spdk_net_framework *
+get_next_net_framework(struct spdk_net_framework *net)
+{
+	return net ? STAILQ_NEXT(net, link) : STAILQ_FIRST(&g_net_frameworks);
+}
+
+void
+spdk_net_framework_init_next(int rc)
+{
+	if (rc) {
+		SPDK_ERRLOG("Net framework %s failed to initalize with error %d\n", g_next_net_framework->name, rc);
+		g_init_cb_fn(g_init_cb_arg, rc);
+		return;
+	}
+
+	g_next_net_framework = get_next_net_framework(g_next_net_framework);
+	if (g_next_net_framework == NULL) {
+		g_init_cb_fn(g_init_cb_arg, 0);
+		return;
+	}
+
+	g_next_net_framework->init();
+}
+
+void
+spdk_net_framework_start(spdk_net_init_cb cb_fn, void *cb_arg)
+{
+	g_init_cb_fn = cb_fn;
+	g_init_cb_arg = cb_arg;
+
+	spdk_net_framework_init_next(0);
+}
+
+void
+spdk_net_framework_fini_next(void)
+{
+	g_next_net_framework = get_next_net_framework(g_next_net_framework);
+	if (g_next_net_framework == NULL) {
+		g_fini_cb_fn(g_fini_cb_arg);
+		return;
+	}
+
+	g_next_net_framework->fini();
+}
+
+void
+spdk_net_framework_fini(spdk_net_fini_cb cb_fn, void *cb_arg)
+{
+	g_fini_cb_fn = cb_fn;
+	g_fini_cb_arg = cb_arg;
+
+	spdk_net_framework_fini_next();
+}
+
+void
+spdk_net_framework_register(struct spdk_net_framework *frame)
+{
+	STAILQ_INSERT_TAIL(&g_net_frameworks, frame, link);
+}
diff --git a/src/spdk/lib/sock/sock.c b/src/spdk/lib/sock/sock.c
new file mode 100644
index 000000000..5ea90385c
--- /dev/null
+++ b/src/spdk/lib/sock/sock.c
@@ -0,0 +1,809 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation. All rights reserved.
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/log.h"
+#include "spdk/sock.h"
+#include "spdk_internal/sock.h"
+#include "spdk/queue.h"
+
+#define SPDK_SOCK_DEFAULT_PRIORITY 0
+#define SPDK_SOCK_OPTS_FIELD_OK(opts, field) (offsetof(struct spdk_sock_opts, field) + sizeof(opts->field) <= (opts->opts_size))
+
+static STAILQ_HEAD(, spdk_net_impl) g_net_impls = STAILQ_HEAD_INITIALIZER(g_net_impls);
+
+struct spdk_sock_placement_id_entry {
+	int placement_id;
+	uint32_t ref;
+	struct spdk_sock_group *group;
+	STAILQ_ENTRY(spdk_sock_placement_id_entry) link;
+};
+
+static STAILQ_HEAD(, spdk_sock_placement_id_entry) g_placement_id_map = STAILQ_HEAD_INITIALIZER(
+			g_placement_id_map);
+static pthread_mutex_t g_map_table_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Insert a group into the placement map.
+ * If the group is already in the map, take a reference.
+ */
+static int
+sock_map_insert(int placement_id, struct spdk_sock_group *group)
+{
+	struct spdk_sock_placement_id_entry *entry;
+
+	pthread_mutex_lock(&g_map_table_mutex);
+	STAILQ_FOREACH(entry, &g_placement_id_map, link) {
+		if (placement_id == entry->placement_id) {
+			/* The mapping already exists, it means that different sockets have
+			 * the same placement_ids.
+			 */
+			entry->ref++;
+			pthread_mutex_unlock(&g_map_table_mutex);
+			return 0;
+		}
+	}
+
+	entry = calloc(1, sizeof(*entry));
+	if (!entry) {
+		SPDK_ERRLOG("Cannot allocate an entry for placement_id=%u\n", placement_id);
+		pthread_mutex_unlock(&g_map_table_mutex);
+		return -ENOMEM;
+	}
+
+	entry->placement_id = placement_id;
+	entry->group = group;
+	entry->ref++;
+
+	STAILQ_INSERT_TAIL(&g_placement_id_map, entry, link);
+	pthread_mutex_unlock(&g_map_table_mutex);
+
+	return 0;
+}
+
+/* Release a reference to the group for a given placement_id.
+ * If the reference count is 0, remove the group.
+ */
+static void
+sock_map_release(int placement_id)
+{
+	struct spdk_sock_placement_id_entry *entry;
+
+	pthread_mutex_lock(&g_map_table_mutex);
+	STAILQ_FOREACH(entry, &g_placement_id_map, link) {
+		if (placement_id == entry->placement_id) {
+			assert(entry->ref > 0);
+			entry->ref--;
+			break;
+		}
+	}
+
+	pthread_mutex_unlock(&g_map_table_mutex);
+}
+
+/* Look up the group for a placement_id. */
+static void
+sock_map_lookup(int placement_id, struct spdk_sock_group **group)
+{
+	struct spdk_sock_placement_id_entry *entry;
+
+	*group = NULL;
+	pthread_mutex_lock(&g_map_table_mutex);
+	STAILQ_FOREACH(entry, &g_placement_id_map, link) {
+		if (placement_id == entry->placement_id) {
+			assert(entry->group != NULL);
+			*group = entry->group;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_map_table_mutex);
+}
+
+/* Remove the socket group from the map table */
+static void
+sock_remove_sock_group_from_map_table(struct spdk_sock_group *group)
+{
+	struct spdk_sock_placement_id_entry *entry, *tmp;
+
+	pthread_mutex_lock(&g_map_table_mutex);
+	STAILQ_FOREACH_SAFE(entry, &g_placement_id_map, link, tmp) {
+		if (entry->group == group) {
+			STAILQ_REMOVE(&g_placement_id_map, entry, spdk_sock_placement_id_entry, link);
+			free(entry);
+		}
+	}
+	pthread_mutex_unlock(&g_map_table_mutex);
+
+}
+
+int
+spdk_sock_get_optimal_sock_group(struct spdk_sock *sock, struct spdk_sock_group **group)
+{
+	int placement_id = 0, rc;
+
+	rc = sock->net_impl->get_placement_id(sock, &placement_id);
+	if (!rc && (placement_id != 0)) {
+		sock_map_lookup(placement_id, group);
+		return 0;
+	} else {
+		return -1;
+	}
+}
+
+int
+spdk_sock_getaddr(struct spdk_sock *sock, char *saddr, int slen, uint16_t *sport,
+		  char *caddr, int clen, uint16_t *cport)
+{
+	return sock->net_impl->getaddr(sock, saddr, slen, sport, caddr, clen, cport);
+}
+
+void
+spdk_sock_get_default_opts(struct spdk_sock_opts *opts)
+{
+	assert(opts);
+
+	if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) {
+		opts->priority = SPDK_SOCK_DEFAULT_PRIORITY;
+	}
+}
+
+/*
+ * opts The opts allocated in the current library.
+ * opts_user The opts passed by the caller.
+ * */
+static void
+sock_init_opts(struct spdk_sock_opts *opts, struct spdk_sock_opts *opts_user)
+{
+	assert(opts);
+	assert(opts_user);
+
+	opts->opts_size = sizeof(*opts);
+	spdk_sock_get_default_opts(opts);
+
+	/* reset the size according to the user */
+	opts->opts_size = opts_user->opts_size;
+	if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) {
+		opts->priority = opts_user->priority;
+	}
+}
+
+struct spdk_sock *
+spdk_sock_connect(const char *ip, int port, char *impl_name)
+{
+	struct spdk_sock_opts opts;
+
+	opts.opts_size = sizeof(opts);
+	spdk_sock_get_default_opts(&opts);
+	return spdk_sock_connect_ext(ip, port, impl_name, &opts);
+}
+
+struct spdk_sock *
+spdk_sock_connect_ext(const char *ip, int port, char *impl_name, struct spdk_sock_opts *opts)
+{
+	struct spdk_net_impl *impl = NULL;
+	struct spdk_sock *sock;
+	struct spdk_sock_opts opts_local;
+
+	if (opts == NULL) {
+		SPDK_ERRLOG("the opts should not be NULL pointer\n");
+		return NULL;
+	}
+
+	STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
+		if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) {
+			continue;
+		}
+
+		sock_init_opts(&opts_local, opts);
+		sock = impl->connect(ip, port, &opts_local);
+		if (sock != NULL) {
+			/* Copy the contents, both the two structures are the same ABI version */
+			memcpy(&sock->opts, &opts_local, sizeof(sock->opts));
+			sock->net_impl = impl;
+			TAILQ_INIT(&sock->queued_reqs);
+			TAILQ_INIT(&sock->pending_reqs);
+			return sock;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_sock *
+spdk_sock_listen(const char *ip, int port, char *impl_name)
+{
+	struct spdk_sock_opts opts;
+
+	opts.opts_size = sizeof(opts);
+	spdk_sock_get_default_opts(&opts);
+	return spdk_sock_listen_ext(ip, port, impl_name, &opts);
+}
+
+struct spdk_sock *
+spdk_sock_listen_ext(const char *ip, int port, char *impl_name, struct spdk_sock_opts *opts)
+{
+	struct spdk_net_impl *impl = NULL;
+	struct spdk_sock *sock;
+	struct spdk_sock_opts opts_local;
+
+	if (opts == NULL) {
+		SPDK_ERRLOG("the opts should not be NULL pointer\n");
+		return NULL;
+	}
+
+	STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
+		if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) {
+			continue;
+		}
+
+		sock_init_opts(&opts_local, opts);
+		sock = impl->listen(ip, port, &opts_local);
+		if (sock != NULL) {
+			/* Copy the contents, both the two structures are the same ABI version */
+			memcpy(&sock->opts, &opts_local, sizeof(sock->opts));
+			sock->net_impl = impl;
+			/* Don't need to initialize the request queues for listen
+			 * sockets. */
+			return sock;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_sock *
+spdk_sock_accept(struct spdk_sock *sock)
+{
+	struct spdk_sock *new_sock;
+
+	new_sock = sock->net_impl->accept(sock);
+	if (new_sock != NULL) {
+		/* Inherit the opts from the "accept sock" */
+		new_sock->opts = sock->opts;
+		memcpy(&new_sock->opts, &sock->opts, sizeof(new_sock->opts));
+		new_sock->net_impl = sock->net_impl;
+		TAILQ_INIT(&new_sock->queued_reqs);
+		TAILQ_INIT(&new_sock->pending_reqs);
+	}
+
+	return new_sock;
+}
+
+int
+spdk_sock_close(struct spdk_sock **_sock)
+{
+	struct spdk_sock *sock = *_sock;
+	int rc;
+
+	if (sock == NULL) {
+		errno = EBADF;
+		return -1;
+	}
+
+	if (sock->cb_fn != NULL) {
+		/* This sock is still part of a sock_group. */
+		errno = EBUSY;
+		return -1;
+	}
+
+	sock->flags.closed = true;
+
+	if (sock->cb_cnt > 0) {
+		/* Let the callback unwind before destroying the socket */
+		return 0;
+	}
+
+	spdk_sock_abort_requests(sock);
+
+	rc = sock->net_impl->close(sock);
+	if (rc == 0) {
+		*_sock = NULL;
+	}
+
+	return rc;
+}
+
+ssize_t
+spdk_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
+{
+	if (sock == NULL) {
+		errno = EBADF;
+		return -1;
+	}
+
+	if (sock->flags.closed) {
+		errno = EBADF;
+		return -1;
+	}
+
+	return sock->net_impl->recv(sock, buf, len);
+}
+
+ssize_t
+spdk_sock_readv(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
+{
+	if (sock == NULL) {
+		errno = EBADF;
+		return -1;
+	}
+
+	if (sock->flags.closed) {
+		errno = EBADF;
+		return -1;
+	}
+
+	return sock->net_impl->readv(sock, iov, iovcnt);
+}
+
+ssize_t
+spdk_sock_writev(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
+{
+	if (sock == NULL) {
+		errno = EBADF;
+		return -1;
+	}
+
+	if (sock->flags.closed) {
+		errno = EBADF;
+		return -1;
+	}
+
+	return sock->net_impl->writev(sock, iov, iovcnt);
+}
+
+void
+spdk_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req)
+{
+	assert(req->cb_fn != NULL);
+
+	if (sock == NULL) {
+		req->cb_fn(req->cb_arg, -EBADF);
+		return;
+	}
+
+	if (sock->flags.closed) {
+		req->cb_fn(req->cb_arg, -EBADF);
+		return;
+	}
+
+	sock->net_impl->writev_async(sock, req);
+}
+
+int
+spdk_sock_flush(struct spdk_sock *sock)
+{
+	if (sock == NULL) {
+		return -EBADF;
+	}
+
+	if (sock->flags.closed) {
+		return -EBADF;
+	}
+
+	return sock->net_impl->flush(sock);
+}
+
+int
+spdk_sock_set_recvlowat(struct spdk_sock *sock, int nbytes)
+{
+	return sock->net_impl->set_recvlowat(sock, nbytes);
+}
+
+int
+spdk_sock_set_recvbuf(struct spdk_sock *sock, int sz)
+{
+	return sock->net_impl->set_recvbuf(sock, sz);
+}
+
+int
+spdk_sock_set_sendbuf(struct spdk_sock *sock, int sz)
+{
+	return sock->net_impl->set_sendbuf(sock, sz);
+}
+
+bool
+spdk_sock_is_ipv6(struct spdk_sock *sock)
+{
+	return sock->net_impl->is_ipv6(sock);
+}
+
+bool
+spdk_sock_is_ipv4(struct spdk_sock *sock)
+{
+	return sock->net_impl->is_ipv4(sock);
+}
+
+bool
+spdk_sock_is_connected(struct spdk_sock *sock)
+{
+	return sock->net_impl->is_connected(sock);
+}
+
+struct spdk_sock_group *
+spdk_sock_group_create(void *ctx)
+{
+	struct spdk_net_impl *impl = NULL;
+	struct spdk_sock_group *group;
+	struct spdk_sock_group_impl *group_impl;
+
+	group = calloc(1, sizeof(*group));
+	if (group == NULL) {
+		return NULL;
+	}
+
+	STAILQ_INIT(&group->group_impls);
+
+	STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
+		group_impl = impl->group_impl_create();
+		if (group_impl != NULL) {
+			STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link);
+			TAILQ_INIT(&group_impl->socks);
+			group_impl->num_removed_socks = 0;
+			group_impl->net_impl = impl;
+		}
+	}
+
+	group->ctx = ctx;
+	return group;
+}
+
+void *
+spdk_sock_group_get_ctx(struct spdk_sock_group *group)
+{
+	if (group == NULL) {
+		return NULL;
+	}
+
+	return group->ctx;
+}
+
+int
+spdk_sock_group_add_sock(struct spdk_sock_group *group, struct spdk_sock *sock,
+			 spdk_sock_cb cb_fn, void *cb_arg)
+{
+	struct spdk_sock_group_impl *group_impl = NULL;
+	int rc, placement_id = 0;
+
+	if (cb_fn == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (sock->group_impl != NULL) {
+		/*
+		 * This sock is already part of a sock_group.  Currently we don't
+		 *  support this.
+		 */
+		errno = EBUSY;
+		return -1;
+	}
+
+	rc = sock->net_impl->get_placement_id(sock, &placement_id);
+	if (!rc && (placement_id != 0)) {
+		rc = sock_map_insert(placement_id, group);
+		if (rc < 0) {
+			return -1;
+		}
+	}
+
+	STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
+		if (sock->net_impl == group_impl->net_impl) {
+			break;
+		}
+	}
+
+	if (group_impl == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	rc = group_impl->net_impl->group_impl_add_sock(group_impl, sock);
+	if (rc == 0) {
+		TAILQ_INSERT_TAIL(&group_impl->socks, sock, link);
+		sock->group_impl = group_impl;
+		sock->cb_fn = cb_fn;
+		sock->cb_arg = cb_arg;
+	}
+
+	return rc;
+}
+
+int
+spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+	struct spdk_sock_group_impl *group_impl = NULL;
+	int rc, placement_id = 0;
+
+	STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
+		if (sock->net_impl == group_impl->net_impl) {
+			break;
+		}
+	}
+
+	if (group_impl == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	assert(group_impl == sock->group_impl);
+
+	rc = sock->net_impl->get_placement_id(sock, &placement_id);
+	if (!rc && (placement_id != 0)) {
+		sock_map_release(placement_id);
+	}
+
+	rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock);
+	if (rc == 0) {
+		TAILQ_REMOVE(&group_impl->socks, sock, link);
+		assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL);
+		group_impl->removed_socks[group_impl->num_removed_socks] = (uintptr_t)sock;
+		group_impl->num_removed_socks++;
+		sock->group_impl = NULL;
+		sock->cb_fn = NULL;
+		sock->cb_arg = NULL;
+	}
+
+	return rc;
+}
+
+int
+spdk_sock_group_poll(struct spdk_sock_group *group)
+{
+	return spdk_sock_group_poll_count(group, MAX_EVENTS_PER_POLL);
+}
+
+static int
+sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl,
+			   struct spdk_sock_group *group,
+			   int max_events)
+{
+	struct spdk_sock *socks[MAX_EVENTS_PER_POLL];
+	int num_events, i;
+
+	if (TAILQ_EMPTY(&group_impl->socks)) {
+		return 0;
+	}
+
+	/* The number of removed sockets should be reset for each call to poll. */
+	group_impl->num_removed_socks = 0;
+
+	num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks);
+	if (num_events == -1) {
+		return -1;
+	}
+
+	for (i = 0; i < num_events; i++) {
+		struct spdk_sock *sock = socks[i];
+		int j;
+		bool valid = true;
+		for (j = 0; j < group_impl->num_removed_socks; j++) {
+			if ((uintptr_t)sock == group_impl->removed_socks[j]) {
+				valid = false;
+				break;
+			}
+		}
+
+		if (valid) {
+			assert(sock->cb_fn != NULL);
+			sock->cb_fn(sock->cb_arg, group, sock);
+		}
+	}
+
+	return num_events;
+}
+
+int
+spdk_sock_group_poll_count(struct spdk_sock_group *group, int max_events)
+{
+	struct spdk_sock_group_impl *group_impl = NULL;
+	int rc, num_events = 0;
+
+	if (max_events < 1) {
+		errno = -EINVAL;
+		return -1;
+	}
+
+	/*
+	 * Only poll for up to 32 events at a time - if more events are pending,
+	 *  the next call to this function will reap them.
+	 */
+	if (max_events > MAX_EVENTS_PER_POLL) {
+		max_events = MAX_EVENTS_PER_POLL;
+	}
+
+	STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
+		rc = sock_group_impl_poll_count(group_impl, group, max_events);
+		if (rc < 0) {
+			num_events = -1;
+			SPDK_ERRLOG("group_impl_poll_count for net(%s) failed\n",
+				    group_impl->net_impl->name);
+		} else if (num_events >= 0) {
+			num_events += rc;
+		}
+	}
+
+	return num_events;
+}
+
+int
+spdk_sock_group_close(struct spdk_sock_group **group)
+{
+	struct spdk_sock_group_impl *group_impl = NULL, *tmp;
+	int rc;
+
+	if (*group == NULL) {
+		errno = EBADF;
+		return -1;
+	}
+
+	STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) {
+		if (!TAILQ_EMPTY(&group_impl->socks)) {
+			errno = EBUSY;
+			return -1;
+		}
+	}
+
+	STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) {
+		rc = group_impl->net_impl->group_impl_close(group_impl);
+		if (rc != 0) {
+			SPDK_ERRLOG("group_impl_close for net(%s) failed\n",
+				    group_impl->net_impl->name);
+		}
+	}
+
+	sock_remove_sock_group_from_map_table(*group);
+	free(*group);
+	*group = NULL;
+
+	return 0;
+}
+
+static inline struct spdk_net_impl *
+sock_get_impl_by_name(const char *impl_name)
+{
+	struct spdk_net_impl *impl;
+
+	assert(impl_name != NULL);
+	STAILQ_FOREACH(impl, &g_net_impls, link) {
+		if (0 == strcmp(impl_name, impl->name)) {
+			return impl;
+		}
+	}
+
+	return NULL;
+}
+
+int
+spdk_sock_impl_get_opts(const char *impl_name, struct spdk_sock_impl_opts *opts, size_t *len)
+{
+	struct spdk_net_impl *impl;
+
+	if (!impl_name || !opts || !len) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	impl = sock_get_impl_by_name(impl_name);
+	if (!impl) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (!impl->get_opts) {
+		errno = ENOTSUP;
+		return -1;
+	}
+
+	return impl->get_opts(opts, len);
+}
+
+int
+spdk_sock_impl_set_opts(const char *impl_name, const struct spdk_sock_impl_opts *opts, size_t len)
+{
+	struct spdk_net_impl *impl;
+
+	if (!impl_name || !opts) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	impl = sock_get_impl_by_name(impl_name);
+	if (!impl) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (!impl->set_opts) {
+		errno = ENOTSUP;
+		return -1;
+	}
+
+	return impl->set_opts(opts, len);
+}
+
+void
+spdk_sock_write_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_net_impl *impl;
+	struct spdk_sock_impl_opts opts;
+	size_t len;
+
+	assert(w != NULL);
+
+	spdk_json_write_array_begin(w);
+
+	STAILQ_FOREACH(impl, &g_net_impls, link) {
+		if (!impl->get_opts) {
+			continue;
+		}
+
+		len = sizeof(opts);
+		if (impl->get_opts(&opts, &len) == 0) {
+			spdk_json_write_object_begin(w);
+			spdk_json_write_named_string(w, "method", "sock_impl_set_options");
+			spdk_json_write_named_object_begin(w, "params");
+			spdk_json_write_named_string(w, "impl_name", impl->name);
+			spdk_json_write_named_uint32(w, "recv_buf_size", opts.recv_buf_size);
+			spdk_json_write_named_uint32(w, "send_buf_size", opts.send_buf_size);
+			spdk_json_write_named_bool(w, "enable_recv_pipe", opts.enable_recv_pipe);
+			spdk_json_write_named_bool(w, "enable_zerocopy_send", opts.enable_zerocopy_send);
+			spdk_json_write_object_end(w);
+			spdk_json_write_object_end(w);
+		} else {
+			SPDK_ERRLOG("Failed to get socket options for socket implementation %s\n", impl->name);
+		}
+	}
+
+	spdk_json_write_array_end(w);
+}
+
+void
+spdk_net_impl_register(struct spdk_net_impl *impl, int priority)
+{
+	struct spdk_net_impl *cur, *prev;
+
+	impl->priority = priority;
+	prev = NULL;
+	STAILQ_FOREACH(cur, &g_net_impls, link) {
+		if (impl->priority > cur->priority) {
+			break;
+		}
+		prev = cur;
+	}
+
+	if (prev) {
+		STAILQ_INSERT_AFTER(&g_net_impls, prev, impl, link);
+	} else {
+		STAILQ_INSERT_HEAD(&g_net_impls, impl, link);
+	}
+}
diff --git a/src/spdk/lib/sock/sock_rpc.c b/src/spdk/lib/sock/sock_rpc.c
new file mode 100644
index 000000000..c8686a068
--- /dev/null
+++ b/src/spdk/lib/sock/sock_rpc.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/sock.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+
+static const struct spdk_json_object_decoder rpc_sock_impl_get_opts_decoders[] = {
+	{ "impl_name", 0, spdk_json_decode_string, false },
+};
+
+static void
+rpc_sock_impl_get_options(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	char *impl_name = NULL;
+	struct spdk_sock_impl_opts sock_opts = {};
+	struct spdk_json_write_ctx *w;
+	size_t len;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_sock_impl_get_opts_decoders,
+				    SPDK_COUNTOF(rpc_sock_impl_get_opts_decoders), &impl_name)) {
+		SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	len = sizeof(sock_opts);
+	rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &len);
+	if (rc) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_uint32(w, "recv_buf_size", sock_opts.recv_buf_size);
+	spdk_json_write_named_uint32(w, "send_buf_size", sock_opts.send_buf_size);
+	spdk_json_write_named_bool(w, "enable_recv_pipe", sock_opts.enable_recv_pipe);
+	spdk_json_write_named_bool(w, "enable_zerocopy_send", sock_opts.enable_zerocopy_send);
+	spdk_json_write_object_end(w);
+	spdk_jsonrpc_end_result(request, w);
+	free(impl_name);
+}
+SPDK_RPC_REGISTER("sock_impl_get_options", rpc_sock_impl_get_options,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+
+struct spdk_rpc_sock_impl_set_opts {
+	char *impl_name;
+	struct spdk_sock_impl_opts sock_opts;
+};
+
+static const struct spdk_json_object_decoder rpc_sock_impl_set_opts_decoders[] = {
+	{
+		"impl_name", offsetof(struct spdk_rpc_sock_impl_set_opts, impl_name),
+		spdk_json_decode_string, false
+	},
+	{
+		"recv_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.recv_buf_size),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"send_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.send_buf_size),
+		spdk_json_decode_uint32, true
+	},
+	{
+		"enable_recv_pipe", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_recv_pipe),
+		spdk_json_decode_bool, true
+	},
+	{
+		"enable_zerocopy_send", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_zerocopy_send),
+		spdk_json_decode_bool, true
+	},
+};
+
+static void
+rpc_sock_impl_set_options(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct spdk_rpc_sock_impl_set_opts opts = {};
+	struct spdk_json_write_ctx *w;
+	size_t len;
+	int rc;
+
+	/* Get type */
+	if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders,
+				    SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) {
+		SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	/* Retrieve default opts for requested socket implementation */
+	len = sizeof(opts.sock_opts);
+	rc = spdk_sock_impl_get_opts(opts.impl_name, &opts.sock_opts, &len);
+	if (rc) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	/* Decode opts */
+	if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders,
+				    SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) {
+		SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	rc = spdk_sock_impl_set_opts(opts.impl_name, &opts.sock_opts, sizeof(opts.sock_opts));
+	if (rc != 0) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "Invalid parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	free(opts.impl_name);
+}
+SPDK_RPC_REGISTER("sock_impl_set_options", rpc_sock_impl_set_options, SPDK_RPC_STARTUP)
diff --git a/src/spdk/lib/sock/spdk_sock.map b/src/spdk/lib/sock/spdk_sock.map
new file mode 100644
index 000000000..e3fb44281
--- /dev/null
+++ b/src/spdk/lib/sock/spdk_sock.map
@@ -0,0 +1,47 @@
+{
+	global:
+
+	# public functions in spdk/sock.h
+	spdk_sock_get_default_opts;
+	spdk_sock_getaddr;
+	spdk_sock_connect;
+	spdk_sock_connect_ext;
+	spdk_sock_listen;
+	spdk_sock_listen_ext;
+	spdk_sock_accept;
+	spdk_sock_close;
+	spdk_sock_flush;
+	spdk_sock_recv;
+	spdk_sock_writev;
+	spdk_sock_writev_async;
+	spdk_sock_readv;
+	spdk_sock_set_recvlowat;
+	spdk_sock_set_recvbuf;
+	spdk_sock_set_sendbuf;
+	spdk_sock_is_ipv6;
+	spdk_sock_is_ipv4;
+	spdk_sock_is_connected;
+	spdk_sock_group_create;
+	spdk_sock_group_get_ctx;
+	spdk_sock_group_add_sock;
+	spdk_sock_group_remove_sock;
+	spdk_sock_group_poll;
+	spdk_sock_group_poll_count;
+	spdk_sock_group_close;
+	spdk_sock_get_optimal_sock_group;
+	spdk_sock_impl_get_opts;
+	spdk_sock_impl_set_opts;
+	spdk_sock_write_config_json;
+
+	# public functions in spdk/net.h
+	spdk_net_framework_register;
+	spdk_net_framework_start;
+	spdk_net_framework_fini;
+	spdk_net_framework_init_next;
+	spdk_net_framework_fini_next;
+
+	# internal function in spdk_internal/sock.h
+	spdk_net_impl_register;
+
+	local: *;
+};
diff --git a/src/spdk/lib/thread/Makefile b/src/spdk/lib/thread/Makefile
new file mode 100644
index 000000000..ceb7a394e
--- /dev/null
+++ b/src/spdk/lib/thread/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = thread.c
+LIBNAME = thread
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_thread.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/thread/spdk_thread.map b/src/spdk/lib/thread/spdk_thread.map
new file mode 100644
index 000000000..b71fa06eb
--- /dev/null
+++ b/src/spdk/lib/thread/spdk_thread.map
@@ -0,0 +1,55 @@
+{
+	global:
+
+	# public functions in spdk/thread.h
+	spdk_thread_lib_init;
+	spdk_thread_lib_init_ext;
+	spdk_thread_lib_fini;
+	spdk_thread_create;
+	spdk_set_thread;
+	spdk_thread_exit;
+	spdk_thread_is_exited;
+	spdk_thread_destroy;
+	spdk_thread_get_ctx;
+	spdk_thread_get_cpumask;
+	spdk_thread_set_cpumask;
+	spdk_thread_get_from_ctx;
+	spdk_thread_poll;
+	spdk_thread_next_poller_expiration;
+	spdk_thread_has_active_pollers;
+	spdk_thread_has_pollers;
+	spdk_thread_is_idle;
+	spdk_thread_get_count;
+	spdk_get_thread;
+	spdk_thread_get_name;
+	spdk_thread_get_id;
+	spdk_thread_get_by_id;
+	spdk_thread_get_stats;
+	spdk_thread_get_last_tsc;
+	spdk_thread_send_msg;
+	spdk_thread_send_critical_msg;
+	spdk_for_each_thread;
+	spdk_poller_register;
+	spdk_poller_register_named;
+	spdk_poller_unregister;
+	spdk_poller_pause;
+	spdk_poller_resume;
+	spdk_io_device_register;
+	spdk_io_device_unregister;
+	spdk_get_io_channel;
+	spdk_put_io_channel;
+	spdk_io_channel_get_ctx;
+	spdk_io_channel_from_ctx;
+	spdk_io_channel_get_thread;
+	spdk_for_each_channel;
+	spdk_io_channel_iter_get_io_device;
+	spdk_io_channel_iter_get_channel;
+	spdk_io_channel_iter_get_ctx;
+	spdk_for_each_channel_continue;
+
+	# internal functions in spdk_internal/thread.h
+	spdk_poller_state_str;
+	spdk_io_device_get_name;
+
+	local: *;
+};
diff --git a/src/spdk/lib/thread/thread.c b/src/spdk/lib/thread/thread.c
new file mode 100644
index 000000000..65d91ce35
--- /dev/null
+++ b/src/spdk/lib/thread/thread.c
@@ -0,0 +1,1636 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/thread.h"
+
+#define SPDK_MSG_BATCH_SIZE		8
+#define SPDK_MAX_DEVICE_NAME_LEN	256
+#define SPDK_THREAD_EXIT_TIMEOUT_SEC	5
+
+static pthread_mutex_t g_devlist_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static spdk_new_thread_fn g_new_thread_fn = NULL;
+static spdk_thread_op_fn g_thread_op_fn = NULL;
+static spdk_thread_op_supported_fn g_thread_op_supported_fn;
+static size_t g_ctx_sz = 0;
+/* Monotonic increasing ID is set to each created thread beginning at 1. Once the
+ * ID exceeds UINT64_MAX, further thread creation is not allowed and restarting
+ * SPDK application is required.
+ */
+static uint64_t g_thread_id = 1;
+
+struct io_device {
+	void				*io_device;
+	char				name[SPDK_MAX_DEVICE_NAME_LEN + 1];
+	spdk_io_channel_create_cb	create_cb;
+	spdk_io_channel_destroy_cb	destroy_cb;
+	spdk_io_device_unregister_cb	unregister_cb;
+	struct spdk_thread		*unregister_thread;
+	uint32_t			ctx_size;
+	uint32_t			for_each_count;
+	TAILQ_ENTRY(io_device)		tailq;
+
+	uint32_t			refcnt;
+
+	bool				unregistered;
+};
+
+static TAILQ_HEAD(, io_device) g_io_devices = TAILQ_HEAD_INITIALIZER(g_io_devices);
+
+struct spdk_msg {
+	spdk_msg_fn		fn;
+	void			*arg;
+
+	SLIST_ENTRY(spdk_msg)	link;
+};
+
+#define SPDK_MSG_MEMPOOL_CACHE_SIZE	1024
+static struct spdk_mempool *g_spdk_msg_mempool = NULL;
+
+static TAILQ_HEAD(, spdk_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads);
+static uint32_t g_thread_count = 0;
+
+static __thread struct spdk_thread *tls_thread = NULL;
+
+static inline struct spdk_thread *
+_get_thread(void)
+{
+	return tls_thread;
+}
+
+static int
+_thread_lib_init(size_t ctx_sz)
+{
+	char mempool_name[SPDK_MAX_MEMZONE_NAME_LEN];
+
+	g_ctx_sz = ctx_sz;
+
+	snprintf(mempool_name, sizeof(mempool_name), "msgpool_%d", getpid());
+	g_spdk_msg_mempool = spdk_mempool_create(mempool_name,
+			     262144 - 1, /* Power of 2 minus 1 is optimal for memory consumption */
+			     sizeof(struct spdk_msg),
+			     0, /* No cache. We do our own. */
+			     SPDK_ENV_SOCKET_ID_ANY);
+
+	if (!g_spdk_msg_mempool) {
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+spdk_thread_lib_init(spdk_new_thread_fn new_thread_fn, size_t ctx_sz)
+{
+	assert(g_new_thread_fn == NULL);
+	assert(g_thread_op_fn == NULL);
+
+	if (new_thread_fn == NULL) {
+		SPDK_INFOLOG(SPDK_LOG_THREAD, "new_thread_fn was not specified at spdk_thread_lib_init\n");
+	} else {
+		g_new_thread_fn = new_thread_fn;
+	}
+
+	return _thread_lib_init(ctx_sz);
+}
+
+int
+spdk_thread_lib_init_ext(spdk_thread_op_fn thread_op_fn,
+			 spdk_thread_op_supported_fn thread_op_supported_fn,
+			 size_t ctx_sz)
+{
+	assert(g_new_thread_fn == NULL);
+	assert(g_thread_op_fn == NULL);
+	assert(g_thread_op_supported_fn == NULL);
+
+	if ((thread_op_fn != NULL) != (thread_op_supported_fn != NULL)) {
+		SPDK_ERRLOG("Both must be defined or undefined together.\n");
+		return -EINVAL;
+	}
+
+	if (thread_op_fn == NULL && thread_op_supported_fn == NULL) {
+		SPDK_INFOLOG(SPDK_LOG_THREAD, "thread_op_fn and thread_op_supported_fn were not specified\n");
+	} else {
+		g_thread_op_fn = thread_op_fn;
+		g_thread_op_supported_fn = thread_op_supported_fn;
+	}
+
+	return _thread_lib_init(ctx_sz);
+}
+
+void
+spdk_thread_lib_fini(void)
+{
+	struct io_device *dev;
+
+	TAILQ_FOREACH(dev, &g_io_devices, tailq) {
+		SPDK_ERRLOG("io_device %s not unregistered\n", dev->name);
+	}
+
+	if (g_spdk_msg_mempool) {
+		spdk_mempool_free(g_spdk_msg_mempool);
+		g_spdk_msg_mempool = NULL;
+	}
+
+	g_new_thread_fn = NULL;
+	g_thread_op_fn = NULL;
+	g_thread_op_supported_fn = NULL;
+	g_ctx_sz = 0;
+}
+
+static void
+_free_thread(struct spdk_thread *thread)
+{
+	struct spdk_io_channel *ch;
+	struct spdk_msg *msg;
+	struct spdk_poller *poller, *ptmp;
+
+	TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+		SPDK_ERRLOG("thread %s still has channel for io_device %s\n",
+			    thread->name, ch->dev->name);
+	}
+
+	TAILQ_FOREACH_SAFE(poller, &thread->active_pollers, tailq, ptmp) {
+		if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+			SPDK_WARNLOG("poller %s still registered at thread exit\n",
+				     poller->name);
+		}
+		TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+		free(poller);
+	}
+
+	TAILQ_FOREACH_SAFE(poller, &thread->timed_pollers, tailq, ptmp) {
+		if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+			SPDK_WARNLOG("poller %s still registered at thread exit\n",
+				     poller->name);
+		}
+		TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+		free(poller);
+	}
+
+	TAILQ_FOREACH_SAFE(poller, &thread->paused_pollers, tailq, ptmp) {
+		SPDK_WARNLOG("poller %s still registered at thread exit\n", poller->name);
+		TAILQ_REMOVE(&thread->paused_pollers, poller, tailq);
+		free(poller);
+	}
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	assert(g_thread_count > 0);
+	g_thread_count--;
+	TAILQ_REMOVE(&g_threads, thread, tailq);
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	msg = SLIST_FIRST(&thread->msg_cache);
+	while (msg != NULL) {
+		SLIST_REMOVE_HEAD(&thread->msg_cache, link);
+
+		assert(thread->msg_cache_count > 0);
+		thread->msg_cache_count--;
+		spdk_mempool_put(g_spdk_msg_mempool, msg);
+
+		msg = SLIST_FIRST(&thread->msg_cache);
+	}
+
+	assert(thread->msg_cache_count == 0);
+
+	spdk_ring_free(thread->messages);
+	free(thread);
+}
+
+struct spdk_thread *
+spdk_thread_create(const char *name, struct spdk_cpuset *cpumask)
+{
+	struct spdk_thread *thread;
+	struct spdk_msg *msgs[SPDK_MSG_MEMPOOL_CACHE_SIZE];
+	int rc = 0, i;
+
+	thread = calloc(1, sizeof(*thread) + g_ctx_sz);
+	if (!thread) {
+		SPDK_ERRLOG("Unable to allocate memory for thread\n");
+		return NULL;
+	}
+
+	if (cpumask) {
+		spdk_cpuset_copy(&thread->cpumask, cpumask);
+	} else {
+		spdk_cpuset_negate(&thread->cpumask);
+	}
+
+	TAILQ_INIT(&thread->io_channels);
+	TAILQ_INIT(&thread->active_pollers);
+	TAILQ_INIT(&thread->timed_pollers);
+	TAILQ_INIT(&thread->paused_pollers);
+	SLIST_INIT(&thread->msg_cache);
+	thread->msg_cache_count = 0;
+
+	thread->tsc_last = spdk_get_ticks();
+
+	thread->messages = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);
+	if (!thread->messages) {
+		SPDK_ERRLOG("Unable to allocate memory for message ring\n");
+		free(thread);
+		return NULL;
+	}
+
+	/* Fill the local message pool cache. */
+	rc = spdk_mempool_get_bulk(g_spdk_msg_mempool, (void **)msgs, SPDK_MSG_MEMPOOL_CACHE_SIZE);
+	if (rc == 0) {
+		/* If we can't populate the cache it's ok. The cache will get filled
+		 * up organically as messages are passed to the thread. */
+		for (i = 0; i < SPDK_MSG_MEMPOOL_CACHE_SIZE; i++) {
+			SLIST_INSERT_HEAD(&thread->msg_cache, msgs[i], link);
+			thread->msg_cache_count++;
+		}
+	}
+
+	if (name) {
+		snprintf(thread->name, sizeof(thread->name), "%s", name);
+	} else {
+		snprintf(thread->name, sizeof(thread->name), "%p", thread);
+	}
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	if (g_thread_id == 0) {
+		SPDK_ERRLOG("Thread ID rolled over. Further thread creation is not allowed.\n");
+		pthread_mutex_unlock(&g_devlist_mutex);
+		_free_thread(thread);
+		return NULL;
+	}
+	thread->id = g_thread_id++;
+	TAILQ_INSERT_TAIL(&g_threads, thread, tailq);
+	g_thread_count++;
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Allocating new thread (%" PRIu64 ", %s)\n",
+		      thread->id, thread->name);
+
+	if (g_new_thread_fn) {
+		rc = g_new_thread_fn(thread);
+	} else if (g_thread_op_supported_fn && g_thread_op_supported_fn(SPDK_THREAD_OP_NEW)) {
+		rc = g_thread_op_fn(thread, SPDK_THREAD_OP_NEW);
+	}
+
+	if (rc != 0) {
+		_free_thread(thread);
+		return NULL;
+	}
+
+	thread->state = SPDK_THREAD_STATE_RUNNING;
+
+	return thread;
+}
+
+void
+spdk_set_thread(struct spdk_thread *thread)
+{
+	tls_thread = thread;
+}
+
+static void
+thread_exit(struct spdk_thread *thread, uint64_t now)
+{
+	struct spdk_poller *poller;
+	struct spdk_io_channel *ch;
+
+	if (now >= thread->exit_timeout_tsc) {
+		SPDK_ERRLOG("thread %s got timeout, and move it to the exited state forcefully\n",
+			    thread->name);
+		goto exited;
+	}
+
+	TAILQ_FOREACH(poller, &thread->active_pollers, tailq) {
+		if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+			SPDK_INFOLOG(SPDK_LOG_THREAD,
+				     "thread %s still has active poller %s\n",
+				     thread->name, poller->name);
+			return;
+		}
+	}
+
+	TAILQ_FOREACH(poller, &thread->timed_pollers, tailq) {
+		if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+			SPDK_INFOLOG(SPDK_LOG_THREAD,
+				     "thread %s still has active timed poller %s\n",
+				     thread->name, poller->name);
+			return;
+		}
+	}
+
+	TAILQ_FOREACH(poller, &thread->paused_pollers, tailq) {
+		SPDK_INFOLOG(SPDK_LOG_THREAD,
+			     "thread %s still has paused poller %s\n",
+			     thread->name, poller->name);
+		return;
+	}
+
+	TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+		SPDK_INFOLOG(SPDK_LOG_THREAD,
+			     "thread %s still has channel for io_device %s\n",
+			     thread->name, ch->dev->name);
+		return;
+	}
+
+exited:
+	thread->state = SPDK_THREAD_STATE_EXITED;
+}
+
+int
+spdk_thread_exit(struct spdk_thread *thread)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Exit thread %s\n", thread->name);
+
+	assert(tls_thread == thread);
+
+	if (thread->state >= SPDK_THREAD_STATE_EXITING) {
+		SPDK_INFOLOG(SPDK_LOG_THREAD,
+			     "thread %s is already exiting\n",
+			     thread->name);
+		return 0;
+	}
+
+	thread->exit_timeout_tsc = spdk_get_ticks() + (spdk_get_ticks_hz() *
+				   SPDK_THREAD_EXIT_TIMEOUT_SEC);
+	thread->state = SPDK_THREAD_STATE_EXITING;
+	return 0;
+}
+
+bool
+spdk_thread_is_exited(struct spdk_thread *thread)
+{
+	return thread->state == SPDK_THREAD_STATE_EXITED;
+}
+
+void
+spdk_thread_destroy(struct spdk_thread *thread)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Destroy thread %s\n", thread->name);
+
+	assert(thread->state == SPDK_THREAD_STATE_EXITED);
+
+	if (tls_thread == thread) {
+		tls_thread = NULL;
+	}
+
+	_free_thread(thread);
+}
+
+void *
+spdk_thread_get_ctx(struct spdk_thread *thread)
+{
+	if (g_ctx_sz > 0) {
+		return thread->ctx;
+	}
+
+	return NULL;
+}
+
+struct spdk_cpuset *
+spdk_thread_get_cpumask(struct spdk_thread *thread)
+{
+	return &thread->cpumask;
+}
+
+int
+spdk_thread_set_cpumask(struct spdk_cpuset *cpumask)
+{
+	struct spdk_thread *thread;
+
+	if (!g_thread_op_supported_fn || !g_thread_op_supported_fn(SPDK_THREAD_OP_RESCHED)) {
+		SPDK_ERRLOG("Framework does not support reschedule operation.\n");
+		assert(false);
+		return -ENOTSUP;
+	}
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("Called from non-SPDK thread\n");
+		assert(false);
+		return -EINVAL;
+	}
+
+	spdk_cpuset_copy(&thread->cpumask, cpumask);
+
+	/* Invoke framework's reschedule operation. If this function is called multiple times
+	 * in a single spdk_thread_poll() context, the last cpumask will be used in the
+	 * reschedule operation.
+	 */
+	g_thread_op_fn(thread, SPDK_THREAD_OP_RESCHED);
+
+	return 0;
+}
+
+struct spdk_thread *
+spdk_thread_get_from_ctx(void *ctx)
+{
+	if (ctx == NULL) {
+		assert(false);
+		return NULL;
+	}
+
+	assert(g_ctx_sz > 0);
+
+	return SPDK_CONTAINEROF(ctx, struct spdk_thread, ctx);
+}
+
+static inline uint32_t
+msg_queue_run_batch(struct spdk_thread *thread, uint32_t max_msgs)
+{
+	unsigned count, i;
+	void *messages[SPDK_MSG_BATCH_SIZE];
+
+#ifdef DEBUG
+	/*
+	 * spdk_ring_dequeue() fills messages and returns how many entries it wrote,
+	 * so we will never actually read uninitialized data from events, but just to be sure
+	 * (and to silence a static analyzer false positive), initialize the array to NULL pointers.
+	 */
+	memset(messages, 0, sizeof(messages));
+#endif
+
+	if (max_msgs > 0) {
+		max_msgs = spdk_min(max_msgs, SPDK_MSG_BATCH_SIZE);
+	} else {
+		max_msgs = SPDK_MSG_BATCH_SIZE;
+	}
+
+	count = spdk_ring_dequeue(thread->messages, messages, max_msgs);
+	if (count == 0) {
+		return 0;
+	}
+
+	for (i = 0; i < count; i++) {
+		struct spdk_msg *msg = messages[i];
+
+		assert(msg != NULL);
+		msg->fn(msg->arg);
+
+		if (thread->msg_cache_count < SPDK_MSG_MEMPOOL_CACHE_SIZE) {
+			/* Insert the messages at the head. We want to re-use the hot
+			 * ones. */
+			SLIST_INSERT_HEAD(&thread->msg_cache, msg, link);
+			thread->msg_cache_count++;
+		} else {
+			spdk_mempool_put(g_spdk_msg_mempool, msg);
+		}
+	}
+
+	return count;
+}
+
+static void
+poller_insert_timer(struct spdk_thread *thread, struct spdk_poller *poller, uint64_t now)
+{
+	struct spdk_poller *iter;
+
+	poller->next_run_tick = now + poller->period_ticks;
+
+	/*
+	 * Insert poller in the thread's timed_pollers list in sorted order by next scheduled
+	 * run time.
+	 */
+	TAILQ_FOREACH_REVERSE(iter, &thread->timed_pollers, timed_pollers_head, tailq) {
+		if (iter->next_run_tick <= poller->next_run_tick) {
+			TAILQ_INSERT_AFTER(&thread->timed_pollers, iter, poller, tailq);
+			return;
+		}
+	}
+
+	/* No earlier pollers were found, so this poller must be the new head */
+	TAILQ_INSERT_HEAD(&thread->timed_pollers, poller, tailq);
+}
+
+static void
+thread_insert_poller(struct spdk_thread *thread, struct spdk_poller *poller)
+{
+	if (poller->period_ticks) {
+		poller_insert_timer(thread, poller, spdk_get_ticks());
+	} else {
+		TAILQ_INSERT_TAIL(&thread->active_pollers, poller, tailq);
+	}
+}
+
+static inline void
+thread_update_stats(struct spdk_thread *thread, uint64_t end,
+		    uint64_t start, int rc)
+{
+	if (rc == 0) {
+		/* Poller status idle */
+		thread->stats.idle_tsc += end - start;
+	} else if (rc > 0) {
+		/* Poller status busy */
+		thread->stats.busy_tsc += end - start;
+	}
+	/* Store end time to use it as start time of the next spdk_thread_poll(). */
+	thread->tsc_last = end;
+}
+
+static int
+thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now)
+{
+	uint32_t msg_count;
+	struct spdk_poller *poller, *tmp;
+	spdk_msg_fn critical_msg;
+	int rc = 0;
+
+	critical_msg = thread->critical_msg;
+	if (spdk_unlikely(critical_msg != NULL)) {
+		critical_msg(NULL);
+		thread->critical_msg = NULL;
+	}
+
+	msg_count = msg_queue_run_batch(thread, max_msgs);
+	if (msg_count) {
+		rc = 1;
+	}
+
+	TAILQ_FOREACH_REVERSE_SAFE(poller, &thread->active_pollers,
+				   active_pollers_head, tailq, tmp) {
+		int poller_rc;
+
+		if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+			TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+			free(poller);
+			continue;
+		} else if (poller->state == SPDK_POLLER_STATE_PAUSING) {
+			TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+			TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq);
+			poller->state = SPDK_POLLER_STATE_PAUSED;
+			continue;
+		}
+
+		poller->state = SPDK_POLLER_STATE_RUNNING;
+		poller_rc = poller->fn(poller->arg);
+
+		poller->run_count++;
+		if (poller_rc > 0) {
+			poller->busy_count++;
+		}
+
+#ifdef DEBUG
+		if (poller_rc == -1) {
+			SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Poller %s returned -1\n", poller->name);
+		}
+#endif
+
+		if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+			TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+			free(poller);
+		} else if (poller->state != SPDK_POLLER_STATE_PAUSED) {
+			poller->state = SPDK_POLLER_STATE_WAITING;
+		}
+
+		if (poller_rc > rc) {
+			rc = poller_rc;
+		}
+	}
+
+	TAILQ_FOREACH_SAFE(poller, &thread->timed_pollers, tailq, tmp) {
+		int timer_rc = 0;
+
+		if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+			TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+			free(poller);
+			continue;
+		} else if (poller->state == SPDK_POLLER_STATE_PAUSING) {
+			TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+			TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq);
+			poller->state = SPDK_POLLER_STATE_PAUSED;
+			continue;
+		}
+
+		if (now < poller->next_run_tick) {
+			break;
+		}
+
+		poller->state = SPDK_POLLER_STATE_RUNNING;
+		timer_rc = poller->fn(poller->arg);
+
+		poller->run_count++;
+		if (timer_rc > 0) {
+			poller->busy_count++;
+		}
+
+#ifdef DEBUG
+		if (timer_rc == -1) {
+			SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Timed poller %s returned -1\n", poller->name);
+		}
+#endif
+
+		if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+			TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+			free(poller);
+		} else if (poller->state != SPDK_POLLER_STATE_PAUSED) {
+			poller->state = SPDK_POLLER_STATE_WAITING;
+			TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+			poller_insert_timer(thread, poller, now);
+		}
+
+		if (timer_rc > rc) {
+			rc = timer_rc;
+		}
+	}
+
+	return rc;
+}
+
+int
+spdk_thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now)
+{
+	struct spdk_thread *orig_thread;
+	int rc;
+
+	orig_thread = _get_thread();
+	tls_thread = thread;
+
+	if (now == 0) {
+		now = spdk_get_ticks();
+	}
+
+	rc = thread_poll(thread, max_msgs, now);
+
+	if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITING)) {
+		thread_exit(thread, now);
+	}
+
+	thread_update_stats(thread, spdk_get_ticks(), now, rc);
+
+	tls_thread = orig_thread;
+
+	return rc;
+}
+
+uint64_t
+spdk_thread_next_poller_expiration(struct spdk_thread *thread)
+{
+	struct spdk_poller *poller;
+
+	poller = TAILQ_FIRST(&thread->timed_pollers);
+	if (poller) {
+		return poller->next_run_tick;
+	}
+
+	return 0;
+}
+
+int
+spdk_thread_has_active_pollers(struct spdk_thread *thread)
+{
+	return !TAILQ_EMPTY(&thread->active_pollers);
+}
+
+static bool
+thread_has_unpaused_pollers(struct spdk_thread *thread)
+{
+	if (TAILQ_EMPTY(&thread->active_pollers) &&
+	    TAILQ_EMPTY(&thread->timed_pollers)) {
+		return false;
+	}
+
+	return true;
+}
+
+bool
+spdk_thread_has_pollers(struct spdk_thread *thread)
+{
+	if (!thread_has_unpaused_pollers(thread) &&
+	    TAILQ_EMPTY(&thread->paused_pollers)) {
+		return false;
+	}
+
+	return true;
+}
+
+bool
+spdk_thread_is_idle(struct spdk_thread *thread)
+{
+	if (spdk_ring_count(thread->messages) ||
+	    thread_has_unpaused_pollers(thread) ||
+	    thread->critical_msg != NULL) {
+		return false;
+	}
+
+	return true;
+}
+
+uint32_t
+spdk_thread_get_count(void)
+{
+	/*
+	 * Return cached value of the current thread count.  We could acquire the
+	 *  lock and iterate through the TAILQ of threads to count them, but that
+	 *  count could still be invalidated after we release the lock.
+	 */
+	return g_thread_count;
+}
+
+struct spdk_thread *
+spdk_get_thread(void)
+{
+	return _get_thread();
+}
+
+const char *
+spdk_thread_get_name(const struct spdk_thread *thread)
+{
+	return thread->name;
+}
+
+uint64_t
+spdk_thread_get_id(const struct spdk_thread *thread)
+{
+	return thread->id;
+}
+
+struct spdk_thread *
+spdk_thread_get_by_id(uint64_t id)
+{
+	struct spdk_thread *thread;
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	TAILQ_FOREACH(thread, &g_threads, tailq) {
+		if (thread->id == id) {
+			pthread_mutex_unlock(&g_devlist_mutex);
+
+			return thread;
+		}
+	}
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	return NULL;
+}
+
+int
+spdk_thread_get_stats(struct spdk_thread_stats *stats)
+{
+	struct spdk_thread *thread;
+
+	thread = _get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("No thread allocated\n");
+		return -EINVAL;
+	}
+
+	if (stats == NULL) {
+		return -EINVAL;
+	}
+
+	*stats = thread->stats;
+
+	return 0;
+}
+
+uint64_t
+spdk_thread_get_last_tsc(struct spdk_thread *thread)
+{
+	return thread->tsc_last;
+}
+
+int
+spdk_thread_send_msg(const struct spdk_thread *thread, spdk_msg_fn fn, void *ctx)
+{
+	struct spdk_thread *local_thread;
+	struct spdk_msg *msg;
+	int rc;
+
+	assert(thread != NULL);
+
+	if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) {
+		SPDK_ERRLOG("Thread %s is marked as exited.\n", thread->name);
+		return -EIO;
+	}
+
+	local_thread = _get_thread();
+
+	msg = NULL;
+	if (local_thread != NULL) {
+		if (local_thread->msg_cache_count > 0) {
+			msg = SLIST_FIRST(&local_thread->msg_cache);
+			assert(msg != NULL);
+			SLIST_REMOVE_HEAD(&local_thread->msg_cache, link);
+			local_thread->msg_cache_count--;
+		}
+	}
+
+	if (msg == NULL) {
+		msg = spdk_mempool_get(g_spdk_msg_mempool);
+		if (!msg) {
+			SPDK_ERRLOG("msg could not be allocated\n");
+			return -ENOMEM;
+		}
+	}
+
+	msg->fn = fn;
+	msg->arg = ctx;
+
+	rc = spdk_ring_enqueue(thread->messages, (void **)&msg, 1, NULL);
+	if (rc != 1) {
+		SPDK_ERRLOG("msg could not be enqueued\n");
+		spdk_mempool_put(g_spdk_msg_mempool, msg);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int
+spdk_thread_send_critical_msg(struct spdk_thread *thread, spdk_msg_fn fn)
+{
+	spdk_msg_fn expected = NULL;
+
+	if (__atomic_compare_exchange_n(&thread->critical_msg, &expected, fn, false, __ATOMIC_SEQ_CST,
+					__ATOMIC_SEQ_CST)) {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+static struct spdk_poller *
+poller_register(spdk_poller_fn fn,
+		void *arg,
+		uint64_t period_microseconds,
+		const char *name)
+{
+	struct spdk_thread *thread;
+	struct spdk_poller *poller;
+	uint64_t quotient, remainder, ticks;
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		assert(false);
+		return NULL;
+	}
+
+	if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) {
+		SPDK_ERRLOG("thread %s is marked as exited\n", thread->name);
+		return NULL;
+	}
+
+	poller = calloc(1, sizeof(*poller));
+	if (poller == NULL) {
+		SPDK_ERRLOG("Poller memory allocation failed\n");
+		return NULL;
+	}
+
+	if (name) {
+		snprintf(poller->name, sizeof(poller->name), "%s", name);
+	} else {
+		snprintf(poller->name, sizeof(poller->name), "%p", fn);
+	}
+
+	poller->state = SPDK_POLLER_STATE_WAITING;
+	poller->fn = fn;
+	poller->arg = arg;
+	poller->thread = thread;
+
+	if (period_microseconds) {
+		quotient = period_microseconds / SPDK_SEC_TO_USEC;
+		remainder = period_microseconds % SPDK_SEC_TO_USEC;
+		ticks = spdk_get_ticks_hz();
+
+		poller->period_ticks = ticks * quotient + (ticks * remainder) / SPDK_SEC_TO_USEC;
+	} else {
+		poller->period_ticks = 0;
+	}
+
+	thread_insert_poller(thread, poller);
+
+	return poller;
+}
+
+struct spdk_poller *
+spdk_poller_register(spdk_poller_fn fn,
+		     void *arg,
+		     uint64_t period_microseconds)
+{
+	return poller_register(fn, arg, period_microseconds, NULL);
+}
+
+struct spdk_poller *
+spdk_poller_register_named(spdk_poller_fn fn,
+			   void *arg,
+			   uint64_t period_microseconds,
+			   const char *name)
+{
+	return poller_register(fn, arg, period_microseconds, name);
+}
+
+void
+spdk_poller_unregister(struct spdk_poller **ppoller)
+{
+	struct spdk_thread *thread;
+	struct spdk_poller *poller;
+
+	poller = *ppoller;
+	if (poller == NULL) {
+		return;
+	}
+
+	*ppoller = NULL;
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		assert(false);
+		return;
+	}
+
+	if (poller->thread != thread) {
+		SPDK_ERRLOG("different from the thread that called spdk_poller_register()\n");
+		assert(false);
+		return;
+	}
+
+	/* If the poller was paused, put it on the active_pollers list so that
+	 * its unregistration can be processed by spdk_thread_poll().
+	 */
+	if (poller->state == SPDK_POLLER_STATE_PAUSED) {
+		TAILQ_REMOVE(&thread->paused_pollers, poller, tailq);
+		TAILQ_INSERT_TAIL(&thread->active_pollers, poller, tailq);
+		poller->period_ticks = 0;
+	}
+
+	/* Simply set the state to unregistered. The poller will get cleaned up
+	 * in a subsequent call to spdk_thread_poll().
+	 */
+	poller->state = SPDK_POLLER_STATE_UNREGISTERED;
+}
+
+void
+spdk_poller_pause(struct spdk_poller *poller)
+{
+	struct spdk_thread *thread;
+
+	if (poller->state == SPDK_POLLER_STATE_PAUSED ||
+	    poller->state == SPDK_POLLER_STATE_PAUSING) {
+		return;
+	}
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		assert(false);
+		return;
+	}
+
+	/* If a poller is paused from within itself, we can immediately move it
+	 * on the paused_pollers list.  Otherwise we just set its state to
+	 * SPDK_POLLER_STATE_PAUSING and let spdk_thread_poll() move it.  It
+	 * allows a poller to be paused from another one's context without
+	 * breaking the TAILQ_FOREACH_REVERSE_SAFE iteration.
+	 */
+	if (poller->state != SPDK_POLLER_STATE_RUNNING) {
+		poller->state = SPDK_POLLER_STATE_PAUSING;
+	} else {
+		if (poller->period_ticks > 0) {
+			TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+		} else {
+			TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+		}
+
+		TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq);
+		poller->state = SPDK_POLLER_STATE_PAUSED;
+	}
+}
+
+void
+spdk_poller_resume(struct spdk_poller *poller)
+{
+	struct spdk_thread *thread;
+
+	if (poller->state != SPDK_POLLER_STATE_PAUSED &&
+	    poller->state != SPDK_POLLER_STATE_PAUSING) {
+		return;
+	}
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		assert(false);
+		return;
+	}
+
+	/* If a poller is paused it has to be removed from the paused pollers
+	 * list and put on the active / timer list depending on its
+	 * period_ticks.  If a poller is still in the process of being paused,
+	 * we just need to flip its state back to waiting, as it's already on
+	 * the appropriate list.
+	 */
+	if (poller->state == SPDK_POLLER_STATE_PAUSED) {
+		TAILQ_REMOVE(&thread->paused_pollers, poller, tailq);
+		thread_insert_poller(thread, poller);
+	}
+
+	poller->state = SPDK_POLLER_STATE_WAITING;
+}
+
+const char *
+spdk_poller_state_str(enum spdk_poller_state state)
+{
+	switch (state) {
+	case SPDK_POLLER_STATE_WAITING:
+		return "waiting";
+	case SPDK_POLLER_STATE_RUNNING:
+		return "running";
+	case SPDK_POLLER_STATE_UNREGISTERED:
+		return "unregistered";
+	case SPDK_POLLER_STATE_PAUSING:
+		return "pausing";
+	case SPDK_POLLER_STATE_PAUSED:
+		return "paused";
+	default:
+		return NULL;
+	}
+}
+
+struct call_thread {
+	struct spdk_thread *cur_thread;
+	spdk_msg_fn fn;
+	void *ctx;
+
+	struct spdk_thread *orig_thread;
+	spdk_msg_fn cpl;
+};
+
+static void
+_on_thread(void *ctx)
+{
+	struct call_thread *ct = ctx;
+	int rc __attribute__((unused));
+
+	ct->fn(ct->ctx);
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	ct->cur_thread = TAILQ_NEXT(ct->cur_thread, tailq);
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	if (!ct->cur_thread) {
+		SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Completed thread iteration\n");
+
+		rc = spdk_thread_send_msg(ct->orig_thread, ct->cpl, ct->ctx);
+		free(ctx);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Continuing thread iteration to %s\n",
+			      ct->cur_thread->name);
+
+		rc = spdk_thread_send_msg(ct->cur_thread, _on_thread, ctx);
+	}
+	assert(rc == 0);
+}
+
+void
+spdk_for_each_thread(spdk_msg_fn fn, void *ctx, spdk_msg_fn cpl)
+{
+	struct call_thread *ct;
+	struct spdk_thread *thread;
+	int rc __attribute__((unused));
+
+	ct = calloc(1, sizeof(*ct));
+	if (!ct) {
+		SPDK_ERRLOG("Unable to perform thread iteration\n");
+		cpl(ctx);
+		return;
+	}
+
+	ct->fn = fn;
+	ct->ctx = ctx;
+	ct->cpl = cpl;
+
+	thread = _get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("No thread allocated\n");
+		free(ct);
+		cpl(ctx);
+		return;
+	}
+	ct->orig_thread = thread;
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	ct->cur_thread = TAILQ_FIRST(&g_threads);
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Starting thread iteration from %s\n",
+		      ct->orig_thread->name);
+
+	rc = spdk_thread_send_msg(ct->cur_thread, _on_thread, ct);
+	assert(rc == 0);
+}
+
+void
+spdk_io_device_register(void *io_device, spdk_io_channel_create_cb create_cb,
+			spdk_io_channel_destroy_cb destroy_cb, uint32_t ctx_size,
+			const char *name)
+{
+	struct io_device *dev, *tmp;
+	struct spdk_thread *thread;
+
+	assert(io_device != NULL);
+	assert(create_cb != NULL);
+	assert(destroy_cb != NULL);
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("called from non-SPDK thread\n");
+		assert(false);
+		return;
+	}
+
+	dev = calloc(1, sizeof(struct io_device));
+	if (dev == NULL) {
+		SPDK_ERRLOG("could not allocate io_device\n");
+		return;
+	}
+
+	dev->io_device = io_device;
+	if (name) {
+		snprintf(dev->name, sizeof(dev->name), "%s", name);
+	} else {
+		snprintf(dev->name, sizeof(dev->name), "%p", dev);
+	}
+	dev->create_cb = create_cb;
+	dev->destroy_cb = destroy_cb;
+	dev->unregister_cb = NULL;
+	dev->ctx_size = ctx_size;
+	dev->for_each_count = 0;
+	dev->unregistered = false;
+	dev->refcnt = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Registering io_device %s (%p) on thread %s\n",
+		      dev->name, dev->io_device, thread->name);
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	TAILQ_FOREACH(tmp, &g_io_devices, tailq) {
+		if (tmp->io_device == io_device) {
+			SPDK_ERRLOG("io_device %p already registered (old:%s new:%s)\n",
+				    io_device, tmp->name, dev->name);
+			free(dev);
+			pthread_mutex_unlock(&g_devlist_mutex);
+			return;
+		}
+	}
+	TAILQ_INSERT_TAIL(&g_io_devices, dev, tailq);
+	pthread_mutex_unlock(&g_devlist_mutex);
+}
+
+static void
+_finish_unregister(void *arg)
+{
+	struct io_device *dev = arg;
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Finishing unregistration of io_device %s (%p) on thread %s\n",
+		      dev->name, dev->io_device, dev->unregister_thread->name);
+
+	dev->unregister_cb(dev->io_device);
+	free(dev);
+}
+
+static void
+io_device_free(struct io_device *dev)
+{
+	int rc __attribute__((unused));
+
+	if (dev->unregister_cb == NULL) {
+		free(dev);
+	} else {
+		assert(dev->unregister_thread != NULL);
+		SPDK_DEBUGLOG(SPDK_LOG_THREAD, "io_device %s (%p) needs to unregister from thread %s\n",
+			      dev->name, dev->io_device, dev->unregister_thread->name);
+		rc = spdk_thread_send_msg(dev->unregister_thread, _finish_unregister, dev);
+		assert(rc == 0);
+	}
+}
+
+void
+spdk_io_device_unregister(void *io_device, spdk_io_device_unregister_cb unregister_cb)
+{
+	struct io_device *dev;
+	uint32_t refcnt;
+	struct spdk_thread *thread;
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("called from non-SPDK thread\n");
+		assert(false);
+		return;
+	}
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	TAILQ_FOREACH(dev, &g_io_devices, tailq) {
+		if (dev->io_device == io_device) {
+			break;
+		}
+	}
+
+	if (!dev) {
+		SPDK_ERRLOG("io_device %p not found\n", io_device);
+		assert(false);
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return;
+	}
+
+	if (dev->for_each_count > 0) {
+		SPDK_ERRLOG("io_device %s (%p) has %u for_each calls outstanding\n",
+			    dev->name, io_device, dev->for_each_count);
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return;
+	}
+
+	dev->unregister_cb = unregister_cb;
+	dev->unregistered = true;
+	TAILQ_REMOVE(&g_io_devices, dev, tailq);
+	refcnt = dev->refcnt;
+	dev->unregister_thread = thread;
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Unregistering io_device %s (%p) from thread %s\n",
+		      dev->name, dev->io_device, thread->name);
+
+	if (refcnt > 0) {
+		/* defer deletion */
+		return;
+	}
+
+	io_device_free(dev);
+}
+
+const char *
+spdk_io_device_get_name(struct io_device *dev)
+{
+	return dev->name;
+}
+
+struct spdk_io_channel *
+spdk_get_io_channel(void *io_device)
+{
+	struct spdk_io_channel *ch;
+	struct spdk_thread *thread;
+	struct io_device *dev;
+	int rc;
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	TAILQ_FOREACH(dev, &g_io_devices, tailq) {
+		if (dev->io_device == io_device) {
+			break;
+		}
+	}
+	if (dev == NULL) {
+		SPDK_ERRLOG("could not find io_device %p\n", io_device);
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return NULL;
+	}
+
+	thread = _get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("No thread allocated\n");
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return NULL;
+	}
+
+	if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) {
+		SPDK_ERRLOG("Thread %s is marked as exited\n", thread->name);
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return NULL;
+	}
+
+	TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+		if (ch->dev == dev) {
+			ch->ref++;
+
+			SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n",
+				      ch, dev->name, dev->io_device, thread->name, ch->ref);
+
+			/*
+			 * An I/O channel already exists for this device on this
+			 *  thread, so return it.
+			 */
+			pthread_mutex_unlock(&g_devlist_mutex);
+			return ch;
+		}
+	}
+
+	ch = calloc(1, sizeof(*ch) + dev->ctx_size);
+	if (ch == NULL) {
+		SPDK_ERRLOG("could not calloc spdk_io_channel\n");
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return NULL;
+	}
+
+	ch->dev = dev;
+	ch->destroy_cb = dev->destroy_cb;
+	ch->thread = thread;
+	ch->ref = 1;
+	ch->destroy_ref = 0;
+	TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq);
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n",
+		      ch, dev->name, dev->io_device, thread->name, ch->ref);
+
+	dev->refcnt++;
+
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch));
+	if (rc != 0) {
+		pthread_mutex_lock(&g_devlist_mutex);
+		TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq);
+		dev->refcnt--;
+		free(ch);
+		pthread_mutex_unlock(&g_devlist_mutex);
+		return NULL;
+	}
+
+	return ch;
+}
+
+static void
+put_io_channel(void *arg)
+{
+	struct spdk_io_channel *ch = arg;
+	bool do_remove_dev = true;
+	struct spdk_thread *thread;
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("called from non-SPDK thread\n");
+		assert(false);
+		return;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD,
+		      "Releasing io_channel %p for io_device %s (%p) on thread %s\n",
+		      ch, ch->dev->name, ch->dev->io_device, thread->name);
+
+	assert(ch->thread == thread);
+
+	ch->destroy_ref--;
+
+	if (ch->ref > 0 || ch->destroy_ref > 0) {
+		/*
+		 * Another reference to the associated io_device was requested
+		 *  after this message was sent but before it had a chance to
+		 *  execute.
+		 */
+		return;
+	}
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq);
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	/* Don't hold the devlist mutex while the destroy_cb is called. */
+	ch->destroy_cb(ch->dev->io_device, spdk_io_channel_get_ctx(ch));
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	ch->dev->refcnt--;
+
+	if (!ch->dev->unregistered) {
+		do_remove_dev = false;
+	}
+
+	if (ch->dev->refcnt > 0) {
+		do_remove_dev = false;
+	}
+
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	if (do_remove_dev) {
+		io_device_free(ch->dev);
+	}
+	free(ch);
+}
+
+void
+spdk_put_io_channel(struct spdk_io_channel *ch)
+{
+	struct spdk_thread *thread;
+	int rc __attribute__((unused));
+
+	thread = spdk_get_thread();
+	if (!thread) {
+		SPDK_ERRLOG("called from non-SPDK thread\n");
+		assert(false);
+		return;
+	}
+
+	if (ch->thread != thread) {
+		SPDK_ERRLOG("different from the thread that called get_io_channel()\n");
+		assert(false);
+		return;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_THREAD,
+		      "Putting io_channel %p for io_device %s (%p) on thread %s refcnt %u\n",
+		      ch, ch->dev->name, ch->dev->io_device, thread->name, ch->ref);
+
+	ch->ref--;
+
+	if (ch->ref == 0) {
+		ch->destroy_ref++;
+		rc = spdk_thread_send_msg(thread, put_io_channel, ch);
+		assert(rc == 0);
+	}
+}
+
+struct spdk_io_channel *
+spdk_io_channel_from_ctx(void *ctx)
+{
+	return (struct spdk_io_channel *)((uint8_t *)ctx - sizeof(struct spdk_io_channel));
+}
+
+struct spdk_thread *
+spdk_io_channel_get_thread(struct spdk_io_channel *ch)
+{
+	return ch->thread;
+}
+
+struct spdk_io_channel_iter {
+	void *io_device;
+	struct io_device *dev;
+	spdk_channel_msg fn;
+	int status;
+	void *ctx;
+	struct spdk_io_channel *ch;
+
+	struct spdk_thread *cur_thread;
+
+	struct spdk_thread *orig_thread;
+	spdk_channel_for_each_cpl cpl;
+};
+
+void *
+spdk_io_channel_iter_get_io_device(struct spdk_io_channel_iter *i)
+{
+	return i->io_device;
+}
+
+struct spdk_io_channel *
+spdk_io_channel_iter_get_channel(struct spdk_io_channel_iter *i)
+{
+	return i->ch;
+}
+
+void *
+spdk_io_channel_iter_get_ctx(struct spdk_io_channel_iter *i)
+{
+	return i->ctx;
+}
+
+static void
+_call_completion(void *ctx)
+{
+	struct spdk_io_channel_iter *i = ctx;
+
+	if (i->cpl != NULL) {
+		i->cpl(i, i->status);
+	}
+	free(i);
+}
+
+static void
+_call_channel(void *ctx)
+{
+	struct spdk_io_channel_iter *i = ctx;
+	struct spdk_io_channel *ch;
+
+	/*
+	 * It is possible that the channel was deleted before this
+	 *  message had a chance to execute.  If so, skip calling
+	 *  the fn() on this thread.
+	 */
+	pthread_mutex_lock(&g_devlist_mutex);
+	TAILQ_FOREACH(ch, &i->cur_thread->io_channels, tailq) {
+		if (ch->dev->io_device == i->io_device) {
+			break;
+		}
+	}
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	if (ch) {
+		i->fn(i);
+	} else {
+		spdk_for_each_channel_continue(i, 0);
+	}
+}
+
+void
+spdk_for_each_channel(void *io_device, spdk_channel_msg fn, void *ctx,
+		      spdk_channel_for_each_cpl cpl)
+{
+	struct spdk_thread *thread;
+	struct spdk_io_channel *ch;
+	struct spdk_io_channel_iter *i;
+	int rc __attribute__((unused));
+
+	i = calloc(1, sizeof(*i));
+	if (!i) {
+		SPDK_ERRLOG("Unable to allocate iterator\n");
+		return;
+	}
+
+	i->io_device = io_device;
+	i->fn = fn;
+	i->ctx = ctx;
+	i->cpl = cpl;
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	i->orig_thread = _get_thread();
+
+	TAILQ_FOREACH(thread, &g_threads, tailq) {
+		TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+			if (ch->dev->io_device == io_device) {
+				ch->dev->for_each_count++;
+				i->dev = ch->dev;
+				i->cur_thread = thread;
+				i->ch = ch;
+				pthread_mutex_unlock(&g_devlist_mutex);
+				rc = spdk_thread_send_msg(thread, _call_channel, i);
+				assert(rc == 0);
+				return;
+			}
+		}
+	}
+
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	rc = spdk_thread_send_msg(i->orig_thread, _call_completion, i);
+	assert(rc == 0);
+}
+
+void
+spdk_for_each_channel_continue(struct spdk_io_channel_iter *i, int status)
+{
+	struct spdk_thread *thread;
+	struct spdk_io_channel *ch;
+	int rc __attribute__((unused));
+
+	assert(i->cur_thread == spdk_get_thread());
+
+	i->status = status;
+
+	pthread_mutex_lock(&g_devlist_mutex);
+	if (status) {
+		goto end;
+	}
+	thread = TAILQ_NEXT(i->cur_thread, tailq);
+	while (thread) {
+		TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+			if (ch->dev->io_device == i->io_device) {
+				i->cur_thread = thread;
+				i->ch = ch;
+				pthread_mutex_unlock(&g_devlist_mutex);
+				rc = spdk_thread_send_msg(thread, _call_channel, i);
+				assert(rc == 0);
+				return;
+			}
+		}
+		thread = TAILQ_NEXT(thread, tailq);
+	}
+
+end:
+	i->dev->for_each_count--;
+	i->ch = NULL;
+	pthread_mutex_unlock(&g_devlist_mutex);
+
+	rc = spdk_thread_send_msg(i->orig_thread, _call_completion, i);
+	assert(rc == 0);
+}
+
+
+SPDK_LOG_REGISTER_COMPONENT("thread", SPDK_LOG_THREAD)
diff --git a/src/spdk/lib/trace/Makefile b/src/spdk/lib/trace/Makefile
new file mode 100644
index 000000000..9102c320a
--- /dev/null
+++ b/src/spdk/lib/trace/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = trace.c trace_flags.c trace_rpc.c
+LIBNAME = trace
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_trace.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/trace/spdk_trace.map b/src/spdk/lib/trace/spdk_trace.map
new file mode 100644
index 000000000..14a03b337
--- /dev/null
+++ b/src/spdk/lib/trace/spdk_trace.map
@@ -0,0 +1,29 @@
+{
+	global:
+
+	# public functions
+	_spdk_trace_record;
+	spdk_trace_get_tpoint_mask;
+	spdk_trace_set_tpoints;
+	spdk_trace_clear_tpoints;
+	spdk_trace_get_tpoint_group_mask;
+	spdk_trace_set_tpoint_group_mask;
+	spdk_trace_clear_tpoint_group_mask;
+	spdk_trace_init;
+	spdk_trace_cleanup;
+	spdk_trace_flags_init;
+	spdk_trace_register_owner;
+	spdk_trace_register_object;
+	spdk_trace_register_description;
+	spdk_trace_get_first_register_fn;
+	spdk_trace_get_next_register_fn;
+	spdk_trace_enable_tpoint_group;
+	spdk_trace_disable_tpoint_group;
+	spdk_trace_mask_usage;
+	spdk_trace_add_register_fn;
+
+	# public variables
+	g_trace_histories;
+
+	local: *;
+};
diff --git a/src/spdk/lib/trace/trace.c b/src/spdk/lib/trace/trace.c
new file mode 100644
index 000000000..621c52aae
--- /dev/null
+++ b/src/spdk/lib/trace/trace.c
@@ -0,0 +1,201 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+#include "spdk/barrier.h"
+#include "spdk/log.h"
+
+static int g_trace_fd = -1;
+static char g_shm_name[64];
+
+struct spdk_trace_histories *g_trace_histories;
+
+void
+_spdk_trace_record(uint64_t tsc, uint16_t tpoint_id, uint16_t poller_id, uint32_t size,
+		   uint64_t object_id, uint64_t arg1)
+{
+	struct spdk_trace_history *lcore_history;
+	struct spdk_trace_entry *next_entry;
+	unsigned lcore;
+	uint64_t next_circular_entry;
+
+	lcore = spdk_env_get_current_core();
+	if (lcore >= SPDK_TRACE_MAX_LCORE) {
+		return;
+	}
+
+	lcore_history = spdk_get_per_lcore_history(g_trace_histories, lcore);
+	if (tsc == 0) {
+		tsc = spdk_get_ticks();
+	}
+
+	lcore_history->tpoint_count[tpoint_id]++;
+
+	/* Get next entry index in the circular buffer */
+	next_circular_entry = lcore_history->next_entry & (lcore_history->num_entries - 1);
+	next_entry = &lcore_history->entries[next_circular_entry];
+	next_entry->tsc = tsc;
+	next_entry->tpoint_id = tpoint_id;
+	next_entry->poller_id = poller_id;
+	next_entry->size = size;
+	next_entry->object_id = object_id;
+	next_entry->arg1 = arg1;
+
+	/* Ensure all elements of the trace entry are visible to outside trace tools */
+	spdk_smp_wmb();
+	lcore_history->next_entry++;
+}
+
+int
+spdk_trace_init(const char *shm_name, uint64_t num_entries)
+{
+	int i = 0;
+	int histories_size;
+	uint64_t lcore_offsets[SPDK_TRACE_MAX_LCORE + 1];
+
+	/* 0 entries requested - skip trace initialization */
+	if (num_entries == 0) {
+		return 0;
+	}
+
+	lcore_offsets[0] = sizeof(struct spdk_trace_flags);
+	for (i = 1; i < (int)SPDK_COUNTOF(lcore_offsets); i++) {
+		lcore_offsets[i] = spdk_get_trace_history_size(num_entries) + lcore_offsets[i - 1];
+	}
+	histories_size = lcore_offsets[SPDK_TRACE_MAX_LCORE];
+
+	snprintf(g_shm_name, sizeof(g_shm_name), "%s", shm_name);
+
+	g_trace_fd = shm_open(shm_name, O_RDWR | O_CREAT, 0600);
+	if (g_trace_fd == -1) {
+		SPDK_ERRLOG("could not shm_open spdk_trace\n");
+		SPDK_ERRLOG("errno=%d %s\n", errno, spdk_strerror(errno));
+		return 1;
+	}
+
+	if (ftruncate(g_trace_fd, histories_size) != 0) {
+		SPDK_ERRLOG("could not truncate shm\n");
+		goto trace_init_err;
+	}
+
+	g_trace_histories = mmap(NULL, histories_size, PROT_READ | PROT_WRITE,
+				 MAP_SHARED, g_trace_fd, 0);
+	if (g_trace_histories == MAP_FAILED) {
+		SPDK_ERRLOG("could not mmap shm\n");
+		goto trace_init_err;
+	}
+
+	/* TODO: On FreeBSD, mlock on shm_open'd memory doesn't seem to work.  Docs say that kern.ipc.shm_use_phys=1
+	 * should allow it, but forcing that doesn't seem to work either.  So for now just skip mlock on FreeBSD
+	 * altogether.
+	 */
+#if defined(__linux__)
+	if (mlock(g_trace_histories, histories_size) != 0) {
+		SPDK_ERRLOG("Could not mlock shm for tracing - %s.\n", spdk_strerror(errno));
+		if (errno == ENOMEM) {
+			SPDK_ERRLOG("Check /dev/shm for old tracing files that can be deleted.\n");
+		}
+		goto trace_init_err;
+	}
+#endif
+
+	memset(g_trace_histories, 0, histories_size);
+
+	g_trace_flags = &g_trace_histories->flags;
+
+	g_trace_flags->tsc_rate = spdk_get_ticks_hz();
+
+	for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
+		struct spdk_trace_history *lcore_history;
+
+		g_trace_flags->lcore_history_offsets[i] = lcore_offsets[i];
+		lcore_history = spdk_get_per_lcore_history(g_trace_histories, i);
+		lcore_history->lcore = i;
+		lcore_history->num_entries = num_entries;
+	}
+	g_trace_flags->lcore_history_offsets[SPDK_TRACE_MAX_LCORE] = lcore_offsets[SPDK_TRACE_MAX_LCORE];
+
+	spdk_trace_flags_init();
+
+	return 0;
+
+trace_init_err:
+	if (g_trace_histories != MAP_FAILED) {
+		munmap(g_trace_histories, histories_size);
+	}
+	close(g_trace_fd);
+	g_trace_fd = -1;
+	shm_unlink(shm_name);
+	g_trace_histories = NULL;
+
+	return 1;
+
+}
+
+void
+spdk_trace_cleanup(void)
+{
+	bool unlink;
+	int i;
+	struct spdk_trace_history *lcore_history;
+
+	if (g_trace_histories == NULL) {
+		return;
+	}
+
+	/*
+	 * Only unlink the shm if there were no trace_entry recorded. This ensures the file
+	 * can be used after this process exits/crashes for debugging.
+	 * Note that we have to calculate this value before g_trace_histories gets unmapped.
+	 */
+	for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
+		lcore_history = spdk_get_per_lcore_history(g_trace_histories, i);
+		unlink = lcore_history->entries[0].tsc == 0;
+		if (!unlink) {
+			break;
+		}
+	}
+
+	munmap(g_trace_histories, sizeof(struct spdk_trace_histories));
+	g_trace_histories = NULL;
+	close(g_trace_fd);
+
+	if (unlink) {
+		shm_unlink(g_shm_name);
+	}
+}
diff --git a/src/spdk/lib/trace/trace_flags.c b/src/spdk/lib/trace/trace_flags.c
new file mode 100644
index 000000000..615afe355
--- /dev/null
+++ b/src/spdk/lib/trace/trace_flags.c
@@ -0,0 +1,323 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/trace.h"
+#include "spdk/log.h"
+#include "spdk_internal/log.h"
+
+struct spdk_trace_flags *g_trace_flags = NULL;
+static struct spdk_trace_register_fn *g_reg_fn_head = NULL;
+
+SPDK_LOG_REGISTER_COMPONENT("trace", SPDK_LOG_TRACE)
+
+uint64_t
+spdk_trace_get_tpoint_mask(uint32_t group_id)
+{
+	if (group_id >= SPDK_TRACE_MAX_GROUP_ID) {
+		SPDK_ERRLOG("invalid group ID %d\n", group_id);
+		return 0ULL;
+	}
+
+	return g_trace_flags->tpoint_mask[group_id];
+}
+
+void
+spdk_trace_set_tpoints(uint32_t group_id, uint64_t tpoint_mask)
+{
+	if (group_id >= SPDK_TRACE_MAX_GROUP_ID) {
+		SPDK_ERRLOG("invalid group ID %d\n", group_id);
+		return;
+	}
+
+	g_trace_flags->tpoint_mask[group_id] |= tpoint_mask;
+}
+
+void
+spdk_trace_clear_tpoints(uint32_t group_id, uint64_t tpoint_mask)
+{
+	if (group_id >= SPDK_TRACE_MAX_GROUP_ID) {
+		SPDK_ERRLOG("invalid group ID %d\n", group_id);
+		return;
+	}
+
+	g_trace_flags->tpoint_mask[group_id] &= ~tpoint_mask;
+}
+
+uint64_t
+spdk_trace_get_tpoint_group_mask(void)
+{
+	uint64_t mask = 0x0;
+	int i;
+
+	for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) {
+		if (spdk_trace_get_tpoint_mask(i) != 0) {
+			mask |= (1ULL << i);
+		}
+	}
+
+	return mask;
+}
+
+void
+spdk_trace_set_tpoint_group_mask(uint64_t tpoint_group_mask)
+{
+	int i;
+
+	for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) {
+		if (tpoint_group_mask & (1ULL << i)) {
+			spdk_trace_set_tpoints(i, -1ULL);
+		}
+	}
+}
+
+void
+spdk_trace_clear_tpoint_group_mask(uint64_t tpoint_group_mask)
+{
+	int i;
+
+	for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) {
+		if (tpoint_group_mask & (1ULL << i)) {
+			spdk_trace_clear_tpoints(i, -1ULL);
+		}
+	}
+}
+
+struct spdk_trace_register_fn *
+spdk_trace_get_first_register_fn(void)
+{
+	return g_reg_fn_head;
+}
+
+struct spdk_trace_register_fn *
+spdk_trace_get_next_register_fn(struct spdk_trace_register_fn *register_fn)
+{
+	return register_fn->next;
+}
+
+static uint64_t
+trace_create_tpoint_group_mask(const char *group_name)
+{
+	uint64_t tpoint_group_mask = 0;
+	struct spdk_trace_register_fn *register_fn;
+
+	register_fn = spdk_trace_get_first_register_fn();
+	if (strcmp(group_name, "all") == 0) {
+		while (register_fn) {
+			tpoint_group_mask |= (1UL << register_fn->tgroup_id);
+
+			register_fn = spdk_trace_get_next_register_fn(register_fn);
+		}
+	} else {
+		while (register_fn) {
+			if (strcmp(group_name, register_fn->name) == 0) {
+				break;
+			}
+
+			register_fn = spdk_trace_get_next_register_fn(register_fn);
+		}
+
+		if (register_fn != NULL) {
+			tpoint_group_mask |= (1UL << register_fn->tgroup_id);
+		}
+	}
+
+	return tpoint_group_mask;
+}
+
+int
+spdk_trace_enable_tpoint_group(const char *group_name)
+{
+	uint64_t tpoint_group_mask = 0;
+
+	tpoint_group_mask = trace_create_tpoint_group_mask(group_name);
+	if (tpoint_group_mask == 0) {
+		return -1;
+	}
+
+	spdk_trace_set_tpoint_group_mask(tpoint_group_mask);
+	return 0;
+}
+
+int
+spdk_trace_disable_tpoint_group(const char *group_name)
+{
+	uint64_t tpoint_group_mask = 0;
+
+	tpoint_group_mask = trace_create_tpoint_group_mask(group_name);
+	if (tpoint_group_mask == 0) {
+		return -1;
+	}
+
+	spdk_trace_clear_tpoint_group_mask(tpoint_group_mask);
+	return 0;
+}
+
+void
+spdk_trace_mask_usage(FILE *f, const char *tmask_arg)
+{
+	struct spdk_trace_register_fn *register_fn;
+
+	fprintf(f, " %s, --tpoint-group-mask <mask>\n", tmask_arg);
+	fprintf(f, "                           tracepoint group mask for spdk trace buffers (default 0x0");
+
+	register_fn = g_reg_fn_head;
+	while (register_fn) {
+		fprintf(f, ", %s 0x%x", register_fn->name, 1 << register_fn->tgroup_id);
+		register_fn = register_fn->next;
+	}
+
+	fprintf(f, ", all 0xffff)\n");
+}
+
+void
+spdk_trace_register_owner(uint8_t type, char id_prefix)
+{
+	struct spdk_trace_owner *owner;
+
+	assert(type != OWNER_NONE);
+
+	/* 'owner' has 256 entries and since 'type' is a uint8_t, it
+	 * can't overrun the array.
+	 */
+	owner = &g_trace_flags->owner[type];
+	assert(owner->type == 0);
+
+	owner->type = type;
+	owner->id_prefix = id_prefix;
+}
+
+void
+spdk_trace_register_object(uint8_t type, char id_prefix)
+{
+	struct spdk_trace_object *object;
+
+	assert(type != OBJECT_NONE);
+
+	/* 'object' has 256 entries and since 'type' is a uint8_t, it
+	 * can't overrun the array.
+	 */
+	object = &g_trace_flags->object[type];
+	assert(object->type == 0);
+
+	object->type = type;
+	object->id_prefix = id_prefix;
+}
+
+void
+spdk_trace_register_description(const char *name, uint16_t tpoint_id, uint8_t owner_type,
+				uint8_t object_type, uint8_t new_object,
+				uint8_t arg1_type, const char *arg1_name)
+{
+	struct spdk_trace_tpoint *tpoint;
+
+	assert(tpoint_id != 0);
+	assert(tpoint_id < SPDK_TRACE_MAX_TPOINT_ID);
+
+	if (strnlen(name, sizeof(tpoint->name)) == sizeof(tpoint->name)) {
+		SPDK_ERRLOG("name (%s) too long\n", name);
+	}
+
+	tpoint = &g_trace_flags->tpoint[tpoint_id];
+	assert(tpoint->tpoint_id == 0);
+
+	snprintf(tpoint->name, sizeof(tpoint->name), "%s", name);
+	tpoint->tpoint_id = tpoint_id;
+	tpoint->object_type = object_type;
+	tpoint->owner_type = owner_type;
+	tpoint->new_object = new_object;
+	tpoint->arg1_type = arg1_type;
+	snprintf(tpoint->arg1_name, sizeof(tpoint->arg1_name), "%s", arg1_name);
+}
+
+void
+spdk_trace_add_register_fn(struct spdk_trace_register_fn *reg_fn)
+{
+	struct spdk_trace_register_fn *_reg_fn;
+
+	if (reg_fn->name == NULL) {
+		SPDK_ERRLOG("missing name for registering spdk trace tpoint group\n");
+		assert(false);
+		return;
+	}
+
+	if (strcmp(reg_fn->name, "all") == 0) {
+		SPDK_ERRLOG("illegal name (%s) for tpoint group\n", reg_fn->name);
+		assert(false);
+		return;
+	}
+
+	/* Ensure that no trace point group IDs and names are ever duplicated */
+	for (_reg_fn = g_reg_fn_head; _reg_fn; _reg_fn = _reg_fn->next) {
+		if (reg_fn->tgroup_id == _reg_fn->tgroup_id) {
+			SPDK_ERRLOG("duplicate tgroup_id (%d) with %s\n", _reg_fn->tgroup_id, _reg_fn->name);
+			assert(false);
+			return;
+		}
+
+		if (strcmp(reg_fn->name, _reg_fn->name) == 0) {
+			SPDK_ERRLOG("duplicate name with %s\n", _reg_fn->name);
+			assert(false);
+			return;
+		}
+	}
+
+	/* Arrange trace registration in order on tgroup_id */
+	if (g_reg_fn_head == NULL || reg_fn->tgroup_id < g_reg_fn_head->tgroup_id) {
+		reg_fn->next = g_reg_fn_head;
+		g_reg_fn_head = reg_fn;
+		return;
+	}
+
+	for (_reg_fn = g_reg_fn_head; _reg_fn; _reg_fn = _reg_fn->next) {
+		if (_reg_fn->next == NULL || reg_fn->tgroup_id < _reg_fn->next->tgroup_id) {
+			reg_fn->next = _reg_fn->next;
+			_reg_fn->next = reg_fn;
+			return;
+		}
+	}
+}
+
+void
+spdk_trace_flags_init(void)
+{
+	struct spdk_trace_register_fn *reg_fn;
+
+	reg_fn = g_reg_fn_head;
+	while (reg_fn) {
+		reg_fn->reg_fn();
+		reg_fn = reg_fn->next;
+	}
+}
diff --git a/src/spdk/lib/trace/trace_rpc.c b/src/spdk/lib/trace/trace_rpc.c
new file mode 100644
index 000000000..90dbfbc60
--- /dev/null
+++ b/src/spdk/lib/trace/trace_rpc.c
@@ -0,0 +1,170 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/trace.h"
+#include "spdk_internal/log.h"
+
+struct rpc_tpoint_group {
+	char *name;
+};
+
+static void
+free_rpc_tpoint_group(struct rpc_tpoint_group *p)
+{
+	free(p->name);
+}
+
+static const struct spdk_json_object_decoder rpc_tpoint_group_decoders[] = {
+	{"name", offsetof(struct rpc_tpoint_group, name), spdk_json_decode_string},
+};
+
+static void
+rpc_trace_enable_tpoint_group(struct spdk_jsonrpc_request *request,
+			      const struct spdk_json_val *params)
+{
+	struct rpc_tpoint_group req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_tpoint_group_decoders,
+				    SPDK_COUNTOF(rpc_tpoint_group_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_TRACE, "spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (req.name == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_TRACE, "flag was NULL\n");
+		goto invalid;
+	}
+
+	if (spdk_trace_enable_tpoint_group(req.name)) {
+		goto invalid;
+	}
+
+	free_rpc_tpoint_group(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_tpoint_group(&req);
+}
+SPDK_RPC_REGISTER("trace_enable_tpoint_group", rpc_trace_enable_tpoint_group,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_enable_tpoint_group, enable_tpoint_group)
+
+static void
+rpc_trace_disable_tpoint_group(struct spdk_jsonrpc_request *request,
+			       const struct spdk_json_val *params)
+{
+	struct rpc_tpoint_group req = {};
+	struct spdk_json_write_ctx *w;
+
+	if (spdk_json_decode_object(params, rpc_tpoint_group_decoders,
+				    SPDK_COUNTOF(rpc_tpoint_group_decoders), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_TRACE, "spdk_json_decode_object failed\n");
+		goto invalid;
+	}
+
+	if (req.name == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_TRACE, "flag was NULL\n");
+		goto invalid;
+	}
+
+	if (spdk_trace_disable_tpoint_group(req.name)) {
+		goto invalid;
+	}
+
+	free_rpc_tpoint_group(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+	free_rpc_tpoint_group(&req);
+}
+SPDK_RPC_REGISTER("trace_disable_tpoint_group", rpc_trace_disable_tpoint_group,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_disable_tpoint_group, disable_tpoint_group)
+
+static void
+rpc_trace_get_tpoint_group_mask(struct spdk_jsonrpc_request *request,
+				const struct spdk_json_val *params)
+{
+	uint64_t tpoint_group_mask;
+	char mask_str[7];
+	bool enabled;
+	struct spdk_json_write_ctx *w;
+	struct spdk_trace_register_fn *register_fn;
+
+	if (params != NULL) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+						 "trace_get_tpoint_group_mask requires no parameters");
+		return;
+	}
+
+	w = spdk_jsonrpc_begin_result(request);
+	tpoint_group_mask = spdk_trace_get_tpoint_group_mask();
+
+	spdk_json_write_object_begin(w);
+
+	snprintf(mask_str, sizeof(mask_str), "0x%lx", tpoint_group_mask);
+	spdk_json_write_named_string(w, "tpoint_group_mask", mask_str);
+
+	register_fn = spdk_trace_get_first_register_fn();
+	while (register_fn) {
+		enabled = spdk_trace_get_tpoint_mask(register_fn->tgroup_id) != 0;
+
+		spdk_json_write_named_object_begin(w, register_fn->name);
+		spdk_json_write_named_bool(w, "enabled", enabled);
+
+		snprintf(mask_str, sizeof(mask_str), "0x%lx", (1UL << register_fn->tgroup_id));
+		spdk_json_write_named_string(w, "mask", mask_str);
+		spdk_json_write_object_end(w);
+
+		register_fn = spdk_trace_get_next_register_fn(register_fn);
+	}
+
+	spdk_json_write_object_end(w);
+	spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("trace_get_tpoint_group_mask", rpc_trace_get_tpoint_group_mask,
+		  SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_get_tpoint_group_mask, get_tpoint_group_mask)
diff --git a/src/spdk/lib/ut_mock/Makefile b/src/spdk/lib/ut_mock/Makefile
new file mode 100644
index 000000000..f4087807f
--- /dev/null
+++ b/src/spdk/lib/ut_mock/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = mock.c
+LIBNAME = ut_mock
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/ut_mock/mock.c b/src/spdk/lib/ut_mock/mock.c
new file mode 100644
index 000000000..cfe51c1d5
--- /dev/null
+++ b/src/spdk/lib/ut_mock/mock.c
@@ -0,0 +1,71 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/mock.h"
+
+DEFINE_WRAPPER(calloc, void *, (size_t nmemb, size_t size), (nmemb, size))
+
+DEFINE_WRAPPER(pthread_mutex_init, int,
+	       (pthread_mutex_t *mtx, const pthread_mutexattr_t *attr),
+	       (mtx, attr))
+
+DEFINE_WRAPPER(pthread_mutexattr_init, int,
+	       (pthread_mutexattr_t *attr), (attr))
+
+DEFINE_WRAPPER(recvmsg, ssize_t, (int sockfd, struct msghdr *msg, int flags), (sockfd, msg, flags))
+
+DEFINE_WRAPPER(sendmsg, ssize_t, (int sockfd, const struct msghdr *msg, int flags), (sockfd, msg,
+		flags))
+
+DEFINE_WRAPPER(writev, ssize_t, (int fd, const struct iovec *iov, int iovcnt), (fd, iov, iovcnt))
+
+char *g_unlink_path;
+void (*g_unlink_callback)(void);
+
+int
+__attribute__((used))
+__wrap_unlink(const char *path)
+{
+	if (g_unlink_path == NULL) {
+		return ENOENT;
+	}
+
+	if (strcmp(g_unlink_path, path) != 0) {
+		return ENOENT;
+	}
+
+	if (g_unlink_callback) {
+		g_unlink_callback();
+	}
+	return 0;
+}
diff --git a/src/spdk/lib/util/Makefile b/src/spdk/lib/util/Makefile
new file mode 100644
index 000000000..23f8db6d0
--- /dev/null
+++ b/src/spdk/lib/util/Makefile
@@ -0,0 +1,47 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = base64.c bit_array.c cpuset.c crc16.c crc32.c crc32c.c crc32_ieee.c \
+	 dif.c fd.c file.c iov.c math.c pipe.c strerror_tls.c string.c uuid.c
+LIBNAME = util
+LOCAL_SYS_LIBS = -luuid
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_util.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/util/base64.c b/src/spdk/lib/util/base64.c
new file mode 100644
index 000000000..adc5e15da
--- /dev/null
+++ b/src/spdk/lib/util/base64.c
@@ -0,0 +1,262 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/endian.h"
+#include "spdk/base64.h"
+
+#ifdef __aarch64__
+#include "base64_neon.c"
+#endif
+
+#define BASE64_ENC_BITMASK 0x3FUL
+#define BASE64_PADDING_CHAR '='
+
+static const char base64_enc_table[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+	"abcdefghijklmnopqrstuvwxyz"
+	"0123456789+/";
+
+static const char base64_urfsafe_enc_table[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+	"abcdefghijklmnopqrstuvwxyz"
+	"0123456789-_";
+
+static const uint8_t
+base64_dec_table[] = {
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255, 255,  63,
+	52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 255, 255, 255,
+	255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
+	15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255, 255,
+	255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
+	41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+static const uint8_t
+base64_urlsafe_dec_table[] = {
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255,
+	52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 255, 255, 255,
+	255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
+	15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255,  63,
+	255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
+	41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+static int
+base64_encode(char *dst, const char *enc_table, const void *src, size_t src_len)
+{
+	uint32_t raw_u32;
+
+	if (!dst || !src || src_len <= 0) {
+		return -EINVAL;
+	}
+
+#ifdef __aarch64__
+	base64_encode_neon64(&dst, enc_table, &src, &src_len);
+#endif
+
+	while (src_len >= 4) {
+		raw_u32 = from_be32(src);
+
+		*dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK];
+		*dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK];
+		*dst++ = enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK];
+		*dst++ = enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK];
+
+		src_len -= 3;
+		src += 3;
+	}
+
+	if (src_len == 0) {
+		goto out;
+	}
+
+	raw_u32 = 0;
+	memcpy(&raw_u32, src, src_len);
+	raw_u32 = from_be32(&raw_u32);
+
+	*dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK];
+	*dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK];
+	*dst++ = (src_len >= 2) ? enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR;
+	*dst++ = (src_len == 3) ? enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR;
+
+out:
+	*dst = '\0';
+
+	return 0;
+}
+
+int
+spdk_base64_encode(char *dst, const void *src, size_t src_len)
+{
+	return base64_encode(dst, base64_enc_table, src, src_len);
+}
+
+int
+spdk_base64_urlsafe_encode(char *dst, const void *src, size_t src_len)
+{
+	return base64_encode(dst, base64_urfsafe_enc_table, src, src_len);
+}
+
+#ifdef __aarch64__
+static int
+base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table,
+	      const uint8_t *dec_table_opt, const char *src)
+#else
+static int
+base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, const char *src)
+#endif
+{
+	size_t src_strlen;
+	size_t tail_len = 0;
+	const uint8_t *src_in;
+	uint32_t tmp[4];
+	int i;
+
+	if (!src) {
+		return -EINVAL;
+	}
+
+	src_strlen = strlen(src);
+
+	/* strlen of src should be 4n */
+	if (src_strlen == 0 || src_strlen % 4 != 0) {
+		return -EINVAL;
+	}
+
+	/* Consider Base64 padding, it at most has 2 padding characters. */
+	for (i = 0; i < 2; i++) {
+		if (src[src_strlen - 1] != BASE64_PADDING_CHAR) {
+			break;
+		}
+		src_strlen--;
+	}
+
+	/* strlen of src without padding shouldn't be 4n+1 */
+	if (src_strlen == 0 || src_strlen % 4 == 1) {
+		return -EINVAL;
+	}
+
+	if (_dst_len) {
+		*_dst_len = spdk_base64_get_decoded_len(src_strlen);
+	}
+
+	/* If dst is NULL, the client is only concerned w/ _dst_len, return */
+	if (!dst) {
+		return 0;
+	}
+
+	src_in = (const uint8_t *) src;
+
+#ifdef __aarch64__
+	base64_decode_neon64(&dst, dec_table_opt, &src_in, &src_strlen);
+
+	if (src_strlen == 0) {
+		return 0;
+	}
+#endif
+
+	/* space of dst can be used by to_be32 */
+	while (src_strlen > 4) {
+		tmp[0] = dec_table[*src_in++];
+		tmp[1] = dec_table[*src_in++];
+		tmp[2] = dec_table[*src_in++];
+		tmp[3] = dec_table[*src_in++];
+
+		if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) {
+			return -EINVAL;
+		}
+
+		to_be32(dst, tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26);
+
+		dst += 3;
+		src_strlen -= 4;
+	}
+
+	/* space of dst is not enough to be used by to_be32 */
+	tmp[0] = dec_table[src_in[0]];
+	tmp[1] = dec_table[src_in[1]];
+	tmp[2] = (src_strlen >= 3) ? dec_table[src_in[2]] : 0;
+	tmp[3] = (src_strlen == 4) ? dec_table[src_in[3]] : 0;
+	tail_len = src_strlen - 1;
+
+	if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) {
+		return -EINVAL;
+	}
+
+	to_be32(&tmp[3], tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26);
+	memcpy(dst, (uint8_t *)&tmp[3], tail_len);
+
+	return 0;
+}
+
+int
+spdk_base64_decode(void *dst, size_t *dst_len, const char *src)
+{
+#ifdef __aarch64__
+	return base64_decode(dst, dst_len, base64_dec_table, base64_dec_table_neon64, src);
+#else
+	return base64_decode(dst, dst_len, base64_dec_table, src);
+#endif
+}
+
+int
+spdk_base64_urlsafe_decode(void *dst, size_t *dst_len, const char *src)
+{
+#ifdef __aarch64__
+	return base64_decode(dst, dst_len, base64_urlsafe_dec_table, base64_urlsafe_dec_table_neon64,
+			     src);
+#else
+	return base64_decode(dst, dst_len, base64_urlsafe_dec_table, src);
+#endif
+}
diff --git a/src/spdk/lib/util/base64_neon.c b/src/spdk/lib/util/base64_neon.c
new file mode 100644
index 000000000..971cff06c
--- /dev/null
+++ b/src/spdk/lib/util/base64_neon.c
@@ -0,0 +1,225 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2005-2007, Nick Galbreath
+ *   Copyright (c) 2013-2017, Alfred Klomp
+ *   Copyright (c) 2015-2017, Wojciech Mula
+ *   Copyright (c) 2016-2017, Matthieu Darbois
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are
+ *   met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ *   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ *   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __aarch64__
+#error Unsupported hardware
+#endif
+
+#include "spdk/stdinc.h"
+/*
+ * Encoding
+ * Use a 64-byte lookup to do the encoding.
+ * Reuse existing base64_dec_table and base64_dec_table.
+
+ * Decoding
+ * The input consists of five valid character sets in the Base64 alphabet,
+ * which we need to map back to the 6-bit values they represent.
+ * There are three ranges, two singles, and then there's the rest.
+ *
+ * LUT1[0-63] = base64_dec_table_neon64[0-63]
+ * LUT2[0-63] = base64_dec_table_neon64[64-127]
+ *   #  From       To        LUT  Characters
+ *   1  [0..42]    [255]      #1  invalid input
+ *   2  [43]       [62]       #1  +
+ *   3  [44..46]   [255]      #1  invalid input
+ *   4  [47]       [63]       #1  /
+ *   5  [48..57]   [52..61]   #1  0..9
+ *   6  [58..63]   [255]      #1  invalid input
+ *   7  [64]       [255]      #2  invalid input
+ *   8  [65..90]   [0..25]    #2  A..Z
+ *   9  [91..96]   [255]      #2 invalid input
+ *  10  [97..122]  [26..51]   #2  a..z
+ *  11  [123..126] [255]      #2 invalid input
+ * (12) Everything else => invalid input
+ */
+static const uint8_t base64_dec_table_neon64[] = {
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255, 255,  63,
+	52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 255, 255, 255,
+	0, 255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
+	14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255,
+	255, 255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
+	40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255
+};
+
+/*
+ * LUT1[0-63] = base64_urlsafe_dec_table_neon64[0-63]
+ * LUT2[0-63] = base64_urlsafe_dec_table_neon64[64-127]
+ *   #  From       To        LUT  Characters
+ *   1  [0..44]    [255]      #1  invalid input
+ *   2  [45]       [62]       #1  -
+ *   3  [46..47]   [255]      #1  invalid input
+ *   5  [48..57]   [52..61]   #1  0..9
+ *   6  [58..63]   [255]      #1  invalid input
+ *   7  [64]       [255]      #2  invalid input
+ *   8  [65..90]   [0..25]    #2  A..Z
+ *   9  [91..94]   [255]      #2  invalid input
+ *  10  [95]       [63]       #2  _
+ *  11  [96]       [255]      #2  invalid input
+ *  12  [97..122]  [26..51]   #2  a..z
+ *  13  [123..126] [255]      #2 invalid input
+ * (14) Everything else => invalid input
+ */
+static const uint8_t base64_urlsafe_dec_table_neon64[] = {
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255,
+	52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 255, 255, 255,
+	0, 255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
+	14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255,
+	63, 255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
+	40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255
+};
+
+#include <arm_neon.h>
+#define CMPGT(s,n)      vcgtq_u8((s), vdupq_n_u8(n))
+
+static inline uint8x16x4_t
+load_64byte_table(const uint8_t *p)
+{
+	uint8x16x4_t ret;
+	ret.val[0] = vld1q_u8(p +  0);
+	ret.val[1] = vld1q_u8(p + 16);
+	ret.val[2] = vld1q_u8(p + 32);
+	ret.val[3] = vld1q_u8(p + 48);
+	return ret;
+}
+
+static void
+base64_encode_neon64(char **dst, const char *enc_table, const void **src, size_t *src_len)
+{
+	const uint8x16x4_t tbl_enc = load_64byte_table(enc_table);
+
+	while (*src_len >= 48) {
+		uint8x16x3_t str;
+		uint8x16x4_t res;
+
+		/* Load 48 bytes and deinterleave */
+		str = vld3q_u8((uint8_t *)*src);
+
+		/* Divide bits of three input bytes over four output bytes and clear top two bits */
+		res.val[0] = vshrq_n_u8(str.val[0], 2);
+		res.val[1] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[1], 4), vshlq_n_u8(str.val[0], 4)),
+				      vdupq_n_u8(0x3F));
+		res.val[2] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[2], 6), vshlq_n_u8(str.val[1], 2)),
+				      vdupq_n_u8(0x3F));
+		res.val[3] = vandq_u8(str.val[2], vdupq_n_u8(0x3F));
+
+		/*
+		 * The bits have now been shifted to the right locations;
+		 * translate their values 0..63 to the Base64 alphabet.
+		 * Use a 64-byte table lookup:
+		 */
+		res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
+		res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
+		res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
+		res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);
+
+		/* Interleave and store result */
+		vst4q_u8((uint8_t *)*dst, res);
+
+		*src += 48;      /* 3 * 16 bytes of input */
+		*dst += 64;      /* 4 * 16 bytes of output */
+		*src_len -= 48;
+	}
+}
+
+static void
+base64_decode_neon64(void **dst, const uint8_t *dec_table_neon64, const uint8_t **src,
+		     size_t *src_len)
+{
+	/*
+	 * First LUT tbl_dec1 will use VTBL instruction (out of range indices are set to 0 in destination).
+	 * Second LUT tbl_dec2 will use VTBX instruction (out of range indices will be unchanged in destination).
+	 * Input [64..126] will be mapped to index [1..63] in tb1_dec2. Index 0 means that value comes from tb1_dec1.
+	 */
+	const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_table_neon64);
+	const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_table_neon64 + 64);
+	const uint8x16_t offset = vdupq_n_u8(63U);
+
+	while (*src_len >= 64) {
+
+		uint8x16x4_t dec1, dec2;
+		uint8x16x3_t dec;
+
+		/* Load 64 bytes and deinterleave */
+		uint8x16x4_t str = vld4q_u8((uint8_t *)*src);
+
+		/* Get indices for 2nd LUT */
+		dec2.val[0] = vqsubq_u8(str.val[0], offset);
+		dec2.val[1] = vqsubq_u8(str.val[1], offset);
+		dec2.val[2] = vqsubq_u8(str.val[2], offset);
+		dec2.val[3] = vqsubq_u8(str.val[3], offset);
+
+		/* Get values from 1st LUT */
+		dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
+		dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
+		dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
+		dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
+
+		/* Get values from 2nd LUT */
+		dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
+		dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
+		dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
+		dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
+
+		/* Get final values */
+		str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
+		str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
+		str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
+		str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
+
+		/* Check for invalid input, any value larger than 63 */
+		uint8x16_t classified = CMPGT(str.val[0], 63);
+		classified = vorrq_u8(classified, CMPGT(str.val[1], 63));
+		classified = vorrq_u8(classified, CMPGT(str.val[2], 63));
+		classified = vorrq_u8(classified, CMPGT(str.val[3], 63));
+
+		/* check that all bits are zero */
+		if (vmaxvq_u8(classified) != 0U) {
+			break;
+		}
+
+		/* Compress four bytes into three */
+		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+		/* Interleave and store decoded result */
+		vst3q_u8((uint8_t *)*dst, dec);
+
+		*src += 64;
+		*dst += 48;
+		*src_len -= 64;
+	}
+}
diff --git a/src/spdk/lib/util/bit_array.c b/src/spdk/lib/util/bit_array.c
new file mode 100644
index 000000000..43c1a4d9b
--- /dev/null
+++ b/src/spdk/lib/util/bit_array.c
@@ -0,0 +1,363 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bit_array.h"
+#include "spdk/env.h"
+
+#include "spdk/likely.h"
+#include "spdk/util.h"
+
+typedef uint64_t spdk_bit_array_word;
+#define SPDK_BIT_ARRAY_WORD_TZCNT(x)	(__builtin_ctzll(x))
+#define SPDK_BIT_ARRAY_WORD_POPCNT(x)	(__builtin_popcountll(x))
+#define SPDK_BIT_ARRAY_WORD_C(x)	((spdk_bit_array_word)(x))
+#define SPDK_BIT_ARRAY_WORD_BYTES	sizeof(spdk_bit_array_word)
+#define SPDK_BIT_ARRAY_WORD_BITS	(SPDK_BIT_ARRAY_WORD_BYTES * 8)
+#define SPDK_BIT_ARRAY_WORD_INDEX_SHIFT	spdk_u32log2(SPDK_BIT_ARRAY_WORD_BITS)
+#define SPDK_BIT_ARRAY_WORD_INDEX_MASK	((1u << SPDK_BIT_ARRAY_WORD_INDEX_SHIFT) - 1)
+
+struct spdk_bit_array {
+	uint32_t bit_count;
+	spdk_bit_array_word words[];
+};
+
+struct spdk_bit_array *
+spdk_bit_array_create(uint32_t num_bits)
+{
+	struct spdk_bit_array *ba = NULL;
+
+	spdk_bit_array_resize(&ba, num_bits);
+
+	return ba;
+}
+
+void
+spdk_bit_array_free(struct spdk_bit_array **bap)
+{
+	struct spdk_bit_array *ba;
+
+	if (!bap) {
+		return;
+	}
+
+	ba = *bap;
+	*bap = NULL;
+	spdk_free(ba);
+}
+
+static inline uint32_t
+bit_array_word_count(uint32_t num_bits)
+{
+	return (num_bits + SPDK_BIT_ARRAY_WORD_BITS - 1) >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT;
+}
+
+static inline spdk_bit_array_word
+bit_array_word_mask(uint32_t num_bits)
+{
+	assert(num_bits < SPDK_BIT_ARRAY_WORD_BITS);
+	return (SPDK_BIT_ARRAY_WORD_C(1) << num_bits) - 1;
+}
+
+int
+spdk_bit_array_resize(struct spdk_bit_array **bap, uint32_t num_bits)
+{
+	struct spdk_bit_array *new_ba;
+	uint32_t old_word_count, new_word_count;
+	size_t new_size;
+
+	/*
+	 * Max number of bits allowed is UINT32_MAX - 1, because we use UINT32_MAX to denote
+	 * when a set or cleared bit cannot be found.
+	 */
+	if (!bap || num_bits == UINT32_MAX) {
+		return -EINVAL;
+	}
+
+	new_word_count = bit_array_word_count(num_bits);
+	new_size = offsetof(struct spdk_bit_array, words) + new_word_count * SPDK_BIT_ARRAY_WORD_BYTES;
+
+	/*
+	 * Always keep one extra word with a 0 and a 1 past the actual required size so that the
+	 * find_first functions can just keep going until they match.
+	 */
+	new_size += SPDK_BIT_ARRAY_WORD_BYTES;
+
+	new_ba = (struct spdk_bit_array *)spdk_realloc(*bap, new_size, 64);
+	if (!new_ba) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Set up special extra word (see above comment about find_first_clear).
+	 *
+	 * This is set to 0b10 so that find_first_clear will find a 0 at the very first
+	 * bit past the end of the buffer, and find_first_set will find a 1 at the next bit
+	 * past that.
+	 */
+	new_ba->words[new_word_count] = 0x2;
+
+	if (*bap == NULL) {
+		old_word_count = 0;
+		new_ba->bit_count = 0;
+	} else {
+		old_word_count = bit_array_word_count(new_ba->bit_count);
+	}
+
+	if (new_word_count > old_word_count) {
+		/* Zero out new entries */
+		memset(&new_ba->words[old_word_count], 0,
+		       (new_word_count - old_word_count) * SPDK_BIT_ARRAY_WORD_BYTES);
+	} else if (new_word_count == old_word_count && num_bits < new_ba->bit_count) {
+		/* Make sure any existing partial last word is cleared beyond the new num_bits. */
+		uint32_t last_word_bits;
+		spdk_bit_array_word mask;
+
+		last_word_bits = num_bits & SPDK_BIT_ARRAY_WORD_INDEX_MASK;
+		mask = bit_array_word_mask(last_word_bits);
+		new_ba->words[old_word_count - 1] &= mask;
+	}
+
+	new_ba->bit_count = num_bits;
+	*bap = new_ba;
+	return 0;
+}
+
+uint32_t
+spdk_bit_array_capacity(const struct spdk_bit_array *ba)
+{
+	return ba->bit_count;
+}
+
+static inline int
+bit_array_get_word(const struct spdk_bit_array *ba, uint32_t bit_index,
+		   uint32_t *word_index, uint32_t *word_bit_index)
+{
+	if (spdk_unlikely(bit_index >= ba->bit_count)) {
+		return -EINVAL;
+	}
+
+	*word_index = bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT;
+	*word_bit_index = bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK;
+
+	return 0;
+}
+
+bool
+spdk_bit_array_get(const struct spdk_bit_array *ba, uint32_t bit_index)
+{
+	uint32_t word_index, word_bit_index;
+
+	if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) {
+		return false;
+	}
+
+	return (ba->words[word_index] >> word_bit_index) & 1U;
+}
+
+int
+spdk_bit_array_set(struct spdk_bit_array *ba, uint32_t bit_index)
+{
+	uint32_t word_index, word_bit_index;
+
+	if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) {
+		return -EINVAL;
+	}
+
+	ba->words[word_index] |= (SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index);
+	return 0;
+}
+
+void
+spdk_bit_array_clear(struct spdk_bit_array *ba, uint32_t bit_index)
+{
+	uint32_t word_index, word_bit_index;
+
+	if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) {
+		/*
+		 * Clearing past the end of the bit array is a no-op, since bit past the end
+		 * are implicitly 0.
+		 */
+		return;
+	}
+
+	ba->words[word_index] &= ~(SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index);
+}
+
+static inline uint32_t
+bit_array_find_first(const struct spdk_bit_array *ba, uint32_t start_bit_index,
+		     spdk_bit_array_word xor_mask)
+{
+	uint32_t word_index, first_word_bit_index;
+	spdk_bit_array_word word, first_word_mask;
+	const spdk_bit_array_word *words, *cur_word;
+
+	if (spdk_unlikely(start_bit_index >= ba->bit_count)) {
+		return ba->bit_count;
+	}
+
+	word_index = start_bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT;
+	words = ba->words;
+	cur_word = &words[word_index];
+
+	/*
+	 * Special case for first word: skip start_bit_index % SPDK_BIT_ARRAY_WORD_BITS bits
+	 * within the first word.
+	 */
+	first_word_bit_index = start_bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK;
+	first_word_mask = bit_array_word_mask(first_word_bit_index);
+
+	word = (*cur_word ^ xor_mask) & ~first_word_mask;
+
+	/*
+	 * spdk_bit_array_resize() guarantees that an extra word with a 1 and a 0 will always be
+	 * at the end of the words[] array, so just keep going until a word matches.
+	 */
+	while (word == 0) {
+		word = *++cur_word ^ xor_mask;
+	}
+
+	return ((uintptr_t)cur_word - (uintptr_t)words) * 8 + SPDK_BIT_ARRAY_WORD_TZCNT(word);
+}
+
+
+uint32_t
+spdk_bit_array_find_first_set(const struct spdk_bit_array *ba, uint32_t start_bit_index)
+{
+	uint32_t bit_index;
+
+	bit_index = bit_array_find_first(ba, start_bit_index, 0);
+
+	/*
+	 * If we ran off the end of the array and found the 1 bit in the extra word,
+	 * return UINT32_MAX to indicate no actual 1 bits were found.
+	 */
+	if (bit_index >= ba->bit_count) {
+		bit_index = UINT32_MAX;
+	}
+
+	return bit_index;
+}
+
+uint32_t
+spdk_bit_array_find_first_clear(const struct spdk_bit_array *ba, uint32_t start_bit_index)
+{
+	uint32_t bit_index;
+
+	bit_index = bit_array_find_first(ba, start_bit_index, SPDK_BIT_ARRAY_WORD_C(-1));
+
+	/*
+	 * If we ran off the end of the array and found the 0 bit in the extra word,
+	 * return UINT32_MAX to indicate no actual 0 bits were found.
+	 */
+	if (bit_index >= ba->bit_count) {
+		bit_index = UINT32_MAX;
+	}
+
+	return bit_index;
+}
+
+uint32_t
+spdk_bit_array_count_set(const struct spdk_bit_array *ba)
+{
+	const spdk_bit_array_word *cur_word = ba->words;
+	uint32_t word_count = bit_array_word_count(ba->bit_count);
+	uint32_t set_count = 0;
+
+	while (word_count--) {
+		/*
+		 * No special treatment is needed for the last (potentially partial) word, since
+		 * spdk_bit_array_resize() makes sure the bits past bit_count are cleared.
+		 */
+		set_count += SPDK_BIT_ARRAY_WORD_POPCNT(*cur_word++);
+	}
+
+	return set_count;
+}
+
+uint32_t
+spdk_bit_array_count_clear(const struct spdk_bit_array *ba)
+{
+	return ba->bit_count - spdk_bit_array_count_set(ba);
+}
+
+void
+spdk_bit_array_store_mask(const struct spdk_bit_array *ba, void *mask)
+{
+	uint32_t size, i;
+	uint32_t num_bits = spdk_bit_array_capacity(ba);
+
+	size = num_bits / CHAR_BIT;
+	memcpy(mask, ba->words, size);
+
+	for (i = 0; i < num_bits % CHAR_BIT; i++) {
+		if (spdk_bit_array_get(ba, i + size * CHAR_BIT)) {
+			((uint8_t *)mask)[size] |= (1U << i);
+		} else {
+			((uint8_t *)mask)[size] &= ~(1U << i);
+		}
+	}
+}
+
+void
+spdk_bit_array_load_mask(struct spdk_bit_array *ba, const void *mask)
+{
+	uint32_t size, i;
+	uint32_t num_bits = spdk_bit_array_capacity(ba);
+
+	size = num_bits / CHAR_BIT;
+	memcpy(ba->words, mask, size);
+
+	for (i = 0; i < num_bits % CHAR_BIT; i++) {
+		if (((uint8_t *)mask)[size] & (1U << i)) {
+			spdk_bit_array_set(ba, i + size * CHAR_BIT);
+		} else {
+			spdk_bit_array_clear(ba, i + size * CHAR_BIT);
+		}
+	}
+}
+
+void
+spdk_bit_array_clear_mask(struct spdk_bit_array *ba)
+{
+	uint32_t size, i;
+	uint32_t num_bits = spdk_bit_array_capacity(ba);
+
+	size = num_bits / CHAR_BIT;
+	memset(ba->words, 0, size);
+
+	for (i = 0; i < num_bits % CHAR_BIT; i++) {
+		spdk_bit_array_clear(ba, i + size * CHAR_BIT);
+	}
+}
diff --git a/src/spdk/lib/util/cpuset.c b/src/spdk/lib/util/cpuset.c
new file mode 100644
index 000000000..8d7c8dc89
--- /dev/null
+++ b/src/spdk/lib/util/cpuset.c
@@ -0,0 +1,336 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/cpuset.h"
+#include "spdk/log.h"
+
+struct spdk_cpuset *
+spdk_cpuset_alloc(void)
+{
+	return (struct spdk_cpuset *)calloc(sizeof(struct spdk_cpuset), 1);
+}
+
+void
+spdk_cpuset_free(struct spdk_cpuset *set)
+{
+	free(set);
+}
+
+bool
+spdk_cpuset_equal(const struct spdk_cpuset *set1, const struct spdk_cpuset *set2)
+{
+	assert(set1 != NULL);
+	assert(set2 != NULL);
+	return memcmp(set1->cpus, set2->cpus, sizeof(set2->cpus)) == 0;
+}
+
+void
+spdk_cpuset_copy(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+	assert(dst != NULL);
+	assert(src != NULL);
+	memcpy(&dst->cpus, &src->cpus, sizeof(src->cpus));
+}
+
+void
+spdk_cpuset_negate(struct spdk_cpuset *set)
+{
+	unsigned int i;
+	assert(set != NULL);
+	for (i = 0; i < sizeof(set->cpus); i++) {
+		set->cpus[i] = ~set->cpus[i];
+	}
+}
+
+void
+spdk_cpuset_and(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+	unsigned int i;
+	assert(dst != NULL);
+	assert(src != NULL);
+	for (i = 0; i < sizeof(src->cpus); i++) {
+		dst->cpus[i] &= src->cpus[i];
+	}
+}
+
+void
+spdk_cpuset_or(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+	unsigned int i;
+	assert(dst != NULL);
+	assert(src != NULL);
+	for (i = 0; i < sizeof(src->cpus); i++) {
+		dst->cpus[i] |= src->cpus[i];
+	}
+}
+
+void
+spdk_cpuset_xor(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+	unsigned int i;
+	assert(dst != NULL);
+	assert(src != NULL);
+	for (i = 0; i < sizeof(src->cpus); i++) {
+		dst->cpus[i] ^= src->cpus[i];
+	}
+}
+
+void
+spdk_cpuset_zero(struct spdk_cpuset *set)
+{
+	assert(set != NULL);
+	memset(set->cpus, 0, sizeof(set->cpus));
+}
+
+void
+spdk_cpuset_set_cpu(struct spdk_cpuset *set, uint32_t cpu, bool state)
+{
+	assert(set != NULL);
+	assert(cpu < sizeof(set->cpus) * 8);
+	if (state) {
+		set->cpus[cpu / 8] |= (1U << (cpu % 8));
+	} else {
+		set->cpus[cpu / 8] &= ~(1U << (cpu % 8));
+	}
+}
+
+bool
+spdk_cpuset_get_cpu(const struct spdk_cpuset *set, uint32_t cpu)
+{
+	assert(set != NULL);
+	assert(cpu < sizeof(set->cpus) * 8);
+	return (set->cpus[cpu / 8] >> (cpu % 8)) & 1U;
+}
+
+uint32_t
+spdk_cpuset_count(const struct spdk_cpuset *set)
+{
+	uint32_t count = 0;
+	uint8_t n;
+	unsigned int i;
+	for (i = 0; i < sizeof(set->cpus); i++) {
+		n = set->cpus[i];
+		while (n) {
+			n &= (n - 1);
+			count++;
+		}
+	}
+	return count;
+}
+
+const char *
+spdk_cpuset_fmt(struct spdk_cpuset *set)
+{
+	uint32_t lcore, lcore_max = 0;
+	int val, i, n;
+	char *ptr;
+	static const char *hex = "0123456789abcdef";
+
+	assert(set != NULL);
+
+	for (lcore = 0; lcore < sizeof(set->cpus) * 8; lcore++) {
+		if (spdk_cpuset_get_cpu(set, lcore)) {
+			lcore_max = lcore;
+		}
+	}
+
+	ptr = set->str;
+	n = lcore_max / 8;
+	val = set->cpus[n];
+
+	/* Store first number only if it is not leading zero */
+	if ((val & 0xf0) != 0) {
+		*(ptr++) = hex[(val & 0xf0) >> 4];
+	}
+	*(ptr++) = hex[val & 0x0f];
+
+	for (i = n - 1; i >= 0; i--) {
+		val = set->cpus[i];
+		*(ptr++) = hex[(val & 0xf0) >> 4];
+		*(ptr++) = hex[val & 0x0f];
+	}
+	*ptr = '\0';
+
+	return set->str;
+}
+
+static int
+hex_value(uint8_t c)
+{
+#define V(x, y) [x] = y + 1
+	static const int8_t val[256] = {
+		V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
+		V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
+		V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
+		V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
+	};
+#undef V
+
+	return val[c] - 1;
+}
+
+static int
+parse_list(const char *mask, struct spdk_cpuset *set)
+{
+	char *end;
+	const char *ptr = mask;
+	uint32_t lcore;
+	uint32_t lcore_min, lcore_max;
+
+	spdk_cpuset_zero(set);
+	lcore_min = UINT32_MAX;
+
+	ptr++;
+	end = (char *)ptr;
+	do {
+		while (isblank(*ptr)) {
+			ptr++;
+		}
+		if (*ptr == '\0' || *ptr == ']' || *ptr == '-' || *ptr == ',') {
+			goto invalid_character;
+		}
+
+		errno = 0;
+		lcore = strtoul(ptr, &end, 10);
+		if (errno) {
+			SPDK_ERRLOG("Conversion of core mask in '%s' failed\n", mask);
+			return -1;
+		}
+
+		if (lcore >= sizeof(set->cpus) * 8) {
+			SPDK_ERRLOG("Core number %" PRIu32 " is out of range in '%s'\n", lcore, mask);
+			return -1;
+		}
+
+		while (isblank(*end)) {
+			end++;
+		}
+
+		if (*end == '-') {
+			lcore_min = lcore;
+		} else if (*end == ',' || *end == ']') {
+			lcore_max = lcore;
+			if (lcore_min == UINT32_MAX) {
+				lcore_min = lcore;
+			}
+			if (lcore_min > lcore_max) {
+				SPDK_ERRLOG("Invalid range of CPUs (%" PRIu32 " > %" PRIu32 ")\n",
+					    lcore_min, lcore_max);
+				return -1;
+			}
+			for (lcore = lcore_min; lcore <= lcore_max; lcore++) {
+				spdk_cpuset_set_cpu(set, lcore, true);
+			}
+			lcore_min = UINT32_MAX;
+		} else {
+			goto invalid_character;
+		}
+
+		ptr = end + 1;
+
+	} while (*end != ']');
+
+	return 0;
+
+invalid_character:
+	if (*end == '\0') {
+		SPDK_ERRLOG("Unexpected end of core list '%s'\n", mask);
+	} else {
+		SPDK_ERRLOG("Parsing of core list '%s' failed on character '%c'\n", mask, *end);
+	}
+	return -1;
+}
+
+static int
+parse_mask(const char *mask, struct spdk_cpuset *set, size_t len)
+{
+	int i, j;
+	char c;
+	int val;
+	uint32_t lcore = 0;
+
+	if (mask[0] == '0' && (mask[1] == 'x' || mask[1] == 'X')) {
+		mask += 2;
+		len -= 2;
+	}
+
+	spdk_cpuset_zero(set);
+	for (i = len - 1; i >= 0; i--) {
+		c = mask[i];
+		val = hex_value(c);
+		if (val < 0) {
+			/* Invalid character */
+			SPDK_ERRLOG("Invalid character in core mask '%s' (%c)\n", mask, c);
+			return -1;
+		}
+		for (j = 0; j < 4 && lcore < sizeof(set->cpus); j++, lcore++) {
+			if ((1 << j) & val) {
+				spdk_cpuset_set_cpu(set, lcore, true);
+			}
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_cpuset_parse(struct spdk_cpuset *set, const char *mask)
+{
+	int ret;
+	size_t len;
+
+	if (mask == NULL || set == NULL) {
+		return -1;
+	}
+
+	while (isblank(*mask)) {
+		mask++;
+	}
+
+	len = strlen(mask);
+	while (len > 0 && isblank(mask[len - 1])) {
+		len--;
+	}
+
+	if (len == 0) {
+		return -1;
+	}
+
+	if (mask[0] == '[') {
+		ret = parse_list(mask, set);
+	} else {
+		ret = parse_mask(mask, set, len);
+	}
+
+	return ret;
+}
diff --git a/src/spdk/lib/util/crc16.c b/src/spdk/lib/util/crc16.c
new file mode 100644
index 000000000..2ba168c4b
--- /dev/null
+++ b/src/spdk/lib/util/crc16.c
@@ -0,0 +1,668 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/crc16.h"
+#include "spdk/config.h"
+
+/*
+ * Use Intelligent Storage Acceleration Library for line speed CRC
+ */
+
+#ifdef SPDK_CONFIG_ISAL
+#include "isa-l/include/crc.h"
+
+uint16_t
+spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len)
+{
+	return (crc16_t10dif(init_crc, buf, len));
+}
+
+uint16_t
+spdk_crc16_t10dif_copy(uint16_t init_crc, uint8_t *dst, uint8_t *src,
+		       size_t len)
+{
+	return (crc16_t10dif_copy(init_crc, dst, src, len));
+}
+
+#else
+/*
+ * Use table-driven (somewhat faster) CRC
+ */
+
+/*
+ * Static tables used for the table_driven implementation.
+ */
+
+static const uint16_t crc_table_fast[16][256] = {
+	{
+		0x0000u, 0x8BB7u, 0x9CD9u, 0x176Eu, 0xB205u, 0x39B2u, 0x2EDCu, 0xA56Bu,
+		0xEFBDu, 0x640Au, 0x7364u, 0xF8D3u, 0x5DB8u, 0xD60Fu, 0xC161u, 0x4AD6u,
+		0x54CDu, 0xDF7Au, 0xC814u, 0x43A3u, 0xE6C8u, 0x6D7Fu, 0x7A11u, 0xF1A6u,
+		0xBB70u, 0x30C7u, 0x27A9u, 0xAC1Eu, 0x0975u, 0x82C2u, 0x95ACu, 0x1E1Bu,
+		0xA99Au, 0x222Du, 0x3543u, 0xBEF4u, 0x1B9Fu, 0x9028u, 0x8746u, 0x0CF1u,
+		0x4627u, 0xCD90u, 0xDAFEu, 0x5149u, 0xF422u, 0x7F95u, 0x68FBu, 0xE34Cu,
+		0xFD57u, 0x76E0u, 0x618Eu, 0xEA39u, 0x4F52u, 0xC4E5u, 0xD38Bu, 0x583Cu,
+		0x12EAu, 0x995Du, 0x8E33u, 0x0584u, 0xA0EFu, 0x2B58u, 0x3C36u, 0xB781u,
+		0xD883u, 0x5334u, 0x445Au, 0xCFEDu, 0x6A86u, 0xE131u, 0xF65Fu, 0x7DE8u,
+		0x373Eu, 0xBC89u, 0xABE7u, 0x2050u, 0x853Bu, 0x0E8Cu, 0x19E2u, 0x9255u,
+		0x8C4Eu, 0x07F9u, 0x1097u, 0x9B20u, 0x3E4Bu, 0xB5FCu, 0xA292u, 0x2925u,
+		0x63F3u, 0xE844u, 0xFF2Au, 0x749Du, 0xD1F6u, 0x5A41u, 0x4D2Fu, 0xC698u,
+		0x7119u, 0xFAAEu, 0xEDC0u, 0x6677u, 0xC31Cu, 0x48ABu, 0x5FC5u, 0xD472u,
+		0x9EA4u, 0x1513u, 0x027Du, 0x89CAu, 0x2CA1u, 0xA716u, 0xB078u, 0x3BCFu,
+		0x25D4u, 0xAE63u, 0xB90Du, 0x32BAu, 0x97D1u, 0x1C66u, 0x0B08u, 0x80BFu,
+		0xCA69u, 0x41DEu, 0x56B0u, 0xDD07u, 0x786Cu, 0xF3DBu, 0xE4B5u, 0x6F02u,
+		0x3AB1u, 0xB106u, 0xA668u, 0x2DDFu, 0x88B4u, 0x0303u, 0x146Du, 0x9FDAu,
+		0xD50Cu, 0x5EBBu, 0x49D5u, 0xC262u, 0x6709u, 0xECBEu, 0xFBD0u, 0x7067u,
+		0x6E7Cu, 0xE5CBu, 0xF2A5u, 0x7912u, 0xDC79u, 0x57CEu, 0x40A0u, 0xCB17u,
+		0x81C1u, 0x0A76u, 0x1D18u, 0x96AFu, 0x33C4u, 0xB873u, 0xAF1Du, 0x24AAu,
+		0x932Bu, 0x189Cu, 0x0FF2u, 0x8445u, 0x212Eu, 0xAA99u, 0xBDF7u, 0x3640u,
+		0x7C96u, 0xF721u, 0xE04Fu, 0x6BF8u, 0xCE93u, 0x4524u, 0x524Au, 0xD9FDu,
+		0xC7E6u, 0x4C51u, 0x5B3Fu, 0xD088u, 0x75E3u, 0xFE54u, 0xE93Au, 0x628Du,
+		0x285Bu, 0xA3ECu, 0xB482u, 0x3F35u, 0x9A5Eu, 0x11E9u, 0x0687u, 0x8D30u,
+		0xE232u, 0x6985u, 0x7EEBu, 0xF55Cu, 0x5037u, 0xDB80u, 0xCCEEu, 0x4759u,
+		0x0D8Fu, 0x8638u, 0x9156u, 0x1AE1u, 0xBF8Au, 0x343Du, 0x2353u, 0xA8E4u,
+		0xB6FFu, 0x3D48u, 0x2A26u, 0xA191u, 0x04FAu, 0x8F4Du, 0x9823u, 0x1394u,
+		0x5942u, 0xD2F5u, 0xC59Bu, 0x4E2Cu, 0xEB47u, 0x60F0u, 0x779Eu, 0xFC29u,
+		0x4BA8u, 0xC01Fu, 0xD771u, 0x5CC6u, 0xF9ADu, 0x721Au, 0x6574u, 0xEEC3u,
+		0xA415u, 0x2FA2u, 0x38CCu, 0xB37Bu, 0x1610u, 0x9DA7u, 0x8AC9u, 0x017Eu,
+		0x1F65u, 0x94D2u, 0x83BCu, 0x080Bu, 0xAD60u, 0x26D7u, 0x31B9u, 0xBA0Eu,
+		0xF0D8u, 0x7B6Fu, 0x6C01u, 0xE7B6u, 0x42DDu, 0xC96Au, 0xDE04u, 0x55B3u
+	},
+	{
+		0x0000u, 0x7562u, 0xEAC4u, 0x9FA6u, 0x5E3Fu, 0x2B5Du, 0xB4FBu, 0xC199u,
+		0xBC7Eu, 0xC91Cu, 0x56BAu, 0x23D8u, 0xE241u, 0x9723u, 0x0885u, 0x7DE7u,
+		0xF34Bu, 0x8629u, 0x198Fu, 0x6CEDu, 0xAD74u, 0xD816u, 0x47B0u, 0x32D2u,
+		0x4F35u, 0x3A57u, 0xA5F1u, 0xD093u, 0x110Au, 0x6468u, 0xFBCEu, 0x8EACu,
+		0x6D21u, 0x1843u, 0x87E5u, 0xF287u, 0x331Eu, 0x467Cu, 0xD9DAu, 0xACB8u,
+		0xD15Fu, 0xA43Du, 0x3B9Bu, 0x4EF9u, 0x8F60u, 0xFA02u, 0x65A4u, 0x10C6u,
+		0x9E6Au, 0xEB08u, 0x74AEu, 0x01CCu, 0xC055u, 0xB537u, 0x2A91u, 0x5FF3u,
+		0x2214u, 0x5776u, 0xC8D0u, 0xBDB2u, 0x7C2Bu, 0x0949u, 0x96EFu, 0xE38Du,
+		0xDA42u, 0xAF20u, 0x3086u, 0x45E4u, 0x847Du, 0xF11Fu, 0x6EB9u, 0x1BDBu,
+		0x663Cu, 0x135Eu, 0x8CF8u, 0xF99Au, 0x3803u, 0x4D61u, 0xD2C7u, 0xA7A5u,
+		0x2909u, 0x5C6Bu, 0xC3CDu, 0xB6AFu, 0x7736u, 0x0254u, 0x9DF2u, 0xE890u,
+		0x9577u, 0xE015u, 0x7FB3u, 0x0AD1u, 0xCB48u, 0xBE2Au, 0x218Cu, 0x54EEu,
+		0xB763u, 0xC201u, 0x5DA7u, 0x28C5u, 0xE95Cu, 0x9C3Eu, 0x0398u, 0x76FAu,
+		0x0B1Du, 0x7E7Fu, 0xE1D9u, 0x94BBu, 0x5522u, 0x2040u, 0xBFE6u, 0xCA84u,
+		0x4428u, 0x314Au, 0xAEECu, 0xDB8Eu, 0x1A17u, 0x6F75u, 0xF0D3u, 0x85B1u,
+		0xF856u, 0x8D34u, 0x1292u, 0x67F0u, 0xA669u, 0xD30Bu, 0x4CADu, 0x39CFu,
+		0x3F33u, 0x4A51u, 0xD5F7u, 0xA095u, 0x610Cu, 0x146Eu, 0x8BC8u, 0xFEAAu,
+		0x834Du, 0xF62Fu, 0x6989u, 0x1CEBu, 0xDD72u, 0xA810u, 0x37B6u, 0x42D4u,
+		0xCC78u, 0xB91Au, 0x26BCu, 0x53DEu, 0x9247u, 0xE725u, 0x7883u, 0x0DE1u,
+		0x7006u, 0x0564u, 0x9AC2u, 0xEFA0u, 0x2E39u, 0x5B5Bu, 0xC4FDu, 0xB19Fu,
+		0x5212u, 0x2770u, 0xB8D6u, 0xCDB4u, 0x0C2Du, 0x794Fu, 0xE6E9u, 0x938Bu,
+		0xEE6Cu, 0x9B0Eu, 0x04A8u, 0x71CAu, 0xB053u, 0xC531u, 0x5A97u, 0x2FF5u,
+		0xA159u, 0xD43Bu, 0x4B9Du, 0x3EFFu, 0xFF66u, 0x8A04u, 0x15A2u, 0x60C0u,
+		0x1D27u, 0x6845u, 0xF7E3u, 0x8281u, 0x4318u, 0x367Au, 0xA9DCu, 0xDCBEu,
+		0xE571u, 0x9013u, 0x0FB5u, 0x7AD7u, 0xBB4Eu, 0xCE2Cu, 0x518Au, 0x24E8u,
+		0x590Fu, 0x2C6Du, 0xB3CBu, 0xC6A9u, 0x0730u, 0x7252u, 0xEDF4u, 0x9896u,
+		0x163Au, 0x6358u, 0xFCFEu, 0x899Cu, 0x4805u, 0x3D67u, 0xA2C1u, 0xD7A3u,
+		0xAA44u, 0xDF26u, 0x4080u, 0x35E2u, 0xF47Bu, 0x8119u, 0x1EBFu, 0x6BDDu,
+		0x8850u, 0xFD32u, 0x6294u, 0x17F6u, 0xD66Fu, 0xA30Du, 0x3CABu, 0x49C9u,
+		0x342Eu, 0x414Cu, 0xDEEAu, 0xAB88u, 0x6A11u, 0x1F73u, 0x80D5u, 0xF5B7u,
+		0x7B1Bu, 0x0E79u, 0x91DFu, 0xE4BDu, 0x2524u, 0x5046u, 0xCFE0u, 0xBA82u,
+		0xC765u, 0xB207u, 0x2DA1u, 0x58C3u, 0x995Au, 0xEC38u, 0x739Eu, 0x06FCu
+	},
+	{
+		0x0000u, 0x7E66u, 0xFCCCu, 0x82AAu, 0x722Fu, 0x0C49u, 0x8EE3u, 0xF085u,
+		0xE45Eu, 0x9A38u, 0x1892u, 0x66F4u, 0x9671u, 0xE817u, 0x6ABDu, 0x14DBu,
+		0x430Bu, 0x3D6Du, 0xBFC7u, 0xC1A1u, 0x3124u, 0x4F42u, 0xCDE8u, 0xB38Eu,
+		0xA755u, 0xD933u, 0x5B99u, 0x25FFu, 0xD57Au, 0xAB1Cu, 0x29B6u, 0x57D0u,
+		0x8616u, 0xF870u, 0x7ADAu, 0x04BCu, 0xF439u, 0x8A5Fu, 0x08F5u, 0x7693u,
+		0x6248u, 0x1C2Eu, 0x9E84u, 0xE0E2u, 0x1067u, 0x6E01u, 0xECABu, 0x92CDu,
+		0xC51Du, 0xBB7Bu, 0x39D1u, 0x47B7u, 0xB732u, 0xC954u, 0x4BFEu, 0x3598u,
+		0x2143u, 0x5F25u, 0xDD8Fu, 0xA3E9u, 0x536Cu, 0x2D0Au, 0xAFA0u, 0xD1C6u,
+		0x879Bu, 0xF9FDu, 0x7B57u, 0x0531u, 0xF5B4u, 0x8BD2u, 0x0978u, 0x771Eu,
+		0x63C5u, 0x1DA3u, 0x9F09u, 0xE16Fu, 0x11EAu, 0x6F8Cu, 0xED26u, 0x9340u,
+		0xC490u, 0xBAF6u, 0x385Cu, 0x463Au, 0xB6BFu, 0xC8D9u, 0x4A73u, 0x3415u,
+		0x20CEu, 0x5EA8u, 0xDC02u, 0xA264u, 0x52E1u, 0x2C87u, 0xAE2Du, 0xD04Bu,
+		0x018Du, 0x7FEBu, 0xFD41u, 0x8327u, 0x73A2u, 0x0DC4u, 0x8F6Eu, 0xF108u,
+		0xE5D3u, 0x9BB5u, 0x191Fu, 0x6779u, 0x97FCu, 0xE99Au, 0x6B30u, 0x1556u,
+		0x4286u, 0x3CE0u, 0xBE4Au, 0xC02Cu, 0x30A9u, 0x4ECFu, 0xCC65u, 0xB203u,
+		0xA6D8u, 0xD8BEu, 0x5A14u, 0x2472u, 0xD4F7u, 0xAA91u, 0x283Bu, 0x565Du,
+		0x8481u, 0xFAE7u, 0x784Du, 0x062Bu, 0xF6AEu, 0x88C8u, 0x0A62u, 0x7404u,
+		0x60DFu, 0x1EB9u, 0x9C13u, 0xE275u, 0x12F0u, 0x6C96u, 0xEE3Cu, 0x905Au,
+		0xC78Au, 0xB9ECu, 0x3B46u, 0x4520u, 0xB5A5u, 0xCBC3u, 0x4969u, 0x370Fu,
+		0x23D4u, 0x5DB2u, 0xDF18u, 0xA17Eu, 0x51FBu, 0x2F9Du, 0xAD37u, 0xD351u,
+		0x0297u, 0x7CF1u, 0xFE5Bu, 0x803Du, 0x70B8u, 0x0EDEu, 0x8C74u, 0xF212u,
+		0xE6C9u, 0x98AFu, 0x1A05u, 0x6463u, 0x94E6u, 0xEA80u, 0x682Au, 0x164Cu,
+		0x419Cu, 0x3FFAu, 0xBD50u, 0xC336u, 0x33B3u, 0x4DD5u, 0xCF7Fu, 0xB119u,
+		0xA5C2u, 0xDBA4u, 0x590Eu, 0x2768u, 0xD7EDu, 0xA98Bu, 0x2B21u, 0x5547u,
+		0x031Au, 0x7D7Cu, 0xFFD6u, 0x81B0u, 0x7135u, 0x0F53u, 0x8DF9u, 0xF39Fu,
+		0xE744u, 0x9922u, 0x1B88u, 0x65EEu, 0x956Bu, 0xEB0Du, 0x69A7u, 0x17C1u,
+		0x4011u, 0x3E77u, 0xBCDDu, 0xC2BBu, 0x323Eu, 0x4C58u, 0xCEF2u, 0xB094u,
+		0xA44Fu, 0xDA29u, 0x5883u, 0x26E5u, 0xD660u, 0xA806u, 0x2AACu, 0x54CAu,
+		0x850Cu, 0xFB6Au, 0x79C0u, 0x07A6u, 0xF723u, 0x8945u, 0x0BEFu, 0x7589u,
+		0x6152u, 0x1F34u, 0x9D9Eu, 0xE3F8u, 0x137Du, 0x6D1Bu, 0xEFB1u, 0x91D7u,
+		0xC607u, 0xB861u, 0x3ACBu, 0x44ADu, 0xB428u, 0xCA4Eu, 0x48E4u, 0x3682u,
+		0x2259u, 0x5C3Fu, 0xDE95u, 0xA0F3u, 0x5076u, 0x2E10u, 0xACBAu, 0xD2DCu
+	},
+	{
+		0x0000u, 0x82B5u, 0x8EDDu, 0x0C68u, 0x960Du, 0x14B8u, 0x18D0u, 0x9A65u,
+		0xA7ADu, 0x2518u, 0x2970u, 0xABC5u, 0x31A0u, 0xB315u, 0xBF7Du, 0x3DC8u,
+		0xC4EDu, 0x4658u, 0x4A30u, 0xC885u, 0x52E0u, 0xD055u, 0xDC3Du, 0x5E88u,
+		0x6340u, 0xE1F5u, 0xED9Du, 0x6F28u, 0xF54Du, 0x77F8u, 0x7B90u, 0xF925u,
+		0x026Du, 0x80D8u, 0x8CB0u, 0x0E05u, 0x9460u, 0x16D5u, 0x1ABDu, 0x9808u,
+		0xA5C0u, 0x2775u, 0x2B1Du, 0xA9A8u, 0x33CDu, 0xB178u, 0xBD10u, 0x3FA5u,
+		0xC680u, 0x4435u, 0x485Du, 0xCAE8u, 0x508Du, 0xD238u, 0xDE50u, 0x5CE5u,
+		0x612Du, 0xE398u, 0xEFF0u, 0x6D45u, 0xF720u, 0x7595u, 0x79FDu, 0xFB48u,
+		0x04DAu, 0x866Fu, 0x8A07u, 0x08B2u, 0x92D7u, 0x1062u, 0x1C0Au, 0x9EBFu,
+		0xA377u, 0x21C2u, 0x2DAAu, 0xAF1Fu, 0x357Au, 0xB7CFu, 0xBBA7u, 0x3912u,
+		0xC037u, 0x4282u, 0x4EEAu, 0xCC5Fu, 0x563Au, 0xD48Fu, 0xD8E7u, 0x5A52u,
+		0x679Au, 0xE52Fu, 0xE947u, 0x6BF2u, 0xF197u, 0x7322u, 0x7F4Au, 0xFDFFu,
+		0x06B7u, 0x8402u, 0x886Au, 0x0ADFu, 0x90BAu, 0x120Fu, 0x1E67u, 0x9CD2u,
+		0xA11Au, 0x23AFu, 0x2FC7u, 0xAD72u, 0x3717u, 0xB5A2u, 0xB9CAu, 0x3B7Fu,
+		0xC25Au, 0x40EFu, 0x4C87u, 0xCE32u, 0x5457u, 0xD6E2u, 0xDA8Au, 0x583Fu,
+		0x65F7u, 0xE742u, 0xEB2Au, 0x699Fu, 0xF3FAu, 0x714Fu, 0x7D27u, 0xFF92u,
+		0x09B4u, 0x8B01u, 0x8769u, 0x05DCu, 0x9FB9u, 0x1D0Cu, 0x1164u, 0x93D1u,
+		0xAE19u, 0x2CACu, 0x20C4u, 0xA271u, 0x3814u, 0xBAA1u, 0xB6C9u, 0x347Cu,
+		0xCD59u, 0x4FECu, 0x4384u, 0xC131u, 0x5B54u, 0xD9E1u, 0xD589u, 0x573Cu,
+		0x6AF4u, 0xE841u, 0xE429u, 0x669Cu, 0xFCF9u, 0x7E4Cu, 0x7224u, 0xF091u,
+		0x0BD9u, 0x896Cu, 0x8504u, 0x07B1u, 0x9DD4u, 0x1F61u, 0x1309u, 0x91BCu,
+		0xAC74u, 0x2EC1u, 0x22A9u, 0xA01Cu, 0x3A79u, 0xB8CCu, 0xB4A4u, 0x3611u,
+		0xCF34u, 0x4D81u, 0x41E9u, 0xC35Cu, 0x5939u, 0xDB8Cu, 0xD7E4u, 0x5551u,
+		0x6899u, 0xEA2Cu, 0xE644u, 0x64F1u, 0xFE94u, 0x7C21u, 0x7049u, 0xF2FCu,
+		0x0D6Eu, 0x8FDBu, 0x83B3u, 0x0106u, 0x9B63u, 0x19D6u, 0x15BEu, 0x970Bu,
+		0xAAC3u, 0x2876u, 0x241Eu, 0xA6ABu, 0x3CCEu, 0xBE7Bu, 0xB213u, 0x30A6u,
+		0xC983u, 0x4B36u, 0x475Eu, 0xC5EBu, 0x5F8Eu, 0xDD3Bu, 0xD153u, 0x53E6u,
+		0x6E2Eu, 0xEC9Bu, 0xE0F3u, 0x6246u, 0xF823u, 0x7A96u, 0x76FEu, 0xF44Bu,
+		0x0F03u, 0x8DB6u, 0x81DEu, 0x036Bu, 0x990Eu, 0x1BBBu, 0x17D3u, 0x9566u,
+		0xA8AEu, 0x2A1Bu, 0x2673u, 0xA4C6u, 0x3EA3u, 0xBC16u, 0xB07Eu, 0x32CBu,
+		0xCBEEu, 0x495Bu, 0x4533u, 0xC786u, 0x5DE3u, 0xDF56u, 0xD33Eu, 0x518Bu,
+		0x6C43u, 0xEEF6u, 0xE29Eu, 0x602Bu, 0xFA4Eu, 0x78FBu, 0x7493u, 0xF626u
+	},
+	{
+		0x0000u, 0x1368u, 0x26D0u, 0x35B8u, 0x4DA0u, 0x5EC8u, 0x6B70u, 0x7818u,
+		0x9B40u, 0x8828u, 0xBD90u, 0xAEF8u, 0xD6E0u, 0xC588u, 0xF030u, 0xE358u,
+		0xBD37u, 0xAE5Fu, 0x9BE7u, 0x888Fu, 0xF097u, 0xE3FFu, 0xD647u, 0xC52Fu,
+		0x2677u, 0x351Fu, 0x00A7u, 0x13CFu, 0x6BD7u, 0x78BFu, 0x4D07u, 0x5E6Fu,
+		0xF1D9u, 0xE2B1u, 0xD709u, 0xC461u, 0xBC79u, 0xAF11u, 0x9AA9u, 0x89C1u,
+		0x6A99u, 0x79F1u, 0x4C49u, 0x5F21u, 0x2739u, 0x3451u, 0x01E9u, 0x1281u,
+		0x4CEEu, 0x5F86u, 0x6A3Eu, 0x7956u, 0x014Eu, 0x1226u, 0x279Eu, 0x34F6u,
+		0xD7AEu, 0xC4C6u, 0xF17Eu, 0xE216u, 0x9A0Eu, 0x8966u, 0xBCDEu, 0xAFB6u,
+		0x6805u, 0x7B6Du, 0x4ED5u, 0x5DBDu, 0x25A5u, 0x36CDu, 0x0375u, 0x101Du,
+		0xF345u, 0xE02Du, 0xD595u, 0xC6FDu, 0xBEE5u, 0xAD8Du, 0x9835u, 0x8B5Du,
+		0xD532u, 0xC65Au, 0xF3E2u, 0xE08Au, 0x9892u, 0x8BFAu, 0xBE42u, 0xAD2Au,
+		0x4E72u, 0x5D1Au, 0x68A2u, 0x7BCAu, 0x03D2u, 0x10BAu, 0x2502u, 0x366Au,
+		0x99DCu, 0x8AB4u, 0xBF0Cu, 0xAC64u, 0xD47Cu, 0xC714u, 0xF2ACu, 0xE1C4u,
+		0x029Cu, 0x11F4u, 0x244Cu, 0x3724u, 0x4F3Cu, 0x5C54u, 0x69ECu, 0x7A84u,
+		0x24EBu, 0x3783u, 0x023Bu, 0x1153u, 0x694Bu, 0x7A23u, 0x4F9Bu, 0x5CF3u,
+		0xBFABu, 0xACC3u, 0x997Bu, 0x8A13u, 0xF20Bu, 0xE163u, 0xD4DBu, 0xC7B3u,
+		0xD00Au, 0xC362u, 0xF6DAu, 0xE5B2u, 0x9DAAu, 0x8EC2u, 0xBB7Au, 0xA812u,
+		0x4B4Au, 0x5822u, 0x6D9Au, 0x7EF2u, 0x06EAu, 0x1582u, 0x203Au, 0x3352u,
+		0x6D3Du, 0x7E55u, 0x4BEDu, 0x5885u, 0x209Du, 0x33F5u, 0x064Du, 0x1525u,
+		0xF67Du, 0xE515u, 0xD0ADu, 0xC3C5u, 0xBBDDu, 0xA8B5u, 0x9D0Du, 0x8E65u,
+		0x21D3u, 0x32BBu, 0x0703u, 0x146Bu, 0x6C73u, 0x7F1Bu, 0x4AA3u, 0x59CBu,
+		0xBA93u, 0xA9FBu, 0x9C43u, 0x8F2Bu, 0xF733u, 0xE45Bu, 0xD1E3u, 0xC28Bu,
+		0x9CE4u, 0x8F8Cu, 0xBA34u, 0xA95Cu, 0xD144u, 0xC22Cu, 0xF794u, 0xE4FCu,
+		0x07A4u, 0x14CCu, 0x2174u, 0x321Cu, 0x4A04u, 0x596Cu, 0x6CD4u, 0x7FBCu,
+		0xB80Fu, 0xAB67u, 0x9EDFu, 0x8DB7u, 0xF5AFu, 0xE6C7u, 0xD37Fu, 0xC017u,
+		0x234Fu, 0x3027u, 0x059Fu, 0x16F7u, 0x6EEFu, 0x7D87u, 0x483Fu, 0x5B57u,
+		0x0538u, 0x1650u, 0x23E8u, 0x3080u, 0x4898u, 0x5BF0u, 0x6E48u, 0x7D20u,
+		0x9E78u, 0x8D10u, 0xB8A8u, 0xABC0u, 0xD3D8u, 0xC0B0u, 0xF508u, 0xE660u,
+		0x49D6u, 0x5ABEu, 0x6F06u, 0x7C6Eu, 0x0476u, 0x171Eu, 0x22A6u, 0x31CEu,
+		0xD296u, 0xC1FEu, 0xF446u, 0xE72Eu, 0x9F36u, 0x8C5Eu, 0xB9E6u, 0xAA8Eu,
+		0xF4E1u, 0xE789u, 0xD231u, 0xC159u, 0xB941u, 0xAA29u, 0x9F91u, 0x8CF9u,
+		0x6FA1u, 0x7CC9u, 0x4971u, 0x5A19u, 0x2201u, 0x3169u, 0x04D1u, 0x17B9u
+	},
+	{
+		0x0000u, 0x2BA3u, 0x5746u, 0x7CE5u, 0xAE8Cu, 0x852Fu, 0xF9CAu, 0xD269u,
+		0xD6AFu, 0xFD0Cu, 0x81E9u, 0xAA4Au, 0x7823u, 0x5380u, 0x2F65u, 0x04C6u,
+		0x26E9u, 0x0D4Au, 0x71AFu, 0x5A0Cu, 0x8865u, 0xA3C6u, 0xDF23u, 0xF480u,
+		0xF046u, 0xDBE5u, 0xA700u, 0x8CA3u, 0x5ECAu, 0x7569u, 0x098Cu, 0x222Fu,
+		0x4DD2u, 0x6671u, 0x1A94u, 0x3137u, 0xE35Eu, 0xC8FDu, 0xB418u, 0x9FBBu,
+		0x9B7Du, 0xB0DEu, 0xCC3Bu, 0xE798u, 0x35F1u, 0x1E52u, 0x62B7u, 0x4914u,
+		0x6B3Bu, 0x4098u, 0x3C7Du, 0x17DEu, 0xC5B7u, 0xEE14u, 0x92F1u, 0xB952u,
+		0xBD94u, 0x9637u, 0xEAD2u, 0xC171u, 0x1318u, 0x38BBu, 0x445Eu, 0x6FFDu,
+		0x9BA4u, 0xB007u, 0xCCE2u, 0xE741u, 0x3528u, 0x1E8Bu, 0x626Eu, 0x49CDu,
+		0x4D0Bu, 0x66A8u, 0x1A4Du, 0x31EEu, 0xE387u, 0xC824u, 0xB4C1u, 0x9F62u,
+		0xBD4Du, 0x96EEu, 0xEA0Bu, 0xC1A8u, 0x13C1u, 0x3862u, 0x4487u, 0x6F24u,
+		0x6BE2u, 0x4041u, 0x3CA4u, 0x1707u, 0xC56Eu, 0xEECDu, 0x9228u, 0xB98Bu,
+		0xD676u, 0xFDD5u, 0x8130u, 0xAA93u, 0x78FAu, 0x5359u, 0x2FBCu, 0x041Fu,
+		0x00D9u, 0x2B7Au, 0x579Fu, 0x7C3Cu, 0xAE55u, 0x85F6u, 0xF913u, 0xD2B0u,
+		0xF09Fu, 0xDB3Cu, 0xA7D9u, 0x8C7Au, 0x5E13u, 0x75B0u, 0x0955u, 0x22F6u,
+		0x2630u, 0x0D93u, 0x7176u, 0x5AD5u, 0x88BCu, 0xA31Fu, 0xDFFAu, 0xF459u,
+		0xBCFFu, 0x975Cu, 0xEBB9u, 0xC01Au, 0x1273u, 0x39D0u, 0x4535u, 0x6E96u,
+		0x6A50u, 0x41F3u, 0x3D16u, 0x16B5u, 0xC4DCu, 0xEF7Fu, 0x939Au, 0xB839u,
+		0x9A16u, 0xB1B5u, 0xCD50u, 0xE6F3u, 0x349Au, 0x1F39u, 0x63DCu, 0x487Fu,
+		0x4CB9u, 0x671Au, 0x1BFFu, 0x305Cu, 0xE235u, 0xC996u, 0xB573u, 0x9ED0u,
+		0xF12Du, 0xDA8Eu, 0xA66Bu, 0x8DC8u, 0x5FA1u, 0x7402u, 0x08E7u, 0x2344u,
+		0x2782u, 0x0C21u, 0x70C4u, 0x5B67u, 0x890Eu, 0xA2ADu, 0xDE48u, 0xF5EBu,
+		0xD7C4u, 0xFC67u, 0x8082u, 0xAB21u, 0x7948u, 0x52EBu, 0x2E0Eu, 0x05ADu,
+		0x016Bu, 0x2AC8u, 0x562Du, 0x7D8Eu, 0xAFE7u, 0x8444u, 0xF8A1u, 0xD302u,
+		0x275Bu, 0x0CF8u, 0x701Du, 0x5BBEu, 0x89D7u, 0xA274u, 0xDE91u, 0xF532u,
+		0xF1F4u, 0xDA57u, 0xA6B2u, 0x8D11u, 0x5F78u, 0x74DBu, 0x083Eu, 0x239Du,
+		0x01B2u, 0x2A11u, 0x56F4u, 0x7D57u, 0xAF3Eu, 0x849Du, 0xF878u, 0xD3DBu,
+		0xD71Du, 0xFCBEu, 0x805Bu, 0xABF8u, 0x7991u, 0x5232u, 0x2ED7u, 0x0574u,
+		0x6A89u, 0x412Au, 0x3DCFu, 0x166Cu, 0xC405u, 0xEFA6u, 0x9343u, 0xB8E0u,
+		0xBC26u, 0x9785u, 0xEB60u, 0xC0C3u, 0x12AAu, 0x3909u, 0x45ECu, 0x6E4Fu,
+		0x4C60u, 0x67C3u, 0x1B26u, 0x3085u, 0xE2ECu, 0xC94Fu, 0xB5AAu, 0x9E09u,
+		0x9ACFu, 0xB16Cu, 0xCD89u, 0xE62Au, 0x3443u, 0x1FE0u, 0x6305u, 0x48A6u
+	},
+	{
+		0x0000u, 0xF249u, 0x6F25u, 0x9D6Cu, 0xDE4Au, 0x2C03u, 0xB16Fu, 0x4326u,
+		0x3723u, 0xC56Au, 0x5806u, 0xAA4Fu, 0xE969u, 0x1B20u, 0x864Cu, 0x7405u,
+		0x6E46u, 0x9C0Fu, 0x0163u, 0xF32Au, 0xB00Cu, 0x4245u, 0xDF29u, 0x2D60u,
+		0x5965u, 0xAB2Cu, 0x3640u, 0xC409u, 0x872Fu, 0x7566u, 0xE80Au, 0x1A43u,
+		0xDC8Cu, 0x2EC5u, 0xB3A9u, 0x41E0u, 0x02C6u, 0xF08Fu, 0x6DE3u, 0x9FAAu,
+		0xEBAFu, 0x19E6u, 0x848Au, 0x76C3u, 0x35E5u, 0xC7ACu, 0x5AC0u, 0xA889u,
+		0xB2CAu, 0x4083u, 0xDDEFu, 0x2FA6u, 0x6C80u, 0x9EC9u, 0x03A5u, 0xF1ECu,
+		0x85E9u, 0x77A0u, 0xEACCu, 0x1885u, 0x5BA3u, 0xA9EAu, 0x3486u, 0xC6CFu,
+		0x32AFu, 0xC0E6u, 0x5D8Au, 0xAFC3u, 0xECE5u, 0x1EACu, 0x83C0u, 0x7189u,
+		0x058Cu, 0xF7C5u, 0x6AA9u, 0x98E0u, 0xDBC6u, 0x298Fu, 0xB4E3u, 0x46AAu,
+		0x5CE9u, 0xAEA0u, 0x33CCu, 0xC185u, 0x82A3u, 0x70EAu, 0xED86u, 0x1FCFu,
+		0x6BCAu, 0x9983u, 0x04EFu, 0xF6A6u, 0xB580u, 0x47C9u, 0xDAA5u, 0x28ECu,
+		0xEE23u, 0x1C6Au, 0x8106u, 0x734Fu, 0x3069u, 0xC220u, 0x5F4Cu, 0xAD05u,
+		0xD900u, 0x2B49u, 0xB625u, 0x446Cu, 0x074Au, 0xF503u, 0x686Fu, 0x9A26u,
+		0x8065u, 0x722Cu, 0xEF40u, 0x1D09u, 0x5E2Fu, 0xAC66u, 0x310Au, 0xC343u,
+		0xB746u, 0x450Fu, 0xD863u, 0x2A2Au, 0x690Cu, 0x9B45u, 0x0629u, 0xF460u,
+		0x655Eu, 0x9717u, 0x0A7Bu, 0xF832u, 0xBB14u, 0x495Du, 0xD431u, 0x2678u,
+		0x527Du, 0xA034u, 0x3D58u, 0xCF11u, 0x8C37u, 0x7E7Eu, 0xE312u, 0x115Bu,
+		0x0B18u, 0xF951u, 0x643Du, 0x9674u, 0xD552u, 0x271Bu, 0xBA77u, 0x483Eu,
+		0x3C3Bu, 0xCE72u, 0x531Eu, 0xA157u, 0xE271u, 0x1038u, 0x8D54u, 0x7F1Du,
+		0xB9D2u, 0x4B9Bu, 0xD6F7u, 0x24BEu, 0x6798u, 0x95D1u, 0x08BDu, 0xFAF4u,
+		0x8EF1u, 0x7CB8u, 0xE1D4u, 0x139Du, 0x50BBu, 0xA2F2u, 0x3F9Eu, 0xCDD7u,
+		0xD794u, 0x25DDu, 0xB8B1u, 0x4AF8u, 0x09DEu, 0xFB97u, 0x66FBu, 0x94B2u,
+		0xE0B7u, 0x12FEu, 0x8F92u, 0x7DDBu, 0x3EFDu, 0xCCB4u, 0x51D8u, 0xA391u,
+		0x57F1u, 0xA5B8u, 0x38D4u, 0xCA9Du, 0x89BBu, 0x7BF2u, 0xE69Eu, 0x14D7u,
+		0x60D2u, 0x929Bu, 0x0FF7u, 0xFDBEu, 0xBE98u, 0x4CD1u, 0xD1BDu, 0x23F4u,
+		0x39B7u, 0xCBFEu, 0x5692u, 0xA4DBu, 0xE7FDu, 0x15B4u, 0x88D8u, 0x7A91u,
+		0x0E94u, 0xFCDDu, 0x61B1u, 0x93F8u, 0xD0DEu, 0x2297u, 0xBFFBu, 0x4DB2u,
+		0x8B7Du, 0x7934u, 0xE458u, 0x1611u, 0x5537u, 0xA77Eu, 0x3A12u, 0xC85Bu,
+		0xBC5Eu, 0x4E17u, 0xD37Bu, 0x2132u, 0x6214u, 0x905Du, 0x0D31u, 0xFF78u,
+		0xE53Bu, 0x1772u, 0x8A1Eu, 0x7857u, 0x3B71u, 0xC938u, 0x5454u, 0xA61Du,
+		0xD218u, 0x2051u, 0xBD3Du, 0x4F74u, 0x0C52u, 0xFE1Bu, 0x6377u, 0x913Eu
+	},
+	{
+		0x0000u, 0xCABCu, 0x1ECFu, 0xD473u, 0x3D9Eu, 0xF722u, 0x2351u, 0xE9EDu,
+		0x7B3Cu, 0xB180u, 0x65F3u, 0xAF4Fu, 0x46A2u, 0x8C1Eu, 0x586Du, 0x92D1u,
+		0xF678u, 0x3CC4u, 0xE8B7u, 0x220Bu, 0xCBE6u, 0x015Au, 0xD529u, 0x1F95u,
+		0x8D44u, 0x47F8u, 0x938Bu, 0x5937u, 0xB0DAu, 0x7A66u, 0xAE15u, 0x64A9u,
+		0x6747u, 0xADFBu, 0x7988u, 0xB334u, 0x5AD9u, 0x9065u, 0x4416u, 0x8EAAu,
+		0x1C7Bu, 0xD6C7u, 0x02B4u, 0xC808u, 0x21E5u, 0xEB59u, 0x3F2Au, 0xF596u,
+		0x913Fu, 0x5B83u, 0x8FF0u, 0x454Cu, 0xACA1u, 0x661Du, 0xB26Eu, 0x78D2u,
+		0xEA03u, 0x20BFu, 0xF4CCu, 0x3E70u, 0xD79Du, 0x1D21u, 0xC952u, 0x03EEu,
+		0xCE8Eu, 0x0432u, 0xD041u, 0x1AFDu, 0xF310u, 0x39ACu, 0xEDDFu, 0x2763u,
+		0xB5B2u, 0x7F0Eu, 0xAB7Du, 0x61C1u, 0x882Cu, 0x4290u, 0x96E3u, 0x5C5Fu,
+		0x38F6u, 0xF24Au, 0x2639u, 0xEC85u, 0x0568u, 0xCFD4u, 0x1BA7u, 0xD11Bu,
+		0x43CAu, 0x8976u, 0x5D05u, 0x97B9u, 0x7E54u, 0xB4E8u, 0x609Bu, 0xAA27u,
+		0xA9C9u, 0x6375u, 0xB706u, 0x7DBAu, 0x9457u, 0x5EEBu, 0x8A98u, 0x4024u,
+		0xD2F5u, 0x1849u, 0xCC3Au, 0x0686u, 0xEF6Bu, 0x25D7u, 0xF1A4u, 0x3B18u,
+		0x5FB1u, 0x950Du, 0x417Eu, 0x8BC2u, 0x622Fu, 0xA893u, 0x7CE0u, 0xB65Cu,
+		0x248Du, 0xEE31u, 0x3A42u, 0xF0FEu, 0x1913u, 0xD3AFu, 0x07DCu, 0xCD60u,
+		0x16ABu, 0xDC17u, 0x0864u, 0xC2D8u, 0x2B35u, 0xE189u, 0x35FAu, 0xFF46u,
+		0x6D97u, 0xA72Bu, 0x7358u, 0xB9E4u, 0x5009u, 0x9AB5u, 0x4EC6u, 0x847Au,
+		0xE0D3u, 0x2A6Fu, 0xFE1Cu, 0x34A0u, 0xDD4Du, 0x17F1u, 0xC382u, 0x093Eu,
+		0x9BEFu, 0x5153u, 0x8520u, 0x4F9Cu, 0xA671u, 0x6CCDu, 0xB8BEu, 0x7202u,
+		0x71ECu, 0xBB50u, 0x6F23u, 0xA59Fu, 0x4C72u, 0x86CEu, 0x52BDu, 0x9801u,
+		0x0AD0u, 0xC06Cu, 0x141Fu, 0xDEA3u, 0x374Eu, 0xFDF2u, 0x2981u, 0xE33Du,
+		0x8794u, 0x4D28u, 0x995Bu, 0x53E7u, 0xBA0Au, 0x70B6u, 0xA4C5u, 0x6E79u,
+		0xFCA8u, 0x3614u, 0xE267u, 0x28DBu, 0xC136u, 0x0B8Au, 0xDFF9u, 0x1545u,
+		0xD825u, 0x1299u, 0xC6EAu, 0x0C56u, 0xE5BBu, 0x2F07u, 0xFB74u, 0x31C8u,
+		0xA319u, 0x69A5u, 0xBDD6u, 0x776Au, 0x9E87u, 0x543Bu, 0x8048u, 0x4AF4u,
+		0x2E5Du, 0xE4E1u, 0x3092u, 0xFA2Eu, 0x13C3u, 0xD97Fu, 0x0D0Cu, 0xC7B0u,
+		0x5561u, 0x9FDDu, 0x4BAEu, 0x8112u, 0x68FFu, 0xA243u, 0x7630u, 0xBC8Cu,
+		0xBF62u, 0x75DEu, 0xA1ADu, 0x6B11u, 0x82FCu, 0x4840u, 0x9C33u, 0x568Fu,
+		0xC45Eu, 0x0EE2u, 0xDA91u, 0x102Du, 0xF9C0u, 0x337Cu, 0xE70Fu, 0x2DB3u,
+		0x491Au, 0x83A6u, 0x57D5u, 0x9D69u, 0x7484u, 0xBE38u, 0x6A4Bu, 0xA0F7u,
+		0x3226u, 0xF89Au, 0x2CE9u, 0xE655u, 0x0FB8u, 0xC504u, 0x1177u, 0xDBCBu
+	},
+	{
+		0x0000u, 0x2D56u, 0x5AACu, 0x77FAu, 0xB558u, 0x980Eu, 0xEFF4u, 0xC2A2u,
+		0xE107u, 0xCC51u, 0xBBABu, 0x96FDu, 0x545Fu, 0x7909u, 0x0EF3u, 0x23A5u,
+		0x49B9u, 0x64EFu, 0x1315u, 0x3E43u, 0xFCE1u, 0xD1B7u, 0xA64Du, 0x8B1Bu,
+		0xA8BEu, 0x85E8u, 0xF212u, 0xDF44u, 0x1DE6u, 0x30B0u, 0x474Au, 0x6A1Cu,
+		0x9372u, 0xBE24u, 0xC9DEu, 0xE488u, 0x262Au, 0x0B7Cu, 0x7C86u, 0x51D0u,
+		0x7275u, 0x5F23u, 0x28D9u, 0x058Fu, 0xC72Du, 0xEA7Bu, 0x9D81u, 0xB0D7u,
+		0xDACBu, 0xF79Du, 0x8067u, 0xAD31u, 0x6F93u, 0x42C5u, 0x353Fu, 0x1869u,
+		0x3BCCu, 0x169Au, 0x6160u, 0x4C36u, 0x8E94u, 0xA3C2u, 0xD438u, 0xF96Eu,
+		0xAD53u, 0x8005u, 0xF7FFu, 0xDAA9u, 0x180Bu, 0x355Du, 0x42A7u, 0x6FF1u,
+		0x4C54u, 0x6102u, 0x16F8u, 0x3BAEu, 0xF90Cu, 0xD45Au, 0xA3A0u, 0x8EF6u,
+		0xE4EAu, 0xC9BCu, 0xBE46u, 0x9310u, 0x51B2u, 0x7CE4u, 0x0B1Eu, 0x2648u,
+		0x05EDu, 0x28BBu, 0x5F41u, 0x7217u, 0xB0B5u, 0x9DE3u, 0xEA19u, 0xC74Fu,
+		0x3E21u, 0x1377u, 0x648Du, 0x49DBu, 0x8B79u, 0xA62Fu, 0xD1D5u, 0xFC83u,
+		0xDF26u, 0xF270u, 0x858Au, 0xA8DCu, 0x6A7Eu, 0x4728u, 0x30D2u, 0x1D84u,
+		0x7798u, 0x5ACEu, 0x2D34u, 0x0062u, 0xC2C0u, 0xEF96u, 0x986Cu, 0xB53Au,
+		0x969Fu, 0xBBC9u, 0xCC33u, 0xE165u, 0x23C7u, 0x0E91u, 0x796Bu, 0x543Du,
+		0xD111u, 0xFC47u, 0x8BBDu, 0xA6EBu, 0x6449u, 0x491Fu, 0x3EE5u, 0x13B3u,
+		0x3016u, 0x1D40u, 0x6ABAu, 0x47ECu, 0x854Eu, 0xA818u, 0xDFE2u, 0xF2B4u,
+		0x98A8u, 0xB5FEu, 0xC204u, 0xEF52u, 0x2DF0u, 0x00A6u, 0x775Cu, 0x5A0Au,
+		0x79AFu, 0x54F9u, 0x2303u, 0x0E55u, 0xCCF7u, 0xE1A1u, 0x965Bu, 0xBB0Du,
+		0x4263u, 0x6F35u, 0x18CFu, 0x3599u, 0xF73Bu, 0xDA6Du, 0xAD97u, 0x80C1u,
+		0xA364u, 0x8E32u, 0xF9C8u, 0xD49Eu, 0x163Cu, 0x3B6Au, 0x4C90u, 0x61C6u,
+		0x0BDAu, 0x268Cu, 0x5176u, 0x7C20u, 0xBE82u, 0x93D4u, 0xE42Eu, 0xC978u,
+		0xEADDu, 0xC78Bu, 0xB071u, 0x9D27u, 0x5F85u, 0x72D3u, 0x0529u, 0x287Fu,
+		0x7C42u, 0x5114u, 0x26EEu, 0x0BB8u, 0xC91Au, 0xE44Cu, 0x93B6u, 0xBEE0u,
+		0x9D45u, 0xB013u, 0xC7E9u, 0xEABFu, 0x281Du, 0x054Bu, 0x72B1u, 0x5FE7u,
+		0x35FBu, 0x18ADu, 0x6F57u, 0x4201u, 0x80A3u, 0xADF5u, 0xDA0Fu, 0xF759u,
+		0xD4FCu, 0xF9AAu, 0x8E50u, 0xA306u, 0x61A4u, 0x4CF2u, 0x3B08u, 0x165Eu,
+		0xEF30u, 0xC266u, 0xB59Cu, 0x98CAu, 0x5A68u, 0x773Eu, 0x00C4u, 0x2D92u,
+		0x0E37u, 0x2361u, 0x549Bu, 0x79CDu, 0xBB6Fu, 0x9639u, 0xE1C3u, 0xCC95u,
+		0xA689u, 0x8BDFu, 0xFC25u, 0xD173u, 0x13D1u, 0x3E87u, 0x497Du, 0x642Bu,
+		0x478Eu, 0x6AD8u, 0x1D22u, 0x3074u, 0xF2D6u, 0xDF80u, 0xA87Au, 0x852Cu
+	},
+	{
+		0x0000u, 0x2995u, 0x532Au, 0x7ABFu, 0xA654u, 0x8FC1u, 0xF57Eu, 0xDCEBu,
+		0xC71Fu, 0xEE8Au, 0x9435u, 0xBDA0u, 0x614Bu, 0x48DEu, 0x3261u, 0x1BF4u,
+		0x0589u, 0x2C1Cu, 0x56A3u, 0x7F36u, 0xA3DDu, 0x8A48u, 0xF0F7u, 0xD962u,
+		0xC296u, 0xEB03u, 0x91BCu, 0xB829u, 0x64C2u, 0x4D57u, 0x37E8u, 0x1E7Du,
+		0x0B12u, 0x2287u, 0x5838u, 0x71ADu, 0xAD46u, 0x84D3u, 0xFE6Cu, 0xD7F9u,
+		0xCC0Du, 0xE598u, 0x9F27u, 0xB6B2u, 0x6A59u, 0x43CCu, 0x3973u, 0x10E6u,
+		0x0E9Bu, 0x270Eu, 0x5DB1u, 0x7424u, 0xA8CFu, 0x815Au, 0xFBE5u, 0xD270u,
+		0xC984u, 0xE011u, 0x9AAEu, 0xB33Bu, 0x6FD0u, 0x4645u, 0x3CFAu, 0x156Fu,
+		0x1624u, 0x3FB1u, 0x450Eu, 0x6C9Bu, 0xB070u, 0x99E5u, 0xE35Au, 0xCACFu,
+		0xD13Bu, 0xF8AEu, 0x8211u, 0xAB84u, 0x776Fu, 0x5EFAu, 0x2445u, 0x0DD0u,
+		0x13ADu, 0x3A38u, 0x4087u, 0x6912u, 0xB5F9u, 0x9C6Cu, 0xE6D3u, 0xCF46u,
+		0xD4B2u, 0xFD27u, 0x8798u, 0xAE0Du, 0x72E6u, 0x5B73u, 0x21CCu, 0x0859u,
+		0x1D36u, 0x34A3u, 0x4E1Cu, 0x6789u, 0xBB62u, 0x92F7u, 0xE848u, 0xC1DDu,
+		0xDA29u, 0xF3BCu, 0x8903u, 0xA096u, 0x7C7Du, 0x55E8u, 0x2F57u, 0x06C2u,
+		0x18BFu, 0x312Au, 0x4B95u, 0x6200u, 0xBEEBu, 0x977Eu, 0xEDC1u, 0xC454u,
+		0xDFA0u, 0xF635u, 0x8C8Au, 0xA51Fu, 0x79F4u, 0x5061u, 0x2ADEu, 0x034Bu,
+		0x2C48u, 0x05DDu, 0x7F62u, 0x56F7u, 0x8A1Cu, 0xA389u, 0xD936u, 0xF0A3u,
+		0xEB57u, 0xC2C2u, 0xB87Du, 0x91E8u, 0x4D03u, 0x6496u, 0x1E29u, 0x37BCu,
+		0x29C1u, 0x0054u, 0x7AEBu, 0x537Eu, 0x8F95u, 0xA600u, 0xDCBFu, 0xF52Au,
+		0xEEDEu, 0xC74Bu, 0xBDF4u, 0x9461u, 0x488Au, 0x611Fu, 0x1BA0u, 0x3235u,
+		0x275Au, 0x0ECFu, 0x7470u, 0x5DE5u, 0x810Eu, 0xA89Bu, 0xD224u, 0xFBB1u,
+		0xE045u, 0xC9D0u, 0xB36Fu, 0x9AFAu, 0x4611u, 0x6F84u, 0x153Bu, 0x3CAEu,
+		0x22D3u, 0x0B46u, 0x71F9u, 0x586Cu, 0x8487u, 0xAD12u, 0xD7ADu, 0xFE38u,
+		0xE5CCu, 0xCC59u, 0xB6E6u, 0x9F73u, 0x4398u, 0x6A0Du, 0x10B2u, 0x3927u,
+		0x3A6Cu, 0x13F9u, 0x6946u, 0x40D3u, 0x9C38u, 0xB5ADu, 0xCF12u, 0xE687u,
+		0xFD73u, 0xD4E6u, 0xAE59u, 0x87CCu, 0x5B27u, 0x72B2u, 0x080Du, 0x2198u,
+		0x3FE5u, 0x1670u, 0x6CCFu, 0x455Au, 0x99B1u, 0xB024u, 0xCA9Bu, 0xE30Eu,
+		0xF8FAu, 0xD16Fu, 0xABD0u, 0x8245u, 0x5EAEu, 0x773Bu, 0x0D84u, 0x2411u,
+		0x317Eu, 0x18EBu, 0x6254u, 0x4BC1u, 0x972Au, 0xBEBFu, 0xC400u, 0xED95u,
+		0xF661u, 0xDFF4u, 0xA54Bu, 0x8CDEu, 0x5035u, 0x79A0u, 0x031Fu, 0x2A8Au,
+		0x34F7u, 0x1D62u, 0x67DDu, 0x4E48u, 0x92A3u, 0xBB36u, 0xC189u, 0xE81Cu,
+		0xF3E8u, 0xDA7Du, 0xA0C2u, 0x8957u, 0x55BCu, 0x7C29u, 0x0696u, 0x2F03u
+	},
+	{
+		0x0000u, 0x5890u, 0xB120u, 0xE9B0u, 0xE9F7u, 0xB167u, 0x58D7u, 0x0047u,
+		0x5859u, 0x00C9u, 0xE979u, 0xB1E9u, 0xB1AEu, 0xE93Eu, 0x008Eu, 0x581Eu,
+		0xB0B2u, 0xE822u, 0x0192u, 0x5902u, 0x5945u, 0x01D5u, 0xE865u, 0xB0F5u,
+		0xE8EBu, 0xB07Bu, 0x59CBu, 0x015Bu, 0x011Cu, 0x598Cu, 0xB03Cu, 0xE8ACu,
+		0xEAD3u, 0xB243u, 0x5BF3u, 0x0363u, 0x0324u, 0x5BB4u, 0xB204u, 0xEA94u,
+		0xB28Au, 0xEA1Au, 0x03AAu, 0x5B3Au, 0x5B7Du, 0x03EDu, 0xEA5Du, 0xB2CDu,
+		0x5A61u, 0x02F1u, 0xEB41u, 0xB3D1u, 0xB396u, 0xEB06u, 0x02B6u, 0x5A26u,
+		0x0238u, 0x5AA8u, 0xB318u, 0xEB88u, 0xEBCFu, 0xB35Fu, 0x5AEFu, 0x027Fu,
+		0x5E11u, 0x0681u, 0xEF31u, 0xB7A1u, 0xB7E6u, 0xEF76u, 0x06C6u, 0x5E56u,
+		0x0648u, 0x5ED8u, 0xB768u, 0xEFF8u, 0xEFBFu, 0xB72Fu, 0x5E9Fu, 0x060Fu,
+		0xEEA3u, 0xB633u, 0x5F83u, 0x0713u, 0x0754u, 0x5FC4u, 0xB674u, 0xEEE4u,
+		0xB6FAu, 0xEE6Au, 0x07DAu, 0x5F4Au, 0x5F0Du, 0x079Du, 0xEE2Du, 0xB6BDu,
+		0xB4C2u, 0xEC52u, 0x05E2u, 0x5D72u, 0x5D35u, 0x05A5u, 0xEC15u, 0xB485u,
+		0xEC9Bu, 0xB40Bu, 0x5DBBu, 0x052Bu, 0x056Cu, 0x5DFCu, 0xB44Cu, 0xECDCu,
+		0x0470u, 0x5CE0u, 0xB550u, 0xEDC0u, 0xED87u, 0xB517u, 0x5CA7u, 0x0437u,
+		0x5C29u, 0x04B9u, 0xED09u, 0xB599u, 0xB5DEu, 0xED4Eu, 0x04FEu, 0x5C6Eu,
+		0xBC22u, 0xE4B2u, 0x0D02u, 0x5592u, 0x55D5u, 0x0D45u, 0xE4F5u, 0xBC65u,
+		0xE47Bu, 0xBCEBu, 0x555Bu, 0x0DCBu, 0x0D8Cu, 0x551Cu, 0xBCACu, 0xE43Cu,
+		0x0C90u, 0x5400u, 0xBDB0u, 0xE520u, 0xE567u, 0xBDF7u, 0x5447u, 0x0CD7u,
+		0x54C9u, 0x0C59u, 0xE5E9u, 0xBD79u, 0xBD3Eu, 0xE5AEu, 0x0C1Eu, 0x548Eu,
+		0x56F1u, 0x0E61u, 0xE7D1u, 0xBF41u, 0xBF06u, 0xE796u, 0x0E26u, 0x56B6u,
+		0x0EA8u, 0x5638u, 0xBF88u, 0xE718u, 0xE75Fu, 0xBFCFu, 0x567Fu, 0x0EEFu,
+		0xE643u, 0xBED3u, 0x5763u, 0x0FF3u, 0x0FB4u, 0x5724u, 0xBE94u, 0xE604u,
+		0xBE1Au, 0xE68Au, 0x0F3Au, 0x57AAu, 0x57EDu, 0x0F7Du, 0xE6CDu, 0xBE5Du,
+		0xE233u, 0xBAA3u, 0x5313u, 0x0B83u, 0x0BC4u, 0x5354u, 0xBAE4u, 0xE274u,
+		0xBA6Au, 0xE2FAu, 0x0B4Au, 0x53DAu, 0x539Du, 0x0B0Du, 0xE2BDu, 0xBA2Du,
+		0x5281u, 0x0A11u, 0xE3A1u, 0xBB31u, 0xBB76u, 0xE3E6u, 0x0A56u, 0x52C6u,
+		0x0AD8u, 0x5248u, 0xBBF8u, 0xE368u, 0xE32Fu, 0xBBBFu, 0x520Fu, 0x0A9Fu,
+		0x08E0u, 0x5070u, 0xB9C0u, 0xE150u, 0xE117u, 0xB987u, 0x5037u, 0x08A7u,
+		0x50B9u, 0x0829u, 0xE199u, 0xB909u, 0xB94Eu, 0xE1DEu, 0x086Eu, 0x50FEu,
+		0xB852u, 0xE0C2u, 0x0972u, 0x51E2u, 0x51A5u, 0x0935u, 0xE085u, 0xB815u,
+		0xE00Bu, 0xB89Bu, 0x512Bu, 0x09BBu, 0x09FCu, 0x516Cu, 0xB8DCu, 0xE04Cu
+	},
+	{
+		0x0000u, 0xF3F3u, 0x6C51u, 0x9FA2u, 0xD8A2u, 0x2B51u, 0xB4F3u, 0x4700u,
+		0x3AF3u, 0xC900u, 0x56A2u, 0xA551u, 0xE251u, 0x11A2u, 0x8E00u, 0x7DF3u,
+		0x75E6u, 0x8615u, 0x19B7u, 0xEA44u, 0xAD44u, 0x5EB7u, 0xC115u, 0x32E6u,
+		0x4F15u, 0xBCE6u, 0x2344u, 0xD0B7u, 0x97B7u, 0x6444u, 0xFBE6u, 0x0815u,
+		0xEBCCu, 0x183Fu, 0x879Du, 0x746Eu, 0x336Eu, 0xC09Du, 0x5F3Fu, 0xACCCu,
+		0xD13Fu, 0x22CCu, 0xBD6Eu, 0x4E9Du, 0x099Du, 0xFA6Eu, 0x65CCu, 0x963Fu,
+		0x9E2Au, 0x6DD9u, 0xF27Bu, 0x0188u, 0x4688u, 0xB57Bu, 0x2AD9u, 0xD92Au,
+		0xA4D9u, 0x572Au, 0xC888u, 0x3B7Bu, 0x7C7Bu, 0x8F88u, 0x102Au, 0xE3D9u,
+		0x5C2Fu, 0xAFDCu, 0x307Eu, 0xC38Du, 0x848Du, 0x777Eu, 0xE8DCu, 0x1B2Fu,
+		0x66DCu, 0x952Fu, 0x0A8Du, 0xF97Eu, 0xBE7Eu, 0x4D8Du, 0xD22Fu, 0x21DCu,
+		0x29C9u, 0xDA3Au, 0x4598u, 0xB66Bu, 0xF16Bu, 0x0298u, 0x9D3Au, 0x6EC9u,
+		0x133Au, 0xE0C9u, 0x7F6Bu, 0x8C98u, 0xCB98u, 0x386Bu, 0xA7C9u, 0x543Au,
+		0xB7E3u, 0x4410u, 0xDBB2u, 0x2841u, 0x6F41u, 0x9CB2u, 0x0310u, 0xF0E3u,
+		0x8D10u, 0x7EE3u, 0xE141u, 0x12B2u, 0x55B2u, 0xA641u, 0x39E3u, 0xCA10u,
+		0xC205u, 0x31F6u, 0xAE54u, 0x5DA7u, 0x1AA7u, 0xE954u, 0x76F6u, 0x8505u,
+		0xF8F6u, 0x0B05u, 0x94A7u, 0x6754u, 0x2054u, 0xD3A7u, 0x4C05u, 0xBFF6u,
+		0xB85Eu, 0x4BADu, 0xD40Fu, 0x27FCu, 0x60FCu, 0x930Fu, 0x0CADu, 0xFF5Eu,
+		0x82ADu, 0x715Eu, 0xEEFCu, 0x1D0Fu, 0x5A0Fu, 0xA9FCu, 0x365Eu, 0xC5ADu,
+		0xCDB8u, 0x3E4Bu, 0xA1E9u, 0x521Au, 0x151Au, 0xE6E9u, 0x794Bu, 0x8AB8u,
+		0xF74Bu, 0x04B8u, 0x9B1Au, 0x68E9u, 0x2FE9u, 0xDC1Au, 0x43B8u, 0xB04Bu,
+		0x5392u, 0xA061u, 0x3FC3u, 0xCC30u, 0x8B30u, 0x78C3u, 0xE761u, 0x1492u,
+		0x6961u, 0x9A92u, 0x0530u, 0xF6C3u, 0xB1C3u, 0x4230u, 0xDD92u, 0x2E61u,
+		0x2674u, 0xD587u, 0x4A25u, 0xB9D6u, 0xFED6u, 0x0D25u, 0x9287u, 0x6174u,
+		0x1C87u, 0xEF74u, 0x70D6u, 0x8325u, 0xC425u, 0x37D6u, 0xA874u, 0x5B87u,
+		0xE471u, 0x1782u, 0x8820u, 0x7BD3u, 0x3CD3u, 0xCF20u, 0x5082u, 0xA371u,
+		0xDE82u, 0x2D71u, 0xB2D3u, 0x4120u, 0x0620u, 0xF5D3u, 0x6A71u, 0x9982u,
+		0x9197u, 0x6264u, 0xFDC6u, 0x0E35u, 0x4935u, 0xBAC6u, 0x2564u, 0xD697u,
+		0xAB64u, 0x5897u, 0xC735u, 0x34C6u, 0x73C6u, 0x8035u, 0x1F97u, 0xEC64u,
+		0x0FBDu, 0xFC4Eu, 0x63ECu, 0x901Fu, 0xD71Fu, 0x24ECu, 0xBB4Eu, 0x48BDu,
+		0x354Eu, 0xC6BDu, 0x591Fu, 0xAAECu, 0xEDECu, 0x1E1Fu, 0x81BDu, 0x724Eu,
+		0x7A5Bu, 0x89A8u, 0x160Au, 0xE5F9u, 0xA2F9u, 0x510Au, 0xCEA8u, 0x3D5Bu,
+		0x40A8u, 0xB35Bu, 0x2CF9u, 0xDF0Au, 0x980Au, 0x6BF9u, 0xF45Bu, 0x07A8u
+	},
+	{
+		0x0000u, 0xFB0Bu, 0x7DA1u, 0x86AAu, 0xFB42u, 0x0049u, 0x86E3u, 0x7DE8u,
+		0x7D33u, 0x8638u, 0x0092u, 0xFB99u, 0x8671u, 0x7D7Au, 0xFBD0u, 0x00DBu,
+		0xFA66u, 0x016Du, 0x87C7u, 0x7CCCu, 0x0124u, 0xFA2Fu, 0x7C85u, 0x878Eu,
+		0x8755u, 0x7C5Eu, 0xFAF4u, 0x01FFu, 0x7C17u, 0x871Cu, 0x01B6u, 0xFABDu,
+		0x7F7Bu, 0x8470u, 0x02DAu, 0xF9D1u, 0x8439u, 0x7F32u, 0xF998u, 0x0293u,
+		0x0248u, 0xF943u, 0x7FE9u, 0x84E2u, 0xF90Au, 0x0201u, 0x84ABu, 0x7FA0u,
+		0x851Du, 0x7E16u, 0xF8BCu, 0x03B7u, 0x7E5Fu, 0x8554u, 0x03FEu, 0xF8F5u,
+		0xF82Eu, 0x0325u, 0x858Fu, 0x7E84u, 0x036Cu, 0xF867u, 0x7ECDu, 0x85C6u,
+		0xFEF6u, 0x05FDu, 0x8357u, 0x785Cu, 0x05B4u, 0xFEBFu, 0x7815u, 0x831Eu,
+		0x83C5u, 0x78CEu, 0xFE64u, 0x056Fu, 0x7887u, 0x838Cu, 0x0526u, 0xFE2Du,
+		0x0490u, 0xFF9Bu, 0x7931u, 0x823Au, 0xFFD2u, 0x04D9u, 0x8273u, 0x7978u,
+		0x79A3u, 0x82A8u, 0x0402u, 0xFF09u, 0x82E1u, 0x79EAu, 0xFF40u, 0x044Bu,
+		0x818Du, 0x7A86u, 0xFC2Cu, 0x0727u, 0x7ACFu, 0x81C4u, 0x076Eu, 0xFC65u,
+		0xFCBEu, 0x07B5u, 0x811Fu, 0x7A14u, 0x07FCu, 0xFCF7u, 0x7A5Du, 0x8156u,
+		0x7BEBu, 0x80E0u, 0x064Au, 0xFD41u, 0x80A9u, 0x7BA2u, 0xFD08u, 0x0603u,
+		0x06D8u, 0xFDD3u, 0x7B79u, 0x8072u, 0xFD9Au, 0x0691u, 0x803Bu, 0x7B30u,
+		0x765Bu, 0x8D50u, 0x0BFAu, 0xF0F1u, 0x8D19u, 0x7612u, 0xF0B8u, 0x0BB3u,
+		0x0B68u, 0xF063u, 0x76C9u, 0x8DC2u, 0xF02Au, 0x0B21u, 0x8D8Bu, 0x7680u,
+		0x8C3Du, 0x7736u, 0xF19Cu, 0x0A97u, 0x777Fu, 0x8C74u, 0x0ADEu, 0xF1D5u,
+		0xF10Eu, 0x0A05u, 0x8CAFu, 0x77A4u, 0x0A4Cu, 0xF147u, 0x77EDu, 0x8CE6u,
+		0x0920u, 0xF22Bu, 0x7481u, 0x8F8Au, 0xF262u, 0x0969u, 0x8FC3u, 0x74C8u,
+		0x7413u, 0x8F18u, 0x09B2u, 0xF2B9u, 0x8F51u, 0x745Au, 0xF2F0u, 0x09FBu,
+		0xF346u, 0x084Du, 0x8EE7u, 0x75ECu, 0x0804u, 0xF30Fu, 0x75A5u, 0x8EAEu,
+		0x8E75u, 0x757Eu, 0xF3D4u, 0x08DFu, 0x7537u, 0x8E3Cu, 0x0896u, 0xF39Du,
+		0x88ADu, 0x73A6u, 0xF50Cu, 0x0E07u, 0x73EFu, 0x88E4u, 0x0E4Eu, 0xF545u,
+		0xF59Eu, 0x0E95u, 0x883Fu, 0x7334u, 0x0EDCu, 0xF5D7u, 0x737Du, 0x8876u,
+		0x72CBu, 0x89C0u, 0x0F6Au, 0xF461u, 0x8989u, 0x7282u, 0xF428u, 0x0F23u,
+		0x0FF8u, 0xF4F3u, 0x7259u, 0x8952u, 0xF4BAu, 0x0FB1u, 0x891Bu, 0x7210u,
+		0xF7D6u, 0x0CDDu, 0x8A77u, 0x717Cu, 0x0C94u, 0xF79Fu, 0x7135u, 0x8A3Eu,
+		0x8AE5u, 0x71EEu, 0xF744u, 0x0C4Fu, 0x71A7u, 0x8AACu, 0x0C06u, 0xF70Du,
+		0x0DB0u, 0xF6BBu, 0x7011u, 0x8B1Au, 0xF6F2u, 0x0DF9u, 0x8B53u, 0x7058u,
+		0x7083u, 0x8B88u, 0x0D22u, 0xF629u, 0x8BC1u, 0x70CAu, 0xF660u, 0x0D6Bu
+	},
+	{
+		0x0000u, 0xECB6u, 0x52DBu, 0xBE6Du, 0xA5B6u, 0x4900u, 0xF76Du, 0x1BDBu,
+		0xC0DBu, 0x2C6Du, 0x9200u, 0x7EB6u, 0x656Du, 0x89DBu, 0x37B6u, 0xDB00u,
+		0x0A01u, 0xE6B7u, 0x58DAu, 0xB46Cu, 0xAFB7u, 0x4301u, 0xFD6Cu, 0x11DAu,
+		0xCADAu, 0x266Cu, 0x9801u, 0x74B7u, 0x6F6Cu, 0x83DAu, 0x3DB7u, 0xD101u,
+		0x1402u, 0xF8B4u, 0x46D9u, 0xAA6Fu, 0xB1B4u, 0x5D02u, 0xE36Fu, 0x0FD9u,
+		0xD4D9u, 0x386Fu, 0x8602u, 0x6AB4u, 0x716Fu, 0x9DD9u, 0x23B4u, 0xCF02u,
+		0x1E03u, 0xF2B5u, 0x4CD8u, 0xA06Eu, 0xBBB5u, 0x5703u, 0xE96Eu, 0x05D8u,
+		0xDED8u, 0x326Eu, 0x8C03u, 0x60B5u, 0x7B6Eu, 0x97D8u, 0x29B5u, 0xC503u,
+		0x2804u, 0xC4B2u, 0x7ADFu, 0x9669u, 0x8DB2u, 0x6104u, 0xDF69u, 0x33DFu,
+		0xE8DFu, 0x0469u, 0xBA04u, 0x56B2u, 0x4D69u, 0xA1DFu, 0x1FB2u, 0xF304u,
+		0x2205u, 0xCEB3u, 0x70DEu, 0x9C68u, 0x87B3u, 0x6B05u, 0xD568u, 0x39DEu,
+		0xE2DEu, 0x0E68u, 0xB005u, 0x5CB3u, 0x4768u, 0xABDEu, 0x15B3u, 0xF905u,
+		0x3C06u, 0xD0B0u, 0x6EDDu, 0x826Bu, 0x99B0u, 0x7506u, 0xCB6Bu, 0x27DDu,
+		0xFCDDu, 0x106Bu, 0xAE06u, 0x42B0u, 0x596Bu, 0xB5DDu, 0x0BB0u, 0xE706u,
+		0x3607u, 0xDAB1u, 0x64DCu, 0x886Au, 0x93B1u, 0x7F07u, 0xC16Au, 0x2DDCu,
+		0xF6DCu, 0x1A6Au, 0xA407u, 0x48B1u, 0x536Au, 0xBFDCu, 0x01B1u, 0xED07u,
+		0x5008u, 0xBCBEu, 0x02D3u, 0xEE65u, 0xF5BEu, 0x1908u, 0xA765u, 0x4BD3u,
+		0x90D3u, 0x7C65u, 0xC208u, 0x2EBEu, 0x3565u, 0xD9D3u, 0x67BEu, 0x8B08u,
+		0x5A09u, 0xB6BFu, 0x08D2u, 0xE464u, 0xFFBFu, 0x1309u, 0xAD64u, 0x41D2u,
+		0x9AD2u, 0x7664u, 0xC809u, 0x24BFu, 0x3F64u, 0xD3D2u, 0x6DBFu, 0x8109u,
+		0x440Au, 0xA8BCu, 0x16D1u, 0xFA67u, 0xE1BCu, 0x0D0Au, 0xB367u, 0x5FD1u,
+		0x84D1u, 0x6867u, 0xD60Au, 0x3ABCu, 0x2167u, 0xCDD1u, 0x73BCu, 0x9F0Au,
+		0x4E0Bu, 0xA2BDu, 0x1CD0u, 0xF066u, 0xEBBDu, 0x070Bu, 0xB966u, 0x55D0u,
+		0x8ED0u, 0x6266u, 0xDC0Bu, 0x30BDu, 0x2B66u, 0xC7D0u, 0x79BDu, 0x950Bu,
+		0x780Cu, 0x94BAu, 0x2AD7u, 0xC661u, 0xDDBAu, 0x310Cu, 0x8F61u, 0x63D7u,
+		0xB8D7u, 0x5461u, 0xEA0Cu, 0x06BAu, 0x1D61u, 0xF1D7u, 0x4FBAu, 0xA30Cu,
+		0x720Du, 0x9EBBu, 0x20D6u, 0xCC60u, 0xD7BBu, 0x3B0Du, 0x8560u, 0x69D6u,
+		0xB2D6u, 0x5E60u, 0xE00Du, 0x0CBBu, 0x1760u, 0xFBD6u, 0x45BBu, 0xA90Du,
+		0x6C0Eu, 0x80B8u, 0x3ED5u, 0xD263u, 0xC9B8u, 0x250Eu, 0x9B63u, 0x77D5u,
+		0xACD5u, 0x4063u, 0xFE0Eu, 0x12B8u, 0x0963u, 0xE5D5u, 0x5BB8u, 0xB70Eu,
+		0x660Fu, 0x8AB9u, 0x34D4u, 0xD862u, 0xC3B9u, 0x2F0Fu, 0x9162u, 0x7DD4u,
+		0xA6D4u, 0x4A62u, 0xF40Fu, 0x18B9u, 0x0362u, 0xEFD4u, 0x51B9u, 0xBD0Fu
+	},
+	{
+		0x0000u, 0xA010u, 0xCB97u, 0x6B87u, 0x1C99u, 0xBC89u, 0xD70Eu, 0x771Eu,
+		0x3932u, 0x9922u, 0xF2A5u, 0x52B5u, 0x25ABu, 0x85BBu, 0xEE3Cu, 0x4E2Cu,
+		0x7264u, 0xD274u, 0xB9F3u, 0x19E3u, 0x6EFDu, 0xCEEDu, 0xA56Au, 0x057Au,
+		0x4B56u, 0xEB46u, 0x80C1u, 0x20D1u, 0x57CFu, 0xF7DFu, 0x9C58u, 0x3C48u,
+		0xE4C8u, 0x44D8u, 0x2F5Fu, 0x8F4Fu, 0xF851u, 0x5841u, 0x33C6u, 0x93D6u,
+		0xDDFAu, 0x7DEAu, 0x166Du, 0xB67Du, 0xC163u, 0x6173u, 0x0AF4u, 0xAAE4u,
+		0x96ACu, 0x36BCu, 0x5D3Bu, 0xFD2Bu, 0x8A35u, 0x2A25u, 0x41A2u, 0xE1B2u,
+		0xAF9Eu, 0x0F8Eu, 0x6409u, 0xC419u, 0xB307u, 0x1317u, 0x7890u, 0xD880u,
+		0x4227u, 0xE237u, 0x89B0u, 0x29A0u, 0x5EBEu, 0xFEAEu, 0x9529u, 0x3539u,
+		0x7B15u, 0xDB05u, 0xB082u, 0x1092u, 0x678Cu, 0xC79Cu, 0xAC1Bu, 0x0C0Bu,
+		0x3043u, 0x9053u, 0xFBD4u, 0x5BC4u, 0x2CDAu, 0x8CCAu, 0xE74Du, 0x475Du,
+		0x0971u, 0xA961u, 0xC2E6u, 0x62F6u, 0x15E8u, 0xB5F8u, 0xDE7Fu, 0x7E6Fu,
+		0xA6EFu, 0x06FFu, 0x6D78u, 0xCD68u, 0xBA76u, 0x1A66u, 0x71E1u, 0xD1F1u,
+		0x9FDDu, 0x3FCDu, 0x544Au, 0xF45Au, 0x8344u, 0x2354u, 0x48D3u, 0xE8C3u,
+		0xD48Bu, 0x749Bu, 0x1F1Cu, 0xBF0Cu, 0xC812u, 0x6802u, 0x0385u, 0xA395u,
+		0xEDB9u, 0x4DA9u, 0x262Eu, 0x863Eu, 0xF120u, 0x5130u, 0x3AB7u, 0x9AA7u,
+		0x844Eu, 0x245Eu, 0x4FD9u, 0xEFC9u, 0x98D7u, 0x38C7u, 0x5340u, 0xF350u,
+		0xBD7Cu, 0x1D6Cu, 0x76EBu, 0xD6FBu, 0xA1E5u, 0x01F5u, 0x6A72u, 0xCA62u,
+		0xF62Au, 0x563Au, 0x3DBDu, 0x9DADu, 0xEAB3u, 0x4AA3u, 0x2124u, 0x8134u,
+		0xCF18u, 0x6F08u, 0x048Fu, 0xA49Fu, 0xD381u, 0x7391u, 0x1816u, 0xB806u,
+		0x6086u, 0xC096u, 0xAB11u, 0x0B01u, 0x7C1Fu, 0xDC0Fu, 0xB788u, 0x1798u,
+		0x59B4u, 0xF9A4u, 0x9223u, 0x3233u, 0x452Du, 0xE53Du, 0x8EBAu, 0x2EAAu,
+		0x12E2u, 0xB2F2u, 0xD975u, 0x7965u, 0x0E7Bu, 0xAE6Bu, 0xC5ECu, 0x65FCu,
+		0x2BD0u, 0x8BC0u, 0xE047u, 0x4057u, 0x3749u, 0x9759u, 0xFCDEu, 0x5CCEu,
+		0xC669u, 0x6679u, 0x0DFEu, 0xADEEu, 0xDAF0u, 0x7AE0u, 0x1167u, 0xB177u,
+		0xFF5Bu, 0x5F4Bu, 0x34CCu, 0x94DCu, 0xE3C2u, 0x43D2u, 0x2855u, 0x8845u,
+		0xB40Du, 0x141Du, 0x7F9Au, 0xDF8Au, 0xA894u, 0x0884u, 0x6303u, 0xC313u,
+		0x8D3Fu, 0x2D2Fu, 0x46A8u, 0xE6B8u, 0x91A6u, 0x31B6u, 0x5A31u, 0xFA21u,
+		0x22A1u, 0x82B1u, 0xE936u, 0x4926u, 0x3E38u, 0x9E28u, 0xF5AFu, 0x55BFu,
+		0x1B93u, 0xBB83u, 0xD004u, 0x7014u, 0x070Au, 0xA71Au, 0xCC9Du, 0x6C8Du,
+		0x50C5u, 0xF0D5u, 0x9B52u, 0x3B42u, 0x4C5Cu, 0xEC4Cu, 0x87CBu, 0x27DBu,
+		0x69F7u, 0xC9E7u, 0xA260u, 0x0270u, 0x756Eu, 0xD57Eu, 0xBEF9u, 0x1EE9u
+	},
+	{
+		0x0000u, 0x832Bu, 0x8DE1u, 0x0ECAu, 0x9075u, 0x135Eu, 0x1D94u, 0x9EBFu,
+		0xAB5Du, 0x2876u, 0x26BCu, 0xA597u, 0x3B28u, 0xB803u, 0xB6C9u, 0x35E2u,
+		0xDD0Du, 0x5E26u, 0x50ECu, 0xD3C7u, 0x4D78u, 0xCE53u, 0xC099u, 0x43B2u,
+		0x7650u, 0xF57Bu, 0xFBB1u, 0x789Au, 0xE625u, 0x650Eu, 0x6BC4u, 0xE8EFu,
+		0x31ADu, 0xB286u, 0xBC4Cu, 0x3F67u, 0xA1D8u, 0x22F3u, 0x2C39u, 0xAF12u,
+		0x9AF0u, 0x19DBu, 0x1711u, 0x943Au, 0x0A85u, 0x89AEu, 0x8764u, 0x044Fu,
+		0xECA0u, 0x6F8Bu, 0x6141u, 0xE26Au, 0x7CD5u, 0xFFFEu, 0xF134u, 0x721Fu,
+		0x47FDu, 0xC4D6u, 0xCA1Cu, 0x4937u, 0xD788u, 0x54A3u, 0x5A69u, 0xD942u,
+		0x635Au, 0xE071u, 0xEEBBu, 0x6D90u, 0xF32Fu, 0x7004u, 0x7ECEu, 0xFDE5u,
+		0xC807u, 0x4B2Cu, 0x45E6u, 0xC6CDu, 0x5872u, 0xDB59u, 0xD593u, 0x56B8u,
+		0xBE57u, 0x3D7Cu, 0x33B6u, 0xB09Du, 0x2E22u, 0xAD09u, 0xA3C3u, 0x20E8u,
+		0x150Au, 0x9621u, 0x98EBu, 0x1BC0u, 0x857Fu, 0x0654u, 0x089Eu, 0x8BB5u,
+		0x52F7u, 0xD1DCu, 0xDF16u, 0x5C3Du, 0xC282u, 0x41A9u, 0x4F63u, 0xCC48u,
+		0xF9AAu, 0x7A81u, 0x744Bu, 0xF760u, 0x69DFu, 0xEAF4u, 0xE43Eu, 0x6715u,
+		0x8FFAu, 0x0CD1u, 0x021Bu, 0x8130u, 0x1F8Fu, 0x9CA4u, 0x926Eu, 0x1145u,
+		0x24A7u, 0xA78Cu, 0xA946u, 0x2A6Du, 0xB4D2u, 0x37F9u, 0x3933u, 0xBA18u,
+		0xC6B4u, 0x459Fu, 0x4B55u, 0xC87Eu, 0x56C1u, 0xD5EAu, 0xDB20u, 0x580Bu,
+		0x6DE9u, 0xEEC2u, 0xE008u, 0x6323u, 0xFD9Cu, 0x7EB7u, 0x707Du, 0xF356u,
+		0x1BB9u, 0x9892u, 0x9658u, 0x1573u, 0x8BCCu, 0x08E7u, 0x062Du, 0x8506u,
+		0xB0E4u, 0x33CFu, 0x3D05u, 0xBE2Eu, 0x2091u, 0xA3BAu, 0xAD70u, 0x2E5Bu,
+		0xF719u, 0x7432u, 0x7AF8u, 0xF9D3u, 0x676Cu, 0xE447u, 0xEA8Du, 0x69A6u,
+		0x5C44u, 0xDF6Fu, 0xD1A5u, 0x528Eu, 0xCC31u, 0x4F1Au, 0x41D0u, 0xC2FBu,
+		0x2A14u, 0xA93Fu, 0xA7F5u, 0x24DEu, 0xBA61u, 0x394Au, 0x3780u, 0xB4ABu,
+		0x8149u, 0x0262u, 0x0CA8u, 0x8F83u, 0x113Cu, 0x9217u, 0x9CDDu, 0x1FF6u,
+		0xA5EEu, 0x26C5u, 0x280Fu, 0xAB24u, 0x359Bu, 0xB6B0u, 0xB87Au, 0x3B51u,
+		0x0EB3u, 0x8D98u, 0x8352u, 0x0079u, 0x9EC6u, 0x1DEDu, 0x1327u, 0x900Cu,
+		0x78E3u, 0xFBC8u, 0xF502u, 0x7629u, 0xE896u, 0x6BBDu, 0x6577u, 0xE65Cu,
+		0xD3BEu, 0x5095u, 0x5E5Fu, 0xDD74u, 0x43CBu, 0xC0E0u, 0xCE2Au, 0x4D01u,
+		0x9443u, 0x1768u, 0x19A2u, 0x9A89u, 0x0436u, 0x871Du, 0x89D7u, 0x0AFCu,
+		0x3F1Eu, 0xBC35u, 0xB2FFu, 0x31D4u, 0xAF6Bu, 0x2C40u, 0x228Au, 0xA1A1u,
+		0x494Eu, 0xCA65u, 0xC4AFu, 0x4784u, 0xD93Bu, 0x5A10u, 0x54DAu, 0xD7F1u,
+		0xE213u, 0x6138u, 0x6FF2u, 0xECD9u, 0x7266u, 0xF14Du, 0xFF87u, 0x7CACu
+	}
+};
+
+static inline uint16_t
+crc_update_fast(uint16_t crc, const void *data, size_t data_len)
+{
+	const unsigned char *d = (const unsigned char *)data;
+	const unsigned char *d_end = d + data_len;
+	const unsigned char *d_last16 = d + (data_len & ~0x0F);
+
+	for (; d < d_last16 ; d += 16) {
+		crc = crc_table_fast[15][d[0] ^ (uint8_t)(crc >> 8)] ^
+		      crc_table_fast[14][d[1] ^ (uint8_t)(crc >> 0)] ^
+		      crc_table_fast[13][d[2]] ^
+		      crc_table_fast[12][d[3]] ^
+		      crc_table_fast[11][d[4]] ^
+		      crc_table_fast[10][d[5]] ^
+		      crc_table_fast[9][d[6]] ^
+		      crc_table_fast[8][d[7]] ^
+		      crc_table_fast[7][d[8]] ^
+		      crc_table_fast[6][d[9]] ^
+		      crc_table_fast[5][d[10]] ^
+		      crc_table_fast[4][d[11]] ^
+		      crc_table_fast[3][d[12]] ^
+		      crc_table_fast[2][d[13]] ^
+		      crc_table_fast[1][d[14]] ^
+		      crc_table_fast[0][d[15]];
+	}
+	for (; d < d_end ; d++) {
+		crc = (crc << 8) ^ crc_table_fast[0][((uint8_t)(crc >> 8) ^ *d)];
+	}
+	return crc & 0xffff;
+}
+
+static inline uint16_t
+crc16_table_t10dif(uint16_t init_crc, const void *buf, size_t len)
+{
+	uint16_t crc;
+	const uint8_t *data = (const uint8_t *)buf;
+
+	crc = init_crc;
+	crc = crc_update_fast(crc, data, len);
+	return crc;
+}
+
+uint16_t
+spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len)
+{
+	return (crc16_table_t10dif(init_crc, buf, len));
+}
+
+uint16_t
+spdk_crc16_t10dif_copy(uint16_t init_crc, uint8_t *dst, uint8_t *src, size_t len)
+{
+	memcpy(dst, src, len);
+	return (crc16_table_t10dif(init_crc, src, len));
+}
+
+#endif
diff --git a/src/spdk/lib/util/crc32.c b/src/spdk/lib/util/crc32.c
new file mode 100644
index 000000000..34bb60b78
--- /dev/null
+++ b/src/spdk/lib/util/crc32.c
@@ -0,0 +1,95 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util_internal.h"
+#include "spdk/crc32.h"
+
+void
+crc32_table_init(struct spdk_crc32_table *table, uint32_t polynomial_reflect)
+{
+	int i, j;
+	uint32_t val;
+
+	for (i = 0; i < 256; i++) {
+		val = i;
+		for (j = 0; j < 8; j++) {
+			if (val & 1) {
+				val = (val >> 1) ^ polynomial_reflect;
+			} else {
+				val = (val >> 1);
+			}
+		}
+		table->table[i] = val;
+	}
+}
+
+#ifdef SPDK_HAVE_ARM_CRC
+
+uint32_t
+crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc)
+{
+	size_t count;
+	const uint64_t *dword_buf;
+
+	count = len & 7;
+	while (count--) {
+		crc = __crc32b(crc, *(const uint8_t *)buf);
+		buf++;
+	}
+	dword_buf = (const uint64_t *)buf;
+
+	count = len / 8;
+	while (count--) {
+		crc = __crc32d(crc, *dword_buf);
+		dword_buf++;
+	}
+
+	return crc;
+}
+
+#else
+
+uint32_t
+crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc)
+{
+	const uint8_t *buf_u8 = buf;
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		crc = (crc >> 8) ^ table->table[(crc ^ buf_u8[i]) & 0xff];
+	}
+
+	return crc;
+}
+
+#endif
diff --git a/src/spdk/lib/util/crc32_ieee.c b/src/spdk/lib/util/crc32_ieee.c
new file mode 100644
index 000000000..ddc3c9901
--- /dev/null
+++ b/src/spdk/lib/util/crc32_ieee.c
@@ -0,0 +1,49 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util_internal.h"
+#include "spdk/crc32.h"
+
+static struct spdk_crc32_table g_crc32_ieee_table;
+
+__attribute__((constructor)) static void
+crc32_ieee_init(void)
+{
+	crc32_table_init(&g_crc32_ieee_table, SPDK_CRC32_POLYNOMIAL_REFLECT);
+}
+
+uint32_t
+spdk_crc32_ieee_update(const void *buf, size_t len, uint32_t crc)
+{
+	return crc32_update(&g_crc32_ieee_table, buf, len, crc);
+}
diff --git a/src/spdk/lib/util/crc32c.c b/src/spdk/lib/util/crc32c.c
new file mode 100644
index 000000000..9acd8d80f
--- /dev/null
+++ b/src/spdk/lib/util/crc32c.c
@@ -0,0 +1,133 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util_internal.h"
+#include "spdk/crc32.h"
+
+#ifdef SPDK_CONFIG_ISAL
+#define SPDK_HAVE_ISAL
+#include <isa-l/include/crc.h>
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#define SPDK_HAVE_ARM_CRC
+#include <arm_acle.h>
+#elif defined(__x86_64__) && defined(__SSE4_2__)
+#define SPDK_HAVE_SSE4_2
+#include <x86intrin.h>
+#endif
+
+#ifdef SPDK_HAVE_ISAL
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+	return crc32_iscsi((unsigned char *)buf, len, crc);
+}
+
+#elif defined(SPDK_HAVE_SSE4_2)
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+	uint64_t crc_tmp64;
+	size_t count;
+
+	/* _mm_crc32_u64() needs a 64-bit intermediate value */
+	crc_tmp64 = crc;
+
+	/* Process as much of the buffer as possible in 64-bit blocks. */
+	count = len / 8;
+	while (count--) {
+		uint64_t block;
+
+		/*
+		 * Use memcpy() to avoid unaligned loads, which are undefined behavior in C.
+		 * The compiler will optimize out the memcpy() in release builds.
+		 */
+		memcpy(&block, buf, sizeof(block));
+		crc_tmp64 = _mm_crc32_u64(crc_tmp64, block);
+		buf += sizeof(block);
+	}
+	crc = (uint32_t)crc_tmp64;
+
+	/* Handle any trailing bytes. */
+	count = len & 7;
+	while (count--) {
+		crc = _mm_crc32_u8(crc, *(const uint8_t *)buf);
+		buf++;
+	}
+
+	return crc;
+}
+
+#elif defined(SPDK_HAVE_ARM_CRC)
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+	size_t count;
+
+	count = len / 8;
+	while (count--) {
+		uint64_t block;
+
+		memcpy(&block, buf, sizeof(block));
+		crc = __crc32cd(crc, block);
+		buf += sizeof(block);
+	}
+
+	count = len & 7;
+	while (count--) {
+		crc = __crc32cb(crc, *(const uint8_t *)buf);
+		buf++;
+	}
+
+	return crc;
+}
+
+#else /* Neither SSE 4.2 nor ARM CRC32 instructions available */
+
+static struct spdk_crc32_table g_crc32c_table;
+
+__attribute__((constructor)) static void
+crc32c_init(void)
+{
+	crc32_table_init(&g_crc32c_table, SPDK_CRC32C_POLYNOMIAL_REFLECT);
+}
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+	return crc32_update(&g_crc32c_table, buf, len, crc);
+}
+
+#endif
diff --git a/src/spdk/lib/util/dif.c b/src/spdk/lib/util/dif.c
new file mode 100644
index 000000000..64bce1487
--- /dev/null
+++ b/src/spdk/lib/util/dif.c
@@ -0,0 +1,1999 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/dif.h"
+#include "spdk/crc16.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/log.h"
+#include "spdk/util.h"
+
+/* Context to iterate or create a iovec array.
+ * Each sgl is either iterated or created at a time.
+ */
+struct _dif_sgl {
+	/* Current iovec in the iteration or creation */
+	struct iovec *iov;
+
+	/* Remaining count of iovecs in the iteration or creation. */
+	int iovcnt;
+
+	/* Current offset in the iovec */
+	uint32_t iov_offset;
+
+	/* Size of the created iovec array in bytes */
+	uint32_t total_size;
+};
+
+static inline void
+_dif_sgl_init(struct _dif_sgl *s, struct iovec *iovs, int iovcnt)
+{
+	s->iov = iovs;
+	s->iovcnt = iovcnt;
+	s->iov_offset = 0;
+	s->total_size = 0;
+}
+
+static void
+_dif_sgl_advance(struct _dif_sgl *s, uint32_t step)
+{
+	s->iov_offset += step;
+	while (s->iovcnt != 0) {
+		if (s->iov_offset < s->iov->iov_len) {
+			break;
+		}
+
+		s->iov_offset -= s->iov->iov_len;
+		s->iov++;
+		s->iovcnt--;
+	}
+}
+
+static inline void
+_dif_sgl_get_buf(struct _dif_sgl *s, void **_buf, uint32_t *_buf_len)
+{
+	if (_buf != NULL) {
+		*_buf = s->iov->iov_base + s->iov_offset;
+	}
+	if (_buf_len != NULL) {
+		*_buf_len = s->iov->iov_len - s->iov_offset;
+	}
+}
+
+static inline bool
+_dif_sgl_append(struct _dif_sgl *s, uint8_t *data, uint32_t data_len)
+{
+	assert(s->iovcnt > 0);
+	s->iov->iov_base = data;
+	s->iov->iov_len = data_len;
+	s->total_size += data_len;
+	s->iov++;
+	s->iovcnt--;
+
+	if (s->iovcnt > 0) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static inline bool
+_dif_sgl_append_split(struct _dif_sgl *dst, struct _dif_sgl *src, uint32_t data_len)
+{
+	uint8_t *buf;
+	uint32_t buf_len;
+
+	while (data_len != 0) {
+		_dif_sgl_get_buf(src, (void *)&buf, &buf_len);
+		buf_len = spdk_min(buf_len, data_len);
+
+		if (!_dif_sgl_append(dst, buf, buf_len)) {
+			return false;
+		}
+
+		_dif_sgl_advance(src, buf_len);
+		data_len -= buf_len;
+	}
+
+	return true;
+}
+
+/* This function must be used before starting iteration. */
+static bool
+_dif_sgl_is_bytes_multiple(struct _dif_sgl *s, uint32_t bytes)
+{
+	int i;
+
+	for (i = 0; i < s->iovcnt; i++) {
+		if (s->iov[i].iov_len % bytes) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* This function must be used before starting iteration. */
+static bool
+_dif_sgl_is_valid(struct _dif_sgl *s, uint32_t bytes)
+{
+	uint64_t total = 0;
+	int i;
+
+	for (i = 0; i < s->iovcnt; i++) {
+		total += s->iov[i].iov_len;
+	}
+
+	return total >= bytes;
+}
+
+static void
+_dif_sgl_copy(struct _dif_sgl *to, struct _dif_sgl *from)
+{
+	memcpy(to, from, sizeof(struct _dif_sgl));
+}
+
+static bool
+_dif_type_is_valid(enum spdk_dif_type dif_type, uint32_t dif_flags)
+{
+	switch (dif_type) {
+	case SPDK_DIF_TYPE1:
+	case SPDK_DIF_TYPE2:
+	case SPDK_DIF_DISABLE:
+		break;
+	case SPDK_DIF_TYPE3:
+		if (dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) {
+			SPDK_ERRLOG("Reference Tag should not be checked for Type 3\n");
+			return false;
+		}
+		break;
+	default:
+		SPDK_ERRLOG("Unknown DIF Type: %d\n", dif_type);
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+_dif_is_disabled(enum spdk_dif_type dif_type)
+{
+	if (dif_type == SPDK_DIF_DISABLE) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+
+static uint32_t
+_get_guard_interval(uint32_t block_size, uint32_t md_size, bool dif_loc, bool md_interleave)
+{
+	if (!dif_loc) {
+		/* For metadata formats with more than 8 bytes, if the DIF is
+		 * contained in the last 8 bytes of metadata, then the CRC
+		 * covers all metadata up to but excluding these last 8 bytes.
+		 */
+		if (md_interleave) {
+			return block_size - sizeof(struct spdk_dif);
+		} else {
+			return md_size - sizeof(struct spdk_dif);
+		}
+	} else {
+		/* For metadata formats with more than 8 bytes, if the DIF is
+		 * contained in the first 8 bytes of metadata, then the CRC
+		 * does not cover any metadata.
+		 */
+		if (md_interleave) {
+			return block_size - md_size;
+		} else {
+			return 0;
+		}
+	}
+}
+
+int
+spdk_dif_ctx_init(struct spdk_dif_ctx *ctx, uint32_t block_size, uint32_t md_size,
+		  bool md_interleave, bool dif_loc, enum spdk_dif_type dif_type, uint32_t dif_flags,
+		  uint32_t init_ref_tag, uint16_t apptag_mask, uint16_t app_tag,
+		  uint32_t data_offset, uint16_t guard_seed)
+{
+	uint32_t data_block_size;
+
+	if (md_size < sizeof(struct spdk_dif)) {
+		SPDK_ERRLOG("Metadata size is smaller than DIF size.\n");
+		return -EINVAL;
+	}
+
+	if (md_interleave) {
+		if (block_size < md_size) {
+			SPDK_ERRLOG("Block size is smaller than DIF size.\n");
+			return -EINVAL;
+		}
+		data_block_size = block_size - md_size;
+	} else {
+		if (block_size == 0 || (block_size % 512) != 0) {
+			SPDK_ERRLOG("Zero block size is not allowed\n");
+			return -EINVAL;
+		}
+		data_block_size = block_size;
+	}
+
+	if (!_dif_type_is_valid(dif_type, dif_flags)) {
+		SPDK_ERRLOG("DIF type is invalid.\n");
+		return -EINVAL;
+	}
+
+	ctx->block_size = block_size;
+	ctx->md_size = md_size;
+	ctx->md_interleave = md_interleave;
+	ctx->guard_interval = _get_guard_interval(block_size, md_size, dif_loc, md_interleave);
+	ctx->dif_type = dif_type;
+	ctx->dif_flags = dif_flags;
+	ctx->init_ref_tag = init_ref_tag;
+	ctx->apptag_mask = apptag_mask;
+	ctx->app_tag = app_tag;
+	ctx->data_offset = data_offset;
+	ctx->ref_tag_offset = data_offset / data_block_size;
+	ctx->last_guard = guard_seed;
+	ctx->guard_seed = guard_seed;
+	ctx->remapped_init_ref_tag = 0;
+
+	return 0;
+}
+
+void
+spdk_dif_ctx_set_data_offset(struct spdk_dif_ctx *ctx, uint32_t data_offset)
+{
+	uint32_t data_block_size;
+
+	if (ctx->md_interleave) {
+		data_block_size = ctx->block_size - ctx->md_size;
+	} else {
+		data_block_size = ctx->block_size;
+	}
+
+	ctx->data_offset = data_offset;
+	ctx->ref_tag_offset = data_offset / data_block_size;
+}
+
+void
+spdk_dif_ctx_set_remapped_init_ref_tag(struct spdk_dif_ctx *ctx,
+				       uint32_t remapped_init_ref_tag)
+{
+	ctx->remapped_init_ref_tag = remapped_init_ref_tag;
+}
+
+static void
+_dif_generate(void *_dif, uint16_t guard, uint32_t offset_blocks,
+	      const struct spdk_dif_ctx *ctx)
+{
+	struct spdk_dif *dif = _dif;
+	uint32_t ref_tag;
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		to_be16(&dif->guard, guard);
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) {
+		to_be16(&dif->app_tag, ctx->app_tag);
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) {
+		/* For type 1 and 2, the reference tag is incremented for each
+		 * subsequent logical block. For type 3, the reference tag
+		 * remains the same as the initial reference tag.
+		 */
+		if (ctx->dif_type != SPDK_DIF_TYPE3) {
+			ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+		} else {
+			ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset;
+		}
+
+		to_be32(&dif->ref_tag, ref_tag);
+	}
+}
+
+static void
+dif_generate(struct _dif_sgl *sgl, uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks = 0;
+	void *buf;
+	uint16_t guard = 0;
+
+	while (offset_blocks < num_blocks) {
+		_dif_sgl_get_buf(sgl, &buf, NULL);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(ctx->guard_seed, buf, ctx->guard_interval);
+		}
+
+		_dif_generate(buf + ctx->guard_interval, guard, offset_blocks, ctx);
+
+		_dif_sgl_advance(sgl, ctx->block_size);
+		offset_blocks++;
+	}
+}
+
+static uint16_t
+_dif_generate_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len,
+		    uint16_t guard, uint32_t offset_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_in_dif, buf_len;
+	void *buf;
+	struct spdk_dif dif = {};
+
+	assert(offset_in_block < ctx->guard_interval);
+	assert(offset_in_block + data_len < ctx->guard_interval ||
+	       offset_in_block + data_len == ctx->block_size);
+
+	/* Compute CRC over split logical block data. */
+	while (data_len != 0 && offset_in_block < ctx->guard_interval) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+		buf_len = spdk_min(buf_len, data_len);
+		buf_len = spdk_min(buf_len, ctx->guard_interval - offset_in_block);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(guard, buf, buf_len);
+		}
+
+		_dif_sgl_advance(sgl, buf_len);
+		offset_in_block += buf_len;
+		data_len -= buf_len;
+	}
+
+	if (offset_in_block < ctx->guard_interval) {
+		return guard;
+	}
+
+	/* If a whole logical block data is parsed, generate DIF
+	 * and save it to the temporary DIF area.
+	 */
+	_dif_generate(&dif, guard, offset_blocks, ctx);
+
+	/* Copy generated DIF field to the split DIF field, and then
+	 * skip metadata field after DIF field (if any).
+	 */
+	while (offset_in_block < ctx->block_size) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+
+		if (offset_in_block < ctx->guard_interval + sizeof(struct spdk_dif)) {
+			offset_in_dif = offset_in_block - ctx->guard_interval;
+			buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset_in_dif);
+
+			memcpy(buf, ((uint8_t *)&dif) + offset_in_dif, buf_len);
+		} else {
+			buf_len = spdk_min(buf_len, ctx->block_size - offset_in_block);
+		}
+
+		_dif_sgl_advance(sgl, buf_len);
+		offset_in_block += buf_len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+
+	return guard;
+}
+
+static void
+dif_generate_split(struct _dif_sgl *sgl, uint32_t num_blocks,
+		   const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks;
+	uint16_t guard = 0;
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		_dif_generate_split(sgl, 0, ctx->block_size, guard, offset_blocks, ctx);
+	}
+}
+
+int
+spdk_dif_generate(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+		  const struct spdk_dif_ctx *ctx)
+{
+	struct _dif_sgl sgl;
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) {
+		dif_generate(&sgl, num_blocks, ctx);
+	} else {
+		dif_generate_split(&sgl, num_blocks, ctx);
+	}
+
+	return 0;
+}
+
+static void
+_dif_error_set(struct spdk_dif_error *err_blk, uint8_t err_type,
+	       uint32_t expected, uint32_t actual, uint32_t err_offset)
+{
+	if (err_blk) {
+		err_blk->err_type = err_type;
+		err_blk->expected = expected;
+		err_blk->actual = actual;
+		err_blk->err_offset = err_offset;
+	}
+}
+
+static int
+_dif_verify(void *_dif, uint16_t guard, uint32_t offset_blocks,
+	    const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	struct spdk_dif *dif = _dif;
+	uint16_t _guard;
+	uint16_t _app_tag;
+	uint32_t ref_tag, _ref_tag;
+
+	switch (ctx->dif_type) {
+	case SPDK_DIF_TYPE1:
+	case SPDK_DIF_TYPE2:
+		/* If Type 1 or 2 is used, then all DIF checks are disabled when
+		 * the Application Tag is 0xFFFF.
+		 */
+		if (dif->app_tag == 0xFFFF) {
+			return 0;
+		}
+		break;
+	case SPDK_DIF_TYPE3:
+		/* If Type 3 is used, then all DIF checks are disabled when the
+		 * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF.
+		 */
+		if (dif->app_tag == 0xFFFF && dif->ref_tag == 0xFFFFFFFF) {
+			return 0;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/* For type 1 and 2, the reference tag is incremented for each
+	 * subsequent logical block. For type 3, the reference tag
+	 * remains the same as the initial reference tag.
+	 */
+	if (ctx->dif_type != SPDK_DIF_TYPE3) {
+		ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+	} else {
+		ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		/* Compare the DIF Guard field to the CRC computed over the logical
+		 * block data.
+		 */
+		_guard = from_be16(&dif->guard);
+		if (_guard != guard) {
+			_dif_error_set(err_blk, SPDK_DIF_GUARD_ERROR, _guard, guard,
+				       offset_blocks);
+			SPDK_ERRLOG("Failed to compare Guard: LBA=%" PRIu32 "," \
+				    "  Expected=%x, Actual=%x\n",
+				    ref_tag, _guard, guard);
+			return -1;
+		}
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) {
+		/* Compare unmasked bits in the DIF Application Tag field to the
+		 * passed Application Tag.
+		 */
+		_app_tag = from_be16(&dif->app_tag);
+		if ((_app_tag & ctx->apptag_mask) != ctx->app_tag) {
+			_dif_error_set(err_blk, SPDK_DIF_APPTAG_ERROR, ctx->app_tag,
+				       (_app_tag & ctx->apptag_mask), offset_blocks);
+			SPDK_ERRLOG("Failed to compare App Tag: LBA=%" PRIu32 "," \
+				    "  Expected=%x, Actual=%x\n",
+				    ref_tag, ctx->app_tag, (_app_tag & ctx->apptag_mask));
+			return -1;
+		}
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) {
+		switch (ctx->dif_type) {
+		case SPDK_DIF_TYPE1:
+		case SPDK_DIF_TYPE2:
+			/* Compare the DIF Reference Tag field to the passed Reference Tag.
+			 * The passed Reference Tag will be the least significant 4 bytes
+			 * of the LBA when Type 1 is used, and application specific value
+			 * if Type 2 is used,
+			 */
+			_ref_tag = from_be32(&dif->ref_tag);
+			if (_ref_tag != ref_tag) {
+				_dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, ref_tag,
+					       _ref_tag, offset_blocks);
+				SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \
+					    " Expected=%x, Actual=%x\n",
+					    ref_tag, ref_tag, _ref_tag);
+				return -1;
+			}
+			break;
+		case SPDK_DIF_TYPE3:
+			/* For Type 3, computed Reference Tag remains unchanged.
+			 * Hence ignore the Reference Tag field.
+			 */
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int
+dif_verify(struct _dif_sgl *sgl, uint32_t num_blocks,
+	   const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_blocks = 0;
+	int rc;
+	void *buf;
+	uint16_t guard = 0;
+
+	while (offset_blocks < num_blocks) {
+		_dif_sgl_get_buf(sgl, &buf, NULL);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(ctx->guard_seed, buf, ctx->guard_interval);
+		}
+
+		rc = _dif_verify(buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+
+		_dif_sgl_advance(sgl, ctx->block_size);
+		offset_blocks++;
+	}
+
+	return 0;
+}
+
+static int
+_dif_verify_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len,
+		  uint16_t *_guard, uint32_t offset_blocks,
+		  const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_in_dif, buf_len;
+	void *buf;
+	uint16_t guard;
+	struct spdk_dif dif = {};
+	int rc;
+
+	assert(_guard != NULL);
+	assert(offset_in_block < ctx->guard_interval);
+	assert(offset_in_block + data_len < ctx->guard_interval ||
+	       offset_in_block + data_len == ctx->block_size);
+
+	guard = *_guard;
+
+	/* Compute CRC over split logical block data. */
+	while (data_len != 0 && offset_in_block < ctx->guard_interval) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+		buf_len = spdk_min(buf_len, data_len);
+		buf_len = spdk_min(buf_len, ctx->guard_interval - offset_in_block);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(guard, buf, buf_len);
+		}
+
+		_dif_sgl_advance(sgl, buf_len);
+		offset_in_block += buf_len;
+		data_len -= buf_len;
+	}
+
+	if (offset_in_block < ctx->guard_interval) {
+		*_guard = guard;
+		return 0;
+	}
+
+	/* Copy the split DIF field to the temporary DIF buffer, and then
+	 * skip metadata field after DIF field (if any). */
+	while (offset_in_block < ctx->block_size) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+
+		if (offset_in_block < ctx->guard_interval + sizeof(struct spdk_dif)) {
+			offset_in_dif = offset_in_block - ctx->guard_interval;
+			buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset_in_dif);
+
+			memcpy((uint8_t *)&dif + offset_in_dif, buf, buf_len);
+		} else {
+			buf_len = spdk_min(buf_len, ctx->block_size - offset_in_block);
+		}
+		_dif_sgl_advance(sgl, buf_len);
+		offset_in_block += buf_len;
+	}
+
+	rc = _dif_verify(&dif, guard, offset_blocks, ctx, err_blk);
+	if (rc != 0) {
+		return rc;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+
+	*_guard = guard;
+	return 0;
+}
+
+static int
+dif_verify_split(struct _dif_sgl *sgl, uint32_t num_blocks,
+		 const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_blocks;
+	uint16_t guard = 0;
+	int rc;
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		rc = _dif_verify_split(sgl, 0, ctx->block_size, &guard, offset_blocks,
+				       ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_dif_verify(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+		const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	struct _dif_sgl sgl;
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) {
+		return dif_verify(&sgl, num_blocks, ctx, err_blk);
+	} else {
+		return dif_verify_split(&sgl, num_blocks, ctx, err_blk);
+	}
+}
+
+static uint32_t
+dif_update_crc32c(struct _dif_sgl *sgl, uint32_t num_blocks,
+		  uint32_t crc32c,  const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks;
+	void *buf;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		_dif_sgl_get_buf(sgl, &buf, NULL);
+
+		crc32c = spdk_crc32c_update(buf, ctx->block_size - ctx->md_size, crc32c);
+
+		_dif_sgl_advance(sgl, ctx->block_size);
+	}
+
+	return crc32c;
+}
+
+static uint32_t
+_dif_update_crc32c_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len,
+			 uint32_t crc32c, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t data_block_size, buf_len;
+	void *buf;
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	assert(offset_in_block + data_len <= ctx->block_size);
+
+	while (data_len != 0) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+		buf_len = spdk_min(buf_len, data_len);
+
+		if (offset_in_block < data_block_size) {
+			buf_len = spdk_min(buf_len, data_block_size - offset_in_block);
+			crc32c = spdk_crc32c_update(buf, buf_len, crc32c);
+		}
+
+		_dif_sgl_advance(sgl, buf_len);
+		offset_in_block += buf_len;
+		data_len -= buf_len;
+	}
+
+	return crc32c;
+}
+
+static uint32_t
+dif_update_crc32c_split(struct _dif_sgl *sgl, uint32_t num_blocks,
+			uint32_t crc32c, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		crc32c = _dif_update_crc32c_split(sgl, 0, ctx->block_size, crc32c, ctx);
+	}
+
+	return crc32c;
+}
+
+int
+spdk_dif_update_crc32c(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+		       uint32_t *_crc32c, const struct spdk_dif_ctx *ctx)
+{
+	struct _dif_sgl sgl;
+
+	if (_crc32c == NULL) {
+		return -EINVAL;
+	}
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) {
+		*_crc32c = dif_update_crc32c(&sgl, num_blocks, *_crc32c, ctx);
+	} else {
+		*_crc32c = dif_update_crc32c_split(&sgl, num_blocks, *_crc32c, ctx);
+	}
+
+	return 0;
+}
+
+static void
+dif_generate_copy(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+		  uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks = 0, data_block_size;
+	void *src, *dst;
+	uint16_t guard;
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	while (offset_blocks < num_blocks) {
+		_dif_sgl_get_buf(src_sgl, &src, NULL);
+		_dif_sgl_get_buf(dst_sgl, &dst, NULL);
+
+		guard = 0;
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif_copy(ctx->guard_seed, dst, src, data_block_size);
+			guard = spdk_crc16_t10dif(guard, dst + data_block_size,
+						  ctx->guard_interval - data_block_size);
+		} else {
+			memcpy(dst, src, data_block_size);
+		}
+
+		_dif_generate(dst + ctx->guard_interval, guard, offset_blocks, ctx);
+
+		_dif_sgl_advance(src_sgl, data_block_size);
+		_dif_sgl_advance(dst_sgl, ctx->block_size);
+		offset_blocks++;
+	}
+}
+
+static void
+_dif_generate_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+			 uint32_t offset_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_in_block, src_len, data_block_size;
+	uint16_t guard = 0;
+	void *src, *dst;
+
+	_dif_sgl_get_buf(dst_sgl, &dst, NULL);
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+	offset_in_block = 0;
+
+	while (offset_in_block < data_block_size) {
+		/* Compute CRC over split logical block data and copy
+		 * data to bounce buffer.
+		 */
+		_dif_sgl_get_buf(src_sgl, &src, &src_len);
+		src_len = spdk_min(src_len, data_block_size - offset_in_block);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif_copy(guard, dst + offset_in_block,
+						       src, src_len);
+		} else {
+			memcpy(dst + offset_in_block, src, src_len);
+		}
+
+		_dif_sgl_advance(src_sgl, src_len);
+		offset_in_block += src_len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = spdk_crc16_t10dif(guard, dst + data_block_size,
+					  ctx->guard_interval - data_block_size);
+	}
+
+	_dif_sgl_advance(dst_sgl, ctx->block_size);
+
+	_dif_generate(dst + ctx->guard_interval, guard, offset_blocks, ctx);
+}
+
+static void
+dif_generate_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+			uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		_dif_generate_copy_split(src_sgl, dst_sgl, offset_blocks, ctx);
+	}
+}
+
+int
+spdk_dif_generate_copy(struct iovec *iovs, int iovcnt, struct iovec *bounce_iov,
+		       uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	struct _dif_sgl src_sgl, dst_sgl;
+	uint32_t data_block_size;
+
+	_dif_sgl_init(&src_sgl, iovs, iovcnt);
+	_dif_sgl_init(&dst_sgl, bounce_iov, 1);
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	if (!_dif_sgl_is_valid(&src_sgl, data_block_size * num_blocks) ||
+	    !_dif_sgl_is_valid(&dst_sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec arrays are not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&src_sgl, data_block_size)) {
+		dif_generate_copy(&src_sgl, &dst_sgl, num_blocks, ctx);
+	} else {
+		dif_generate_copy_split(&src_sgl, &dst_sgl, num_blocks, ctx);
+	}
+
+	return 0;
+}
+
+static int
+dif_verify_copy(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+		uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+		struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_blocks = 0, data_block_size;
+	void *src, *dst;
+	int rc;
+	uint16_t guard;
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	while (offset_blocks < num_blocks) {
+		_dif_sgl_get_buf(src_sgl, &src, NULL);
+		_dif_sgl_get_buf(dst_sgl, &dst, NULL);
+
+		guard = 0;
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif_copy(ctx->guard_seed, dst, src, data_block_size);
+			guard = spdk_crc16_t10dif(guard, src + data_block_size,
+						  ctx->guard_interval - data_block_size);
+		} else {
+			memcpy(dst, src, data_block_size);
+		}
+
+		rc = _dif_verify(src + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+
+		_dif_sgl_advance(src_sgl, ctx->block_size);
+		_dif_sgl_advance(dst_sgl, data_block_size);
+		offset_blocks++;
+	}
+
+	return 0;
+}
+
+static int
+_dif_verify_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+		       uint32_t offset_blocks, const struct spdk_dif_ctx *ctx,
+		       struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_in_block, dst_len, data_block_size;
+	uint16_t guard = 0;
+	void *src, *dst;
+
+	_dif_sgl_get_buf(src_sgl, &src, NULL);
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+	offset_in_block = 0;
+
+	while (offset_in_block < data_block_size) {
+		/* Compute CRC over split logical block data and copy
+		 * data to bounce buffer.
+		 */
+		_dif_sgl_get_buf(dst_sgl, &dst, &dst_len);
+		dst_len = spdk_min(dst_len, data_block_size - offset_in_block);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif_copy(guard, dst,
+						       src + offset_in_block, dst_len);
+		} else {
+			memcpy(dst, src + offset_in_block, dst_len);
+		}
+
+		_dif_sgl_advance(dst_sgl, dst_len);
+		offset_in_block += dst_len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = spdk_crc16_t10dif(guard, src + data_block_size,
+					  ctx->guard_interval - data_block_size);
+	}
+
+	_dif_sgl_advance(src_sgl, ctx->block_size);
+
+	return _dif_verify(src + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+}
+
+static int
+dif_verify_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+		      uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+		      struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_blocks;
+	int rc;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		rc = _dif_verify_copy_split(src_sgl, dst_sgl, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_dif_verify_copy(struct iovec *iovs, int iovcnt, struct iovec *bounce_iov,
+		     uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+		     struct spdk_dif_error *err_blk)
+{
+	struct _dif_sgl src_sgl, dst_sgl;
+	uint32_t data_block_size;
+
+	_dif_sgl_init(&src_sgl, bounce_iov, 1);
+	_dif_sgl_init(&dst_sgl, iovs, iovcnt);
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	if (!_dif_sgl_is_valid(&dst_sgl, data_block_size * num_blocks) ||
+	    !_dif_sgl_is_valid(&src_sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec arrays are not valid\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&dst_sgl, data_block_size)) {
+		return dif_verify_copy(&src_sgl, &dst_sgl, num_blocks, ctx, err_blk);
+	} else {
+		return dif_verify_copy_split(&src_sgl, &dst_sgl, num_blocks, ctx, err_blk);
+	}
+}
+
+static void
+_bit_flip(uint8_t *buf, uint32_t flip_bit)
+{
+	uint8_t byte;
+
+	byte = *buf;
+	byte ^= 1 << flip_bit;
+	*buf = byte;
+}
+
+static int
+_dif_inject_error(struct _dif_sgl *sgl,
+		  uint32_t block_size, uint32_t num_blocks,
+		  uint32_t inject_offset_blocks,
+		  uint32_t inject_offset_bytes,
+		  uint32_t inject_offset_bits)
+{
+	uint32_t offset_in_block, buf_len;
+	void *buf;
+
+	_dif_sgl_advance(sgl, block_size * inject_offset_blocks);
+
+	offset_in_block = 0;
+
+	while (offset_in_block < block_size) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+		buf_len = spdk_min(buf_len, block_size - offset_in_block);
+
+		if (inject_offset_bytes >= offset_in_block &&
+		    inject_offset_bytes < offset_in_block + buf_len) {
+			buf += inject_offset_bytes - offset_in_block;
+			_bit_flip(buf, inject_offset_bits);
+			return 0;
+		}
+
+		_dif_sgl_advance(sgl, buf_len);
+		offset_in_block += buf_len;
+	}
+
+	return -1;
+}
+
+static int
+dif_inject_error(struct _dif_sgl *sgl, uint32_t block_size, uint32_t num_blocks,
+		 uint32_t start_inject_bytes, uint32_t inject_range_bytes,
+		 uint32_t *inject_offset)
+{
+	uint32_t inject_offset_blocks, inject_offset_bytes, inject_offset_bits;
+	uint32_t offset_blocks;
+	int rc;
+
+	srand(time(0));
+
+	inject_offset_blocks = rand() % num_blocks;
+	inject_offset_bytes = start_inject_bytes + (rand() % inject_range_bytes);
+	inject_offset_bits = rand() % 8;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		if (offset_blocks == inject_offset_blocks) {
+			rc = _dif_inject_error(sgl, block_size, num_blocks,
+					       inject_offset_blocks,
+					       inject_offset_bytes,
+					       inject_offset_bits);
+			if (rc == 0) {
+				*inject_offset = inject_offset_blocks;
+			}
+			return rc;
+		}
+	}
+
+	return -1;
+}
+
+#define _member_size(type, member)	sizeof(((type *)0)->member)
+
+int
+spdk_dif_inject_error(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+		      const struct spdk_dif_ctx *ctx, uint32_t inject_flags,
+		      uint32_t *inject_offset)
+{
+	struct _dif_sgl sgl;
+	int rc;
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (inject_flags & SPDK_DIF_REFTAG_ERROR) {
+		rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+				      ctx->guard_interval + offsetof(struct spdk_dif, ref_tag),
+				      _member_size(struct spdk_dif, ref_tag),
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Reference Tag.\n");
+			return rc;
+		}
+	}
+
+	if (inject_flags & SPDK_DIF_APPTAG_ERROR) {
+		rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+				      ctx->guard_interval + offsetof(struct spdk_dif, app_tag),
+				      _member_size(struct spdk_dif, app_tag),
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Application Tag.\n");
+			return rc;
+		}
+	}
+	if (inject_flags & SPDK_DIF_GUARD_ERROR) {
+		rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+				      ctx->guard_interval,
+				      _member_size(struct spdk_dif, guard),
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Guard.\n");
+			return rc;
+		}
+	}
+
+	if (inject_flags & SPDK_DIF_DATA_ERROR) {
+		/* If the DIF information is contained within the last 8 bytes of
+		 * metadata, then the CRC covers all metadata bytes up to but excluding
+		 * the last 8 bytes. But error injection does not cover these metadata
+		 * because classification is not determined yet.
+		 *
+		 * Note: Error injection to data block is expected to be detected as
+		 * guard error.
+		 */
+		rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+				      0,
+				      ctx->block_size - ctx->md_size,
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to data block.\n");
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void
+dix_generate(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+	     uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks = 0;
+	uint16_t guard;
+	void *data_buf, *md_buf;
+
+	while (offset_blocks < num_blocks) {
+		_dif_sgl_get_buf(data_sgl, &data_buf, NULL);
+		_dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+		guard = 0;
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(ctx->guard_seed, data_buf, ctx->block_size);
+			guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+		}
+
+		_dif_generate(md_buf + ctx->guard_interval, guard, offset_blocks, ctx);
+
+		_dif_sgl_advance(data_sgl, ctx->block_size);
+		_dif_sgl_advance(md_sgl, ctx->md_size);
+		offset_blocks++;
+	}
+}
+
+static void
+_dix_generate_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+		    uint32_t offset_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_in_block, data_buf_len;
+	uint16_t guard = 0;
+	void *data_buf, *md_buf;
+
+	_dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+	offset_in_block = 0;
+
+	while (offset_in_block < ctx->block_size) {
+		_dif_sgl_get_buf(data_sgl, &data_buf, &data_buf_len);
+		data_buf_len = spdk_min(data_buf_len, ctx->block_size - offset_in_block);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(guard, data_buf, data_buf_len);
+		}
+
+		_dif_sgl_advance(data_sgl, data_buf_len);
+		offset_in_block += data_buf_len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+	}
+
+	_dif_sgl_advance(md_sgl, ctx->md_size);
+
+	_dif_generate(md_buf + ctx->guard_interval, guard, offset_blocks, ctx);
+}
+
+static void
+dix_generate_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+		   uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t offset_blocks;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		_dix_generate_split(data_sgl, md_sgl, offset_blocks, ctx);
+	}
+}
+
+int
+spdk_dix_generate(struct iovec *iovs, int iovcnt, struct iovec *md_iov,
+		  uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+	struct _dif_sgl data_sgl, md_sgl;
+
+	_dif_sgl_init(&data_sgl, iovs, iovcnt);
+	_dif_sgl_init(&md_sgl, md_iov, 1);
+
+	if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) ||
+	    !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&data_sgl, ctx->block_size)) {
+		dix_generate(&data_sgl, &md_sgl, num_blocks, ctx);
+	} else {
+		dix_generate_split(&data_sgl, &md_sgl, num_blocks, ctx);
+	}
+
+	return 0;
+}
+
+static int
+dix_verify(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+	   uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+	   struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_blocks = 0;
+	uint16_t guard;
+	void *data_buf, *md_buf;
+	int rc;
+
+	while (offset_blocks < num_blocks) {
+		_dif_sgl_get_buf(data_sgl, &data_buf, NULL);
+		_dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+		guard = 0;
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(ctx->guard_seed, data_buf, ctx->block_size);
+			guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+		}
+
+		rc = _dif_verify(md_buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+
+		_dif_sgl_advance(data_sgl, ctx->block_size);
+		_dif_sgl_advance(md_sgl, ctx->md_size);
+		offset_blocks++;
+	}
+
+	return 0;
+}
+
+static int
+_dix_verify_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+		  uint32_t offset_blocks, const struct spdk_dif_ctx *ctx,
+		  struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_in_block, data_buf_len;
+	uint16_t guard = 0;
+	void *data_buf, *md_buf;
+
+	_dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->guard_seed;
+	}
+	offset_in_block = 0;
+
+	while (offset_in_block < ctx->block_size) {
+		_dif_sgl_get_buf(data_sgl, &data_buf, &data_buf_len);
+		data_buf_len = spdk_min(data_buf_len, ctx->block_size - offset_in_block);
+
+		if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+			guard = spdk_crc16_t10dif(guard, data_buf, data_buf_len);
+		}
+
+		_dif_sgl_advance(data_sgl, data_buf_len);
+		offset_in_block += data_buf_len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+	}
+
+	_dif_sgl_advance(md_sgl, ctx->md_size);
+
+	return _dif_verify(md_buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+}
+
+static int
+dix_verify_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+		 uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+		 struct spdk_dif_error *err_blk)
+{
+	uint32_t offset_blocks;
+	int rc;
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		rc = _dix_verify_split(data_sgl, md_sgl, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_dix_verify(struct iovec *iovs, int iovcnt, struct iovec *md_iov,
+		uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+		struct spdk_dif_error *err_blk)
+{
+	struct _dif_sgl data_sgl, md_sgl;
+
+	_dif_sgl_init(&data_sgl, iovs, iovcnt);
+	_dif_sgl_init(&md_sgl, md_iov, 1);
+
+	if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) ||
+	    !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (_dif_sgl_is_bytes_multiple(&data_sgl, ctx->block_size)) {
+		return dix_verify(&data_sgl, &md_sgl, num_blocks, ctx, err_blk);
+	} else {
+		return dix_verify_split(&data_sgl, &md_sgl, num_blocks, ctx, err_blk);
+	}
+}
+
+int
+spdk_dix_inject_error(struct iovec *iovs, int iovcnt, struct iovec *md_iov,
+		      uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+		      uint32_t inject_flags, uint32_t *inject_offset)
+{
+	struct _dif_sgl data_sgl, md_sgl;
+	int rc;
+
+	_dif_sgl_init(&data_sgl, iovs, iovcnt);
+	_dif_sgl_init(&md_sgl, md_iov, 1);
+
+	if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) ||
+	    !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (inject_flags & SPDK_DIF_REFTAG_ERROR) {
+		rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks,
+				      ctx->guard_interval + offsetof(struct spdk_dif, ref_tag),
+				      _member_size(struct spdk_dif, ref_tag),
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Reference Tag.\n");
+			return rc;
+		}
+	}
+
+	if (inject_flags & SPDK_DIF_APPTAG_ERROR) {
+		rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks,
+				      ctx->guard_interval + offsetof(struct spdk_dif, app_tag),
+				      _member_size(struct spdk_dif, app_tag),
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Application Tag.\n");
+			return rc;
+		}
+	}
+
+	if (inject_flags & SPDK_DIF_GUARD_ERROR) {
+		rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks,
+				      ctx->guard_interval,
+				      _member_size(struct spdk_dif, guard),
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Guard.\n");
+			return rc;
+		}
+	}
+
+	if (inject_flags & SPDK_DIF_DATA_ERROR) {
+		/* Note: Error injection to data block is expected to be detected
+		 * as guard error.
+		 */
+		rc = dif_inject_error(&data_sgl, ctx->block_size, num_blocks,
+				      0,
+				      ctx->block_size,
+				      inject_offset);
+		if (rc != 0) {
+			SPDK_ERRLOG("Failed to inject error to Guard.\n");
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static uint32_t
+_to_next_boundary(uint32_t offset, uint32_t boundary)
+{
+	return boundary - (offset % boundary);
+}
+
+static uint32_t
+_to_size_with_md(uint32_t size, uint32_t data_block_size, uint32_t block_size)
+{
+	return (size / data_block_size) * block_size + (size % data_block_size);
+}
+
+int
+spdk_dif_set_md_interleave_iovs(struct iovec *iovs, int iovcnt,
+				struct iovec *buf_iovs, int buf_iovcnt,
+				uint32_t data_offset, uint32_t data_len,
+				uint32_t *_mapped_len,
+				const struct spdk_dif_ctx *ctx)
+{
+	uint32_t data_block_size, data_unalign, buf_len, buf_offset, len;
+	struct _dif_sgl dif_sgl;
+	struct _dif_sgl buf_sgl;
+
+	if (iovs == NULL || iovcnt == 0 || buf_iovs == NULL || buf_iovcnt == 0) {
+		return -EINVAL;
+	}
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	data_unalign = ctx->data_offset % data_block_size;
+
+	buf_len = _to_size_with_md(data_unalign + data_offset + data_len, data_block_size,
+				   ctx->block_size);
+	buf_len -= data_unalign;
+
+	_dif_sgl_init(&dif_sgl, iovs, iovcnt);
+	_dif_sgl_init(&buf_sgl, buf_iovs, buf_iovcnt);
+
+	if (!_dif_sgl_is_valid(&buf_sgl, buf_len)) {
+		SPDK_ERRLOG("Buffer overflow will occur.\n");
+		return -ERANGE;
+	}
+
+	buf_offset = _to_size_with_md(data_unalign + data_offset, data_block_size, ctx->block_size);
+	buf_offset -= data_unalign;
+
+	_dif_sgl_advance(&buf_sgl, buf_offset);
+
+	while (data_len != 0) {
+		len = spdk_min(data_len, _to_next_boundary(ctx->data_offset + data_offset, data_block_size));
+		if (!_dif_sgl_append_split(&dif_sgl, &buf_sgl, len)) {
+			break;
+		}
+		_dif_sgl_advance(&buf_sgl, ctx->md_size);
+		data_offset += len;
+		data_len -= len;
+	}
+
+	if (_mapped_len != NULL) {
+		*_mapped_len = dif_sgl.total_size;
+	}
+
+	return iovcnt - dif_sgl.iovcnt;
+}
+
+static int
+_dif_sgl_setup_stream(struct _dif_sgl *sgl, uint32_t *_buf_offset, uint32_t *_buf_len,
+		      uint32_t data_offset, uint32_t data_len,
+		      const struct spdk_dif_ctx *ctx)
+{
+	uint32_t data_block_size, data_unalign, buf_len, buf_offset;
+
+	data_block_size = ctx->block_size - ctx->md_size;
+
+	data_unalign = ctx->data_offset % data_block_size;
+
+	/* If the last data block is complete, DIF of the data block is
+	 * inserted or verified in this turn.
+	 */
+	buf_len = _to_size_with_md(data_unalign + data_offset + data_len, data_block_size,
+				   ctx->block_size);
+	buf_len -= data_unalign;
+
+	if (!_dif_sgl_is_valid(sgl, buf_len)) {
+		return -ERANGE;
+	}
+
+	buf_offset = _to_size_with_md(data_unalign + data_offset, data_block_size, ctx->block_size);
+	buf_offset -= data_unalign;
+
+	_dif_sgl_advance(sgl, buf_offset);
+	buf_len -= buf_offset;
+
+	buf_offset += data_unalign;
+
+	*_buf_offset = buf_offset;
+	*_buf_len = buf_len;
+
+	return 0;
+}
+
+int
+spdk_dif_generate_stream(struct iovec *iovs, int iovcnt,
+			 uint32_t data_offset, uint32_t data_len,
+			 struct spdk_dif_ctx *ctx)
+{
+	uint32_t buf_len = 0, buf_offset = 0;
+	uint32_t len, offset_in_block, offset_blocks;
+	uint16_t guard = 0;
+	struct _dif_sgl sgl;
+	int rc;
+
+	if (iovs == NULL || iovcnt == 0) {
+		return -EINVAL;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->last_guard;
+	}
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx);
+	if (rc != 0) {
+		return rc;
+	}
+
+	while (buf_len != 0) {
+		len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size));
+		offset_in_block = buf_offset % ctx->block_size;
+		offset_blocks = buf_offset / ctx->block_size;
+
+		guard = _dif_generate_split(&sgl, offset_in_block, len, guard, offset_blocks, ctx);
+
+		buf_len -= len;
+		buf_offset += len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		ctx->last_guard = guard;
+	}
+
+	return 0;
+}
+
+int
+spdk_dif_verify_stream(struct iovec *iovs, int iovcnt,
+		       uint32_t data_offset, uint32_t data_len,
+		       struct spdk_dif_ctx *ctx,
+		       struct spdk_dif_error *err_blk)
+{
+	uint32_t buf_len = 0, buf_offset = 0;
+	uint32_t len, offset_in_block, offset_blocks;
+	uint16_t guard = 0;
+	struct _dif_sgl sgl;
+	int rc = 0;
+
+	if (iovs == NULL || iovcnt == 0) {
+		return -EINVAL;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		guard = ctx->last_guard;
+	}
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx);
+	if (rc != 0) {
+		return rc;
+	}
+
+	while (buf_len != 0) {
+		len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size));
+		offset_in_block = buf_offset % ctx->block_size;
+		offset_blocks = buf_offset / ctx->block_size;
+
+		rc = _dif_verify_split(&sgl, offset_in_block, len, &guard, offset_blocks,
+				       ctx, err_blk);
+		if (rc != 0) {
+			goto error;
+		}
+
+		buf_len -= len;
+		buf_offset += len;
+	}
+
+	if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+		ctx->last_guard = guard;
+	}
+error:
+	return rc;
+}
+
+int
+spdk_dif_update_crc32c_stream(struct iovec *iovs, int iovcnt,
+			      uint32_t data_offset, uint32_t data_len,
+			      uint32_t *_crc32c, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t buf_len = 0, buf_offset = 0, len, offset_in_block;
+	uint32_t crc32c;
+	struct _dif_sgl sgl;
+	int rc;
+
+	if (iovs == NULL || iovcnt == 0) {
+		return -EINVAL;
+	}
+
+	crc32c = *_crc32c;
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx);
+	if (rc != 0) {
+		return rc;
+	}
+
+	while (buf_len != 0) {
+		len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size));
+		offset_in_block = buf_offset % ctx->block_size;
+
+		crc32c = _dif_update_crc32c_split(&sgl, offset_in_block, len, crc32c, ctx);
+
+		buf_len -= len;
+		buf_offset += len;
+	}
+
+	*_crc32c = crc32c;
+
+	return 0;
+}
+
+void
+spdk_dif_get_range_with_md(uint32_t data_offset, uint32_t data_len,
+			   uint32_t *_buf_offset, uint32_t *_buf_len,
+			   const struct spdk_dif_ctx *ctx)
+{
+	uint32_t data_block_size, data_unalign, buf_offset, buf_len;
+
+	if (!ctx->md_interleave) {
+		buf_offset = data_offset;
+		buf_len = data_len;
+	} else {
+		data_block_size = ctx->block_size - ctx->md_size;
+
+		data_unalign = data_offset % data_block_size;
+
+		buf_offset = _to_size_with_md(data_offset, data_block_size, ctx->block_size);
+		buf_len = _to_size_with_md(data_unalign + data_len, data_block_size, ctx->block_size) -
+			  data_unalign;
+	}
+
+	if (_buf_offset != NULL) {
+		*_buf_offset = buf_offset;
+	}
+
+	if (_buf_len != NULL) {
+		*_buf_len = buf_len;
+	}
+}
+
+uint32_t
+spdk_dif_get_length_with_md(uint32_t data_len, const struct spdk_dif_ctx *ctx)
+{
+	uint32_t data_block_size;
+
+	if (!ctx->md_interleave) {
+		return data_len;
+	} else {
+		data_block_size = ctx->block_size - ctx->md_size;
+
+		return _to_size_with_md(data_len, data_block_size, ctx->block_size);
+	}
+}
+
+static int
+_dif_remap_ref_tag(struct _dif_sgl *sgl, uint32_t offset_blocks,
+		   const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	uint32_t offset, buf_len, expected = 0, _actual, remapped;
+	void *buf;
+	struct _dif_sgl tmp_sgl;
+	struct spdk_dif dif;
+
+	/* Fast forward to DIF field. */
+	_dif_sgl_advance(sgl, ctx->guard_interval);
+	_dif_sgl_copy(&tmp_sgl, sgl);
+
+	/* Copy the split DIF field to the temporary DIF buffer */
+	offset = 0;
+	while (offset < sizeof(struct spdk_dif)) {
+		_dif_sgl_get_buf(sgl, &buf, &buf_len);
+		buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset);
+
+		memcpy((uint8_t *)&dif + offset, buf, buf_len);
+
+		_dif_sgl_advance(sgl, buf_len);
+		offset += buf_len;
+	}
+
+	switch (ctx->dif_type) {
+	case SPDK_DIF_TYPE1:
+	case SPDK_DIF_TYPE2:
+		/* If Type 1 or 2 is used, then all DIF checks are disabled when
+		 * the Application Tag is 0xFFFF.
+		 */
+		if (dif.app_tag == 0xFFFF) {
+			goto end;
+		}
+		break;
+	case SPDK_DIF_TYPE3:
+		/* If Type 3 is used, then all DIF checks are disabled when the
+		 * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF.
+		 */
+		if (dif.app_tag == 0xFFFF && dif.ref_tag == 0xFFFFFFFF) {
+			goto end;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/* For type 1 and 2, the Reference Tag is incremented for each
+	 * subsequent logical block. For type 3, the Reference Tag
+	 * remains the same as the initial Reference Tag.
+	 */
+	if (ctx->dif_type != SPDK_DIF_TYPE3) {
+		expected = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+		remapped = ctx->remapped_init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+	} else {
+		remapped = ctx->remapped_init_ref_tag;
+	}
+
+	/* Verify the stored Reference Tag. */
+	switch (ctx->dif_type) {
+	case SPDK_DIF_TYPE1:
+	case SPDK_DIF_TYPE2:
+		/* Compare the DIF Reference Tag field to the computed Reference Tag.
+		 * The computed Reference Tag will be the least significant 4 bytes
+		 * of the LBA when Type 1 is used, and application specific value
+		 * if Type 2 is used.
+		 */
+		_actual = from_be32(&dif.ref_tag);
+		if (_actual != expected) {
+			_dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, expected,
+				       _actual, offset_blocks);
+			SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \
+				    " Expected=%x, Actual=%x\n",
+				    expected, expected, _actual);
+			return -1;
+		}
+		break;
+	case SPDK_DIF_TYPE3:
+		/* For type 3, the computed Reference Tag remains unchanged.
+		 * Hence ignore the Reference Tag field.
+		 */
+		break;
+	default:
+		break;
+	}
+
+	/* Update the stored Reference Tag to the remapped one. */
+	to_be32(&dif.ref_tag, remapped);
+
+	offset = 0;
+	while (offset < sizeof(struct spdk_dif)) {
+		_dif_sgl_get_buf(&tmp_sgl, &buf, &buf_len);
+		buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset);
+
+		memcpy(buf, (uint8_t *)&dif + offset, buf_len);
+
+		_dif_sgl_advance(&tmp_sgl, buf_len);
+		offset += buf_len;
+	}
+
+end:
+	_dif_sgl_advance(sgl, ctx->block_size - ctx->guard_interval - sizeof(struct spdk_dif));
+
+	return 0;
+}
+
+int
+spdk_dif_remap_ref_tag(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+		       const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	struct _dif_sgl sgl;
+	uint32_t offset_blocks;
+	int rc;
+
+	_dif_sgl_init(&sgl, iovs, iovcnt);
+
+	if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+		SPDK_ERRLOG("Size of iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (!(ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
+		return 0;
+	}
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		rc = _dif_remap_ref_tag(&sgl, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int
+_dix_remap_ref_tag(struct _dif_sgl *md_sgl, uint32_t offset_blocks,
+		   const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+	uint32_t expected = 0, _actual, remapped;
+	uint8_t *md_buf;
+	struct spdk_dif *dif;
+
+	_dif_sgl_get_buf(md_sgl, (void *)&md_buf, NULL);
+
+	dif = (struct spdk_dif *)(md_buf + ctx->guard_interval);
+
+	switch (ctx->dif_type) {
+	case SPDK_DIF_TYPE1:
+	case SPDK_DIF_TYPE2:
+		/* If Type 1 or 2 is used, then all DIF checks are disabled when
+		 * the Application Tag is 0xFFFF.
+		 */
+		if (dif->app_tag == 0xFFFF) {
+			goto end;
+		}
+		break;
+	case SPDK_DIF_TYPE3:
+		/* If Type 3 is used, then all DIF checks are disabled when the
+		 * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF.
+		 */
+		if (dif->app_tag == 0xFFFF && dif->ref_tag == 0xFFFFFFFF) {
+			goto end;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/* For type 1 and 2, the Reference Tag is incremented for each
+	 * subsequent logical block. For type 3, the Reference Tag
+	 * remains the same as the initialReference Tag.
+	 */
+	if (ctx->dif_type != SPDK_DIF_TYPE3) {
+		expected = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+		remapped = ctx->remapped_init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+	} else {
+		remapped = ctx->remapped_init_ref_tag;
+	}
+
+	/* Verify the stored Reference Tag. */
+	switch (ctx->dif_type) {
+	case SPDK_DIF_TYPE1:
+	case SPDK_DIF_TYPE2:
+		/* Compare the DIF Reference Tag field to the computed Reference Tag.
+		 * The computed Reference Tag will be the least significant 4 bytes
+		 * of the LBA when Type 1 is used, and application specific value
+		 * if Type 2 is used.
+		 */
+		_actual = from_be32(&dif->ref_tag);
+		if (_actual != expected) {
+			_dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, expected,
+				       _actual, offset_blocks);
+			SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \
+				    " Expected=%x, Actual=%x\n",
+				    expected, expected, _actual);
+			return -1;
+		}
+		break;
+	case SPDK_DIF_TYPE3:
+		/* For type 3, the computed Reference Tag remains unchanged.
+		 * Hence ignore the Reference Tag field.
+		 */
+		break;
+	default:
+		break;
+	}
+
+	/* Update the stored Reference Tag to the remapped one. */
+	to_be32(&dif->ref_tag, remapped);
+
+end:
+	_dif_sgl_advance(md_sgl, ctx->md_size);
+
+	return 0;
+}
+
+int
+spdk_dix_remap_ref_tag(struct iovec *md_iov, uint32_t num_blocks,
+		       const struct spdk_dif_ctx *ctx,
+		       struct spdk_dif_error *err_blk)
+{
+	struct _dif_sgl md_sgl;
+	uint32_t offset_blocks;
+	int rc;
+
+	_dif_sgl_init(&md_sgl, md_iov, 1);
+
+	if (!_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+		SPDK_ERRLOG("Size of metadata iovec array is not valid.\n");
+		return -EINVAL;
+	}
+
+	if (_dif_is_disabled(ctx->dif_type)) {
+		return 0;
+	}
+
+	if (!(ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
+		return 0;
+	}
+
+	for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+		rc = _dix_remap_ref_tag(&md_sgl, offset_blocks, ctx, err_blk);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
diff --git a/src/spdk/lib/util/fd.c b/src/spdk/lib/util/fd.c
new file mode 100644
index 000000000..6b0d0d554
--- /dev/null
+++ b/src/spdk/lib/util/fd.c
@@ -0,0 +1,103 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/fd.h"
+
+#ifdef __linux__
+#include <linux/fs.h>
+#endif
+
+static uint64_t
+dev_get_size(int fd)
+{
+#if defined(DIOCGMEDIASIZE) /* FreeBSD */
+	off_t size;
+
+	if (ioctl(fd, DIOCGMEDIASIZE, &size) == 0) {
+		return size;
+	}
+#elif defined(__linux__) && defined(BLKGETSIZE64)
+	uint64_t size;
+
+	if (ioctl(fd, BLKGETSIZE64, &size) == 0) {
+		return size;
+	}
+#endif
+
+	return 0;
+}
+
+uint32_t
+spdk_fd_get_blocklen(int fd)
+{
+#if defined(DKIOCGETBLOCKSIZE) /* FreeBSD */
+	uint32_t blocklen;
+
+	if (ioctl(fd, DKIOCGETBLOCKSIZE, &blocklen) == 0) {
+		return blocklen;
+	}
+#elif defined(__linux__) && defined(BLKSSZGET)
+	uint32_t blocklen;
+
+	if (ioctl(fd, BLKSSZGET, &blocklen) == 0) {
+		return blocklen;
+	}
+#endif
+
+	return 0;
+}
+
+uint64_t
+spdk_fd_get_size(int fd)
+{
+	struct stat st;
+
+	if (fstat(fd, &st) != 0) {
+		return 0;
+	}
+
+	if (S_ISLNK(st.st_mode)) {
+		return 0;
+	}
+
+	if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
+		return dev_get_size(fd);
+	} else if (S_ISREG(st.st_mode)) {
+		return st.st_size;
+	}
+
+	/* Not REG, CHR or BLK */
+	return 0;
+}
diff --git a/src/spdk/lib/util/file.c b/src/spdk/lib/util/file.c
new file mode 100644
index 000000000..2ba08547b
--- /dev/null
+++ b/src/spdk/lib/util/file.c
@@ -0,0 +1,71 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/file.h"
+
+void *
+spdk_posix_file_load(FILE *file, size_t *size)
+{
+	void *newbuf, *buf = NULL;
+	size_t rc, buf_size, cur_size = 0;
+
+	*size = 0;
+	buf_size = 128 * 1024;
+
+	while (buf_size <= 1024 * 1024 * 1024) {
+		newbuf = realloc(buf, buf_size);
+		if (newbuf == NULL) {
+			free(buf);
+			return NULL;
+		}
+		buf = newbuf;
+
+		rc = fread(buf + cur_size, 1, buf_size - cur_size, file);
+		cur_size += rc;
+
+		if (feof(file)) {
+			*size = cur_size;
+			return buf;
+		}
+
+		if (ferror(file)) {
+			free(buf);
+			return NULL;
+		}
+
+		buf_size *= 2;
+	}
+
+	free(buf);
+	return NULL;
+}
diff --git a/src/spdk/lib/util/iov.c b/src/spdk/lib/util/iov.c
new file mode 100644
index 000000000..e89ef9d21
--- /dev/null
+++ b/src/spdk/lib/util/iov.c
@@ -0,0 +1,111 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/util.h"
+
+size_t
+spdk_iovcpy(struct iovec *siov, size_t siovcnt, struct iovec *diov, size_t diovcnt)
+{
+	size_t total_sz;
+	size_t sidx;
+	size_t didx;
+	int siov_len;
+	uint8_t *siov_base;
+	int diov_len;
+	uint8_t *diov_base;
+
+	/* d prefix = destination. s prefix = source. */
+
+	assert(diovcnt > 0);
+	assert(siovcnt > 0);
+
+	total_sz = 0;
+	sidx = 0;
+	didx = 0;
+	siov_len = siov[0].iov_len;
+	siov_base = siov[0].iov_base;
+	diov_len = diov[0].iov_len;
+	diov_base = diov[0].iov_base;
+	while (siov_len > 0 && diov_len > 0) {
+		if (siov_len == diov_len) {
+			memcpy(diov_base, siov_base, siov_len);
+			total_sz += siov_len;
+
+			/* Advance both iovs to the next element */
+			sidx++;
+			if (sidx == siovcnt) {
+				break;
+			}
+
+			didx++;
+			if (didx == diovcnt) {
+				break;
+			}
+
+			siov_len = siov[sidx].iov_len;
+			siov_base = siov[sidx].iov_base;
+			diov_len = diov[didx].iov_len;
+			diov_base = diov[didx].iov_base;
+		} else if (siov_len < diov_len) {
+			memcpy(diov_base, siov_base, siov_len);
+			total_sz += siov_len;
+
+			/* Advance only the source to the next element */
+			sidx++;
+			if (sidx == siovcnt) {
+				break;
+			}
+
+			diov_base += siov_len;
+			diov_len -= siov_len;
+			siov_len = siov[sidx].iov_len;
+			siov_base = siov[sidx].iov_base;
+		} else {
+			memcpy(diov_base, siov_base, diov_len);
+			total_sz += diov_len;
+
+			/* Advance only the destination to the next element */
+			didx++;
+			if (didx == diovcnt) {
+				break;
+			}
+
+			siov_base += diov_len;
+			siov_len -= diov_len;
+			diov_len = diov[didx].iov_len;
+			diov_base = diov[didx].iov_base;
+		}
+	}
+
+	return total_sz;
+}
diff --git a/src/spdk/lib/util/math.c b/src/spdk/lib/util/math.c
new file mode 100644
index 000000000..7d1852421
--- /dev/null
+++ b/src/spdk/lib/util/math.c
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+
+/* The following will automatically generate several version of
+ * this function, targeted at different architectures. This
+ * is only supported by GCC 6 or newer. */
+#if defined(__GNUC__) && __GNUC__ >= 6 && !defined(__clang__) \
+	&& (defined(__i386__) || defined(__x86_64__))
+__attribute__((target_clones("bmi", "arch=core2", "arch=atom", "default")))
+#endif
+uint32_t
+spdk_u32log2(uint32_t x)
+{
+	if (x == 0) {
+		/* log(0) is undefined */
+		return 0;
+	}
+	return 31u - __builtin_clz(x);
+}
+
+/* The following will automatically generate several version of
+ * this function, targeted at different architectures. This
+ * is only supported by GCC 6 or newer. */
+#if defined(__GNUC__) && __GNUC__ >= 6 && !defined(__clang__) \
+	&& (defined(__i386__) || defined(__x86_64__))
+__attribute__((target_clones("bmi", "arch=core2", "arch=atom", "default")))
+#endif
+uint64_t
+spdk_u64log2(uint64_t x)
+{
+	if (x == 0) {
+		/* log(0) is undefined */
+		return 0;
+	}
+	return 63u - __builtin_clzl(x);
+}
diff --git a/src/spdk/lib/util/pipe.c b/src/spdk/lib/util/pipe.c
new file mode 100644
index 000000000..1c640dd2e
--- /dev/null
+++ b/src/spdk/lib/util/pipe.c
@@ -0,0 +1,246 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/pipe.h"
+#include "spdk/util.h"
+
+struct spdk_pipe {
+	uint8_t	*buf;
+	uint32_t sz;
+
+	uint32_t write;
+	uint32_t read;
+};
+
+struct spdk_pipe *
+spdk_pipe_create(void *buf, uint32_t sz)
+{
+	struct spdk_pipe *pipe;
+
+	pipe = calloc(1, sizeof(*pipe));
+	if (pipe == NULL) {
+		return NULL;
+	}
+
+	pipe->buf = buf;
+	pipe->sz = sz;
+
+	return pipe;
+}
+
+void
+spdk_pipe_destroy(struct spdk_pipe *pipe)
+{
+	free(pipe);
+}
+
+int
+spdk_pipe_writer_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struct iovec *iovs)
+{
+	uint32_t sz;
+	uint32_t read;
+	uint32_t write;
+
+	read = pipe->read;
+	write = pipe->write;
+
+	if (read <= write) {
+		requested_sz = spdk_min(requested_sz, ((read + pipe->sz) - write - 1));
+
+		sz = spdk_min(requested_sz, pipe->sz - write);
+
+		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write);
+		iovs[0].iov_len = sz;
+
+		requested_sz -= sz;
+
+		if (requested_sz > 0) {
+			sz = spdk_min(requested_sz, read);
+
+			iovs[1].iov_base = (sz == 0) ? NULL : pipe->buf;
+			iovs[1].iov_len = sz;
+		} else {
+			iovs[1].iov_base = NULL;
+			iovs[1].iov_len = 0;
+		}
+	} else {
+		sz = spdk_min(requested_sz, read - write - 1);
+
+		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write);
+		iovs[0].iov_len = sz;
+		iovs[1].iov_base = NULL;
+		iovs[1].iov_len = 0;
+	}
+
+	return iovs[0].iov_len + iovs[1].iov_len;
+}
+
+int
+spdk_pipe_writer_advance(struct spdk_pipe *pipe, uint32_t requested_sz)
+{
+	uint32_t sz;
+	uint32_t read;
+	uint32_t write;
+
+	read = pipe->read;
+	write = pipe->write;
+
+	if (requested_sz > pipe->sz - 1) {
+		return -EINVAL;
+	}
+
+	if (read <= write) {
+		if (requested_sz > (read + pipe->sz) - write) {
+			return -EINVAL;
+		}
+
+		sz = spdk_min(requested_sz, pipe->sz - write);
+
+		write += sz;
+		if (write > pipe->sz - 1) {
+			write = 0;
+		}
+		requested_sz -= sz;
+
+		if (requested_sz > 0) {
+			if (requested_sz >= read) {
+				return -EINVAL;
+			}
+
+			write = requested_sz;
+		}
+	} else {
+		if (requested_sz > (read - write - 1)) {
+			return -EINVAL;
+		}
+
+		write += requested_sz;
+	}
+
+	pipe->write = write;
+
+	return 0;
+}
+
+uint32_t
+spdk_pipe_reader_bytes_available(struct spdk_pipe *pipe)
+{
+	uint32_t read;
+	uint32_t write;
+
+	read = pipe->read;
+	write = pipe->write;
+
+	if (read <= write) {
+		return write - read;
+	}
+
+	return (write + pipe->sz) - read;
+}
+
+int
+spdk_pipe_reader_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struct iovec *iovs)
+{
+	uint32_t sz;
+	uint32_t read;
+	uint32_t write;
+
+	read = pipe->read;
+	write = pipe->write;
+
+	if (read <= write) {
+		sz = spdk_min(requested_sz, write - read);
+
+		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read);
+		iovs[0].iov_len = sz;
+		iovs[1].iov_base = NULL;
+		iovs[1].iov_len = 0;
+	} else {
+		sz = spdk_min(requested_sz, pipe->sz - read);
+
+		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read);
+		iovs[0].iov_len = sz;
+
+		requested_sz -= sz;
+
+		if (requested_sz > 0) {
+			sz = spdk_min(requested_sz, write);
+			iovs[1].iov_base = (sz == 0) ? NULL : pipe->buf;
+			iovs[1].iov_len = sz;
+		} else {
+			iovs[1].iov_base = NULL;
+			iovs[1].iov_len = 0;
+		}
+	}
+
+	return iovs[0].iov_len + iovs[1].iov_len;
+}
+
+int
+spdk_pipe_reader_advance(struct spdk_pipe *pipe, uint32_t requested_sz)
+{
+	uint32_t sz;
+	uint32_t read;
+	uint32_t write;
+
+	read = pipe->read;
+	write = pipe->write;
+
+	if (read <= write) {
+		if (requested_sz > (write - read)) {
+			return -EINVAL;
+		}
+
+		read += requested_sz;
+	} else {
+		sz = spdk_min(requested_sz, pipe->sz - read);
+
+		read += sz;
+		if (read > pipe->sz - 1) {
+			read = 0;
+		}
+		requested_sz -= sz;
+
+		if (requested_sz > 0) {
+			if (requested_sz > write) {
+				return -EINVAL;
+			}
+
+			read = requested_sz;
+		}
+	}
+
+	pipe->read = read;
+
+	return 0;
+}
diff --git a/src/spdk/lib/util/spdk_util.map b/src/spdk/lib/util/spdk_util.map
new file mode 100644
index 000000000..07e067faa
--- /dev/null
+++ b/src/spdk/lib/util/spdk_util.map
@@ -0,0 +1,128 @@
+{
+	global:
+
+	# public functions in base64.h
+	spdk_base64_encode;
+	spdk_base64_urlsafe_encode;
+	spdk_base64_decode;
+	spdk_base64_urlsafe_decode;
+
+	# public functions in bit_array.h
+	spdk_bit_array_capacity;
+	spdk_bit_array_create;
+	spdk_bit_array_free;
+	spdk_bit_array_resize;
+	spdk_bit_array_get;
+	spdk_bit_array_set;
+	spdk_bit_array_clear;
+	spdk_bit_array_find_first_set;
+	spdk_bit_array_find_first_clear;
+	spdk_bit_array_count_set;
+	spdk_bit_array_count_clear;
+	spdk_bit_array_store_mask;
+	spdk_bit_array_load_mask;
+	spdk_bit_array_clear_mask;
+
+	# public functions in cpuset.h
+	spdk_cpuset_alloc;
+	spdk_cpuset_free;
+	spdk_cpuset_equal;
+	spdk_cpuset_copy;
+	spdk_cpuset_and;
+	spdk_cpuset_or;
+	spdk_cpuset_xor;
+	spdk_cpuset_negate;
+	spdk_cpuset_zero;
+	spdk_cpuset_set_cpu;
+	spdk_cpuset_get_cpu;
+	spdk_cpuset_count;
+	spdk_cpuset_fmt;
+	spdk_cpuset_parse;
+
+	# public functions in crc16.h
+	spdk_crc16_t10dif;
+	spdk_crc16_t10dif_copy;
+
+	# public functions in crc32.h
+	spdk_crc32_ieee_update;
+	spdk_crc32c_update;
+
+	# public functions in dif.h
+	spdk_dif_ctx_init;
+	spdk_dif_ctx_set_data_offset;
+	spdk_dif_ctx_set_remapped_init_ref_tag;
+	spdk_dif_generate;
+	spdk_dif_verify;
+	spdk_dif_update_crc32c;
+	spdk_dif_generate_copy;
+	spdk_dif_verify_copy;
+	spdk_dif_inject_error;
+	spdk_dix_generate;
+	spdk_dix_verify;
+	spdk_dix_inject_error;
+	spdk_dif_set_md_interleave_iovs;
+	spdk_dif_generate_stream;
+	spdk_dif_verify_stream;
+	spdk_dif_update_crc32c_stream;
+	spdk_dif_get_range_with_md;
+	spdk_dif_get_length_with_md;
+	spdk_dif_remap_ref_tag;
+	spdk_dix_remap_ref_tag;
+
+	# public functions in fd.h
+	spdk_fd_get_size;
+	spdk_fd_get_blocklen;
+
+	# public functions in file.h
+	spdk_posix_file_load;
+
+	# public functions in pipe.h
+	spdk_pipe_create;
+	spdk_pipe_destroy;
+	spdk_pipe_writer_get_buffer;
+	spdk_pipe_writer_advance;
+	spdk_pipe_reader_bytes_available;
+	spdk_pipe_reader_get_buffer;
+	spdk_pipe_reader_advance;
+
+	# public functions in string.h
+	spdk_sprintf_alloc;
+	spdk_vsprintf_alloc;
+	spdk_sprintf_append_realloc;
+	spdk_vsprintf_append_realloc;
+	spdk_strlwr;
+	spdk_strsepq;
+	spdk_str_trim;
+	spdk_strerror_r;
+	spdk_strerror;
+	spdk_str_chomp;
+	spdk_strcpy_pad;
+	spdk_strlen_pad;
+	spdk_parse_ip_addr;
+	spdk_parse_capacity;
+	spdk_mem_all_zero;
+	spdk_strtol;
+	spdk_strtoll;
+
+	# public functions in util.h
+	spdk_u32log2;
+	spdk_u64log2;
+	spdk_iovcpy;
+
+	# resolvers for functions in util.h
+	spdk_u32log2.resolver;
+	spdk_u64log2.resolver;
+
+	# public functions in uuid.h
+	spdk_uuid_parse;
+	spdk_uuid_fmt_lower;
+	spdk_uuid_compare;
+	spdk_uuid_generate;
+	spdk_uuid_copy;
+
+
+
+
+
+	local: *;
+};
diff --git a/src/spdk/lib/util/strerror_tls.c b/src/spdk/lib/util/strerror_tls.c
new file mode 100644
index 000000000..c9dc8f13f
--- /dev/null
+++ b/src/spdk/lib/util/strerror_tls.c
@@ -0,0 +1,43 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/string.h"
+
+static __thread char strerror_message[64];
+
+const char *
+spdk_strerror(int errnum)
+{
+	spdk_strerror_r(errnum, strerror_message, sizeof(strerror_message));
+	return strerror_message;
+}
diff --git a/src/spdk/lib/util/string.c b/src/spdk/lib/util/string.c
new file mode 100644
index 000000000..30ac1628a
--- /dev/null
+++ b/src/spdk/lib/util/string.c
@@ -0,0 +1,476 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/string.h"
+
+char *
+spdk_vsprintf_append_realloc(char *buffer, const char *format, va_list args)
+{
+	va_list args_copy;
+	char *new_buffer;
+	int orig_size = 0, new_size;
+
+	/* Original buffer size */
+	if (buffer) {
+		orig_size = strlen(buffer);
+	}
+
+	/* Necessary buffer size */
+	va_copy(args_copy, args);
+	new_size = vsnprintf(NULL, 0, format, args_copy);
+	va_end(args_copy);
+
+	if (new_size < 0) {
+		return NULL;
+	}
+	new_size += orig_size + 1;
+
+	new_buffer = realloc(buffer, new_size);
+	if (new_buffer == NULL) {
+		return NULL;
+	}
+
+	vsnprintf(new_buffer + orig_size, new_size - orig_size, format, args);
+
+	return new_buffer;
+}
+
+char *
+spdk_sprintf_append_realloc(char *buffer, const char *format, ...)
+{
+	va_list args;
+	char *ret;
+
+	va_start(args, format);
+	ret = spdk_vsprintf_append_realloc(buffer, format, args);
+	va_end(args);
+
+	return ret;
+}
+
+char *
+spdk_vsprintf_alloc(const char *format, va_list args)
+{
+	return spdk_vsprintf_append_realloc(NULL, format, args);
+}
+
+char *
+spdk_sprintf_alloc(const char *format, ...)
+{
+	va_list args;
+	char *ret;
+
+	va_start(args, format);
+	ret = spdk_vsprintf_alloc(format, args);
+	va_end(args);
+
+	return ret;
+}
+
+char *
+spdk_strlwr(char *s)
+{
+	char *p;
+
+	if (s == NULL) {
+		return NULL;
+	}
+
+	p = s;
+	while (*p != '\0') {
+		*p = tolower(*p);
+		p++;
+	}
+
+	return s;
+}
+
+char *
+spdk_strsepq(char **stringp, const char *delim)
+{
+	char *p, *q, *r;
+	int quoted = 0, bslash = 0;
+
+	p = *stringp;
+	if (p == NULL) {
+		return NULL;
+	}
+
+	r = q = p;
+	while (*q != '\0' && *q != '\n') {
+		/* eat quoted characters */
+		if (bslash) {
+			bslash = 0;
+			*r++ = *q++;
+			continue;
+		} else if (quoted) {
+			if (quoted == '"' && *q == '\\') {
+				bslash = 1;
+				q++;
+				continue;
+			} else if (*q == quoted) {
+				quoted = 0;
+				q++;
+				continue;
+			}
+			*r++ = *q++;
+			continue;
+		} else if (*q == '\\') {
+			bslash = 1;
+			q++;
+			continue;
+		} else if (*q == '"' || *q == '\'') {
+			quoted = *q;
+			q++;
+			continue;
+		}
+
+		/* separator? */
+		if (strchr(delim, *q) == NULL) {
+			*r++ = *q++;
+			continue;
+		}
+
+		/* new string */
+		q++;
+		break;
+	}
+	*r = '\0';
+
+	/* skip tailer */
+	while (*q != '\0' && strchr(delim, *q) != NULL) {
+		q++;
+	}
+	if (*q != '\0') {
+		*stringp = q;
+	} else {
+		*stringp = NULL;
+	}
+
+	return p;
+}
+
+char *
+spdk_str_trim(char *s)
+{
+	char *p, *q;
+
+	if (s == NULL) {
+		return NULL;
+	}
+
+	/* remove header */
+	p = s;
+	while (*p != '\0' && isspace(*p)) {
+		p++;
+	}
+
+	/* remove tailer */
+	q = p + strlen(p);
+	while (q - 1 >= p && isspace(*(q - 1))) {
+		q--;
+		*q = '\0';
+	}
+
+	/* if remove header, move */
+	if (p != s) {
+		q = s;
+		while (*p != '\0') {
+			*q++ = *p++;
+		}
+		*q = '\0';
+	}
+
+	return s;
+}
+
+void
+spdk_strcpy_pad(void *dst, const char *src, size_t size, int pad)
+{
+	size_t len;
+
+	len = strlen(src);
+	if (len < size) {
+		memcpy(dst, src, len);
+		memset((char *)dst + len, pad, size - len);
+	} else {
+		memcpy(dst, src, size);
+	}
+}
+
+size_t
+spdk_strlen_pad(const void *str, size_t size, int pad)
+{
+	const uint8_t *start;
+	const uint8_t *iter;
+	uint8_t pad_byte;
+
+	pad_byte = (uint8_t)pad;
+	start = (const uint8_t *)str;
+
+	if (size == 0) {
+		return 0;
+	}
+
+	iter = start + size - 1;
+	while (1) {
+		if (*iter != pad_byte) {
+			return iter - start + 1;
+		}
+
+		if (iter == start) {
+			/* Hit the start of the string finding only pad_byte. */
+			return 0;
+		}
+		iter--;
+	}
+}
+
+int
+spdk_parse_ip_addr(char *ip, char **host, char **port)
+{
+	char *p;
+
+	if (ip == NULL) {
+		return -EINVAL;
+	}
+
+	*host = NULL;
+	*port = NULL;
+
+	if (ip[0] == '[') {
+		/* IPv6 */
+		p = strchr(ip, ']');
+		if (p == NULL) {
+			return -EINVAL;
+		}
+		*host = &ip[1];
+		*p = '\0';
+
+		p++;
+		if (*p == '\0') {
+			return 0;
+		} else if (*p != ':') {
+			return -EINVAL;
+		}
+
+		p++;
+		if (*p == '\0') {
+			return 0;
+		}
+
+		*port = p;
+	} else {
+		/* IPv4 */
+		p = strchr(ip, ':');
+		if (p == NULL) {
+			*host = ip;
+			return 0;
+		}
+
+		*host = ip;
+		*p = '\0';
+
+		p++;
+		if (*p == '\0') {
+			return 0;
+		}
+
+		*port = p;
+	}
+
+	return 0;
+}
+
+size_t
+spdk_str_chomp(char *s)
+{
+	size_t len = strlen(s);
+	size_t removed = 0;
+
+	while (len > 0) {
+		if (s[len - 1] != '\r' && s[len - 1] != '\n') {
+			break;
+		}
+
+		s[len - 1] = '\0';
+		len--;
+		removed++;
+	}
+
+	return removed;
+}
+
+void
+spdk_strerror_r(int errnum, char *buf, size_t buflen)
+{
+	int rc;
+
+#if defined(__USE_GNU)
+	char *new_buffer;
+	new_buffer = strerror_r(errnum, buf, buflen);
+	if (new_buffer == buf) {
+		rc = 0;
+	} else if (new_buffer != NULL) {
+		snprintf(buf, buflen, "%s", new_buffer);
+		rc = 0;
+	} else {
+		rc = 1;
+	}
+#else
+	rc = strerror_r(errnum, buf, buflen);
+#endif
+
+	if (rc != 0) {
+		snprintf(buf, buflen, "Unknown error %d", errnum);
+	}
+}
+
+int
+spdk_parse_capacity(const char *cap_str, uint64_t *cap, bool *has_prefix)
+{
+	int rc;
+	char bin_prefix;
+
+	rc = sscanf(cap_str, "%"SCNu64"%c", cap, &bin_prefix);
+	if (rc == 1) {
+		*has_prefix = false;
+		return 0;
+	} else if (rc == 0) {
+		if (errno == 0) {
+			/* No scanf matches - the string does not start with a digit */
+			return -EINVAL;
+		} else {
+			/* Parsing error */
+			return -errno;
+		}
+	}
+
+	*has_prefix = true;
+	switch (bin_prefix) {
+	case 'k':
+	case 'K':
+		*cap *= 1024;
+		break;
+	case 'm':
+	case 'M':
+		*cap *= 1024 * 1024;
+		break;
+	case 'g':
+	case 'G':
+		*cap *= 1024 * 1024 * 1024;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+bool
+spdk_mem_all_zero(const void *data, size_t size)
+{
+	const uint8_t *buf = data;
+
+	while (size--) {
+		if (*buf++ != 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+long int
+spdk_strtol(const char *nptr, int base)
+{
+	long val;
+	char *endptr;
+
+	/* Since strtoll() can legitimately return 0, LONG_MAX, or LONG_MIN
+	 * on both success and failure, the calling program should set errno
+	 * to 0 before the call.
+	 */
+	errno = 0;
+
+	val = strtol(nptr, &endptr, base);
+
+	if (!errno && *endptr != '\0') {
+		/* Non integer character was found. */
+		return -EINVAL;
+	} else if (errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) {
+		/* Overflow occurred. */
+		return -ERANGE;
+	} else if (errno != 0 && val == 0) {
+		/* Other error occurred. */
+		return -errno;
+	} else if (val < 0) {
+		/* Input string was negative number. */
+		return -ERANGE;
+	}
+
+	return val;
+}
+
+long long int
+spdk_strtoll(const char *nptr, int base)
+{
+	long long val;
+	char *endptr;
+
+	/* Since strtoll() can legitimately return 0, LLONG_MAX, or LLONG_MIN
+	 * on both success and failure, the calling program should set errno
+	 * to 0 before the call.
+	 */
+	errno = 0;
+
+	val = strtoll(nptr, &endptr, base);
+
+	if (!errno && *endptr != '\0') {
+		/* Non integer character was found. */
+		return -EINVAL;
+	} else if (errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) {
+		/* Overflow occurred. */
+		return -ERANGE;
+	} else if (errno != 0 && val == 0) {
+		/* Other error occurred. */
+		return -errno;
+	} else if (val < 0) {
+		/* Input string was negative number. */
+		return -ERANGE;
+	}
+
+	return val;
+}
diff --git a/src/spdk/lib/util/util_internal.h b/src/spdk/lib/util/util_internal.h
new file mode 100644
index 000000000..655ef513d
--- /dev/null
+++ b/src/spdk/lib/util/util_internal.h
@@ -0,0 +1,77 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_UTIL_INTERNAL_H
+#define SPDK_UTIL_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+/**
+ * IEEE CRC-32 polynomial (bit reflected)
+ */
+#define SPDK_CRC32_POLYNOMIAL_REFLECT 0xedb88320UL
+
+/**
+ * CRC-32C (Castagnoli) polynomial (bit reflected)
+ */
+#define SPDK_CRC32C_POLYNOMIAL_REFLECT 0x82f63b78UL
+
+struct spdk_crc32_table {
+	uint32_t table[256];
+};
+
+/**
+ * Initialize a CRC32 lookup table for a given polynomial.
+ *
+ * \param table Table to fill with precalculated CRC-32 data.
+ * \param polynomial_reflect Bit-reflected CRC-32 polynomial.
+ */
+void crc32_table_init(struct spdk_crc32_table *table,
+		      uint32_t polynomial_reflect);
+
+
+/**
+ * Calculate a partial CRC-32 checksum.
+ *
+ * \param table CRC-32 table initialized with crc32_table_init().
+ * \param buf Data buffer to checksum.
+ * \param len Length of buf in bytes.
+ * \param crc Previous CRC-32 value.
+ * \return Updated CRC-32 value.
+ */
+uint32_t crc32_update(const struct spdk_crc32_table *table,
+		      const void *buf, size_t len,
+		      uint32_t crc);
+
+#endif /* SPDK_UTIL_INTERNAL_H */
diff --git a/src/spdk/lib/util/uuid.c b/src/spdk/lib/util/uuid.c
new file mode 100644
index 000000000..176f65880
--- /dev/null
+++ b/src/spdk/lib/util/uuid.c
@@ -0,0 +1,73 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/uuid.h"
+
+#include <uuid/uuid.h>
+
+SPDK_STATIC_ASSERT(sizeof(struct spdk_uuid) == sizeof(uuid_t), "Size mismatch");
+
+int
+spdk_uuid_parse(struct spdk_uuid *uuid, const char *uuid_str)
+{
+	return uuid_parse(uuid_str, (void *)uuid) == 0 ? 0 : -EINVAL;
+}
+
+int
+spdk_uuid_fmt_lower(char *uuid_str, size_t uuid_str_size, const struct spdk_uuid *uuid)
+{
+	if (uuid_str_size < SPDK_UUID_STRING_LEN) {
+		return -EINVAL;
+	}
+
+	uuid_unparse_lower((void *)uuid, uuid_str);
+	return 0;
+}
+
+int
+spdk_uuid_compare(const struct spdk_uuid *u1, const struct spdk_uuid *u2)
+{
+	return uuid_compare((void *)u1, (void *)u2);
+}
+
+void
+spdk_uuid_generate(struct spdk_uuid *uuid)
+{
+	uuid_generate((void *)uuid);
+}
+
+void
+spdk_uuid_copy(struct spdk_uuid *dst, const struct spdk_uuid *src)
+{
+	uuid_copy((void *)dst, (void *)src);
+}
diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile
new file mode 100644
index 000000000..1fe9b6e40
--- /dev/null
+++ b/src/spdk/lib/vhost/Makefile
@@ -0,0 +1,54 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 4
+SO_MINOR := 0
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+
+C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c rte_vhost_compat.c
+
+ifeq ($(CONFIG_VHOST_INTERNAL_LIB),y)
+C_SRCS += vhost_nvme.c
+CFLAGS := -I../rte_vhost $(CFLAGS)
+endif
+
+LIBNAME = vhost
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vhost.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vhost/rte_vhost_compat.c b/src/spdk/lib/vhost/rte_vhost_compat.c
new file mode 100644
index 000000000..53f31bfd7
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost_compat.c
@@ -0,0 +1,402 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * Set of workarounds for rte_vhost to make it work with device types
+ * other than vhost-net.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+#include "spdk_internal/vhost_user.h"
+
+static inline void
+vhost_session_mem_region_calc(uint64_t *previous_start, uint64_t *start, uint64_t *end,
+			      uint64_t *len, struct rte_vhost_mem_region *region)
+{
+	*start = FLOOR_2MB(region->mmap_addr);
+	*end = CEIL_2MB(region->mmap_addr + region->mmap_size);
+	if (*start == *previous_start) {
+		*start += (size_t) VALUE_2MB;
+	}
+	*previous_start = *start;
+	*len = *end - *start;
+}
+
+void
+vhost_session_mem_register(struct rte_vhost_memory *mem)
+{
+	uint64_t start, end, len;
+	uint32_t i;
+	uint64_t previous_start = UINT64_MAX;
+
+
+	for (i = 0; i < mem->nregions; i++) {
+		vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]);
+		SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
+			     start, len);
+
+		if (spdk_mem_register((void *)start, len) != 0) {
+			SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
+				     i);
+			continue;
+		}
+	}
+}
+
+void
+vhost_session_mem_unregister(struct rte_vhost_memory *mem)
+{
+	uint64_t start, end, len;
+	uint32_t i;
+	uint64_t previous_start = UINT64_MAX;
+
+	for (i = 0; i < mem->nregions; i++) {
+		vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]);
+		if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) {
+			continue; /* region has not been registered */
+		}
+
+		if (spdk_mem_unregister((void *)start, len) != 0) {
+			assert(false);
+		}
+	}
+}
+
+static int
+new_connection(int vid)
+{
+	char ifname[PATH_MAX];
+
+	if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
+		SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
+		return -1;
+	}
+
+	return vhost_new_connection_cb(vid, ifname);
+}
+
+static int
+start_device(int vid)
+{
+	return vhost_start_device_cb(vid);
+}
+
+static void
+stop_device(int vid)
+{
+	vhost_stop_device_cb(vid);
+}
+
+static void
+destroy_connection(int vid)
+{
+	vhost_destroy_connection_cb(vid);
+}
+
+static const struct vhost_device_ops g_spdk_vhost_ops = {
+	.new_device =  start_device,
+	.destroy_device = stop_device,
+	.new_connection = new_connection,
+	.destroy_connection = destroy_connection,
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+	.get_config = vhost_get_config_cb,
+	.set_config = vhost_set_config_cb,
+	.vhost_nvme_admin_passthrough = vhost_nvme_admin_passthrough,
+	.vhost_nvme_set_cq_call = vhost_nvme_set_cq_call,
+	.vhost_nvme_get_cap = vhost_nvme_get_cap,
+	.vhost_nvme_set_bar_mr = vhost_nvme_set_bar_mr,
+#endif
+};
+
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+
+static enum rte_vhost_msg_result
+extern_vhost_pre_msg_handler(int vid, void *_msg)
+{
+	struct vhost_user_msg *msg = _msg;
+	struct spdk_vhost_session *vsession;
+
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid);
+		assert(false);
+		return RTE_VHOST_MSG_RESULT_ERR;
+	}
+
+	switch (msg->request) {
+	case VHOST_USER_GET_VRING_BASE:
+		if (vsession->forced_polling && vsession->started) {
+			/* Our queue is stopped for whatever reason, but we may still
+			 * need to poll it after it's initialized again.
+			 */
+			g_spdk_vhost_ops.destroy_device(vid);
+		}
+		break;
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_ADDR:
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_KICK:
+		if (vsession->forced_polling && vsession->started) {
+			/* Additional queues are being initialized, so we either processed
+			 * enough I/Os and are switching from SeaBIOS to the OS now, or
+			 * we were never in SeaBIOS in the first place. Either way, we
+			 * don't need our workaround anymore.
+			 */
+			g_spdk_vhost_ops.destroy_device(vid);
+			vsession->forced_polling = false;
+		}
+		break;
+	case VHOST_USER_SET_VRING_CALL:
+		/* rte_vhost will close the previous callfd and won't notify
+		 * us about any change. This will effectively make SPDK fail
+		 * to deliver any subsequent interrupts until a session is
+		 * restarted. We stop the session here before closing the previous
+		 * fd (so that all interrupts must have been delivered by the
+		 * time the descriptor is closed) and start right after (which
+		 * will make SPDK retrieve the latest, up-to-date callfd from
+		 * rte_vhost.
+		 */
+	case VHOST_USER_SET_MEM_TABLE:
+		/* rte_vhost will unmap previous memory that SPDK may still
+		 * have pending DMA operations on. We can't let that happen,
+		 * so stop the device before letting rte_vhost unmap anything.
+		 * This will block until all pending I/Os are finished.
+		 * We will start the device again from the post-processing
+		 * message handler.
+		 */
+		if (vsession->started) {
+			g_spdk_vhost_ops.destroy_device(vid);
+			vsession->needs_restart = true;
+		}
+		break;
+	case VHOST_USER_GET_CONFIG: {
+		int rc = 0;
+
+		spdk_vhost_lock();
+		if (vsession->vdev->backend->vhost_get_config) {
+			rc = vsession->vdev->backend->vhost_get_config(vsession->vdev,
+				msg->payload.cfg.region, msg->payload.cfg.size);
+			if (rc != 0) {
+				msg->size = 0;
+			}
+		}
+		spdk_vhost_unlock();
+
+		return RTE_VHOST_MSG_RESULT_REPLY;
+	}
+	case VHOST_USER_SET_CONFIG: {
+		int rc = 0;
+
+		spdk_vhost_lock();
+		if (vsession->vdev->backend->vhost_set_config) {
+			rc = vsession->vdev->backend->vhost_set_config(vsession->vdev,
+				msg->payload.cfg.region, msg->payload.cfg.offset,
+				msg->payload.cfg.size, msg->payload.cfg.flags);
+		}
+		spdk_vhost_unlock();
+
+		return rc == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR;
+	}
+	default:
+		break;
+	}
+
+	return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+static enum rte_vhost_msg_result
+extern_vhost_post_msg_handler(int vid, void *_msg)
+{
+	struct vhost_user_msg *msg = _msg;
+	struct spdk_vhost_session *vsession;
+
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid);
+		assert(false);
+		return RTE_VHOST_MSG_RESULT_ERR;
+	}
+
+	if (vsession->needs_restart) {
+		g_spdk_vhost_ops.new_device(vid);
+		vsession->needs_restart = false;
+		return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+	}
+
+	switch (msg->request) {
+	case VHOST_USER_SET_FEATURES:
+		/* rte_vhost requires all queues to be fully initialized in order
+		 * to start I/O processing. This behavior is not compliant with the
+		 * vhost-user specification and doesn't work with QEMU 2.12+, which
+		 * will only initialize 1 I/O queue for the SeaBIOS boot.
+		 * Theoretically, we should start polling each virtqueue individually
+		 * after receiving its SET_VRING_KICK message, but rte_vhost is not
+		 * designed to poll individual queues. So here we use a workaround
+		 * to detect when the vhost session could be potentially at that SeaBIOS
+		 * stage and we mark it to start polling as soon as its first virtqueue
+		 * gets initialized. This doesn't hurt any non-QEMU vhost slaves
+		 * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent
+		 * at any time, but QEMU will send it at least once on SeaBIOS
+		 * initialization - whenever powered-up or rebooted.
+		 */
+		vsession->forced_polling = true;
+		break;
+	case VHOST_USER_SET_VRING_KICK:
+		/* vhost-user spec tells us to start polling a queue after receiving
+		 * its SET_VRING_KICK message. Let's do it!
+		 */
+		if (vsession->forced_polling && !vsession->started) {
+			g_spdk_vhost_ops.new_device(vid);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops = {
+	.pre_msg_handle = extern_vhost_pre_msg_handler,
+	.post_msg_handle = extern_vhost_post_msg_handler,
+};
+
+void
+vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession)
+{
+	int rc;
+
+	rc = rte_vhost_extern_callback_register(vsession->vid, &g_spdk_extern_vhost_ops, NULL);
+	if (rc != 0) {
+		SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n",
+			    vsession->vid);
+		return;
+	}
+}
+
+#else /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
+
+void
+vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession)
+{
+	/* nothing to do. all the changes are already incorporated into rte_vhost */
+}
+
+#endif
+
+int
+vhost_register_unix_socket(const char *path, const char *ctrl_name,
+			   uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features)
+{
+	struct stat file_stat;
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+	uint64_t features = 0;
+#endif
+
+	/* Register vhost driver to handle vhost messages. */
+	if (stat(path, &file_stat) != -1) {
+		if (!S_ISSOCK(file_stat.st_mode)) {
+			SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+				    "The file already exists and is not a socket.\n",
+				    path);
+			return -EIO;
+		} else if (unlink(path) != 0) {
+			SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+				    "The socket already exists and failed to unlink.\n",
+				    path);
+			return -EIO;
+		}
+	}
+
+	if (rte_vhost_driver_register(path, 0) != 0) {
+		SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name);
+		SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
+		return -EIO;
+	}
+	if (rte_vhost_driver_set_features(path, virtio_features) ||
+	    rte_vhost_driver_disable_features(path, disabled_features)) {
+		SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name);
+
+		rte_vhost_driver_unregister(path);
+		return -EIO;
+	}
+
+	if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
+		rte_vhost_driver_unregister(path);
+		SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name);
+		return -EIO;
+	}
+
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+	rte_vhost_driver_get_protocol_features(path, &features);
+	features |= protocol_features;
+	rte_vhost_driver_set_protocol_features(path, features);
+#endif
+
+	if (rte_vhost_driver_start(path) != 0) {
+		SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
+			    ctrl_name, errno, spdk_strerror(errno));
+		rte_vhost_driver_unregister(path);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int
+vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+	return rte_vhost_get_mem_table(vid, mem);
+}
+
+int
+vhost_driver_unregister(const char *path)
+{
+	return rte_vhost_driver_unregister(path);
+}
+
+int
+vhost_get_negotiated_features(int vid, uint64_t *negotiated_features)
+{
+	return rte_vhost_get_negotiated_features(vid, negotiated_features);
+}
diff --git a/src/spdk/lib/vhost/spdk_vhost.map b/src/spdk/lib/vhost/spdk_vhost.map
new file mode 100644
index 000000000..de38e5a5e
--- /dev/null
+++ b/src/spdk/lib/vhost/spdk_vhost.map
@@ -0,0 +1,27 @@
+{
+	global:
+
+	# public functions
+	spdk_vhost_set_socket_path;
+	spdk_vhost_init;
+	spdk_vhost_fini;
+	spdk_vhost_config_json;
+	spdk_vhost_shutdown_cb;
+	spdk_vhost_lock;
+	spdk_vhost_trylock;
+	spdk_vhost_unlock;
+	spdk_vhost_dev_find;
+	spdk_vhost_dev_next;
+	spdk_vhost_dev_get_name;
+	spdk_vhost_dev_get_cpumask;
+	spdk_vhost_set_coalescing;
+	spdk_vhost_get_coalescing;
+	spdk_vhost_scsi_dev_construct;
+	spdk_vhost_scsi_dev_add_tgt;
+	spdk_vhost_scsi_dev_get_tgt;
+	spdk_vhost_scsi_dev_remove_tgt;
+	spdk_vhost_blk_construct;
+	spdk_vhost_dev_remove;
+
+	local: *;
+};
diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c
new file mode 100644
index 000000000..b904d8bf9
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost.c
@@ -0,0 +1,1634 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+static struct spdk_cpuset g_vhost_core_mask;
+
+/* Path to folder where character device will be created. Can be set by user. */
+static char dev_dirname[PATH_MAX] = "";
+
+/* Thread performing all vhost management operations */
+static struct spdk_thread *g_vhost_init_thread;
+
+static spdk_vhost_fini_cb g_fini_cpl_cb;
+
+/**
+ * DPDK calls our callbacks synchronously but the work those callbacks
+ * perform needs to be async. Luckily, all DPDK callbacks are called on
+ * a DPDK-internal pthread, so we'll just wait on a semaphore in there.
+ */
+static sem_t g_dpdk_sem;
+
+/** Return code for the current DPDK callback */
+static int g_dpdk_response;
+
+struct vhost_session_fn_ctx {
+	/** Device pointer obtained before enqueuing the event */
+	struct spdk_vhost_dev *vdev;
+
+	/** ID of the session to send event to. */
+	uint32_t vsession_id;
+
+	/** User provided function to be executed on session's thread. */
+	spdk_vhost_session_fn cb_fn;
+
+	/**
+	 * User provided function to be called on the init thread
+	 * after iterating through all sessions.
+	 */
+	spdk_vhost_dev_fn cpl_fn;
+
+	/** Custom user context */
+	void *user_ctx;
+};
+
+static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER(
+			g_vhost_devices);
+static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
+{
+	void *vva;
+	uint64_t newlen;
+
+	newlen = len;
+	vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen);
+	if (newlen != len) {
+		return NULL;
+	}
+
+	return vva;
+
+}
+
+static void
+vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+		   uint16_t req_id)
+{
+	struct vring_desc *desc, *desc_table;
+	uint32_t desc_table_size;
+	int rc;
+
+	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+		return;
+	}
+
+	rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("Can't log used ring descriptors!\n");
+		return;
+	}
+
+	do {
+		if (vhost_vring_desc_is_wr(desc)) {
+			/* To be honest, only pages realy touched should be logged, but
+			 * doing so would require tracking those changes in each backed.
+			 * Also backend most likely will touch all/most of those pages so
+			 * for lets assume we touched all pages passed to as writeable buffers. */
+			rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
+		}
+		vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+	} while (desc);
+}
+
+static void
+vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
+			  struct spdk_vhost_virtqueue *virtqueue,
+			  uint16_t idx)
+{
+	uint64_t offset, len;
+
+	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+		return;
+	}
+
+	if (spdk_unlikely(virtqueue->packed.packed_ring)) {
+		offset = idx * sizeof(struct vring_packed_desc);
+		len = sizeof(struct vring_packed_desc);
+	} else {
+		offset = offsetof(struct vring_used, ring[idx]);
+		len = sizeof(virtqueue->vring.used->ring[idx]);
+	}
+
+	rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len);
+}
+
+static void
+vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
+			 struct spdk_vhost_virtqueue *virtqueue)
+{
+	uint64_t offset, len;
+	uint16_t vq_idx;
+
+	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+		return;
+	}
+
+	offset = offsetof(struct vring_used, idx);
+	len = sizeof(virtqueue->vring.used->idx);
+	vq_idx = virtqueue - vsession->virtqueue;
+
+	rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
+}
+
+/*
+ * Get available requests from avail ring.
+ */
+uint16_t
+vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
+			uint16_t reqs_len)
+{
+	struct rte_vhost_vring *vring = &virtqueue->vring;
+	struct vring_avail *avail = vring->avail;
+	uint16_t size_mask = vring->size - 1;
+	uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx;
+	uint16_t count, i;
+
+	count = avail_idx - last_idx;
+	if (spdk_likely(count == 0)) {
+		return 0;
+	}
+
+	if (spdk_unlikely(count > vring->size)) {
+		/* TODO: the queue is unrecoverably broken and should be marked so.
+		 * For now we will fail silently and report there are no new avail entries.
+		 */
+		return 0;
+	}
+
+	count = spdk_min(count, reqs_len);
+	virtqueue->last_avail_idx += count;
+	for (i = 0; i < count; i++) {
+		reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+		      "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
+		      last_idx, avail_idx, count);
+
+	return count;
+}
+
+static bool
+vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
+{
+	return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
+}
+
+static bool
+vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc)
+{
+	return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0;
+}
+
+int
+vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+		  uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+		  uint32_t *desc_table_size)
+{
+	if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
+		return -1;
+	}
+
+	*desc = &virtqueue->vring.desc[req_idx];
+
+	if (vhost_vring_desc_is_indirect(*desc)) {
+		*desc_table_size = (*desc)->len / sizeof(**desc);
+		*desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+					       sizeof(**desc) * *desc_table_size);
+		*desc = *desc_table;
+		if (*desc == NULL) {
+			return -1;
+		}
+
+		return 0;
+	}
+
+	*desc_table = virtqueue->vring.desc;
+	*desc_table_size = virtqueue->vring.size;
+
+	return 0;
+}
+
+int
+vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+			 struct spdk_vhost_virtqueue *virtqueue,
+			 uint16_t req_idx, struct vring_packed_desc **desc,
+			 struct vring_packed_desc **desc_table, uint32_t *desc_table_size)
+{
+	*desc =  &virtqueue->vring.desc_packed[req_idx];
+
+	/* In packed ring when the desc is non-indirect we get next desc
+	 * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc
+	 * is indirect we get next desc by idx and desc_table_size. It's
+	 * different from split ring.
+	 */
+	if (vhost_vring_packed_desc_is_indirect(*desc)) {
+		*desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc);
+		*desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+					       (*desc)->len);
+		*desc = *desc_table;
+		if (spdk_unlikely(*desc == NULL)) {
+			return -1;
+		}
+	} else {
+		*desc_table = NULL;
+		*desc_table_size  = 0;
+	}
+
+	return 0;
+}
+
+int
+vhost_vq_used_signal(struct spdk_vhost_session *vsession,
+		     struct spdk_vhost_virtqueue *virtqueue)
+{
+	if (virtqueue->used_req_cnt == 0) {
+		return 0;
+	}
+
+	virtqueue->req_cnt += virtqueue->used_req_cnt;
+	virtqueue->used_req_cnt = 0;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+		      "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
+		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx);
+
+	if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) {
+		/* interrupt signalled */
+		return 1;
+	} else {
+		/* interrupt not signalled */
+		return 0;
+	}
+}
+
+
+static void
+check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now)
+{
+	struct spdk_vhost_virtqueue *virtqueue;
+	uint32_t irq_delay_base = vsession->coalescing_delay_time_base;
+	uint32_t io_threshold = vsession->coalescing_io_rate_threshold;
+	int32_t irq_delay;
+	uint32_t req_cnt;
+	uint16_t q_idx;
+
+	if (now < vsession->next_stats_check_time) {
+		return;
+	}
+
+	vsession->next_stats_check_time = now + vsession->stats_check_interval;
+	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+		virtqueue = &vsession->virtqueue[q_idx];
+
+		req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
+		if (req_cnt <= io_threshold) {
+			continue;
+		}
+
+		irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
+		virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
+
+		virtqueue->req_cnt = 0;
+		virtqueue->next_event_time = now;
+	}
+}
+
+static inline bool
+vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq)
+{
+	if (spdk_unlikely(vq->packed.packed_ring)) {
+		if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) {
+			return true;
+		}
+	} else {
+		if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void
+vhost_session_used_signal(struct spdk_vhost_session *vsession)
+{
+	struct spdk_vhost_virtqueue *virtqueue;
+	uint64_t now;
+	uint16_t q_idx;
+
+	if (vsession->coalescing_delay_time_base == 0) {
+		for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+			virtqueue = &vsession->virtqueue[q_idx];
+
+			if (virtqueue->vring.desc == NULL) {
+				continue;
+			}
+
+			if (vhost_vq_event_is_suppressed(virtqueue)) {
+				continue;
+			}
+
+			vhost_vq_used_signal(vsession, virtqueue);
+		}
+	} else {
+		now = spdk_get_ticks();
+		check_session_io_stats(vsession, now);
+
+		for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+			virtqueue = &vsession->virtqueue[q_idx];
+
+			/* No need for event right now */
+			if (now < virtqueue->next_event_time) {
+				continue;
+			}
+
+			if (vhost_vq_event_is_suppressed(virtqueue)) {
+				continue;
+			}
+
+			if (!vhost_vq_used_signal(vsession, virtqueue)) {
+				continue;
+			}
+
+			/* Syscall is quite long so update time */
+			now = spdk_get_ticks();
+			virtqueue->next_event_time = now + virtqueue->irq_delay_time;
+		}
+	}
+}
+
+static int
+vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
+			     struct spdk_vhost_session *vsession, void *ctx)
+{
+	vsession->coalescing_delay_time_base =
+		vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
+	vsession->coalescing_io_rate_threshold =
+		vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
+	return 0;
+}
+
+static int
+vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+			 uint32_t iops_threshold)
+{
+	uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
+	uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
+
+	if (delay_time_base >= UINT32_MAX) {
+		SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
+		return -EINVAL;
+	} else if (io_rate == 0) {
+		SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
+			    1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS);
+		return -EINVAL;
+	}
+
+	vdev->coalescing_delay_us = delay_base_us;
+	vdev->coalescing_iops_threshold = iops_threshold;
+	return 0;
+}
+
+int
+spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+			  uint32_t iops_threshold)
+{
+	int rc;
+
+	rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold);
+	if (rc != 0) {
+		return rc;
+	}
+
+	vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL);
+	return 0;
+}
+
+void
+spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
+			  uint32_t *iops_threshold)
+{
+	if (delay_base_us) {
+		*delay_base_us = vdev->coalescing_delay_us;
+	}
+
+	if (iops_threshold) {
+		*iops_threshold = vdev->coalescing_iops_threshold;
+	}
+}
+
+/*
+ * Enqueue id and len to used ring.
+ */
+void
+vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+			   struct spdk_vhost_virtqueue *virtqueue,
+			   uint16_t id, uint32_t len)
+{
+	struct rte_vhost_vring *vring = &virtqueue->vring;
+	struct vring_used *used = vring->used;
+	uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
+	uint16_t vq_idx = virtqueue->vring_idx;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+		      "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
+		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
+
+	vhost_log_req_desc(vsession, virtqueue, id);
+
+	virtqueue->last_used_idx++;
+	used->ring[last_idx].id = id;
+	used->ring[last_idx].len = len;
+
+	/* Ensure the used ring is updated before we log it or increment used->idx. */
+	spdk_smp_wmb();
+
+	rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id);
+
+	vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
+	* (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
+	vhost_log_used_vring_idx(vsession, virtqueue);
+
+	rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id);
+
+	virtqueue->used_req_cnt++;
+}
+
+void
+vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+			     struct spdk_vhost_virtqueue *virtqueue,
+			     uint16_t num_descs, uint16_t buffer_id,
+			     uint32_t length)
+{
+	struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx];
+	bool used, avail;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+		      "Queue %td - RING: buffer_id=%"PRIu16"\n",
+		      virtqueue - vsession->virtqueue, buffer_id);
+
+	/* When the descriptor is used, two flags in descriptor
+	 * avail flag and used flag are set to equal
+	 * and used flag value == used_wrap_counter.
+	 */
+	used = !!(desc->flags & VRING_DESC_F_USED);
+	avail = !!(desc->flags & VRING_DESC_F_AVAIL);
+	if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) {
+		SPDK_ERRLOG("descriptor has been used before\n");
+		return;
+	}
+
+	/* In used desc addr is unused and len specifies the buffer length
+	 * that has been written to by the device.
+	 */
+	desc->addr = 0;
+	desc->len = length;
+
+	/* This bit specifies whether any data has been written by the device */
+	if (length != 0) {
+		desc->flags |= VRING_DESC_F_WRITE;
+	}
+
+	/* Buffer ID is included in the last descriptor in the list.
+	 * The driver needs to keep track of the size of the list corresponding
+	 * to each buffer ID.
+	 */
+	desc->id = buffer_id;
+
+	/* A device MUST NOT make the descriptor used before buffer_id is
+	 * written to the descriptor.
+	 */
+	spdk_smp_wmb();
+	/* To mark a desc as used, the device sets the F_USED bit in flags to match
+	 * the internal Device ring wrap counter. It also sets the F_AVAIL bit to
+	 * match the same value.
+	 */
+	if (virtqueue->packed.used_phase) {
+		desc->flags |= VRING_DESC_F_AVAIL_USED;
+	} else {
+		desc->flags &= ~VRING_DESC_F_AVAIL_USED;
+	}
+
+	vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx);
+	virtqueue->last_used_idx += num_descs;
+	if (virtqueue->last_used_idx >= virtqueue->vring.size) {
+		virtqueue->last_used_idx -= virtqueue->vring.size;
+		virtqueue->packed.used_phase = !virtqueue->packed.used_phase;
+	}
+
+	virtqueue->used_req_cnt++;
+}
+
+bool
+vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue)
+{
+	uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags;
+
+	/* To mark a desc as available, the driver sets the F_AVAIL bit in flags
+	 * to match the internal avail wrap counter. It also sets the F_USED bit to
+	 * match the inverse value but it's not mandatory.
+	 */
+	return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase);
+}
+
+bool
+vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc)
+{
+	return (cur_desc->flags & VRING_DESC_F_WRITE) != 0;
+}
+
+int
+vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+				 struct spdk_vhost_virtqueue *vq,
+				 struct vring_packed_desc *desc_table,
+				 uint32_t desc_table_size)
+{
+	if (desc_table != NULL) {
+		/* When the desc_table isn't NULL means it's indirect and we get the next
+		 * desc by req_idx and desc_table_size. The return value is NULL means
+		 * we reach the last desc of this request.
+		 */
+		(*req_idx)++;
+		if (*req_idx < desc_table_size) {
+			*desc = &desc_table[*req_idx];
+		} else {
+			*desc = NULL;
+		}
+	} else {
+		/* When the desc_table is NULL means it's non-indirect and we get the next
+		 * desc by req_idx and F_NEXT in flags. The return value is NULL means
+		 * we reach the last desc of this request. When return new desc
+		 * we update the req_idx too.
+		 */
+		if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) {
+			*desc = NULL;
+			return 0;
+		}
+
+		*req_idx = (*req_idx + 1) % vq->vring.size;
+		*desc = &vq->vring.desc_packed[*req_idx];
+	}
+
+	return 0;
+}
+
+static int
+vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+				uint16_t *iov_index, uintptr_t payload, uint64_t remaining)
+{
+	uintptr_t vva;
+	uint64_t len;
+
+	do {
+		if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
+			SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
+			return -1;
+		}
+		len = remaining;
+		vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len);
+		if (vva == 0 || len == 0) {
+			SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
+			return -1;
+		}
+		iov[*iov_index].iov_base = (void *)vva;
+		iov[*iov_index].iov_len = len;
+		remaining -= len;
+		payload += len;
+		(*iov_index)++;
+	} while (remaining);
+
+	return 0;
+}
+
+int
+vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+			       uint16_t *iov_index, const struct vring_packed_desc *desc)
+{
+	return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+					       desc->addr, desc->len);
+}
+
+/* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx.
+ * 2, Update the vq->last_avail_idx to point next available desc chain.
+ * 3, Update the avail_wrap_counter if last_avail_idx overturn.
+ */
+uint16_t
+vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+				      uint16_t *num_descs)
+{
+	struct vring_packed_desc *desc;
+	uint16_t desc_head = req_idx;
+
+	*num_descs = 1;
+
+	desc =  &vq->vring.desc_packed[req_idx];
+	if (!vhost_vring_packed_desc_is_indirect(desc)) {
+		while ((desc->flags & VRING_DESC_F_NEXT) != 0) {
+			req_idx = (req_idx + 1) % vq->vring.size;
+			desc = &vq->vring.desc_packed[req_idx];
+			(*num_descs)++;
+		}
+	}
+
+	/* Queue Size doesn't have to be a power of 2
+	 * Device maintains last_avail_idx so we can make sure
+	 * the value is valid(0 ~ vring.size - 1)
+	 */
+	vq->last_avail_idx = (req_idx + 1) % vq->vring.size;
+	if (vq->last_avail_idx < desc_head) {
+		vq->packed.avail_phase = !vq->packed.avail_phase;
+	}
+
+	return desc->id;
+}
+
+int
+vhost_vring_desc_get_next(struct vring_desc **desc,
+			  struct vring_desc *desc_table, uint32_t desc_table_size)
+{
+	struct vring_desc *old_desc = *desc;
+	uint16_t next_idx;
+
+	if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
+		*desc = NULL;
+		return 0;
+	}
+
+	next_idx = old_desc->next;
+	if (spdk_unlikely(next_idx >= desc_table_size)) {
+		*desc = NULL;
+		return -1;
+	}
+
+	*desc = &desc_table[next_idx];
+	return 0;
+}
+
+int
+vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+			uint16_t *iov_index, const struct vring_desc *desc)
+{
+	return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+					       desc->addr, desc->len);
+}
+
+static struct spdk_vhost_session *
+vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
+{
+	struct spdk_vhost_session *vsession;
+
+	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+		if (vsession->id == id) {
+			return vsession;
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_vhost_session *
+vhost_session_find_by_vid(int vid)
+{
+	struct spdk_vhost_dev *vdev;
+	struct spdk_vhost_session *vsession;
+
+	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+		TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+			if (vsession->vid == vid) {
+				return vsession;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
+{
+	if (vdev == NULL) {
+		return TAILQ_FIRST(&g_vhost_devices);
+	}
+
+	return TAILQ_NEXT(vdev, tailq);
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_find(const char *ctrlr_name)
+{
+	struct spdk_vhost_dev *vdev;
+	size_t dev_dirname_len = strlen(dev_dirname);
+
+	if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
+		ctrlr_name += dev_dirname_len;
+	}
+
+	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+		if (strcmp(vdev->name, ctrlr_name) == 0) {
+			return vdev;
+		}
+	}
+
+	return NULL;
+}
+
+static int
+vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+{
+	int rc;
+
+	if (cpumask == NULL) {
+		return -1;
+	}
+
+	if (mask == NULL) {
+		spdk_cpuset_copy(cpumask, &g_vhost_core_mask);
+		return 0;
+	}
+
+	rc = spdk_cpuset_parse(cpumask, mask);
+	if (rc < 0) {
+		SPDK_ERRLOG("invalid cpumask %s\n", mask);
+		return -1;
+	}
+
+	spdk_cpuset_and(cpumask, &g_vhost_core_mask);
+
+	if (spdk_cpuset_count(cpumask) == 0) {
+		SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n",
+			    spdk_cpuset_fmt(&g_vhost_core_mask));
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+vhost_setup_core_mask(void *ctx)
+{
+	struct spdk_thread *thread = spdk_get_thread();
+	spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread));
+}
+
+static void
+vhost_setup_core_mask_done(void *ctx)
+{
+	spdk_vhost_init_cb init_cb = ctx;
+
+	if (spdk_cpuset_count(&g_vhost_core_mask) == 0) {
+		init_cb(-ECHILD);
+		return;
+	}
+
+	init_cb(0);
+}
+
+static void
+vhost_dev_thread_exit(void *arg1)
+{
+	spdk_thread_exit(spdk_get_thread());
+}
+
+int
+vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+		   const struct spdk_vhost_dev_backend *backend)
+{
+	char path[PATH_MAX];
+	struct spdk_cpuset cpumask = {};
+	int rc;
+
+	assert(vdev);
+	if (name == NULL) {
+		SPDK_ERRLOG("Can't register controller with no name\n");
+		return -EINVAL;
+	}
+
+	if (vhost_parse_core_mask(mask_str, &cpumask) != 0) {
+		SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n",
+			    mask_str, spdk_cpuset_fmt(&g_vhost_core_mask));
+		return -EINVAL;
+	}
+
+	if (spdk_vhost_dev_find(name)) {
+		SPDK_ERRLOG("vhost controller %s already exists.\n", name);
+		return -EEXIST;
+	}
+
+	if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
+		SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
+			    name);
+		return -EINVAL;
+	}
+
+	vdev->name = strdup(name);
+	vdev->path = strdup(path);
+	if (vdev->name == NULL || vdev->path == NULL) {
+		rc = -EIO;
+		goto out;
+	}
+
+	vdev->thread = spdk_thread_create(vdev->name, &cpumask);
+	if (vdev->thread == NULL) {
+		SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name);
+		rc = -EIO;
+		goto out;
+	}
+
+	vdev->registered = true;
+	vdev->backend = backend;
+	TAILQ_INIT(&vdev->vsessions);
+
+	vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
+				 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
+
+	if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features,
+				       vdev->protocol_features)) {
+		spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+		rc = -EIO;
+		goto out;
+	}
+
+	TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq);
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
+	return 0;
+
+out:
+	free(vdev->name);
+	free(vdev->path);
+	return rc;
+}
+
+int
+vhost_dev_unregister(struct spdk_vhost_dev *vdev)
+{
+	if (!TAILQ_EMPTY(&vdev->vsessions)) {
+		SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
+		return -EBUSY;
+	}
+
+	if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) {
+		SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
+			    "Check if domain socket %s still exists\n",
+			    vdev->name, vdev->path);
+		return -EIO;
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
+
+	spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+
+	free(vdev->name);
+	free(vdev->path);
+	TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
+	return 0;
+}
+
+const char *
+spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
+{
+	assert(vdev != NULL);
+	return vdev->name;
+}
+
+const struct spdk_cpuset *
+spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
+{
+	assert(vdev != NULL);
+	return spdk_thread_get_cpumask(vdev->thread);
+}
+
+static void
+wait_for_semaphore(int timeout_sec, const char *errmsg)
+{
+	struct timespec timeout;
+	int rc;
+
+	clock_gettime(CLOCK_REALTIME, &timeout);
+	timeout.tv_sec += timeout_sec;
+	rc = sem_timedwait(&g_dpdk_sem, &timeout);
+	if (rc != 0) {
+		SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
+		sem_wait(&g_dpdk_sem);
+	}
+}
+
+static void
+vhost_session_cb_done(int rc)
+{
+	g_dpdk_response = rc;
+	sem_post(&g_dpdk_sem);
+}
+
+void
+vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
+{
+	if (response == 0) {
+		vsession->started = true;
+
+		assert(vsession->vdev->active_session_num < UINT32_MAX);
+		vsession->vdev->active_session_num++;
+	}
+
+	vhost_session_cb_done(response);
+}
+
+void
+vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
+{
+	if (response == 0) {
+		vsession->started = false;
+
+		assert(vsession->vdev->active_session_num > 0);
+		vsession->vdev->active_session_num--;
+	}
+
+	vhost_session_cb_done(response);
+}
+
+static void
+vhost_event_cb(void *arg1)
+{
+	struct vhost_session_fn_ctx *ctx = arg1;
+	struct spdk_vhost_session *vsession;
+
+	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+		spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1);
+		return;
+	}
+
+	vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
+	ctx->cb_fn(ctx->vdev, vsession, NULL);
+	pthread_mutex_unlock(&g_vhost_mutex);
+}
+
+int
+vhost_session_send_event(struct spdk_vhost_session *vsession,
+			 spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+			 const char *errmsg)
+{
+	struct vhost_session_fn_ctx ev_ctx = {0};
+	struct spdk_vhost_dev *vdev = vsession->vdev;
+
+	ev_ctx.vdev = vdev;
+	ev_ctx.vsession_id = vsession->id;
+	ev_ctx.cb_fn = cb_fn;
+
+	spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx);
+
+	pthread_mutex_unlock(&g_vhost_mutex);
+	wait_for_semaphore(timeout_sec, errmsg);
+	pthread_mutex_lock(&g_vhost_mutex);
+
+	return g_dpdk_response;
+}
+
+static void
+foreach_session_finish_cb(void *arg1)
+{
+	struct vhost_session_fn_ctx *ev_ctx = arg1;
+	struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+
+	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+		spdk_thread_send_msg(spdk_get_thread(),
+				     foreach_session_finish_cb, arg1);
+		return;
+	}
+
+	assert(vdev->pending_async_op_num > 0);
+	vdev->pending_async_op_num--;
+	if (ev_ctx->cpl_fn != NULL) {
+		ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx);
+	}
+
+	pthread_mutex_unlock(&g_vhost_mutex);
+	free(ev_ctx);
+}
+
+static void
+foreach_session(void *arg1)
+{
+	struct vhost_session_fn_ctx *ev_ctx = arg1;
+	struct spdk_vhost_session *vsession;
+	struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+	int rc;
+
+	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+		spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1);
+		return;
+	}
+
+	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+		if (vsession->initialized) {
+			rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx);
+			if (rc < 0) {
+				goto out;
+			}
+		}
+	}
+
+out:
+	pthread_mutex_unlock(&g_vhost_mutex);
+
+	spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1);
+}
+
+void
+vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
+			  spdk_vhost_session_fn fn,
+			  spdk_vhost_dev_fn cpl_fn,
+			  void *arg)
+{
+	struct vhost_session_fn_ctx *ev_ctx;
+
+	ev_ctx = calloc(1, sizeof(*ev_ctx));
+	if (ev_ctx == NULL) {
+		SPDK_ERRLOG("Failed to alloc vhost event.\n");
+		assert(false);
+		return;
+	}
+
+	ev_ctx->vdev = vdev;
+	ev_ctx->cb_fn = fn;
+	ev_ctx->cpl_fn = cpl_fn;
+	ev_ctx->user_ctx = arg;
+
+	assert(vdev->pending_async_op_num < UINT32_MAX);
+	vdev->pending_async_op_num++;
+
+	spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx);
+}
+
+static int
+_stop_session(struct spdk_vhost_session *vsession)
+{
+	struct spdk_vhost_dev *vdev = vsession->vdev;
+	struct spdk_vhost_virtqueue *q;
+	int rc;
+	uint16_t i;
+
+	rc = vdev->backend->stop_session(vsession);
+	if (rc != 0) {
+		SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
+		pthread_mutex_unlock(&g_vhost_mutex);
+		return rc;
+	}
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		q = &vsession->virtqueue[i];
+
+		/* vring.desc and vring.desc_packed are in a union struct
+		 * so q->vring.desc can replace q->vring.desc_packed.
+		 */
+		if (q->vring.desc == NULL) {
+			continue;
+		}
+
+		/* Packed virtqueues support up to 2^15 entries each
+		 * so left one bit can be used as wrap counter.
+		 */
+		if (q->packed.packed_ring) {
+			q->last_avail_idx = q->last_avail_idx |
+					    ((uint16_t)q->packed.avail_phase << 15);
+			q->last_used_idx = q->last_used_idx |
+					   ((uint16_t)q->packed.used_phase << 15);
+		}
+
+		rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
+	}
+
+	vhost_session_mem_unregister(vsession->mem);
+	free(vsession->mem);
+
+	return 0;
+}
+
+int
+vhost_stop_device_cb(int vid)
+{
+	struct spdk_vhost_session *vsession;
+	int rc;
+
+	pthread_mutex_lock(&g_vhost_mutex);
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+		pthread_mutex_unlock(&g_vhost_mutex);
+		return -EINVAL;
+	}
+
+	if (!vsession->started) {
+		/* already stopped, nothing to do */
+		pthread_mutex_unlock(&g_vhost_mutex);
+		return -EALREADY;
+	}
+
+	rc = _stop_session(vsession);
+	pthread_mutex_unlock(&g_vhost_mutex);
+
+	return rc;
+}
+
+int
+vhost_start_device_cb(int vid)
+{
+	struct spdk_vhost_dev *vdev;
+	struct spdk_vhost_session *vsession;
+	int rc = -1;
+	uint16_t i;
+	bool packed_ring;
+
+	pthread_mutex_lock(&g_vhost_mutex);
+
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+		goto out;
+	}
+
+	vdev = vsession->vdev;
+	if (vsession->started) {
+		/* already started, nothing to do */
+		rc = 0;
+		goto out;
+	}
+
+	if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
+		SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
+		goto out;
+	}
+
+	packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0);
+
+	vsession->max_queues = 0;
+	memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
+	for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
+		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
+
+		q->vring_idx = -1;
+		if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) {
+			continue;
+		}
+		q->vring_idx = i;
+		rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight);
+
+		/* vring.desc and vring.desc_packed are in a union struct
+		 * so q->vring.desc can replace q->vring.desc_packed.
+		 */
+		if (q->vring.desc == NULL || q->vring.size == 0) {
+			continue;
+		}
+
+		if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) {
+			q->vring.desc = NULL;
+			continue;
+		}
+
+		if (packed_ring) {
+			/* Packed virtqueues support up to 2^15 entries each
+			 * so left one bit can be used as wrap counter.
+			 */
+			q->packed.avail_phase = q->last_avail_idx >> 15;
+			q->last_avail_idx = q->last_avail_idx & 0x7FFF;
+			q->packed.used_phase = q->last_used_idx >> 15;
+			q->last_used_idx = q->last_used_idx & 0x7FFF;
+
+			/* Disable I/O submission notifications, we'll be polling. */
+			q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
+		} else {
+			/* Disable I/O submission notifications, we'll be polling. */
+			q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+		}
+
+		q->packed.packed_ring = packed_ring;
+		vsession->max_queues = i + 1;
+	}
+
+	if (vhost_get_mem_table(vid, &vsession->mem) != 0) {
+		SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
+		goto out;
+	}
+
+	/*
+	 * Not sure right now but this look like some kind of QEMU bug and guest IO
+	 * might be frozed without kicking all queues after live-migration. This look like
+	 * the previous vhost instance failed to effectively deliver all interrupts before
+	 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
+	 * should be ignored by guest virtio driver.
+	 *
+	 * Tested on QEMU 2.10.91 and 2.11.50.
+	 */
+	for (i = 0; i < vsession->max_queues; i++) {
+		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
+
+		/* vring.desc and vring.desc_packed are in a union struct
+		 * so q->vring.desc can replace q->vring.desc_packed.
+		 */
+		if (q->vring.desc != NULL && q->vring.size > 0) {
+			rte_vhost_vring_call(vsession->vid, q->vring_idx);
+		}
+	}
+
+	vhost_session_set_coalescing(vdev, vsession, NULL);
+	vhost_session_mem_register(vsession->mem);
+	vsession->initialized = true;
+	rc = vdev->backend->start_session(vsession);
+	if (rc != 0) {
+		vhost_session_mem_unregister(vsession->mem);
+		free(vsession->mem);
+		goto out;
+	}
+
+out:
+	pthread_mutex_unlock(&g_vhost_mutex);
+	return rc;
+}
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int
+vhost_get_config_cb(int vid, uint8_t *config, uint32_t len)
+{
+	struct spdk_vhost_session *vsession;
+	struct spdk_vhost_dev *vdev;
+	int rc = -1;
+
+	pthread_mutex_lock(&g_vhost_mutex);
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+		goto out;
+	}
+
+	vdev = vsession->vdev;
+	if (vdev->backend->vhost_get_config) {
+		rc = vdev->backend->vhost_get_config(vdev, config, len);
+	}
+
+out:
+	pthread_mutex_unlock(&g_vhost_mutex);
+	return rc;
+}
+
+int
+vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
+{
+	struct spdk_vhost_session *vsession;
+	struct spdk_vhost_dev *vdev;
+	int rc = -1;
+
+	pthread_mutex_lock(&g_vhost_mutex);
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+		goto out;
+	}
+
+	vdev = vsession->vdev;
+	if (vdev->backend->vhost_set_config) {
+		rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
+	}
+
+out:
+	pthread_mutex_unlock(&g_vhost_mutex);
+	return rc;
+}
+#endif
+
+int
+spdk_vhost_set_socket_path(const char *basename)
+{
+	int ret;
+
+	if (basename && strlen(basename) > 0) {
+		ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
+		if (ret <= 0) {
+			return -EINVAL;
+		}
+		if ((size_t)ret >= sizeof(dev_dirname) - 2) {
+			SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
+			return -EINVAL;
+		}
+
+		if (dev_dirname[ret - 1] != '/') {
+			dev_dirname[ret] = '/';
+			dev_dirname[ret + 1]  = '\0';
+		}
+	}
+
+	return 0;
+}
+
+void
+vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	assert(vdev->backend->dump_info_json != NULL);
+	vdev->backend->dump_info_json(vdev, w);
+}
+
+int
+spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
+{
+	if (vdev->pending_async_op_num) {
+		return -EBUSY;
+	}
+
+	return vdev->backend->remove_device(vdev);
+}
+
+int
+vhost_new_connection_cb(int vid, const char *ifname)
+{
+	struct spdk_vhost_dev *vdev;
+	struct spdk_vhost_session *vsession;
+
+	pthread_mutex_lock(&g_vhost_mutex);
+
+	vdev = spdk_vhost_dev_find(ifname);
+	if (vdev == NULL) {
+		SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
+		pthread_mutex_unlock(&g_vhost_mutex);
+		return -1;
+	}
+
+	/* We expect sessions inside vdev->vsessions to be sorted in ascending
+	 * order in regard of vsession->id. For now we always set id = vsessions_cnt++
+	 * and append each session to the very end of the vsessions list.
+	 * This is required for spdk_vhost_dev_foreach_session() to work.
+	 */
+	if (vdev->vsessions_num == UINT_MAX) {
+		assert(false);
+		return -EINVAL;
+	}
+
+	if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
+			   vdev->backend->session_ctx_size)) {
+		SPDK_ERRLOG("vsession alloc failed\n");
+		pthread_mutex_unlock(&g_vhost_mutex);
+		return -1;
+	}
+	memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
+
+	vsession->vdev = vdev;
+	vsession->vid = vid;
+	vsession->id = vdev->vsessions_num++;
+	vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid);
+	if (vsession->name == NULL) {
+		SPDK_ERRLOG("vsession alloc failed\n");
+		pthread_mutex_unlock(&g_vhost_mutex);
+		free(vsession);
+		return -1;
+	}
+	vsession->started = false;
+	vsession->initialized = false;
+	vsession->next_stats_check_time = 0;
+	vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS *
+					 spdk_get_ticks_hz() / 1000UL;
+	TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
+
+	vhost_session_install_rte_compat_hooks(vsession);
+	pthread_mutex_unlock(&g_vhost_mutex);
+	return 0;
+}
+
+int
+vhost_destroy_connection_cb(int vid)
+{
+	struct spdk_vhost_session *vsession;
+	int rc = 0;
+
+	pthread_mutex_lock(&g_vhost_mutex);
+	vsession = vhost_session_find_by_vid(vid);
+	if (vsession == NULL) {
+		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+		pthread_mutex_unlock(&g_vhost_mutex);
+		return -EINVAL;
+	}
+
+	if (vsession->started) {
+		rc = _stop_session(vsession);
+	}
+
+	TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
+	free(vsession->name);
+	free(vsession);
+	pthread_mutex_unlock(&g_vhost_mutex);
+
+	return rc;
+}
+
+void
+spdk_vhost_lock(void)
+{
+	pthread_mutex_lock(&g_vhost_mutex);
+}
+
+int
+spdk_vhost_trylock(void)
+{
+	return -pthread_mutex_trylock(&g_vhost_mutex);
+}
+
+void
+spdk_vhost_unlock(void)
+{
+	pthread_mutex_unlock(&g_vhost_mutex);
+}
+
+void
+spdk_vhost_init(spdk_vhost_init_cb init_cb)
+{
+	size_t len;
+	int ret;
+
+	g_vhost_init_thread = spdk_get_thread();
+	assert(g_vhost_init_thread != NULL);
+
+	if (dev_dirname[0] == '\0') {
+		if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
+			SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
+			ret = -1;
+			goto out;
+		}
+
+		len = strlen(dev_dirname);
+		if (dev_dirname[len - 1] != '/') {
+			dev_dirname[len] = '/';
+			dev_dirname[len + 1] = '\0';
+		}
+	}
+
+	ret = sem_init(&g_dpdk_sem, 0, 0);
+	if (ret != 0) {
+		SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n");
+		ret = -1;
+		goto out;
+	}
+
+	ret = vhost_scsi_controller_construct();
+	if (ret != 0) {
+		SPDK_ERRLOG("Cannot construct vhost controllers\n");
+		goto out;
+	}
+
+	ret = vhost_blk_controller_construct();
+	if (ret != 0) {
+		SPDK_ERRLOG("Cannot construct vhost block controllers\n");
+		goto out;
+	}
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+	ret = vhost_nvme_controller_construct();
+	if (ret != 0) {
+		SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
+		goto out;
+	}
+#endif
+
+	spdk_cpuset_zero(&g_vhost_core_mask);
+
+	/* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really
+	 * created.
+	 */
+	spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done);
+	return;
+out:
+	init_cb(ret);
+}
+
+static void
+vhost_fini(void *arg1)
+{
+	struct spdk_vhost_dev *vdev, *tmp;
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_next(NULL);
+	while (vdev != NULL) {
+		tmp = spdk_vhost_dev_next(vdev);
+		spdk_vhost_dev_remove(vdev);
+		/* don't care if it fails, there's nothing we can do for now */
+		vdev = tmp;
+	}
+	spdk_vhost_unlock();
+
+	spdk_cpuset_zero(&g_vhost_core_mask);
+
+	/* All devices are removed now. */
+	sem_destroy(&g_dpdk_sem);
+
+	g_fini_cpl_cb();
+}
+
+static void *
+session_shutdown(void *arg)
+{
+	struct spdk_vhost_dev *vdev = NULL;
+
+	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+		vhost_driver_unregister(vdev->path);
+		vdev->registered = false;
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
+	spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL);
+	return NULL;
+}
+
+void
+spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
+{
+	pthread_t tid;
+	int rc;
+
+	assert(spdk_get_thread() == g_vhost_init_thread);
+	g_fini_cpl_cb = fini_cb;
+
+	/* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
+	 * ops for stopping a device or removing a connection, we need to call it from
+	 * a separate thread to avoid deadlock.
+	 */
+	rc = pthread_create(&tid, NULL, &session_shutdown, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
+		abort();
+	}
+	pthread_detach(tid);
+}
+
+void
+spdk_vhost_config_json(struct spdk_json_write_ctx *w)
+{
+	struct spdk_vhost_dev *vdev;
+	uint32_t delay_base_us;
+	uint32_t iops_threshold;
+
+	spdk_json_write_array_begin(w);
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_next(NULL);
+	while (vdev != NULL) {
+		vdev->backend->write_config_json(vdev, w);
+
+		spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+		if (delay_base_us) {
+			spdk_json_write_object_begin(w);
+			spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing");
+
+			spdk_json_write_named_object_begin(w, "params");
+			spdk_json_write_named_string(w, "ctrlr", vdev->name);
+			spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+			spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+			spdk_json_write_object_end(w);
+
+			spdk_json_write_object_end(w);
+		}
+		vdev = spdk_vhost_dev_next(vdev);
+	}
+	spdk_vhost_unlock();
+
+	spdk_json_write_array_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
+SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)
diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c
new file mode 100644
index 000000000..d387cb27d
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_blk.c
@@ -0,0 +1,1354 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/virtio_blk.h>
+
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/vhost.h"
+
+#include "vhost_internal.h"
+#include <rte_version.h>
+
+/* Minimal set of features supported by every SPDK VHOST-BLK device */
+#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
+		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
+		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
+		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
+		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
+		(1ULL << VIRTIO_BLK_F_MQ))
+
+/* Not supported features */
+#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
+		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
+		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
+
+/* Vhost-blk support protocol features */
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
+		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
+#else
+#define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
+#endif
+
+struct spdk_vhost_blk_task {
+	struct spdk_bdev_io *bdev_io;
+	struct spdk_vhost_blk_session *bvsession;
+	struct spdk_vhost_virtqueue *vq;
+
+	volatile uint8_t *status;
+
+	uint16_t req_idx;
+	uint16_t num_descs;
+	uint16_t buffer_id;
+
+	/* for io wait */
+	struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+	/* If set, the task is currently used for I/O processing. */
+	bool used;
+
+	/** Number of bytes that were written. */
+	uint32_t used_len;
+	uint16_t iovcnt;
+	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+};
+
+struct spdk_vhost_blk_dev {
+	struct spdk_vhost_dev vdev;
+	struct spdk_bdev *bdev;
+	struct spdk_bdev_desc *bdev_desc;
+	/* dummy_io_channel is used to hold a bdev reference */
+	struct spdk_io_channel *dummy_io_channel;
+	bool readonly;
+};
+
+struct spdk_vhost_blk_session {
+	/* The parent session must be the very first field in this struct */
+	struct spdk_vhost_session vsession;
+	struct spdk_vhost_blk_dev *bvdev;
+	struct spdk_poller *requestq_poller;
+	struct spdk_io_channel *io_channel;
+	struct spdk_poller *stop_poller;
+};
+
+/* forward declaration */
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task,
+		    struct spdk_vhost_blk_session *bvsession,
+		    struct spdk_vhost_virtqueue *vq);
+
+static void
+blk_task_finish(struct spdk_vhost_blk_task *task)
+{
+	assert(task->bvsession->vsession.task_cnt > 0);
+	task->bvsession->vsession.task_cnt--;
+	task->used = false;
+}
+
+static void
+blk_task_init(struct spdk_vhost_blk_task *task)
+{
+	task->used = true;
+	task->iovcnt = SPDK_COUNTOF(task->iovs);
+	task->status = NULL;
+	task->used_len = 0;
+}
+
+static void
+blk_task_enqueue(struct spdk_vhost_blk_task *task)
+{
+	if (task->vq->packed.packed_ring) {
+		vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
+					     task->num_descs,
+					     task->buffer_id, task->used_len);
+	} else {
+		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
+					   task->req_idx, task->used_len);
+	}
+}
+
+static void
+invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
+{
+	if (task->status) {
+		*task->status = status;
+	}
+
+	blk_task_enqueue(task);
+	blk_task_finish(task);
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ *   total size of suplied buffers
+ *
+ *   FIXME: Make this function return to rd_cnt and wr_cnt
+ */
+static int
+blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
+			   struct spdk_vhost_virtqueue *vq,
+			   uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	struct spdk_vhost_dev *vdev = vsession->vdev;
+	struct vring_desc *desc, *desc_table;
+	uint16_t out_cnt = 0, cnt = 0;
+	uint32_t desc_table_size, len = 0;
+	uint32_t desc_handled_cnt;
+	int rc;
+
+	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
+	if (rc != 0) {
+		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+		return -1;
+	}
+
+	desc_handled_cnt = 0;
+	while (1) {
+		/*
+		 * Maximum cnt reached?
+		 * Should not happen if request is well formatted, otherwise this is a BUG.
+		 */
+		if (spdk_unlikely(cnt == *iovs_cnt)) {
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
+				      vsession->name, req_idx);
+			return -1;
+		}
+
+		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+				      vsession->name, req_idx, cnt);
+			return -1;
+		}
+
+		len += desc->len;
+
+		out_cnt += vhost_vring_desc_is_wr(desc);
+
+		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+		if (rc != 0) {
+			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
+				    vsession->name, req_idx);
+			return -1;
+		} else if (desc == NULL) {
+			break;
+		}
+
+		desc_handled_cnt++;
+		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
+			/* Break a cycle and report an error, if any. */
+			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
+				    vsession->name, desc_table_size, desc_handled_cnt);
+			return -1;
+		}
+	}
+
+	/*
+	 * There must be least two descriptors.
+	 * First contain request so it must be readable.
+	 * Last descriptor contain buffer for response so it must be writable.
+	 */
+	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+		return -1;
+	}
+
+	*length = len;
+	*iovs_cnt = cnt;
+	return 0;
+}
+
+static int
+blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
+			    struct spdk_vhost_virtqueue *vq,
+			    uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	struct spdk_vhost_dev *vdev = vsession->vdev;
+	struct vring_packed_desc *desc = NULL, *desc_table;
+	uint16_t out_cnt = 0, cnt = 0;
+	uint32_t desc_table_size, len = 0;
+	int rc = 0;
+
+	rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
+				      &desc_table, &desc_table_size);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+		return rc;
+	}
+
+	if (desc_table != NULL) {
+		req_idx = 0;
+	}
+
+	while (1) {
+		/*
+		 * Maximum cnt reached?
+		 * Should not happen if request is well formatted, otherwise this is a BUG.
+		 */
+		if (spdk_unlikely(cnt == *iovs_cnt)) {
+			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
+				    vsession->name, req_idx);
+			return -EINVAL;
+		}
+
+		if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
+			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+				    vsession->name, req_idx, cnt);
+			return -EINVAL;
+		}
+
+		len += desc->len;
+		out_cnt += vhost_vring_packed_desc_is_wr(desc);
+
+		/* desc is NULL means we reach the last desc of this request */
+		vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
+		if (desc == NULL) {
+			break;
+		}
+	}
+
+	/*
+	 * There must be least two descriptors.
+	 * First contain request so it must be readable.
+	 * Last descriptor contain buffer for response so it must be writable.
+	 */
+	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+		return -EINVAL;
+	}
+
+	*length = len;
+	*iovs_cnt = cnt;
+
+	return 0;
+}
+
+static void
+blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
+{
+	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+
+	blk_task_enqueue(task);
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
+		      task->req_idx, success ? "OK" : "FAIL");
+	blk_task_finish(task);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_vhost_blk_task *task = cb_arg;
+
+	spdk_bdev_free_io(bdev_io);
+	blk_request_finish(success, task);
+}
+
+static void
+blk_request_resubmit(void *arg)
+{
+	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
+	int rc = 0;
+
+	blk_task_init(task);
+
+	rc = process_blk_request(task, task->bvsession, task->vq);
+	if (rc == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
+	}
+}
+
+static inline void
+blk_request_queue_io(struct spdk_vhost_blk_task *task)
+{
+	int rc;
+	struct spdk_vhost_blk_session *bvsession = task->bvsession;
+	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
+
+	task->bdev_io_wait.bdev = bdev;
+	task->bdev_io_wait.cb_fn = blk_request_resubmit;
+	task->bdev_io_wait.cb_arg = task;
+
+	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
+	if (rc != 0) {
+		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
+		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+	}
+}
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task,
+		    struct spdk_vhost_blk_session *bvsession,
+		    struct spdk_vhost_virtqueue *vq)
+{
+	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
+	const struct virtio_blk_outhdr *req;
+	struct virtio_blk_discard_write_zeroes *desc;
+	struct iovec *iov;
+	uint32_t type;
+	uint32_t payload_len;
+	uint64_t flush_bytes;
+	int rc;
+
+	if (vq->packed.packed_ring) {
+		rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+						 &payload_len);
+	} else {
+		rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+						&payload_len);
+	}
+
+	if (rc) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
+		/* Only READ and WRITE are supported for now. */
+		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+		return -1;
+	}
+
+	iov = &task->iovs[0];
+	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
+			      iov->iov_len, sizeof(*req), task->req_idx);
+		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+		return -1;
+	}
+
+	req = iov->iov_base;
+
+	iov = &task->iovs[task->iovcnt - 1];
+	if (spdk_unlikely(iov->iov_len != 1)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
+			      iov->iov_len, 1, task->req_idx);
+		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+		return -1;
+	}
+
+	task->status = iov->iov_base;
+	payload_len -= sizeof(*req) + sizeof(*task->status);
+	task->iovcnt -= 2;
+
+	type = req->type;
+#ifdef VIRTIO_BLK_T_BARRIER
+	/* Don't care about barier for now (as QEMU's virtio-blk do). */
+	type &= ~VIRTIO_BLK_T_BARRIER;
+#endif
+
+	switch (type) {
+	case VIRTIO_BLK_T_IN:
+	case VIRTIO_BLK_T_OUT:
+		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
+			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
+				    type ? "WRITE" : "READ", task->req_idx);
+			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+			return -1;
+		}
+
+		if (type == VIRTIO_BLK_T_IN) {
+			task->used_len = payload_len + sizeof(*task->status);
+			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
+					     &task->iovs[1], task->iovcnt, req->sector * 512,
+					     payload_len, blk_request_complete_cb, task);
+		} else if (!bvdev->readonly) {
+			task->used_len = sizeof(*task->status);
+			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
+					      &task->iovs[1], task->iovcnt, req->sector * 512,
+					      payload_len, blk_request_complete_cb, task);
+		} else {
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
+			rc = -1;
+		}
+
+		if (rc) {
+			if (rc == -ENOMEM) {
+				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+				blk_request_queue_io(task);
+			} else {
+				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+				return -1;
+			}
+		}
+		break;
+	case VIRTIO_BLK_T_DISCARD:
+		desc = task->iovs[1].iov_base;
+		if (payload_len != sizeof(*desc)) {
+			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
+			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+			return -1;
+		}
+
+		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
+				     desc->sector * 512, desc->num_sectors * 512,
+				     blk_request_complete_cb, task);
+		if (rc) {
+			if (rc == -ENOMEM) {
+				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+				blk_request_queue_io(task);
+			} else {
+				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+				return -1;
+			}
+		}
+		break;
+	case VIRTIO_BLK_T_WRITE_ZEROES:
+		desc = task->iovs[1].iov_base;
+		if (payload_len != sizeof(*desc)) {
+			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
+			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+			return -1;
+		}
+
+		/* Zeroed and Unmap the range, SPDK doen't support it. */
+		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
+			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+			return -1;
+		}
+
+		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
+					    desc->sector * 512, desc->num_sectors * 512,
+					    blk_request_complete_cb, task);
+		if (rc) {
+			if (rc == -ENOMEM) {
+				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+				blk_request_queue_io(task);
+			} else {
+				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+				return -1;
+			}
+		}
+		break;
+	case VIRTIO_BLK_T_FLUSH:
+		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
+		if (req->sector != 0) {
+			SPDK_NOTICELOG("sector must be zero for flush command\n");
+			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+			return -1;
+		}
+		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
+				     0, flush_bytes,
+				     blk_request_complete_cb, task);
+		if (rc) {
+			if (rc == -ENOMEM) {
+				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+				blk_request_queue_io(task);
+			} else {
+				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+				return -1;
+			}
+		}
+		break;
+	case VIRTIO_BLK_T_GET_ID:
+		if (!task->iovcnt || !payload_len) {
+			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+			return -1;
+		}
+		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
+		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
+				task->used_len, ' ');
+		blk_request_finish(true, task);
+		break;
+	default:
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
+		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
+{
+	struct spdk_vhost_blk_task *task;
+	uint16_t task_idx = req_idx, num_descs;
+
+	if (vq->packed.packed_ring) {
+		/* Packed ring used the buffer_id as the task_idx to get task struct.
+		 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
+		 * must be in the range of 0 ~ vring.size. The free_head value must be unique
+		 * in the outstanding requests.
+		 * We can't use the req_idx as the task_idx because the desc can be reused in
+		 * the next phase even when it's not completed in the previous phase. For example,
+		 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
+		 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
+		 * as task_idx because we will know task[0]->used is true at phase 1.
+		 * The split queue is quite different, the desc would insert into the free list when
+		 * device completes the request, the driver gets the desc from the free list which
+		 * ensures the req_idx is unique in the outstanding requests.
+		 */
+		task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
+	}
+
+	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
+	if (spdk_unlikely(task->used)) {
+		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+			    task->bvsession->vsession.name, task_idx);
+		task->used_len = 0;
+		blk_task_enqueue(task);
+		return;
+	}
+
+	if (vq->packed.packed_ring) {
+		task->req_idx = req_idx;
+		task->num_descs = num_descs;
+		task->buffer_id = task_idx;
+	}
+
+	task->bvsession->vsession.task_cnt++;
+
+	blk_task_init(task);
+
+	if (process_blk_request(task, task->bvsession, vq) == 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
+			      task_idx);
+	} else {
+		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
+	}
+}
+
+static void
+submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
+		     struct spdk_vhost_virtqueue *vq)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
+	spdk_vhost_resubmit_desc *resubmit_list;
+	uint16_t req_idx;
+
+	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
+		return;
+	}
+
+	resubmit_list = resubmit->resubmit_list;
+	while (resubmit->resubmit_num-- > 0) {
+		req_idx = resubmit_list[resubmit->resubmit_num].index;
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
+			      req_idx);
+
+		if (spdk_unlikely(req_idx >= vq->vring.size)) {
+			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+				    vsession->name, req_idx, vq->vring.size);
+			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+			continue;
+		}
+
+		process_blk_task(vq, req_idx);
+	}
+
+	free(resubmit_list);
+	resubmit->resubmit_list = NULL;
+}
+
+static void
+process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
+	uint16_t reqs_cnt, i;
+
+	submit_inflight_desc(bvsession, vq);
+
+	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+	if (!reqs_cnt) {
+		return;
+	}
+
+	for (i = 0; i < reqs_cnt; i++) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+			      reqs[i]);
+
+		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+				    vsession->name, reqs[i], vq->vring.size);
+			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
+			continue;
+		}
+
+		rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
+
+		process_blk_task(vq, reqs[i]);
+	}
+}
+
+static void
+process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+	uint16_t i = 0;
+
+	while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
+	       vhost_vq_packed_ring_is_avail(vq)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+			      vq->last_avail_idx);
+
+		process_blk_task(vq, vq->last_avail_idx);
+	}
+}
+
+static int
+vdev_worker(void *arg)
+{
+	struct spdk_vhost_blk_session *bvsession = arg;
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+
+	uint16_t q_idx;
+	bool packed_ring;
+
+	/* In a session, every vq supports the same format */
+	packed_ring = vsession->virtqueue[0].packed.packed_ring;
+	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+		if (packed_ring) {
+			process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
+		} else {
+			process_vq(bvsession, &vsession->virtqueue[q_idx]);
+		}
+	}
+
+	vhost_session_used_signal(vsession);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static void
+no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+	uint32_t length;
+	uint16_t iovcnt, req_idx;
+
+	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
+		return;
+	}
+
+	iovcnt = SPDK_COUNTOF(iovs);
+	if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
+		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+	}
+
+	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+}
+
+static void
+no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	struct spdk_vhost_blk_task *task;
+	uint32_t length;
+	uint16_t req_idx = vq->last_avail_idx;
+	uint16_t task_idx, num_descs;
+
+	if (!vhost_vq_packed_ring_is_avail(vq)) {
+		return;
+	}
+
+	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
+	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
+	if (spdk_unlikely(task->used)) {
+		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+			    vsession->name, req_idx);
+		vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
+					     task->buffer_id, task->used_len);
+		return;
+	}
+
+	task->req_idx = req_idx;
+	task->num_descs = num_descs;
+	task->buffer_id = task_idx;
+	blk_task_init(task);
+
+	if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+					&length)) {
+		*(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+	}
+
+	task->used = false;
+	vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
+				     task->buffer_id, task->used_len);
+}
+
+static int
+no_bdev_vdev_worker(void *arg)
+{
+	struct spdk_vhost_blk_session *bvsession = arg;
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	uint16_t q_idx;
+	bool packed_ring;
+
+	/* In a session, every vq supports the same format */
+	packed_ring = vsession->virtqueue[0].packed.packed_ring;
+	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+		if (packed_ring) {
+			no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
+		} else {
+			no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
+		}
+	}
+
+	vhost_session_used_signal(vsession);
+
+	if (vsession->task_cnt == 0 && bvsession->io_channel) {
+		spdk_put_io_channel(bvsession->io_channel);
+		bvsession->io_channel = NULL;
+	}
+
+	return SPDK_POLLER_BUSY;
+}
+
+static struct spdk_vhost_blk_session *
+to_blk_session(struct spdk_vhost_session *vsession)
+{
+	assert(vsession->vdev->backend == &vhost_blk_device_backend);
+	return (struct spdk_vhost_blk_session *)vsession;
+}
+
+static struct spdk_vhost_blk_dev *
+to_blk_dev(struct spdk_vhost_dev *vdev)
+{
+	if (vdev == NULL) {
+		return NULL;
+	}
+
+	if (vdev->backend != &vhost_blk_device_backend) {
+		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
+		return NULL;
+	}
+
+	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
+}
+
+static int
+vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
+			     struct spdk_vhost_session *vsession,
+			     void *ctx)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
+	SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
+	rte_vhost_slave_config_change(vsession->vid, false);
+#else
+	SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
+#endif
+
+	return 0;
+}
+
+static void
+blk_resize_cb(void *resize_ctx)
+{
+	struct spdk_vhost_blk_dev *bvdev = resize_ctx;
+
+	spdk_vhost_lock();
+	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
+				  NULL, NULL);
+	spdk_vhost_unlock();
+}
+
+static void
+vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+
+	/* All sessions have been notified, time to close the bdev */
+	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+
+	assert(bvdev != NULL);
+	spdk_put_io_channel(bvdev->dummy_io_channel);
+	spdk_bdev_close(bvdev->bdev_desc);
+	bvdev->bdev_desc = NULL;
+	bvdev->bdev = NULL;
+}
+
+static int
+vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
+			     struct spdk_vhost_session *vsession,
+			     void *ctx)
+{
+	struct spdk_vhost_blk_session *bvsession;
+
+	bvsession = (struct spdk_vhost_blk_session *)vsession;
+	if (bvsession->requestq_poller) {
+		spdk_poller_unregister(&bvsession->requestq_poller);
+		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
+	}
+
+	return 0;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
+
+	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
+		     bvdev->vdev.name);
+
+	spdk_vhost_lock();
+	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
+				  vhost_dev_bdev_remove_cpl_cb, NULL);
+	spdk_vhost_unlock();
+}
+
+static void
+bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
+	      void *event_ctx)
+{
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Bdev event: type %d, name %s\n",
+		      type,
+		      bdev->name);
+
+	switch (type) {
+	case SPDK_BDEV_EVENT_REMOVE:
+		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
+		bdev_remove_cb(event_ctx);
+		break;
+	case SPDK_BDEV_EVENT_RESIZE:
+		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
+		blk_resize_cb(event_ctx);
+		break;
+	default:
+		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
+		break;
+	}
+}
+
+static void
+free_task_pool(struct spdk_vhost_blk_session *bvsession)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	struct spdk_vhost_virtqueue *vq;
+	uint16_t i;
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		vq = &vsession->virtqueue[i];
+		if (vq->tasks == NULL) {
+			continue;
+		}
+
+		spdk_free(vq->tasks);
+		vq->tasks = NULL;
+	}
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
+{
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	struct spdk_vhost_virtqueue *vq;
+	struct spdk_vhost_blk_task *task;
+	uint32_t task_cnt;
+	uint16_t i;
+	uint32_t j;
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		vq = &vsession->virtqueue[i];
+		if (vq->vring.desc == NULL) {
+			continue;
+		}
+
+		task_cnt = vq->vring.size;
+		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+			/* sanity check */
+			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+			free_task_pool(bvsession);
+			return -1;
+		}
+		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
+					 SPDK_CACHE_LINE_SIZE, NULL,
+					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+		if (vq->tasks == NULL) {
+			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+				    vsession->name, task_cnt, i);
+			free_task_pool(bvsession);
+			return -1;
+		}
+
+		for (j = 0; j < task_cnt; j++) {
+			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
+			task->bvsession = bvsession;
+			task->req_idx = j;
+			task->vq = vq;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
+		   struct spdk_vhost_session *vsession, void *unused)
+{
+	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
+	struct spdk_vhost_blk_dev *bvdev;
+	int i, rc = 0;
+
+	bvdev = to_blk_dev(vdev);
+	assert(bvdev != NULL);
+	bvsession->bvdev = bvdev;
+
+	/* validate all I/O queues are in a contiguous index range */
+	for (i = 0; i < vsession->max_queues; i++) {
+		/* vring.desc and vring.desc_packed are in a union struct
+		 * so q->vring.desc can replace q->vring.desc_packed.
+		 */
+		if (vsession->virtqueue[i].vring.desc == NULL) {
+			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
+			rc = -1;
+			goto out;
+		}
+	}
+
+	rc = alloc_task_pool(bvsession);
+	if (rc != 0) {
+		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
+		goto out;
+	}
+
+	if (bvdev->bdev) {
+		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+		if (!bvsession->io_channel) {
+			free_task_pool(bvsession);
+			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
+			rc = -1;
+			goto out;
+		}
+	}
+
+	bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
+				     bvsession, 0);
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
+		     vsession->name, spdk_env_get_current_core());
+out:
+	vhost_session_start_done(vsession, rc);
+	return rc;
+}
+
+static int
+vhost_blk_start(struct spdk_vhost_session *vsession)
+{
+	return vhost_session_send_event(vsession, vhost_blk_start_cb,
+					3, "start session");
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+	struct spdk_vhost_blk_session *bvsession = arg;
+	struct spdk_vhost_session *vsession = &bvsession->vsession;
+	int i;
+
+	if (vsession->task_cnt > 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	if (spdk_vhost_trylock() != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		vsession->virtqueue[i].next_event_time = 0;
+		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
+		     vsession->name, spdk_env_get_current_core());
+
+	if (bvsession->io_channel) {
+		spdk_put_io_channel(bvsession->io_channel);
+		bvsession->io_channel = NULL;
+	}
+
+	free_task_pool(bvsession);
+	spdk_poller_unregister(&bvsession->stop_poller);
+	vhost_session_stop_done(vsession, 0);
+
+	spdk_vhost_unlock();
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
+		  struct spdk_vhost_session *vsession, void *unused)
+{
+	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
+
+	spdk_poller_unregister(&bvsession->requestq_poller);
+	bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
+				 bvsession, 1000);
+	return 0;
+}
+
+static int
+vhost_blk_stop(struct spdk_vhost_session *vsession)
+{
+	return vhost_session_send_event(vsession, vhost_blk_stop_cb,
+					3, "stop session");
+}
+
+static void
+vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct spdk_vhost_blk_dev *bvdev;
+
+	bvdev = to_blk_dev(vdev);
+	assert(bvdev != NULL);
+
+	spdk_json_write_named_object_begin(w, "block");
+
+	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+
+	spdk_json_write_name(w, "bdev");
+	if (bvdev->bdev) {
+		spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
+	} else {
+		spdk_json_write_null(w);
+	}
+
+	spdk_json_write_object_end(w);
+}
+
+static void
+vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct spdk_vhost_blk_dev *bvdev;
+
+	bvdev = to_blk_dev(vdev);
+	assert(bvdev != NULL);
+
+	if (!bvdev->bdev) {
+		return;
+	}
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
+
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_string(w, "ctrlr", vdev->name);
+	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
+	spdk_json_write_named_string(w, "cpumask",
+				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+}
+
+static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
+
+static int
+vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
+		     uint32_t len)
+{
+	struct virtio_blk_config blkcfg;
+	struct spdk_vhost_blk_dev *bvdev;
+	struct spdk_bdev *bdev;
+	uint32_t blk_size;
+	uint64_t blkcnt;
+
+	memset(&blkcfg, 0, sizeof(blkcfg));
+	bvdev = to_blk_dev(vdev);
+	assert(bvdev != NULL);
+	bdev = bvdev->bdev;
+	if (bdev == NULL) {
+		/* We can't just return -1 here as this GET_CONFIG message might
+		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
+		 * error to QEMU, who might then decide to terminate itself.
+		 * We don't want that. A simple reboot shouldn't break the system.
+		 *
+		 * Presenting a block device with block size 0 and block count 0
+		 * doesn't cause any problems on QEMU side and the virtio-pci
+		 * device is even still available inside the VM, but there will
+		 * be no block device created for it - the kernel drivers will
+		 * silently reject it.
+		 */
+		blk_size = 0;
+		blkcnt = 0;
+	} else {
+		blk_size = spdk_bdev_get_block_size(bdev);
+		blkcnt = spdk_bdev_get_num_blocks(bdev);
+		if (spdk_bdev_get_buf_align(bdev) > 1) {
+			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
+		} else {
+			blkcfg.size_max = 131072;
+			/*  -2 for REQ and RESP and -1 for region boundary splitting */
+			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+		}
+	}
+
+	blkcfg.blk_size = blk_size;
+	/* minimum I/O size in blocks */
+	blkcfg.min_io_size = 1;
+	/* expressed in 512 Bytes sectors */
+	blkcfg.capacity = (blkcnt * blk_size) / 512;
+	/* QEMU can overwrite this value when started */
+	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
+
+	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+		/* 16MiB, expressed in 512 Bytes */
+		blkcfg.max_discard_sectors = 32768;
+		blkcfg.max_discard_seg = 1;
+		blkcfg.discard_sector_alignment = blk_size / 512;
+	}
+	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+		blkcfg.max_write_zeroes_sectors = 32768;
+		blkcfg.max_write_zeroes_seg = 1;
+	}
+
+	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
+
+	return 0;
+}
+
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
+	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
+	.start_session =  vhost_blk_start,
+	.stop_session = vhost_blk_stop,
+	.vhost_get_config = vhost_blk_get_config,
+	.dump_info_json = vhost_blk_dump_info_json,
+	.write_config_json = vhost_blk_write_config_json,
+	.remove_device = vhost_blk_destroy,
+};
+
+int
+vhost_blk_controller_construct(void)
+{
+	struct spdk_conf_section *sp;
+	unsigned ctrlr_num;
+	char *bdev_name;
+	char *cpumask;
+	char *name;
+	bool readonly;
+	bool packed_ring;
+
+	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
+			continue;
+		}
+
+		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
+			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+				    spdk_conf_section_get_name(sp));
+			return -1;
+		}
+
+		name = spdk_conf_section_get_val(sp, "Name");
+		if (name == NULL) {
+			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
+			return -1;
+		}
+
+		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
+		packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false);
+
+		bdev_name = spdk_conf_section_get_val(sp, "Dev");
+		if (bdev_name == NULL) {
+			continue;
+		}
+
+		if (spdk_vhost_blk_construct(name, cpumask, bdev_name,
+					     readonly, packed_ring) < 0) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
+			 bool readonly, bool packed_ring)
+{
+	struct spdk_vhost_blk_dev *bvdev = NULL;
+	struct spdk_vhost_dev *vdev;
+	struct spdk_bdev *bdev;
+	int ret = 0;
+
+	spdk_vhost_lock();
+	bdev = spdk_bdev_get_by_name(dev_name);
+	if (bdev == NULL) {
+		SPDK_ERRLOG("%s: bdev '%s' not found\n",
+			    name, dev_name);
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bvdev = calloc(1, sizeof(*bvdev));
+	if (bvdev == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	vdev = &bvdev->vdev;
+	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
+	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
+	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
+
+	vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
+
+	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
+	}
+	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
+	}
+	if (readonly) {
+		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
+	}
+	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
+	}
+
+	ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
+	if (ret != 0) {
+		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
+			    name, dev_name, ret);
+		goto out;
+	}
+
+	/*
+	 * When starting qemu with vhost-user-blk multiqueue, the vhost device will
+	 * be started/stopped many times, related to the queues num, as the
+	 * vhost-user backend doesn't know the exact number of queues used for this
+	 * device. The target have to stop and start the device once got a valid
+	 * IO queue.
+	 * When stoping and starting the vhost device, the backend bdev io device
+	 * will be deleted and created repeatedly.
+	 * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
+	 * the io device will not be deleted.
+	 */
+	bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+
+	bvdev->bdev = bdev;
+	bvdev->readonly = readonly;
+	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
+	if (ret != 0) {
+		spdk_put_io_channel(bvdev->dummy_io_channel);
+		spdk_bdev_close(bvdev->bdev_desc);
+		goto out;
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
+out:
+	if (ret != 0 && bvdev) {
+		free(bvdev);
+	}
+	spdk_vhost_unlock();
+	return ret;
+}
+
+static int
+vhost_blk_destroy(struct spdk_vhost_dev *vdev)
+{
+	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+	int rc;
+
+	assert(bvdev != NULL);
+
+	rc = vhost_dev_unregister(&bvdev->vdev);
+	if (rc != 0) {
+		return rc;
+	}
+
+	/* if the bdev is removed, don't need call spdk_put_io_channel. */
+	if (bvdev->bdev) {
+		spdk_put_io_channel(bvdev->dummy_io_channel);
+	}
+
+	if (bvdev->bdev_desc) {
+		spdk_bdev_close(bvdev->bdev_desc);
+		bvdev->bdev_desc = NULL;
+	}
+	bvdev->bdev = NULL;
+
+	free(bvdev);
+	return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h
new file mode 100644
index 000000000..3aa89768a
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_internal.h
@@ -0,0 +1,496 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VHOST_INTERNAL_H
+#define SPDK_VHOST_INTERNAL_H
+#include <linux/virtio_config.h>
+
+#include "spdk/stdinc.h"
+
+#include <rte_vhost.h>
+
+#include "spdk_internal/vhost_user.h"
+#include "spdk_internal/log.h"
+#include "spdk/util.h"
+#include "spdk/rpc.h"
+#include "spdk/config.h"
+
+#define SPDK_VHOST_MAX_VQUEUES	256
+#define SPDK_VHOST_MAX_VQ_SIZE	1024
+
+#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8
+
+#define SPDK_VHOST_IOVS_MAX 129
+
+#define SPDK_VHOST_VQ_MAX_SUBMISSIONS	32
+
+/*
+ * Rate at which stats are checked for interrupt coalescing.
+ */
+#define SPDK_VHOST_STATS_CHECK_INTERVAL_MS 10
+/*
+ * Default threshold at which interrupts start to be coalesced.
+ */
+#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000
+
+/*
+ * Currently coalescing is not used by default.
+ * Setting this to value > 0 here or by RPC will enable coalescing.
+ */
+#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0
+
+#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \
+	(1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+	(1ULL << VIRTIO_F_VERSION_1) | \
+	(1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
+	(1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+	(1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+	(1ULL << VIRTIO_F_RING_PACKED))
+
+#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+	(1ULL << VIRTIO_F_NOTIFY_ON_EMPTY))
+
+#define VRING_DESC_F_AVAIL	(1ULL << VRING_PACKED_DESC_F_AVAIL)
+#define VRING_DESC_F_USED	(1ULL << VRING_PACKED_DESC_F_USED)
+#define VRING_DESC_F_AVAIL_USED	(VRING_DESC_F_AVAIL | VRING_DESC_F_USED)
+
+typedef struct rte_vhost_resubmit_desc spdk_vhost_resubmit_desc;
+typedef struct rte_vhost_resubmit_info spdk_vhost_resubmit_info;
+
+struct spdk_vhost_virtqueue {
+	struct rte_vhost_vring vring;
+	struct rte_vhost_ring_inflight vring_inflight;
+	uint16_t last_avail_idx;
+	uint16_t last_used_idx;
+
+	struct {
+		/* To mark a descriptor as available in packed ring
+		 * Equal to avail_wrap_counter in spec.
+		 */
+		uint8_t avail_phase	: 1;
+		/* To mark a descriptor as used in packed ring
+		 * Equal to used_wrap_counter in spec.
+		 */
+		uint8_t used_phase	: 1;
+		uint8_t padding		: 5;
+		bool packed_ring	: 1;
+	} packed;
+
+	void *tasks;
+
+	/* Request count from last stats check */
+	uint32_t req_cnt;
+
+	/* Request count from last event */
+	uint16_t used_req_cnt;
+
+	/* How long interrupt is delayed */
+	uint32_t irq_delay_time;
+
+	/* Next time when we need to send event */
+	uint64_t next_event_time;
+
+	/* Associated vhost_virtqueue in the virtio device's virtqueue list */
+	uint32_t vring_idx;
+} __attribute((aligned(SPDK_CACHE_LINE_SIZE)));
+
+struct spdk_vhost_session {
+	struct spdk_vhost_dev *vdev;
+
+	/* rte_vhost connection ID. */
+	int vid;
+
+	/* Unique session ID. */
+	uint64_t id;
+	/* Unique session name. */
+	char *name;
+
+	bool initialized;
+	bool started;
+	bool needs_restart;
+	bool forced_polling;
+
+	struct rte_vhost_memory *mem;
+
+	int task_cnt;
+
+	uint16_t max_queues;
+
+	uint64_t negotiated_features;
+
+	/* Local copy of device coalescing settings. */
+	uint32_t coalescing_delay_time_base;
+	uint32_t coalescing_io_rate_threshold;
+
+	/* Next time when stats for event coalescing will be checked. */
+	uint64_t next_stats_check_time;
+
+	/* Interval used for event coalescing checking. */
+	uint64_t stats_check_interval;
+
+	struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES];
+
+	TAILQ_ENTRY(spdk_vhost_session) tailq;
+};
+
+struct spdk_vhost_dev {
+	char *name;
+	char *path;
+
+	struct spdk_thread *thread;
+	bool registered;
+
+	uint64_t virtio_features;
+	uint64_t disabled_features;
+	uint64_t protocol_features;
+
+	const struct spdk_vhost_dev_backend *backend;
+
+	/* Saved orginal values used to setup coalescing to avoid integer
+	 * rounding issues during save/load config.
+	 */
+	uint32_t coalescing_delay_us;
+	uint32_t coalescing_iops_threshold;
+
+	/* Current connections to the device */
+	TAILQ_HEAD(, spdk_vhost_session) vsessions;
+
+	/* Increment-only session counter */
+	uint64_t vsessions_num;
+
+	/* Number of started and actively polled sessions */
+	uint32_t active_session_num;
+
+	/* Number of pending asynchronous operations */
+	uint32_t pending_async_op_num;
+
+	TAILQ_ENTRY(spdk_vhost_dev) tailq;
+};
+
+/**
+ * \param vdev vhost device.
+ * \param vsession vhost session.
+ * \param arg user-provided parameter.
+ *
+ * \return negative values will break the foreach call, meaning
+ * the function won't be called again. Return codes zero and
+ * positive don't have any effect.
+ */
+typedef int (*spdk_vhost_session_fn)(struct spdk_vhost_dev *vdev,
+				     struct spdk_vhost_session *vsession,
+				     void *arg);
+
+/**
+ * \param vdev vhost device.
+ * \param arg user-provided parameter.
+ */
+typedef void (*spdk_vhost_dev_fn)(struct spdk_vhost_dev *vdev, void *arg);
+
+struct spdk_vhost_dev_backend {
+	/**
+	 * Size of additional per-session context data
+	 * allocated whenever a new client connects.
+	 */
+	size_t session_ctx_size;
+
+	int (*start_session)(struct spdk_vhost_session *vsession);
+	int (*stop_session)(struct spdk_vhost_session *vsession);
+
+	int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len);
+	int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config,
+				uint32_t offset, uint32_t size, uint32_t flags);
+
+	void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+	void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+	int (*remove_device)(struct spdk_vhost_dev *vdev);
+};
+
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len);
+
+uint16_t vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs,
+				 uint16_t reqs_len);
+
+/**
+ * Get a virtio split descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c spdk_vhost_vring_desc_get_next.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * default virtqueue descriptor table or per-chain indirect
+ * table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq,
+		      uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+		      uint32_t *desc_table_size);
+
+/**
+ * Get a virtio packed descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c vhost_vring_packed_desc_get_next.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * \c NULL or per-chain indirect table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+			     struct spdk_vhost_virtqueue *virtqueue,
+			     uint16_t req_idx, struct vring_packed_desc **desc,
+			     struct vring_packed_desc **desc_table, uint32_t *desc_table_size);
+
+/**
+ * Send IRQ/call client (if pending) for \c vq.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \return
+ *   0 - if no interrupt was signalled
+ *   1 - if interrupt was signalled
+ */
+int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq);
+
+
+/**
+ * Send IRQs for all queues that need to be signaled.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ */
+void vhost_session_used_signal(struct spdk_vhost_session *vsession);
+
+void vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+				struct spdk_vhost_virtqueue *vq,
+				uint16_t id, uint32_t len);
+
+/**
+ * Enqueue the entry to the used ring when device complete the request.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \req_idx descriptor index. It's the first index of this descriptor chain.
+ * \num_descs descriptor count. It's the count of the number of buffers in the chain.
+ * \buffer_id descriptor buffer ID.
+ * \length device write length. Specify the length of the buffer that has been initialized
+ * (written to) by the device
+ */
+void vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+				  struct spdk_vhost_virtqueue *virtqueue,
+				  uint16_t num_descs, uint16_t buffer_id,
+				  uint32_t length);
+
+/**
+ * Get subsequent descriptor from given table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int vhost_vring_desc_get_next(struct vring_desc **desc,
+			      struct vring_desc *desc_table, uint32_t desc_table_size);
+static inline bool
+vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
+{
+	return !!(cur_desc->flags & VRING_DESC_F_WRITE);
+}
+
+int vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+			    uint16_t *iov_index, const struct vring_desc *desc);
+
+bool vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue);
+
+/**
+ * Get subsequent descriptor from vq or desc table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \req_idx index of current desc, will be set to the next
+ * index. If desc_table != NULL the req_idx is the the vring index
+ * or the req_idx is the desc_table index.
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+				     struct spdk_vhost_virtqueue *vq,
+				     struct vring_packed_desc *desc_table,
+				     uint32_t desc_table_size);
+
+bool vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc);
+
+int vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+				   uint16_t *iov_index, const struct vring_packed_desc *desc);
+
+uint16_t vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+		uint16_t *num_descs);
+
+static inline bool __attribute__((always_inline))
+vhost_dev_has_feature(struct spdk_vhost_session *vsession, unsigned feature_id)
+{
+	return vsession->negotiated_features & (1ULL << feature_id);
+}
+
+int vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+		       const struct spdk_vhost_dev_backend *backend);
+int vhost_dev_unregister(struct spdk_vhost_dev *vdev);
+
+int vhost_scsi_controller_construct(void);
+int vhost_blk_controller_construct(void);
+void vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+
+/*
+ * Vhost callbacks for vhost_device_ops interface
+ */
+
+int vhost_new_connection_cb(int vid, const char *ifname);
+int vhost_start_device_cb(int vid);
+int vhost_stop_device_cb(int vid);
+int vhost_destroy_connection_cb(int vid);
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int vhost_get_config_cb(int vid, uint8_t *config, uint32_t len);
+int vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset,
+			uint32_t size, uint32_t flags);
+#endif
+
+/*
+ * Memory registration functions used in start/stop device callbacks
+ */
+void vhost_session_mem_register(struct rte_vhost_memory *mem);
+void vhost_session_mem_unregister(struct rte_vhost_memory *mem);
+
+/*
+ * Call a function for each session of the provided vhost device.
+ * The function will be called one-by-one on each session's thread.
+ *
+ * \param vdev vhost device
+ * \param fn function to call on each session's thread
+ * \param cpl_fn function to be called at the end of the iteration on
+ * the vhost management thread.
+ * Optional, can be NULL.
+ * \param arg additional argument to the both callbacks
+ */
+void vhost_dev_foreach_session(struct spdk_vhost_dev *dev,
+			       spdk_vhost_session_fn fn,
+			       spdk_vhost_dev_fn cpl_fn,
+			       void *arg);
+
+/**
+ * Call a function on the provided lcore and block until either
+ * spdk_vhost_session_start_done() or spdk_vhost_session_stop_done()
+ * is called.
+ *
+ * This must be called under the global vhost mutex, which this function
+ * will unlock for the time it's waiting. It's meant to be called only
+ * from start/stop session callbacks.
+ *
+ * \param vsession vhost session
+ * \param cb_fn the function to call. The void *arg parameter in cb_fn
+ * is always NULL.
+ * \param timeout_sec timeout in seconds. This function will still
+ * block after the timeout expires, but will print the provided errmsg.
+ * \param errmsg error message to print once the timeout expires
+ * \return return the code passed to spdk_vhost_session_event_done().
+ */
+int vhost_session_send_event(struct spdk_vhost_session *vsession,
+			     spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+			     const char *errmsg);
+
+/**
+ * Finish a blocking spdk_vhost_session_send_event() call and finally
+ * start the session. This must be called on the target lcore, which
+ * will now receive all session-related messages (e.g. from
+ * spdk_vhost_dev_foreach_session()).
+ *
+ * Must be called under the global vhost lock.
+ *
+ * \param vsession vhost session
+ * \param response return code
+ */
+void vhost_session_start_done(struct spdk_vhost_session *vsession, int response);
+
+/**
+ * Finish a blocking spdk_vhost_session_send_event() call and finally
+ * stop the session. This must be called on the session's lcore which
+ * used to receive all session-related messages (e.g. from
+ * spdk_vhost_dev_foreach_session()). After this call, the session-
+ * related messages will be once again processed by any arbitrary thread.
+ *
+ * Must be called under the global vhost lock.
+ *
+ * Must be called under the global vhost mutex.
+ *
+ * \param vsession vhost session
+ * \param response return code
+ */
+void vhost_session_stop_done(struct spdk_vhost_session *vsession, int response);
+
+struct spdk_vhost_session *vhost_session_find_by_vid(int vid);
+void vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession);
+int vhost_register_unix_socket(const char *path, const char *ctrl_name,
+			       uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features);
+int vhost_driver_unregister(const char *path);
+int vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+int vhost_get_negotiated_features(int vid, uint64_t *negotiated_features);
+
+int remove_vhost_controller(struct spdk_vhost_dev *vdev);
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf);
+int vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd);
+int vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size);
+int vhost_nvme_get_cap(int vid, uint64_t *cap);
+int vhost_nvme_controller_construct(void);
+int vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues);
+int vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev);
+int vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev,
+			  const char *bdev_name);
+#endif
+
+#endif /* SPDK_VHOST_INTERNAL_H */
diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c
new file mode 100644
index 000000000..10f53baf9
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_nvme.c
@@ -0,0 +1,1500 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "spdk/bdev.h"
+#include "spdk/version.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/likely.h"
+
+#include "vhost_internal.h"
+
+#define MAX_IO_QUEUES 31
+#define MAX_IOVS 64
+#define MAX_NAMESPACE 8
+#define MAX_QUEUE_ENTRIES_SUPPORTED 256
+#define MAX_BATCH_IO 8
+
+struct spdk_vhost_nvme_sq {
+	uint16_t sqid;
+	uint16_t size;
+	uint16_t cqid;
+	bool valid;
+	struct spdk_nvme_cmd *sq_cmd;
+	uint16_t sq_head;
+	uint16_t sq_tail;
+};
+
+struct spdk_vhost_nvme_cq {
+	uint8_t phase;
+	uint16_t size;
+	uint16_t cqid;
+	bool valid;
+	volatile struct spdk_nvme_cpl *cq_cqe;
+	uint16_t cq_head;
+	uint16_t guest_signaled_cq_head;
+	uint32_t need_signaled_cnt;
+	STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks;
+	bool irq_enabled;
+	int virq;
+};
+
+struct spdk_vhost_nvme_ns {
+	struct spdk_bdev *bdev;
+	uint32_t block_size;
+	uint64_t capacity;
+	uint32_t nsid;
+	uint32_t active_ns;
+	struct spdk_bdev_desc *bdev_desc;
+	struct spdk_io_channel *bdev_io_channel;
+	struct spdk_nvme_ns_data nsdata;
+};
+
+struct spdk_vhost_nvme_task {
+	struct spdk_nvme_cmd cmd;
+	struct spdk_vhost_nvme_dev *nvme;
+	uint16_t sqid;
+	uint16_t cqid;
+
+	/** array of iovecs to transfer. */
+	struct iovec iovs[MAX_IOVS];
+
+	/** Number of iovecs in iovs array. */
+	int iovcnt;
+
+	/** Current iovec position. */
+	int iovpos;
+
+	/** Offset in current iovec. */
+	uint32_t iov_offset;
+
+	/* for bdev_io_wait */
+	struct spdk_bdev_io_wait_entry bdev_io_wait;
+	struct spdk_vhost_nvme_sq *sq;
+	struct spdk_vhost_nvme_ns *ns;
+
+	/* parent pointer. */
+	struct spdk_vhost_nvme_task *parent;
+	uint8_t dnr;
+	uint8_t sct;
+	uint8_t sc;
+	uint32_t num_children;
+	STAILQ_ENTRY(spdk_vhost_nvme_task) stailq;
+};
+
+struct spdk_vhost_nvme_dev {
+	struct spdk_vhost_dev vdev;
+
+	uint32_t num_io_queues;
+	union spdk_nvme_cap_register cap;
+	union spdk_nvme_cc_register cc;
+	union spdk_nvme_csts_register csts;
+	struct spdk_nvme_ctrlr_data cdata;
+
+	uint32_t num_sqs;
+	uint32_t num_cqs;
+
+	uint32_t num_ns;
+	struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE];
+
+	volatile uint32_t *bar;
+	volatile uint32_t *bar_db;
+	uint64_t bar_size;
+	bool dataplane_started;
+
+	volatile uint32_t *dbbuf_dbs;
+	volatile uint32_t *dbbuf_eis;
+	struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1];
+	struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1];
+
+	/* The one and only session associated with this device */
+	struct spdk_vhost_session *vsession;
+
+	TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq;
+	STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks;
+	struct spdk_poller *requestq_poller;
+	struct spdk_poller *stop_poller;
+};
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend;
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+static int
+nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+		struct spdk_vhost_nvme_task *task);
+
+static struct spdk_vhost_nvme_dev *
+to_nvme_dev(struct spdk_vhost_dev *vdev)
+{
+	if (vdev->backend != &spdk_vhost_nvme_device_backend) {
+		SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name);
+		return NULL;
+	}
+
+	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev);
+}
+
+static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs);
+
+static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride)
+{
+	return qid * 2 * db_stride;
+}
+
+static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride)
+{
+	return (qid * 2 + 1) * db_stride;
+}
+
+static void
+nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq)
+{
+	cq->cq_head++;
+	if (cq->cq_head >= cq->size) {
+		cq->cq_head = 0;
+		cq->phase = !cq->phase;
+	}
+}
+
+static bool
+nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq)
+{
+	return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head);
+}
+
+static void
+nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq)
+{
+	sq->sq_head = (sq->sq_head + 1) % sq->size;
+}
+
+static struct spdk_vhost_nvme_sq *
+vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+	if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+		return NULL;
+	}
+
+	return &dev->sq_queue[qid];
+}
+
+static struct spdk_vhost_nvme_cq *
+vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+	if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+		return NULL;
+	}
+
+	return &dev->cq_queue[qid];
+}
+
+static inline uint32_t
+vhost_nvme_get_queue_head(struct spdk_vhost_nvme_dev *nvme, uint32_t offset)
+{
+	if (nvme->dataplane_started) {
+		return nvme->dbbuf_dbs[offset];
+
+	} else if (nvme->bar) {
+		return nvme->bar_db[offset];
+	}
+
+	assert(0);
+
+	return 0;
+}
+
+static void *
+vhost_nvme_gpa_to_vva(void *priv, uint64_t addr, uint64_t len)
+{
+	struct spdk_vhost_session *vsession = priv;
+
+	return vhost_gpa_to_vva(vsession, addr, len);
+}
+
+static int
+vhost_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd,
+		    struct spdk_vhost_nvme_task *task, uint32_t len)
+{
+	int err;
+
+	err = spdk_nvme_map_prps(nvme->vsession, cmd, task->iovs, len, 4096,
+				 vhost_nvme_gpa_to_vva);
+	if (spdk_unlikely(err < 0)) {
+		return err;
+	}
+	task->iovcnt = err;
+	return 0;
+}
+
+static void
+nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme)
+{
+	struct spdk_vhost_nvme_cq *cq;
+	uint32_t qid, cq_head;
+
+	assert(nvme != NULL);
+
+	for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+		cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+		if (!cq || !cq->valid) {
+			continue;
+		}
+
+		cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(qid, 1));
+		if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) {
+			eventfd_write(cq->virq, (eventfd_t)1);
+			cq->need_signaled_cnt = 0;
+		}
+	}
+}
+
+static void
+vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
+{
+	struct spdk_vhost_nvme_dev *nvme = task->nvme;
+	struct spdk_nvme_cpl cqe = {0};
+	struct spdk_vhost_nvme_cq *cq;
+	struct spdk_vhost_nvme_sq *sq;
+	struct spdk_nvme_cmd *cmd = &task->cmd;
+	uint16_t cqid = task->cqid;
+	uint16_t sqid = task->sqid;
+
+	cq = vhost_nvme_get_cq_from_qid(nvme, cqid);
+	sq = vhost_nvme_get_sq_from_qid(nvme, sqid);
+	if (spdk_unlikely(!cq || !sq)) {
+		return;
+	}
+
+	cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(cqid, 1));
+	if (spdk_unlikely(nvme_cq_is_full(cq))) {
+		STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq);
+		return;
+	}
+
+	cqe.sqid = sqid;
+	cqe.sqhd = sq->sq_head;
+	cqe.cid = cmd->cid;
+	cqe.status.dnr = task->dnr;
+	cqe.status.sct = task->sct;
+	cqe.status.sc = task->sc;
+	cqe.status.p = !cq->phase;
+	cq->cq_cqe[cq->cq_head] = cqe;
+	spdk_smp_wmb();
+	cq->cq_cqe[cq->cq_head].status.p = cq->phase;
+
+	nvme_inc_cq_head(cq);
+	cq->need_signaled_cnt++;
+
+	/* MMIO Controll */
+	if (nvme->dataplane_started) {
+		nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
+	}
+
+	STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_vhost_nvme_task *task = cb_arg;
+	struct spdk_nvme_cmd *cmd = &task->cmd;
+	int sc, sct;
+	uint32_t cdw0;
+
+	assert(bdev_io != NULL);
+
+	spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+	spdk_bdev_free_io(bdev_io);
+
+	task->dnr = !success;
+	task->sct = sct;
+	task->sc = sc;
+
+	if (spdk_unlikely(!success)) {
+		SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10);
+	}
+
+	vhost_nvme_task_complete(task);
+}
+
+static void
+blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	struct spdk_vhost_nvme_task *child = cb_arg;
+	struct spdk_vhost_nvme_task *task = child->parent;
+	struct spdk_vhost_nvme_dev *nvme = task->nvme;
+	int sct, sc;
+	uint32_t cdw0;
+
+	assert(bdev_io != NULL);
+
+	task->num_children--;
+	if (!success) {
+		task->dnr = 1;
+		spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+		task->sct = sct;
+		task->sc = sc;
+	}
+
+	spdk_bdev_free_io(bdev_io);
+
+	if (!task->num_children) {
+		vhost_nvme_task_complete(task);
+	}
+
+	STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+}
+
+static struct spdk_vhost_nvme_ns *
+vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid)
+{
+	if (spdk_unlikely(!nsid || nsid > dev->num_ns)) {
+		return NULL;
+	}
+
+	return &dev->ns[nsid - 1];
+}
+
+static void
+vhost_nvme_resubmit_task(void *arg)
+{
+	struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg;
+	int rc;
+
+	rc = nvme_process_sq(task->nvme, task->sq, task);
+	if (rc) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc);
+	}
+}
+
+static int
+vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task)
+{
+	int rc;
+
+	task->bdev_io_wait.bdev = task->ns->bdev;
+	task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task;
+	task->bdev_io_wait.cb_arg = task;
+
+	rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait);
+	if (rc != 0) {
+		SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc);
+		task->dnr = 1;
+		task->sct = SPDK_NVME_SCT_GENERIC;
+		task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+		vhost_nvme_task_complete(task);
+	}
+
+	return rc;
+}
+
+static int
+nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+		struct spdk_vhost_nvme_task *task)
+{
+	struct spdk_vhost_nvme_task *child;
+	struct spdk_nvme_cmd *cmd = &task->cmd;
+	struct spdk_vhost_nvme_ns *ns;
+	int ret = -1;
+	uint32_t len, nlba, block_size;
+	uint64_t slba;
+	struct spdk_nvme_dsm_range *range;
+	uint16_t i, num_ranges = 0;
+
+	task->nvme = nvme;
+	task->dnr = 0;
+	task->sct = 0;
+	task->sc = 0;
+
+	ns = vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid);
+	if (spdk_unlikely(!ns)) {
+		task->dnr = 1;
+		task->sct = SPDK_NVME_SCT_GENERIC;
+		task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		vhost_nvme_task_complete(task);
+		return -1;
+	}
+
+	block_size = ns->block_size;
+	task->num_children = 0;
+	task->cqid = sq->cqid;
+	task->sqid = sq->sqid;
+
+	task->ns = ns;
+
+	if (spdk_unlikely(!ns->active_ns)) {
+		task->dnr = 1;
+		task->sct = SPDK_NVME_SCT_GENERIC;
+		task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+		vhost_nvme_task_complete(task);
+		return -1;
+	}
+
+	/* valid only for Read/Write commands */
+	nlba = (cmd->cdw12 & 0xffff) + 1;
+	slba = cmd->cdw11;
+	slba = (slba << 32) | cmd->cdw10;
+
+	if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE ||
+	    cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+		if (cmd->psdt != SPDK_NVME_PSDT_PRP) {
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n",
+				      cmd->psdt >> 1, cmd->psdt & 1u);
+			task->dnr = 1;
+			task->sct = SPDK_NVME_SCT_GENERIC;
+			task->sc = SPDK_NVME_SC_INVALID_FIELD;
+			vhost_nvme_task_complete(task);
+			return -1;
+		}
+
+		if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+			num_ranges = (cmd->cdw10 & 0xff) + 1;
+			len = num_ranges * sizeof(struct spdk_nvme_dsm_range);
+		} else {
+			len = nlba * block_size;
+		}
+
+		ret = vhost_nvme_map_prps(nvme, cmd, task, len);
+		if (spdk_unlikely(ret != 0)) {
+			SPDK_ERRLOG("nvme command map prps failed\n");
+			task->dnr = 1;
+			task->sct = SPDK_NVME_SCT_GENERIC;
+			task->sc = SPDK_NVME_SC_INVALID_FIELD;
+			vhost_nvme_task_complete(task);
+			return -1;
+		}
+	}
+
+	switch (cmd->opc) {
+	case SPDK_NVME_OPC_READ:
+		ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel,
+				      task->iovs, task->iovcnt, slba * block_size,
+				      nlba * block_size, blk_request_complete_cb, task);
+		break;
+	case SPDK_NVME_OPC_WRITE:
+		ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel,
+				       task->iovs, task->iovcnt, slba * block_size,
+				       nlba * block_size, blk_request_complete_cb, task);
+		break;
+	case SPDK_NVME_OPC_FLUSH:
+		ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel,
+				      0, ns->capacity,
+				      blk_request_complete_cb, task);
+		break;
+	case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+		range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base;
+		for (i = 0; i < num_ranges; i++) {
+			if (!STAILQ_EMPTY(&nvme->free_tasks)) {
+				child = STAILQ_FIRST(&nvme->free_tasks);
+				STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+			} else {
+				SPDK_ERRLOG("No free task now\n");
+				ret = -1;
+				break;
+			}
+			task->num_children++;
+			child->parent = task;
+			ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel,
+					      range[i].starting_lba * block_size,
+					      range[i].length * block_size,
+					      blk_unmap_complete_cb, child);
+			if (ret) {
+				STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+				break;
+			}
+		}
+		break;
+	default:
+		ret = -1;
+		break;
+	}
+
+	if (spdk_unlikely(ret)) {
+		if (ret == -ENOMEM) {
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n");
+			task->sq = sq;
+			ret = vhost_nvme_queue_task(task);
+		} else {
+			/* post error status to cqe */
+			SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret);
+			task->dnr = 1;
+			task->sct = SPDK_NVME_SCT_GENERIC;
+			task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+			vhost_nvme_task_complete(task);
+		}
+	}
+
+	return ret;
+}
+
+static int
+nvme_worker(void *arg)
+{
+	struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg;
+	struct spdk_vhost_nvme_sq *sq;
+	struct spdk_vhost_nvme_cq *cq;
+	struct spdk_vhost_nvme_task *task;
+	uint32_t qid, dbbuf_sq;
+	int ret;
+	int count = -1;
+
+	if (spdk_unlikely(!nvme->num_sqs)) {
+		return SPDK_POLLER_IDLE;
+	}
+
+	if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) {
+		return SPDK_POLLER_IDLE;
+	}
+
+	for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+
+		sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+		if (!sq->valid) {
+			continue;
+		}
+		cq = vhost_nvme_get_cq_from_qid(nvme, sq->cqid);
+		if (spdk_unlikely(!cq)) {
+			return SPDK_POLLER_BUSY;
+		}
+		cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1));
+		if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) &&
+				  !nvme_cq_is_full(cq))) {
+			task = STAILQ_FIRST(&cq->cq_full_waited_tasks);
+			STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq);
+			vhost_nvme_task_complete(task);
+		}
+
+		dbbuf_sq = vhost_nvme_get_queue_head(nvme, sq_offset(qid, 1));
+		sq->sq_tail = (uint16_t)dbbuf_sq;
+		count = 0;
+
+		while (sq->sq_head != sq->sq_tail) {
+			if (spdk_unlikely(!sq->sq_cmd)) {
+				break;
+			}
+			if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) {
+				task = STAILQ_FIRST(&nvme->free_tasks);
+				STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+			} else {
+				return SPDK_POLLER_BUSY;
+			}
+
+			task->cmd = sq->sq_cmd[sq->sq_head];
+			nvme_inc_sq_head(sq);
+
+			/* processing IO */
+			ret = nvme_process_sq(nvme, sq, task);
+			if (spdk_unlikely(ret)) {
+				SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head,
+					    sq->sq_tail);
+			}
+
+			/* MMIO Control */
+			if (nvme->dataplane_started) {
+				nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
+			}
+
+			/* Maximum batch I/Os to pick up at once */
+			if (count++ == MAX_BATCH_IO) {
+				break;
+			}
+		}
+	}
+
+	/* Completion Queue */
+	nvme_cq_signal_fd(nvme);
+
+	return count;
+}
+
+static int
+vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme,
+				  struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+	struct spdk_vhost_session *vsession = nvme->vsession;
+	uint64_t dbs_dma_addr, eis_dma_addr;
+
+	dbs_dma_addr = cmd->dptr.prp.prp1;
+	eis_dma_addr = cmd->dptr.prp.prp2;
+
+	if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) {
+		return -1;
+	}
+	/* Guest Physical Address to Host Virtual Address */
+	nvme->dbbuf_dbs = vhost_gpa_to_vva(vsession, dbs_dma_addr, 4096);
+	nvme->dbbuf_eis = vhost_gpa_to_vva(vsession, eis_dma_addr, 4096);
+	if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) {
+		return -1;
+	}
+	/* zeroed the doorbell buffer memory */
+	memset((void *)nvme->dbbuf_dbs, 0, 4096);
+	memset((void *)nvme->dbbuf_eis, 0, 4096);
+
+	cpl->status.sc = 0;
+	cpl->status.sct = 0;
+
+	/* Data plane started */
+	nvme->dataplane_started = true;
+
+	return 0;
+}
+
+static int
+vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme,
+			struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+	uint16_t qid, qsize, cqid;
+	uint64_t dma_addr;
+	uint64_t requested_len;
+	struct spdk_vhost_nvme_cq *cq;
+	struct spdk_vhost_nvme_sq *sq;
+
+	/* physical contiguous */
+	if (!(cmd->cdw11 & 0x1)) {
+		return -1;
+	}
+
+	cqid = (cmd->cdw11 >> 16) & 0xffff;
+	qid = cmd->cdw10 & 0xffff;
+	qsize = (cmd->cdw10 >> 16) & 0xffff;
+	dma_addr = cmd->dptr.prp.prp1;
+	if (!dma_addr || dma_addr % 4096) {
+		return -1;
+	}
+
+	sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+	cq = vhost_nvme_get_cq_from_qid(nvme, cqid);
+	if (!sq || !cq) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n",
+			      qid, cqid);
+		cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+		return -1;
+	}
+
+	sq->sqid = qid;
+	sq->cqid = cqid;
+	sq->size = qsize + 1;
+	sq->sq_head = sq->sq_tail = 0;
+	requested_len = sizeof(struct spdk_nvme_cmd) * sq->size;
+	sq->sq_cmd = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len);
+	if (!sq->sq_cmd) {
+		return -1;
+	}
+	nvme->num_sqs++;
+	sq->valid = true;
+	if (nvme->bar) {
+		nvme->bar_db[sq_offset(qid, 1)] = 0;
+	}
+
+	cpl->status.sc = 0;
+	cpl->status.sct = 0;
+	return 0;
+}
+
+static int
+vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme,
+			struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+	uint16_t qid;
+	struct spdk_vhost_nvme_sq *sq;
+
+	qid = cmd->cdw10 & 0xffff;
+	sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+	if (!sq) {
+		return -1;
+	}
+
+	/* We didn't see scenarios when deleting submission
+	 * queue while I/O is running against the submisson
+	 * queue for now, otherwise, we must ensure the poller
+	 * will not run with this submission queue.
+	 */
+	nvme->num_sqs--;
+	sq->valid = false;
+
+	memset(sq, 0, sizeof(*sq));
+	sq->sq_cmd = NULL;
+
+	cpl->status.sc = 0;
+	cpl->status.sct = 0;
+
+	return 0;
+}
+
+static int
+vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme,
+			struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+	uint16_t qsize, qid;
+	uint64_t dma_addr;
+	struct spdk_vhost_nvme_cq *cq;
+	uint64_t requested_len;
+
+	/* physical contiguous */
+	if (!(cmd->cdw11 & 0x1)) {
+		return -1;
+	}
+
+	qid = cmd->cdw10 & 0xffff;
+	qsize = (cmd->cdw10 >> 16) & 0xffff;
+	dma_addr = cmd->dptr.prp.prp1;
+	if (!dma_addr || dma_addr % 4096) {
+		return -1;
+	}
+
+	cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+	if (!cq) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid);
+		cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+		cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+		return -1;
+	}
+	cq->cqid = qid;
+	cq->size = qsize + 1;
+	cq->phase = 1;
+	cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1;
+	/* Setup virq through vhost messages */
+	cq->virq = -1;
+	cq->cq_head = 0;
+	cq->guest_signaled_cq_head = 0;
+	cq->need_signaled_cnt = 0;
+	requested_len = sizeof(struct spdk_nvme_cpl) * cq->size;
+	cq->cq_cqe = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len);
+	if (!cq->cq_cqe) {
+		return -1;
+	}
+	nvme->num_cqs++;
+	cq->valid = true;
+	if (nvme->bar) {
+		nvme->bar_db[cq_offset(qid, 1)] = 0;
+	}
+	STAILQ_INIT(&cq->cq_full_waited_tasks);
+
+	cpl->status.sc = 0;
+	cpl->status.sct = 0;
+	return 0;
+}
+
+static int
+vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme,
+			struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+	uint16_t qid;
+	struct spdk_vhost_nvme_cq *cq;
+
+	qid = cmd->cdw10 & 0xffff;
+	cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+	if (!cq) {
+		return -1;
+	}
+	nvme->num_cqs--;
+	cq->valid = false;
+
+	memset(cq, 0, sizeof(*cq));
+	cq->cq_cqe = NULL;
+
+	cpl->status.sc = 0;
+	cpl->status.sct = 0;
+	return 0;
+}
+
+static struct spdk_vhost_nvme_dev *
+vhost_nvme_get_by_name(int vid)
+{
+	struct spdk_vhost_nvme_dev *nvme;
+	struct spdk_vhost_dev *vdev;
+	struct spdk_vhost_session *vsession;
+
+	TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) {
+		vdev = &nvme->vdev;
+		TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+			if (vsession->vid == vid) {
+				return nvme;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+int
+vhost_nvme_get_cap(int vid, uint64_t *cap)
+{
+	struct spdk_vhost_nvme_dev *nvme;
+
+	nvme = vhost_nvme_get_by_name(vid);
+	if (!nvme) {
+		return -1;
+	}
+
+	*cap = nvme->cap.raw;
+	return 0;
+}
+
+int
+vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
+{
+	struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd;
+	struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe;
+	struct spdk_vhost_nvme_ns *ns;
+	int ret = 0;
+	struct spdk_vhost_nvme_dev *nvme;
+
+	nvme = vhost_nvme_get_by_name(vid);
+	if (!nvme) {
+		return -1;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc);
+	switch (req->opc) {
+	case SPDK_NVME_OPC_IDENTIFY:
+		if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) {
+			memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data));
+
+		} else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) {
+			ns = vhost_nvme_get_ns_from_nsid(nvme, req->nsid);
+			if (!ns) {
+				cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE;
+				cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+				break;
+			}
+			memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data));
+		}
+		/* successfully */
+		cpl->status.sc = 0;
+		cpl->status.sct = 0;
+		break;
+	case SPDK_NVME_OPC_CREATE_IO_CQ:
+		ret = vhost_nvme_create_io_cq(nvme, req, cpl);
+		break;
+	case SPDK_NVME_OPC_DELETE_IO_CQ:
+		ret = vhost_nvme_delete_io_cq(nvme, req, cpl);
+		break;
+	case SPDK_NVME_OPC_CREATE_IO_SQ:
+		ret = vhost_nvme_create_io_sq(nvme, req, cpl);
+		break;
+	case SPDK_NVME_OPC_DELETE_IO_SQ:
+		ret = vhost_nvme_delete_io_sq(nvme, req, cpl);
+		break;
+	case SPDK_NVME_OPC_GET_FEATURES:
+	case SPDK_NVME_OPC_SET_FEATURES:
+		if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) {
+			cpl->status.sc = 0;
+			cpl->status.sct = 0;
+			cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16);
+		} else {
+			cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+			cpl->status.sct = SPDK_NVME_SCT_GENERIC;
+		}
+		break;
+	case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG:
+		ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl);
+		break;
+	case SPDK_NVME_OPC_ABORT:
+		/* TODO: ABORT failed fow now */
+		cpl->cdw0 = 1;
+		cpl->status.sc = 0;
+		cpl->status.sct = 0;
+		break;
+	}
+
+	if (ret) {
+		SPDK_ERRLOG("Admin Passthrough Failed with %u\n", req->opc);
+	}
+
+	return 0;
+}
+
+int
+vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size)
+{
+	struct spdk_vhost_nvme_dev *nvme;
+
+	nvme = vhost_nvme_get_by_name(vid);
+	if (!nvme) {
+		return -1;
+	}
+
+	nvme->bar = (volatile uint32_t *)(uintptr_t)(bar_addr);
+	/* BAR0 SQ/CQ doorbell registers start from offset 0x1000 */
+	nvme->bar_db = (volatile uint32_t *)(uintptr_t)(bar_addr + 0x1000ull);
+	nvme->bar_size = bar_size;
+
+	return 0;
+}
+
+int
+vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd)
+{
+	struct spdk_vhost_nvme_dev *nvme;
+	struct spdk_vhost_nvme_cq *cq;
+
+	nvme = vhost_nvme_get_by_name(vid);
+	if (!nvme) {
+		return -1;
+	}
+
+	cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+	if (!cq) {
+		return -1;
+	}
+	if (cq->irq_enabled) {
+		cq->virq = fd;
+	} else {
+		SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid);
+	}
+
+	return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+	struct spdk_vhost_nvme_task *task;
+
+	while (!STAILQ_EMPTY(&nvme->free_tasks)) {
+		task = STAILQ_FIRST(&nvme->free_tasks);
+		STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+		spdk_free(task);
+	}
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+	uint32_t entries, i;
+	struct spdk_vhost_nvme_task *task;
+
+	entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED;
+
+	for (i = 0; i < entries; i++) {
+		task = spdk_zmalloc(sizeof(struct spdk_vhost_nvme_task),
+				    SPDK_CACHE_LINE_SIZE, NULL,
+				    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+		if (task == NULL) {
+			SPDK_ERRLOG("Controller %s alloc task pool failed\n",
+				    nvme->vdev.name);
+			free_task_pool(nvme);
+			return -1;
+		}
+		STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+	}
+
+	return 0;
+}
+
+static int
+vhost_nvme_start_cb(struct spdk_vhost_dev *vdev,
+		    struct spdk_vhost_session *vsession, void *unused)
+{
+	struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+	struct spdk_vhost_nvme_ns *ns_dev;
+	uint32_t i;
+	int rc = 0;
+
+	if (nvme == NULL) {
+		rc = -1;
+		goto out;
+	}
+
+	rc = alloc_task_pool(nvme);
+	if (rc) {
+		goto out;
+	}
+
+	SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vsession->vid,
+		       vdev->path, spdk_env_get_current_core());
+
+	for (i = 0; i < nvme->num_ns; i++) {
+		ns_dev = &nvme->ns[i];
+		ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc);
+		if (!ns_dev->bdev_io_channel) {
+			rc = -1;
+			goto out;
+		}
+	}
+
+	nvme->vsession = vsession;
+	/* Start the NVMe Poller */
+	nvme->requestq_poller = SPDK_POLLER_REGISTER(nvme_worker, nvme, 0);
+
+out:
+	vhost_session_start_done(vsession, rc);
+	return rc;
+}
+
+static int
+vhost_nvme_start(struct spdk_vhost_session *vsession)
+{
+	if (vsession->vdev->active_session_num > 0) {
+		/* We're trying to start a second session */
+		SPDK_ERRLOG("Vhost-NVMe devices can support only one simultaneous connection.\n");
+		return -1;
+	}
+
+	return vhost_session_send_event(vsession, vhost_nvme_start_cb,
+					3, "start session");
+}
+
+static void
+vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns)
+{
+	ns->active_ns = 0;
+	spdk_bdev_close(ns->bdev_desc);
+	ns->bdev_desc = NULL;
+	ns->bdev = NULL;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+	struct spdk_vhost_nvme_ns *ns = remove_ctx;
+
+	SPDK_NOTICELOG("Removing NS %u, Block Device %s\n",
+		       ns->nsid, spdk_bdev_get_name(ns->bdev));
+
+	vhost_nvme_deactive_ns(ns);
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+	struct spdk_vhost_nvme_dev *nvme = arg;
+	struct spdk_vhost_nvme_ns *ns_dev;
+	uint32_t i;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n");
+
+	/* FIXME wait for pending I/Os to complete */
+
+	if (spdk_vhost_trylock() != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	for (i = 0; i < nvme->num_ns; i++) {
+		ns_dev = &nvme->ns[i];
+		if (ns_dev->bdev_io_channel) {
+			spdk_put_io_channel(ns_dev->bdev_io_channel);
+			ns_dev->bdev_io_channel = NULL;
+		}
+	}
+	/* Clear BAR space */
+	if (nvme->bar) {
+		memset((void *)nvme->bar, 0, nvme->bar_size);
+	}
+	nvme->num_sqs = 0;
+	nvme->num_cqs = 0;
+	nvme->dbbuf_dbs = NULL;
+	nvme->dbbuf_eis = NULL;
+	nvme->dataplane_started = false;
+
+	spdk_poller_unregister(&nvme->stop_poller);
+	vhost_session_stop_done(nvme->vsession, 0);
+
+	spdk_vhost_unlock();
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_nvme_stop_cb(struct spdk_vhost_dev *vdev,
+		   struct spdk_vhost_session *vsession, void *unused)
+{
+	struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+
+	if (nvme == NULL) {
+		vhost_session_stop_done(vsession, -1);
+		return -1;
+	}
+
+	free_task_pool(nvme);
+	SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vsession->vid, vdev->path);
+
+	spdk_poller_unregister(&nvme->requestq_poller);
+	nvme->stop_poller = SPDK_POLLER_REGISTER(destroy_device_poller_cb, nvme, 1000);
+
+	return 0;
+}
+
+static int
+vhost_nvme_stop(struct spdk_vhost_session *vsession)
+{
+	return vhost_session_send_event(vsession, vhost_nvme_stop_cb,
+					3, "start session");
+}
+
+static void
+vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+	struct spdk_vhost_nvme_ns *ns_dev;
+	uint32_t i;
+
+	if (nvme == NULL) {
+		return;
+	}
+
+	spdk_json_write_named_array_begin(w, "namespaces");
+
+	for (i = 0; i < nvme->num_ns; i++) {
+		ns_dev = &nvme->ns[i];
+		if (!ns_dev->active_ns) {
+			continue;
+		}
+
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid);
+		spdk_json_write_named_string(w, "bdev",  spdk_bdev_get_name(ns_dev->bdev));
+		spdk_json_write_object_end(w);
+	}
+
+	spdk_json_write_array_end(w);
+}
+
+static void
+vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+	struct spdk_vhost_nvme_ns *ns_dev;
+	uint32_t i;
+
+	if (nvme == NULL) {
+		return;
+	}
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "vhost_create_nvme_controller");
+
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+	spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues);
+	spdk_json_write_named_string(w, "cpumask",
+				     spdk_cpuset_fmt(spdk_thread_get_cpumask(nvme->vdev.thread)));
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+
+	for (i = 0; i < nvme->num_ns; i++) {
+		ns_dev = &nvme->ns[i];
+		if (!ns_dev->active_ns) {
+			continue;
+		}
+
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "method", "vhost_nvme_controller_add_ns");
+
+		spdk_json_write_named_object_begin(w, "params");
+		spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+		spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev));
+		spdk_json_write_object_end(w);
+
+		spdk_json_write_object_end(w);
+	}
+}
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = {
+	.session_ctx_size = 0,
+	.start_session = vhost_nvme_start,
+	.stop_session = vhost_nvme_stop,
+	.dump_info_json = vhost_nvme_dump_info_json,
+	.write_config_json = vhost_nvme_write_config_json,
+	.remove_device = vhost_nvme_dev_remove,
+};
+
+static int
+vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+	struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+	struct spdk_nvme_ns_data *nsdata;
+	uint64_t num_blocks;
+	uint32_t i;
+
+	/* Identify Namespace */
+	cdata->nn = dev->num_ns;
+	for (i = 0; i < dev->num_ns; i++) {
+		nsdata = &dev->ns[i].nsdata;
+		if (dev->ns[i].active_ns) {
+			num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev);
+			nsdata->nsze = num_blocks;
+			/* ncap must be non-zero for active Namespace */
+			nsdata->ncap = num_blocks;
+			nsdata->nuse = num_blocks;
+			nsdata->nlbaf = 0;
+			nsdata->flbas.format = 0;
+			nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev));
+			nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev);
+			dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev);
+			dev->ns[i].capacity = num_blocks * dev->ns[i].block_size;
+		} else {
+			memset(nsdata, 0, sizeof(*nsdata));
+		}
+	}
+	return 0;
+}
+
+static int
+vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+	struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+	char sn[20];
+
+	/* Controller Capabilities */
+	dev->cap.bits.cqr = 1;
+	dev->cap.bits.to = 1;
+	dev->cap.bits.dstrd = 0;
+	dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+	dev->cap.bits.mpsmin = 0;
+	dev->cap.bits.mpsmax = 0;
+	/* MQES is 0 based value */
+	dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1;
+
+	/* Controller Configuration */
+	dev->cc.bits.en = 0;
+
+	/* Controller Status */
+	dev->csts.bits.rdy = 0;
+
+	/* Identify Controller */
+	spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+	cdata->vid = 0x8086;
+	cdata->ssvid = 0x8086;
+	spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' ');
+	snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name);
+	spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' ');
+	cdata->ieee[0] = 0xe4;
+	cdata->ieee[1] = 0xd2;
+	cdata->ieee[2] = 0x5c;
+	cdata->ver.bits.mjr = 1;
+	cdata->ver.bits.mnr = 0;
+	cdata->mdts = 5; /* 128 KiB */
+	cdata->rab = 6;
+	cdata->sqes.min = 6;
+	cdata->sqes.max = 6;
+	cdata->cqes.min = 4;
+	cdata->cqes.max = 4;
+	cdata->oncs.dsm = 1;
+	/* Emulated NVMe controller */
+	cdata->oacs.doorbell_buffer_config = 1;
+
+	vhost_nvme_ns_identify_update(dev);
+
+	return 0;
+}
+
+int
+vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues)
+{
+	struct spdk_vhost_nvme_dev *dev;
+	int rc;
+
+	if (posix_memalign((void **)&dev, SPDK_CACHE_LINE_SIZE, sizeof(*dev))) {
+		return -ENOMEM;
+	}
+	memset(dev, 0, sizeof(*dev));
+
+	if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) {
+		free(dev);
+		return -EINVAL;
+	}
+
+	spdk_vhost_lock();
+	rc = vhost_dev_register(&dev->vdev, name, cpumask,
+				&spdk_vhost_nvme_device_backend);
+
+	if (rc) {
+		free(dev);
+		spdk_vhost_unlock();
+		return rc;
+	}
+
+	dev->num_io_queues = num_io_queues;
+	STAILQ_INIT(&dev->free_tasks);
+	TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq);
+
+	vhost_nvme_ctrlr_identify_update(dev);
+
+	SPDK_NOTICELOG("Controller %s: Constructed\n", name);
+	spdk_vhost_unlock();
+	return rc;
+}
+
+int
+vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev)
+{
+	struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+	struct spdk_vhost_nvme_ns *ns;
+	int rc;
+	uint32_t i;
+
+	if (nvme == NULL) {
+		return -EINVAL;
+	}
+
+	TAILQ_REMOVE(&g_nvme_ctrlrs, nvme, tailq);
+	for (i = 0; i < nvme->num_ns; i++) {
+		ns = &nvme->ns[i];
+		if (ns->active_ns) {
+			vhost_nvme_deactive_ns(ns);
+		}
+	}
+
+	rc = vhost_dev_unregister(vdev);
+	if (rc != 0) {
+		return rc;
+	}
+
+	free(nvme);
+	return 0;
+}
+
+int
+vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name)
+{
+	struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+	struct spdk_vhost_nvme_ns *ns;
+	struct spdk_bdev *bdev;
+	int rc = -1;
+
+	if (nvme == NULL) {
+		return -ENODEV;
+	}
+
+	if (nvme->num_ns == MAX_NAMESPACE) {
+		SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns);
+		return -ENOSPC;
+	}
+
+	bdev = spdk_bdev_get_by_name(bdev_name);
+	if (!bdev) {
+		SPDK_ERRLOG("could not find bdev %s\n", bdev_name);
+		return -ENODEV;
+	}
+
+	ns = &nvme->ns[nvme->num_ns];
+	rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc);
+	if (rc != 0) {
+		SPDK_ERRLOG("Could not open bdev '%s', error=%d\n",
+			    bdev_name, rc);
+		return rc;
+	}
+
+	nvme->ns[nvme->num_ns].bdev = bdev;
+	nvme->ns[nvme->num_ns].active_ns = 1;
+	nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1;
+	nvme->num_ns++;
+
+	vhost_nvme_ns_identify_update(nvme);
+
+	return rc;
+}
+
+int
+vhost_nvme_controller_construct(void)
+{
+	struct spdk_conf_section *sp;
+	const char *name;
+	const char *bdev_name;
+	const char *cpumask;
+	int rc, i = 0;
+	struct spdk_vhost_dev *vdev;
+	uint32_t ctrlr_num, io_queues;
+
+	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+		if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) {
+			continue;
+		}
+
+		if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) {
+			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+				    spdk_conf_section_get_name(sp));
+			return -1;
+		}
+
+		name = spdk_conf_section_get_val(sp, "Name");
+		if (name == NULL) {
+			SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num);
+			return -1;
+		}
+
+		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+		rc = spdk_conf_section_get_intval(sp, "NumberOfQueues");
+		if (rc > 0) {
+			io_queues = rc;
+		} else {
+			io_queues = 1;
+		}
+
+		rc = vhost_nvme_dev_construct(name, cpumask, io_queues);
+		if (rc < 0) {
+			SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num);
+			return -1;
+		}
+
+		vdev = spdk_vhost_dev_find(name);
+		if (!vdev) {
+			return -1;
+		}
+
+		for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) {
+			bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0);
+			if (!bdev_name) {
+				SPDK_ERRLOG("namespace configuration missing bdev name\n");
+				break;
+			}
+			rc = vhost_nvme_dev_add_ns(vdev, bdev_name);
+			if (rc < 0) {
+				SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n",
+					     ctrlr_num, bdev_name);
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME)
diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c
new file mode 100644
index 000000000..196d75918
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_rpc.c
@@ -0,0 +1,652 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk/scsi.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+#include "spdk/bdev.h"
+
+struct rpc_vhost_scsi_ctrlr {
+	char *ctrlr;
+	char *cpumask;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req)
+{
+	free(req->ctrlr);
+	free(req->cpumask);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_create_scsi_ctrlr[] = {
+	{"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string },
+	{"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+rpc_vhost_create_scsi_controller(struct spdk_jsonrpc_request *request,
+				 const struct spdk_json_val *params)
+{
+	struct rpc_vhost_scsi_ctrlr req = {0};
+	struct spdk_json_write_ctx *w;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_vhost_create_scsi_ctrlr,
+				    SPDK_COUNTOF(rpc_vhost_create_scsi_ctrlr),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask);
+	if (rc < 0) {
+		goto invalid;
+	}
+
+	free_rpc_vhost_scsi_ctrlr(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_vhost_scsi_ctrlr(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_create_scsi_controller", rpc_vhost_create_scsi_controller,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_scsi_controller, construct_vhost_scsi_controller)
+
+struct rpc_vhost_scsi_ctrlr_add_target {
+	char *ctrlr;
+	int32_t scsi_target_num;
+	char *bdev_name;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr_add_target(struct rpc_vhost_scsi_ctrlr_add_target *req)
+{
+	free(req->ctrlr);
+	free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_scsi_ctrlr_add_target[] = {
+	{"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, ctrlr), spdk_json_decode_string },
+	{"scsi_target_num", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, scsi_target_num), spdk_json_decode_int32},
+	{"bdev_name", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, bdev_name), spdk_json_decode_string },
+};
+
+static void
+rpc_vhost_scsi_controller_add_target(struct spdk_jsonrpc_request *request,
+				     const struct spdk_json_val *params)
+{
+	struct rpc_vhost_scsi_ctrlr_add_target req = {0};
+	struct spdk_json_write_ctx *w;
+	struct spdk_vhost_dev *vdev;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_vhost_scsi_ctrlr_add_target,
+				    SPDK_COUNTOF(rpc_vhost_scsi_ctrlr_add_target),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_find(req.ctrlr);
+	if (vdev == NULL) {
+		spdk_vhost_unlock();
+		rc = -ENODEV;
+		goto invalid;
+	}
+
+	rc = spdk_vhost_scsi_dev_add_tgt(vdev, req.scsi_target_num, req.bdev_name);
+	spdk_vhost_unlock();
+	if (rc < 0) {
+		goto invalid;
+	}
+
+	free_rpc_vhost_scsi_ctrlr_add_target(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_int32(w, rc);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_vhost_scsi_ctrlr_add_target(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_scsi_controller_add_target", rpc_vhost_scsi_controller_add_target,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_add_target, add_vhost_scsi_lun)
+
+struct rpc_remove_vhost_scsi_ctrlr_target {
+	char *ctrlr;
+	uint32_t scsi_target_num;
+};
+
+static void
+free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req)
+{
+	free(req->ctrlr);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = {
+	{"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string },
+	{"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32},
+};
+
+static int
+rpc_vhost_scsi_controller_remove_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+	struct spdk_jsonrpc_request *request = arg;
+	struct spdk_json_write_ctx *w;
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return 0;
+}
+
+static void
+rpc_vhost_scsi_controller_remove_target(struct spdk_jsonrpc_request *request,
+					const struct spdk_json_val *params)
+{
+	struct rpc_remove_vhost_scsi_ctrlr_target req = {0};
+	struct spdk_vhost_dev *vdev;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_vhost_remove_target,
+				    SPDK_COUNTOF(rpc_vhost_remove_target),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_find(req.ctrlr);
+	if (vdev == NULL) {
+		spdk_vhost_unlock();
+		rc = -ENODEV;
+		goto invalid;
+	}
+
+	rc = spdk_vhost_scsi_dev_remove_tgt(vdev, req.scsi_target_num,
+					    rpc_vhost_scsi_controller_remove_target_finish_cb,
+					    request);
+	spdk_vhost_unlock();
+	if (rc < 0) {
+		goto invalid;
+	}
+
+	free_rpc_remove_vhost_scsi_ctrlr_target(&req);
+	return;
+
+invalid:
+	free_rpc_remove_vhost_scsi_ctrlr_target(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+}
+
+SPDK_RPC_REGISTER("vhost_scsi_controller_remove_target",
+		  rpc_vhost_scsi_controller_remove_target, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_remove_target, remove_vhost_scsi_target)
+
+struct rpc_vhost_blk_ctrlr {
+	char *ctrlr;
+	char *dev_name;
+	char *cpumask;
+	bool readonly;
+	bool packed_ring;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = {
+	{"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string },
+	{"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string },
+	{"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true},
+	{"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true},
+	{"packed_ring", offsetof(struct rpc_vhost_blk_ctrlr, packed_ring), spdk_json_decode_bool, true},
+};
+
+static void
+free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req)
+{
+	free(req->ctrlr);
+	free(req->dev_name);
+	free(req->cpumask);
+}
+
+static void
+rpc_vhost_create_blk_controller(struct spdk_jsonrpc_request *request,
+				const struct spdk_json_val *params)
+{
+	struct rpc_vhost_blk_ctrlr req = {0};
+	struct spdk_json_write_ctx *w;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr,
+				    SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name,
+				      req.readonly, req.packed_ring);
+	if (rc < 0) {
+		goto invalid;
+	}
+
+	free_rpc_vhost_blk_ctrlr(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_vhost_blk_ctrlr(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_create_blk_controller", rpc_vhost_create_blk_controller,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_blk_controller, construct_vhost_blk_controller)
+
+struct rpc_delete_vhost_ctrlr {
+	char *ctrlr;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_vhost_ctrlr_decoder[] = {
+	{"ctrlr", offsetof(struct rpc_delete_vhost_ctrlr, ctrlr), spdk_json_decode_string },
+};
+
+static void
+free_rpc_delete_vhost_ctrlr(struct rpc_delete_vhost_ctrlr *req)
+{
+	free(req->ctrlr);
+}
+
+static void
+rpc_vhost_delete_controller(struct spdk_jsonrpc_request *request,
+			    const struct spdk_json_val *params)
+{
+	struct rpc_delete_vhost_ctrlr req = {0};
+	struct spdk_json_write_ctx *w;
+	struct spdk_vhost_dev *vdev;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_delete_vhost_ctrlr_decoder,
+				    SPDK_COUNTOF(rpc_delete_vhost_ctrlr_decoder), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_find(req.ctrlr);
+	if (vdev == NULL) {
+		spdk_vhost_unlock();
+		rc = -ENODEV;
+		goto invalid;
+	}
+
+	rc = spdk_vhost_dev_remove(vdev);
+	spdk_vhost_unlock();
+	if (rc < 0) {
+		goto invalid;
+	}
+
+	free_rpc_delete_vhost_ctrlr(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+
+	return;
+
+invalid:
+	free_rpc_delete_vhost_ctrlr(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_delete_controller", rpc_vhost_delete_controller, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_delete_controller, remove_vhost_controller)
+
+struct rpc_get_vhost_ctrlrs {
+	char *name;
+};
+
+static void
+_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev)
+{
+	uint32_t delay_base_us, iops_threshold;
+
+	spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev));
+	spdk_json_write_named_string_fmt(w, "cpumask", "0x%s",
+					 spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+	spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+	spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+	spdk_json_write_named_string(w, "socket", vdev->path);
+
+	spdk_json_write_named_object_begin(w, "backend_specific");
+	vhost_dump_info_json(vdev, w);
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+}
+
+static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = {
+	{"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_get_vhost_ctrlrs(struct rpc_get_vhost_ctrlrs *req)
+{
+	free(req->name);
+}
+
+static void
+rpc_vhost_get_controllers(struct spdk_jsonrpc_request *request,
+			  const struct spdk_json_val *params)
+{
+	struct rpc_get_vhost_ctrlrs req = {0};
+	struct spdk_json_write_ctx *w;
+	struct spdk_vhost_dev *vdev;
+	int rc;
+
+	if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders,
+					      SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), &req)) {
+		SPDK_ERRLOG("spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	spdk_vhost_lock();
+	if (req.name != NULL) {
+		vdev = spdk_vhost_dev_find(req.name);
+		if (vdev == NULL) {
+			spdk_vhost_unlock();
+			rc = -ENODEV;
+			goto invalid;
+		}
+
+		free_rpc_get_vhost_ctrlrs(&req);
+
+		w = spdk_jsonrpc_begin_result(request);
+		spdk_json_write_array_begin(w);
+
+		_rpc_get_vhost_controller(w, vdev);
+		spdk_vhost_unlock();
+
+		spdk_json_write_array_end(w);
+		spdk_jsonrpc_end_result(request, w);
+		return;
+	}
+
+	free_rpc_get_vhost_ctrlrs(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_array_begin(w);
+
+	vdev = spdk_vhost_dev_next(NULL);
+	while (vdev != NULL) {
+		_rpc_get_vhost_controller(w, vdev);
+		vdev = spdk_vhost_dev_next(vdev);
+	}
+	spdk_vhost_unlock();
+
+	spdk_json_write_array_end(w);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_get_vhost_ctrlrs(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+					 spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_get_controllers", rpc_vhost_get_controllers, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_get_controllers, get_vhost_controllers)
+
+
+struct rpc_vhost_ctrlr_coalescing {
+	char *ctrlr;
+	uint32_t delay_base_us;
+	uint32_t iops_threshold;
+};
+
+static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = {
+	{"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string },
+	{"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32},
+	{"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32},
+};
+
+static void
+free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req)
+{
+	free(req->ctrlr);
+}
+
+static void
+rpc_vhost_controller_set_coalescing(struct spdk_jsonrpc_request *request,
+				    const struct spdk_json_val *params)
+{
+	struct rpc_vhost_ctrlr_coalescing req = {0};
+	struct spdk_json_write_ctx *w;
+	struct spdk_vhost_dev *vdev;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing,
+				    SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_find(req.ctrlr);
+	if (vdev == NULL) {
+		spdk_vhost_unlock();
+		rc = -ENODEV;
+		goto invalid;
+	}
+
+	rc = spdk_vhost_set_coalescing(vdev, req.delay_base_us, req.iops_threshold);
+	spdk_vhost_unlock();
+	if (rc) {
+		goto invalid;
+	}
+
+	free_rpc_set_vhost_controllers_event_coalescing(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+
+	return;
+
+invalid:
+	free_rpc_set_vhost_controllers_event_coalescing(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_controller_set_coalescing", rpc_vhost_controller_set_coalescing,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_controller_set_coalescing, set_vhost_controller_coalescing)
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+
+struct rpc_vhost_nvme_ctrlr {
+	char *ctrlr;
+	uint32_t io_queues;
+	char *cpumask;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = {
+	{"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string },
+	{"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32},
+	{"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req)
+{
+	free(req->ctrlr);
+	free(req->cpumask);
+}
+
+static void
+rpc_vhost_create_nvme_controller(struct spdk_jsonrpc_request *request,
+				 const struct spdk_json_val *params)
+{
+	struct rpc_vhost_nvme_ctrlr req = {};
+	struct spdk_json_write_ctx *w;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr,
+				    SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr),
+				    &req)) {
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	rc = vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues);
+	if (rc < 0) {
+		goto invalid;
+	}
+
+	free_rpc_vhost_nvme_ctrlr(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_vhost_nvme_ctrlr(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_create_nvme_controller", rpc_vhost_create_nvme_controller,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_nvme_controller, construct_vhost_nvme_controller)
+
+struct rpc_vhost_nvme_ctrlr_add_ns {
+	char *ctrlr;
+	char *bdev_name;
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr_add_ns(struct rpc_vhost_nvme_ctrlr_add_ns *req)
+{
+	free(req->ctrlr);
+	free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = {
+	{"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, ctrlr), spdk_json_decode_string },
+	{"bdev_name", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, bdev_name), spdk_json_decode_string },
+};
+
+static void
+rpc_vhost_nvme_controller_add_ns(struct spdk_jsonrpc_request *request,
+				 const struct spdk_json_val *params)
+{
+	struct rpc_vhost_nvme_ctrlr_add_ns req = {0};
+	struct spdk_json_write_ctx *w;
+	struct spdk_vhost_dev *vdev;
+	int rc;
+
+	if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns,
+				    SPDK_COUNTOF(rpc_vhost_nvme_add_ns),
+				    &req)) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+		rc = -EINVAL;
+		goto invalid;
+	}
+
+	spdk_vhost_lock();
+	vdev = spdk_vhost_dev_find(req.ctrlr);
+	if (vdev == NULL) {
+		spdk_vhost_unlock();
+		rc = -ENODEV;
+		goto invalid;
+	}
+
+	rc = vhost_nvme_dev_add_ns(vdev, req.bdev_name);
+	spdk_vhost_unlock();
+	if (rc < 0) {
+		goto invalid;
+	}
+	free_rpc_vhost_nvme_ctrlr_add_ns(&req);
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_bool(w, true);
+	spdk_jsonrpc_end_result(request, w);
+	return;
+
+invalid:
+	free_rpc_vhost_nvme_ctrlr_add_ns(&req);
+	spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+					 spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_nvme_controller_add_ns", rpc_vhost_nvme_controller_add_ns,
+		  SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_nvme_controller_add_ns, add_vhost_nvme_ns)
+
+#endif /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC)
diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c
new file mode 100644
index 000000000..49e49dc76
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_scsi.c
@@ -0,0 +1,1536 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <linux/virtio_scsi.h>
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/scsi.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+/* Features supported by SPDK VHOST lib. */
+#define SPDK_VHOST_SCSI_FEATURES	(SPDK_VHOST_FEATURES | \
+					(1ULL << VIRTIO_SCSI_F_INOUT) | \
+					(1ULL << VIRTIO_SCSI_F_HOTPLUG) | \
+					(1ULL << VIRTIO_SCSI_F_CHANGE ) | \
+					(1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+/* Features that are specified in VIRTIO SCSI but currently not supported:
+ * - Live migration not supported yet
+ * - T10 PI
+ */
+#define SPDK_VHOST_SCSI_DISABLED_FEATURES	(SPDK_VHOST_DISABLED_FEATURES | \
+						(1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+#define MGMT_POLL_PERIOD_US (1000 * 5)
+
+#define VIRTIO_SCSI_CONTROLQ   0
+#define VIRTIO_SCSI_EVENTQ   1
+#define VIRTIO_SCSI_REQUESTQ   2
+
+enum spdk_scsi_dev_vhost_status {
+	/* Target ID is empty. */
+	VHOST_SCSI_DEV_EMPTY,
+
+	/* Target is still being added. */
+	VHOST_SCSI_DEV_ADDING,
+
+	/* Target ID occupied. */
+	VHOST_SCSI_DEV_PRESENT,
+
+	/* Target ID is occupied but removal is in progress. */
+	VHOST_SCSI_DEV_REMOVING,
+
+	/* In session - device (SCSI target) seen but removed. */
+	VHOST_SCSI_DEV_REMOVED,
+};
+
+/** Context for a SCSI target in a vhost device */
+struct spdk_scsi_dev_vhost_state {
+	struct spdk_scsi_dev *dev;
+	enum spdk_scsi_dev_vhost_status status;
+	spdk_vhost_event_fn remove_cb;
+	void *remove_ctx;
+};
+
+struct spdk_vhost_scsi_dev {
+	int ref;
+	bool registered;
+	struct spdk_vhost_dev vdev;
+	struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+};
+
+/** Context for a SCSI target in a vhost session */
+struct spdk_scsi_dev_session_state {
+	struct spdk_scsi_dev *dev;
+	enum spdk_scsi_dev_vhost_status status;
+};
+
+struct spdk_vhost_scsi_session {
+	struct spdk_vhost_session vsession;
+
+	struct spdk_vhost_scsi_dev *svdev;
+	/** Local copy of the device state */
+	struct spdk_scsi_dev_session_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+	struct spdk_poller *requestq_poller;
+	struct spdk_poller *mgmt_poller;
+	struct spdk_poller *stop_poller;
+};
+
+struct spdk_vhost_scsi_task {
+	struct spdk_scsi_task	scsi;
+	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+
+	union {
+		struct virtio_scsi_cmd_resp *resp;
+		struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
+	};
+
+	struct spdk_vhost_scsi_session *svsession;
+	struct spdk_scsi_dev *scsi_dev;
+
+	/** Number of bytes that were written. */
+	uint32_t used_len;
+
+	int req_idx;
+
+	/* If set, the task is currently used for I/O processing. */
+	bool used;
+
+	struct spdk_vhost_virtqueue *vq;
+};
+
+static int vhost_scsi_start(struct spdk_vhost_session *vsession);
+static int vhost_scsi_stop(struct spdk_vhost_session *vsession);
+static void vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev,
+				      struct spdk_json_write_ctx *w);
+static void vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev,
+		struct spdk_json_write_ctx *w);
+static int vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev);
+
+static const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = {
+	.session_ctx_size = sizeof(struct spdk_vhost_scsi_session) - sizeof(struct spdk_vhost_session),
+	.start_session =  vhost_scsi_start,
+	.stop_session = vhost_scsi_stop,
+	.dump_info_json = vhost_scsi_dump_info_json,
+	.write_config_json = vhost_scsi_write_config_json,
+	.remove_device = vhost_scsi_dev_remove,
+};
+
+static inline void
+scsi_task_init(struct spdk_vhost_scsi_task *task)
+{
+	memset(&task->scsi, 0, sizeof(task->scsi));
+	/* Tmf_resp pointer and resp pointer are in a union.
+	 * Here means task->tmf_resp = task->resp = NULL.
+	 */
+	task->resp = NULL;
+	task->used = true;
+	task->used_len = 0;
+}
+
+static void
+vhost_scsi_task_put(struct spdk_vhost_scsi_task *task)
+{
+	spdk_scsi_task_put(&task->scsi);
+}
+
+static void
+vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task)
+{
+	struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+	struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+	assert(vsession->task_cnt > 0);
+	vsession->task_cnt--;
+	task->used = false;
+}
+
+static void
+remove_scsi_tgt(struct spdk_vhost_scsi_dev *svdev,
+		unsigned scsi_tgt_num)
+{
+	struct spdk_scsi_dev_vhost_state *state;
+	struct spdk_scsi_dev *dev;
+
+	state = &svdev->scsi_dev_state[scsi_tgt_num];
+	dev = state->dev;
+	state->dev = NULL;
+	assert(state->status == VHOST_SCSI_DEV_REMOVING);
+	state->status = VHOST_SCSI_DEV_EMPTY;
+	spdk_scsi_dev_destruct(dev, NULL, NULL);
+	if (state->remove_cb) {
+		state->remove_cb(&svdev->vdev, state->remove_ctx);
+		state->remove_cb = NULL;
+	}
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n",
+		     svdev->vdev.name, scsi_tgt_num);
+
+	if (--svdev->ref == 0 && svdev->registered == false) {
+		free(svdev);
+	}
+}
+
+static void
+vhost_scsi_dev_process_removed_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+	unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+	struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+					    struct spdk_vhost_scsi_dev, vdev);
+
+	/* all sessions have already detached the device */
+	if (svdev->scsi_dev_state[scsi_tgt_num].status != VHOST_SCSI_DEV_REMOVING) {
+		/* device was already removed in the meantime */
+		return;
+	}
+
+	remove_scsi_tgt(svdev, scsi_tgt_num);
+}
+
+static int
+vhost_scsi_session_process_removed(struct spdk_vhost_dev *vdev,
+				   struct spdk_vhost_session *vsession, void *ctx)
+{
+	unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+	struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+	struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num];
+
+	if (state->dev != NULL) {
+		/* there's still a session that references this device,
+		 * so abort our foreach chain here. We'll be called
+		 * again from this session's management poller after it
+		 * is removed in there
+		 */
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+process_removed_devs(struct spdk_vhost_scsi_session *svsession)
+{
+	struct spdk_scsi_dev *dev;
+	struct spdk_scsi_dev_session_state *state;
+	int i;
+
+	for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+		state = &svsession->scsi_dev_state[i];
+		dev = state->dev;
+
+		if (dev && state->status == VHOST_SCSI_DEV_REMOVING &&
+		    !spdk_scsi_dev_has_pending_tasks(dev, NULL)) {
+			/* detach the device from this session */
+			spdk_scsi_dev_free_io_channels(dev);
+			state->dev = NULL;
+			state->status = VHOST_SCSI_DEV_REMOVED;
+			/* try to detach it globally */
+			spdk_vhost_lock();
+			vhost_dev_foreach_session(&svsession->svdev->vdev,
+						  vhost_scsi_session_process_removed,
+						  vhost_scsi_dev_process_removed_cpl_cb,
+						  (void *)(uintptr_t)i);
+			spdk_vhost_unlock();
+		}
+	}
+}
+
+static void
+eventq_enqueue(struct spdk_vhost_scsi_session *svsession, unsigned scsi_dev_num,
+	       uint32_t event, uint32_t reason)
+{
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+	struct spdk_vhost_virtqueue *vq;
+	struct vring_desc *desc, *desc_table;
+	struct virtio_scsi_event *desc_ev;
+	uint32_t desc_table_size, req_size = 0;
+	uint16_t req;
+	int rc;
+
+	assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+	vq = &vsession->virtqueue[VIRTIO_SCSI_EVENTQ];
+
+	if (vq->vring.desc == NULL || vhost_vq_avail_ring_get(vq, &req, 1) != 1) {
+		SPDK_ERRLOG("%s: failed to send virtio event (no avail ring entries?).\n",
+			    vsession->name);
+		return;
+	}
+
+	rc = vhost_vq_get_desc(vsession, vq, req, &desc, &desc_table, &desc_table_size);
+	if (rc != 0 || desc->len < sizeof(*desc_ev)) {
+		SPDK_ERRLOG("%s: invalid eventq descriptor at index %"PRIu16".\n",
+			    vsession->name, req);
+		goto out;
+	}
+
+	desc_ev = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*desc_ev));
+	if (desc_ev == NULL) {
+		SPDK_ERRLOG("%s: eventq descriptor at index %"PRIu16" points "
+			    "to unmapped guest memory address %p.\n",
+			    vsession->name, req, (void *)(uintptr_t)desc->addr);
+		goto out;
+	}
+
+	desc_ev->event = event;
+	desc_ev->lun[0] = 1;
+	desc_ev->lun[1] = scsi_dev_num;
+	/* virtio LUN id 0 can refer either to the entire device
+	 * or actual LUN 0 (the only supported by vhost for now)
+	 */
+	desc_ev->lun[2] = 0 >> 8;
+	desc_ev->lun[3] = 0 & 0xFF;
+	/* virtio doesn't specify any strict format for LUN id (bytes 2 and 3)
+	 * current implementation relies on linux kernel sources
+	 */
+	memset(&desc_ev->lun[4], 0, 4);
+	desc_ev->reason = reason;
+	req_size = sizeof(*desc_ev);
+
+out:
+	vhost_vq_used_ring_enqueue(vsession, vq, req, req_size);
+}
+
+static void
+submit_completion(struct spdk_vhost_scsi_task *task)
+{
+	struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+	vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx,
+				   task->used_len);
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx);
+
+	vhost_scsi_task_put(task);
+}
+
+static void
+vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task)
+{
+	struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+	submit_completion(task);
+}
+
+static void
+vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task)
+{
+	struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+	/* The SCSI task has completed.  Do final processing and then post
+	   notification to the virtqueue's "used" ring.
+	 */
+	task->resp->status = task->scsi.status;
+
+	if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+		memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len);
+		task->resp->sense_len = task->scsi.sense_data_len;
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx,
+			      task->scsi.status);
+	}
+	assert(task->scsi.transfer_len == task->scsi.length);
+	task->resp->resid = task->scsi.length - task->scsi.data_transferred;
+
+	submit_completion(task);
+}
+
+static void
+task_submit(struct spdk_vhost_scsi_task *task)
+{
+	task->resp->response = VIRTIO_SCSI_S_OK;
+	spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func)
+{
+	task->tmf_resp->response = VIRTIO_SCSI_S_OK;
+	task->scsi.function = func;
+	spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+invalid_request(struct spdk_vhost_scsi_task *task)
+{
+	struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+	vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx,
+				   task->used_len);
+	vhost_scsi_task_put(task);
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n",
+		      task->resp ? task->resp->response : -1);
+}
+
+static int
+vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun)
+{
+	struct spdk_vhost_scsi_session *svsession = task->svsession;
+	struct spdk_scsi_dev_session_state *state;
+	uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF;
+
+	SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8);
+
+	/* First byte must be 1 and second is target */
+	if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+		return -1;
+	}
+
+	state = &svsession->scsi_dev_state[lun[1]];
+	task->scsi_dev = state->dev;
+	if (state->dev == NULL || state->status != VHOST_SCSI_DEV_PRESENT) {
+		/* If dev has been hotdetached, return 0 to allow sending
+		 * additional hotremove event via sense codes.
+		 */
+		return state->status != VHOST_SCSI_DEV_EMPTY ? 0 : -1;
+	}
+
+	task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0);
+	task->scsi.lun = spdk_scsi_dev_get_lun(state->dev, lun_id);
+	return 0;
+}
+
+static void
+process_ctrl_request(struct spdk_vhost_scsi_task *task)
+{
+	struct spdk_vhost_session *vsession = &task->svsession->vsession;
+	struct vring_desc *desc, *desc_table;
+	struct virtio_scsi_ctrl_tmf_req *ctrl_req;
+	struct virtio_scsi_ctrl_an_resp *an_resp;
+	uint32_t desc_table_size, used_len = 0;
+	int rc;
+
+	spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_mgmt_cpl, vhost_scsi_task_free_cb);
+	rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table,
+			       &desc_table_size);
+	if (spdk_unlikely(rc != 0)) {
+		SPDK_ERRLOG("%s: invalid controlq descriptor at index %d.\n",
+			    vsession->name, task->req_idx);
+		goto out;
+	}
+
+	ctrl_req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*ctrl_req));
+	if (ctrl_req == NULL) {
+		SPDK_ERRLOG("%s: invalid task management request at index %d.\n",
+			    vsession->name, task->req_idx);
+		goto out;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE,
+		      "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n",
+		      task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->last_used_idx,
+		      task->vq->vring.kickfd, task->vq->vring.size);
+	SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, desc->len);
+
+	vhost_scsi_task_init_target(task, ctrl_req->lun);
+
+	vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+	if (spdk_unlikely(desc == NULL)) {
+		SPDK_ERRLOG("%s: no response descriptor for controlq request %d.\n",
+			    vsession->name, task->req_idx);
+		goto out;
+	}
+
+	/* Process the TMF request */
+	switch (ctrl_req->type) {
+	case VIRTIO_SCSI_T_TMF:
+		task->tmf_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->tmf_resp));
+		if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) {
+			SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n",
+				    vsession->name, task->req_idx);
+			goto out;
+		}
+
+		/* Check if we are processing a valid request */
+		if (task->scsi_dev == NULL) {
+			task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+			break;
+		}
+
+		switch (ctrl_req->subtype) {
+		case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+			/* Handle LUN reset */
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: LUN reset\n", vsession->name);
+
+			mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+			return;
+		default:
+			task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED;
+			/* Unsupported command */
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: unsupported TMF command %x\n",
+				      vsession->name, ctrl_req->subtype);
+			break;
+		}
+		break;
+	case VIRTIO_SCSI_T_AN_QUERY:
+	case VIRTIO_SCSI_T_AN_SUBSCRIBE: {
+		an_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*an_resp));
+		if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) {
+			SPDK_WARNLOG("%s: asynchronous response descriptor points to invalid guest memory region\n",
+				     vsession->name);
+			goto out;
+		}
+
+		an_resp->response = VIRTIO_SCSI_S_ABORTED;
+		break;
+	}
+	default:
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: Unsupported control command %x\n",
+			      vsession->name, ctrl_req->type);
+		break;
+	}
+
+	used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp);
+out:
+	vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, used_len);
+	vhost_scsi_task_put(task);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ *   -1 if request is invalid and must be aborted,
+ *    0 if all data are set.
+ */
+static int
+task_data_setup(struct spdk_vhost_scsi_task *task,
+		struct virtio_scsi_cmd_req **req)
+{
+	struct spdk_vhost_session *vsession = &task->svsession->vsession;
+	struct vring_desc *desc, *desc_table;
+	struct iovec *iovs = task->iovs;
+	uint16_t iovcnt = 0;
+	uint32_t desc_table_len, len = 0;
+	int rc;
+
+	spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_cpl, vhost_scsi_task_free_cb);
+
+	rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len);
+	/* First descriptor must be readable */
+	if (spdk_unlikely(rc != 0  || vhost_vring_desc_is_wr(desc) ||
+			  desc->len < sizeof(struct virtio_scsi_cmd_req))) {
+		SPDK_WARNLOG("%s: invalid first request descriptor at index %"PRIu16".\n",
+			     vsession->name, task->req_idx);
+		goto invalid_task;
+	}
+
+	*req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(**req));
+	if (spdk_unlikely(*req == NULL)) {
+		SPDK_WARNLOG("%s: request descriptor at index %d points to invalid guest memory region\n",
+			     vsession->name, task->req_idx);
+		goto invalid_task;
+	}
+
+	/* Each request must have at least 2 descriptors (e.g. request and response) */
+	vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+	if (desc == NULL) {
+		SPDK_WARNLOG("%s: descriptor chain at index %d contains neither payload nor response buffer.\n",
+			     vsession->name, task->req_idx);
+		goto invalid_task;
+	}
+	task->scsi.dxfer_dir = vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV :
+			       SPDK_SCSI_DIR_TO_DEV;
+	task->scsi.iovs = iovs;
+
+	if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+		/*
+		 * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN]
+		 */
+		task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp));
+		if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+			SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n",
+				     vsession->name, task->req_idx);
+			goto invalid_task;
+		}
+		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+		if (spdk_unlikely(rc != 0)) {
+			SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n",
+				     vsession->name, task->req_idx);
+			goto invalid_task;
+		}
+
+		if (desc == NULL) {
+			/*
+			 * TEST UNIT READY command and some others might not contain any payload and this is not an error.
+			 */
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA,
+				      "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx);
+			SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE);
+			task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+			task->scsi.iovcnt = 1;
+			task->scsi.iovs[0].iov_len = 0;
+			task->scsi.length = 0;
+			task->scsi.transfer_len = 0;
+			return 0;
+		}
+
+		/* All remaining descriptors are data. */
+		while (desc) {
+			if (spdk_unlikely(!vhost_vring_desc_is_wr(desc))) {
+				SPDK_WARNLOG("%s: FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n",
+					     vsession->name, iovcnt);
+				goto invalid_task;
+			}
+
+			if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) {
+				goto invalid_task;
+			}
+			len += desc->len;
+
+			rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+			if (spdk_unlikely(rc != 0)) {
+				SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n",
+					     vsession->name, task->req_idx);
+				goto invalid_task;
+			}
+		}
+
+		task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len;
+	} else {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV");
+		/*
+		 * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp]
+		 * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir.
+		 */
+
+		/* Process descriptors up to response. */
+		while (!vhost_vring_desc_is_wr(desc)) {
+			if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) {
+				goto invalid_task;
+			}
+			len += desc->len;
+
+			vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+			if (spdk_unlikely(desc == NULL)) {
+				SPDK_WARNLOG("%s: TO_DEV cmd: no response descriptor.\n", vsession->name);
+				goto invalid_task;
+			}
+		}
+
+		task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp));
+		if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+			SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n",
+				     vsession->name, task->req_idx);
+			goto invalid_task;
+		}
+
+		task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+	}
+
+	task->scsi.iovcnt = iovcnt;
+	task->scsi.length = len;
+	task->scsi.transfer_len = len;
+	return 0;
+
+invalid_task:
+	SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n",
+		      vsession->name, task->req_idx);
+	return -1;
+}
+
+static int
+process_request(struct spdk_vhost_scsi_task *task)
+{
+	struct virtio_scsi_cmd_req *req;
+	int result;
+
+	result = task_data_setup(task, &req);
+	if (result) {
+		return result;
+	}
+
+	result = vhost_scsi_task_init_target(task, req->lun);
+	if (spdk_unlikely(result != 0)) {
+		task->resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+		return -1;
+	}
+
+	task->scsi.cdb = req->cdb;
+	SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE);
+
+	if (spdk_unlikely(task->scsi.lun == NULL)) {
+		spdk_scsi_task_process_null_lun(&task->scsi);
+		task->resp->response = VIRTIO_SCSI_S_OK;
+		return 1;
+	}
+
+	return 0;
+}
+
+static void
+process_scsi_task(struct spdk_vhost_session *vsession,
+		  struct spdk_vhost_virtqueue *vq,
+		  uint16_t req_idx)
+{
+	struct spdk_vhost_scsi_task *task;
+	int result;
+
+	task = &((struct spdk_vhost_scsi_task *)vq->tasks)[req_idx];
+	if (spdk_unlikely(task->used)) {
+		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+			    vsession->name, req_idx);
+		vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+		return;
+	}
+
+	vsession->task_cnt++;
+	scsi_task_init(task);
+
+	if (spdk_unlikely(vq->vring_idx == VIRTIO_SCSI_CONTROLQ)) {
+		process_ctrl_request(task);
+	} else {
+		result = process_request(task);
+		if (likely(result == 0)) {
+			task_submit(task);
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task,
+				      task->req_idx);
+		} else if (result > 0) {
+			vhost_scsi_task_cpl(&task->scsi);
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task,
+				      task->req_idx);
+		} else {
+			invalid_request(task);
+			SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task,
+				      task->req_idx);
+		}
+	}
+}
+
+static void
+process_vq(struct spdk_vhost_scsi_session *svsession, struct spdk_vhost_virtqueue *vq)
+{
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+	uint16_t reqs[32];
+	uint16_t reqs_cnt, i;
+
+	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+	assert(reqs_cnt <= 32);
+
+	for (i = 0; i < reqs_cnt; i++) {
+		SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n",
+			      reqs[i]);
+
+		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+				    vsession->name, reqs[i], vq->vring.size);
+			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
+			continue;
+		}
+
+		process_scsi_task(vsession, vq, reqs[i]);
+	}
+}
+
+static int
+vdev_mgmt_worker(void *arg)
+{
+	struct spdk_vhost_scsi_session *svsession = arg;
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+
+	process_removed_devs(svsession);
+	vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]);
+
+	process_vq(svsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]);
+	vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+vdev_worker(void *arg)
+{
+	struct spdk_vhost_scsi_session *svsession = arg;
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+	uint32_t q_idx;
+
+	for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vsession->max_queues; q_idx++) {
+		process_vq(svsession, &vsession->virtqueue[q_idx]);
+	}
+
+	vhost_session_used_signal(vsession);
+
+	return SPDK_POLLER_BUSY;
+}
+
+static struct spdk_vhost_scsi_dev *
+to_scsi_dev(struct spdk_vhost_dev *ctrlr)
+{
+	if (ctrlr == NULL) {
+		return NULL;
+	}
+
+	if (ctrlr->backend != &spdk_vhost_scsi_device_backend) {
+		SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name);
+		return NULL;
+	}
+
+	return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev);
+}
+
+static struct spdk_vhost_scsi_session *
+to_scsi_session(struct spdk_vhost_session *vsession)
+{
+	assert(vsession->vdev->backend == &spdk_vhost_scsi_device_backend);
+	return (struct spdk_vhost_scsi_session *)vsession;
+}
+
+int
+spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask)
+{
+	struct spdk_vhost_scsi_dev *svdev = calloc(1, sizeof(*svdev));
+	int rc;
+
+	if (svdev == NULL) {
+		return -ENOMEM;
+	}
+
+	svdev->vdev.virtio_features = SPDK_VHOST_SCSI_FEATURES;
+	svdev->vdev.disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES;
+
+	spdk_vhost_lock();
+	rc = vhost_dev_register(&svdev->vdev, name, cpumask,
+				&spdk_vhost_scsi_device_backend);
+
+	if (rc) {
+		free(svdev);
+		spdk_vhost_unlock();
+		return rc;
+	}
+
+	svdev->registered = true;
+
+	spdk_vhost_unlock();
+	return rc;
+}
+
+static int
+vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev)
+{
+	struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev);
+	int rc, i;
+
+	assert(svdev != NULL);
+	for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+		if (svdev->scsi_dev_state[i].dev) {
+			if (vdev->registered) {
+				SPDK_ERRLOG("%s: SCSI target %d is still present.\n", vdev->name, i);
+				return -EBUSY;
+			}
+
+			rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL);
+			if (rc != 0) {
+				SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i);
+				return rc;
+			}
+		}
+	}
+
+	rc = vhost_dev_unregister(vdev);
+	if (rc != 0) {
+		return rc;
+	}
+	svdev->registered = false;
+
+	if (svdev->ref == 0) {
+		free(svdev);
+	}
+
+	return 0;
+}
+
+struct spdk_scsi_dev *
+spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num)
+{
+	struct spdk_vhost_scsi_dev *svdev;
+
+	assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+	svdev = to_scsi_dev(vdev);
+	assert(svdev != NULL);
+	if (svdev->scsi_dev_state[num].status != VHOST_SCSI_DEV_PRESENT) {
+		return NULL;
+	}
+
+	assert(svdev->scsi_dev_state[num].dev != NULL);
+	return svdev->scsi_dev_state[num].dev;
+}
+
+static void
+vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg)
+{
+	struct spdk_vhost_scsi_dev *svdev = arg;
+	const struct spdk_scsi_dev *scsi_dev;
+	unsigned scsi_dev_num;
+
+	assert(lun != NULL);
+	assert(svdev != NULL);
+	scsi_dev = spdk_scsi_lun_get_dev(lun);
+	for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) {
+		if (svdev->scsi_dev_state[scsi_dev_num].dev == scsi_dev) {
+			break;
+		}
+	}
+
+	if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+		/* The entire device has been already removed. */
+		return;
+	}
+
+	/* remove entire device */
+	spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL);
+}
+
+static void
+vhost_scsi_dev_add_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+	unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+	struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+					    struct spdk_vhost_scsi_dev, vdev);
+	struct spdk_scsi_dev_vhost_state *vhost_sdev;
+
+	vhost_sdev = &svdev->scsi_dev_state[scsi_tgt_num];
+
+	/* All sessions have added the target */
+	assert(vhost_sdev->status == VHOST_SCSI_DEV_ADDING);
+	vhost_sdev->status = VHOST_SCSI_DEV_PRESENT;
+	svdev->ref++;
+}
+
+static int
+vhost_scsi_session_add_tgt(struct spdk_vhost_dev *vdev,
+			   struct spdk_vhost_session *vsession, void *ctx)
+{
+	unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+	struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+	struct spdk_scsi_dev_session_state *session_sdev = &svsession->scsi_dev_state[scsi_tgt_num];
+	struct spdk_scsi_dev_vhost_state *vhost_sdev;
+	int rc;
+
+	if (!vsession->started || session_sdev->dev != NULL) {
+		/* Nothing to do. */
+		return 0;
+	}
+
+	vhost_sdev = &svsession->svdev->scsi_dev_state[scsi_tgt_num];
+	session_sdev->dev = vhost_sdev->dev;
+	session_sdev->status = VHOST_SCSI_DEV_PRESENT;
+
+	rc = spdk_scsi_dev_allocate_io_channels(svsession->scsi_dev_state[scsi_tgt_num].dev);
+	if (rc != 0) {
+		SPDK_ERRLOG("%s: Couldn't allocate io channnel for SCSI target %u.\n",
+			    vsession->name, scsi_tgt_num);
+
+		/* unset the SCSI target so that all I/O to it will be rejected */
+		session_sdev->dev = NULL;
+		/* Set status to EMPTY so that we won't reply with SCSI hotremove
+		 * sense codes - the device hasn't ever been added.
+		 */
+		session_sdev->status = VHOST_SCSI_DEV_EMPTY;
+
+		/* Return with no error. We'll continue allocating io_channels for
+		 * other sessions on this device in hopes they succeed. The sessions
+		 * that failed to allocate io_channels simply won't be able to
+		 * detect the SCSI target, nor do any I/O to it.
+		 */
+		return 0;
+	}
+
+	if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) {
+		eventq_enqueue(svsession, scsi_tgt_num,
+			       VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN);
+	} else {
+		SPDK_NOTICELOG("%s: driver does not support hotplug. "
+			       "Please restart it or perform a rescan.\n",
+			       vsession->name);
+	}
+
+	return 0;
+}
+
+int
+spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, int scsi_tgt_num,
+			    const char *bdev_name)
+{
+	struct spdk_vhost_scsi_dev *svdev;
+	struct spdk_scsi_dev_vhost_state *state;
+	char target_name[SPDK_SCSI_DEV_MAX_NAME];
+	int lun_id_list[1];
+	const char *bdev_names_list[1];
+
+	svdev = to_scsi_dev(vdev);
+	assert(svdev != NULL);
+	if (scsi_tgt_num < 0) {
+		for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) {
+			if (svdev->scsi_dev_state[scsi_tgt_num].dev == NULL) {
+				break;
+			}
+		}
+
+		if (scsi_tgt_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+			SPDK_ERRLOG("%s: all SCSI target slots are already in use.\n", vdev->name);
+			return -ENOSPC;
+		}
+	} else {
+		if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+			SPDK_ERRLOG("%s: SCSI target number is too big (got %d, max %d)\n",
+				    vdev->name, scsi_tgt_num, SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+			return -EINVAL;
+		}
+	}
+
+	if (bdev_name == NULL) {
+		SPDK_ERRLOG("No lun name specified\n");
+		return -EINVAL;
+	}
+
+	state = &svdev->scsi_dev_state[scsi_tgt_num];
+	if (state->dev != NULL) {
+		SPDK_ERRLOG("%s: SCSI target %u already occupied\n", vdev->name, scsi_tgt_num);
+		return -EEXIST;
+	}
+
+	/*
+	 * At this stage only one LUN per target
+	 */
+	snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num);
+	lun_id_list[0] = 0;
+	bdev_names_list[0] = (char *)bdev_name;
+
+	state->status = VHOST_SCSI_DEV_ADDING;
+	state->dev = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, 1,
+					     SPDK_SPC_PROTOCOL_IDENTIFIER_SAS,
+					     vhost_scsi_lun_hotremove, svdev);
+
+	if (state->dev == NULL) {
+		state->status = VHOST_SCSI_DEV_EMPTY;
+		SPDK_ERRLOG("%s: couldn't create SCSI target %u using bdev '%s'\n",
+			    vdev->name, scsi_tgt_num, bdev_name);
+		return -EINVAL;
+	}
+	spdk_scsi_dev_add_port(state->dev, 0, "vhost");
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: added SCSI target %u using bdev '%s'\n",
+		     vdev->name, scsi_tgt_num, bdev_name);
+
+	vhost_dev_foreach_session(vdev, vhost_scsi_session_add_tgt,
+				  vhost_scsi_dev_add_tgt_cpl_cb,
+				  (void *)(uintptr_t)scsi_tgt_num);
+	return scsi_tgt_num;
+}
+
+struct scsi_tgt_hotplug_ctx {
+	unsigned scsi_tgt_num;
+	bool async_fini;
+};
+
+static void
+vhost_scsi_dev_remove_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *_ctx)
+{
+	struct scsi_tgt_hotplug_ctx *ctx = _ctx;
+	struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+					    struct spdk_vhost_scsi_dev, vdev);
+
+	if (!ctx->async_fini) {
+		/* there aren't any active sessions, so remove the dev and exit */
+		remove_scsi_tgt(svdev, ctx->scsi_tgt_num);
+	}
+
+	free(ctx);
+}
+
+static int
+vhost_scsi_session_remove_tgt(struct spdk_vhost_dev *vdev,
+			      struct spdk_vhost_session *vsession, void *_ctx)
+{
+	struct scsi_tgt_hotplug_ctx *ctx = _ctx;
+	unsigned scsi_tgt_num = ctx->scsi_tgt_num;
+	struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+	struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num];
+
+	if (!vsession->started || state->dev == NULL) {
+		/* Nothing to do */
+		return 0;
+	}
+
+	/* Mark the target for removal */
+	assert(state->status == VHOST_SCSI_DEV_PRESENT);
+	state->status = VHOST_SCSI_DEV_REMOVING;
+
+	/* Send a hotremove Virtio event */
+	if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) {
+		eventq_enqueue(svsession, scsi_tgt_num,
+			       VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED);
+	}
+
+	/* Wait for the session's management poller to remove the target after
+	 * all its pending I/O has finished.
+	 */
+	ctx->async_fini = true;
+	return 0;
+}
+
+int
+spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num,
+			       spdk_vhost_event_fn cb_fn, void *cb_arg)
+{
+	struct spdk_vhost_scsi_dev *svdev;
+	struct spdk_scsi_dev_vhost_state *scsi_dev_state;
+	struct scsi_tgt_hotplug_ctx *ctx;
+
+	if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+		SPDK_ERRLOG("%s: invalid SCSI target number %d\n", vdev->name, scsi_tgt_num);
+		return -EINVAL;
+	}
+
+	svdev = to_scsi_dev(vdev);
+	assert(svdev != NULL);
+	scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num];
+
+	if (scsi_dev_state->status != VHOST_SCSI_DEV_PRESENT) {
+		return -EBUSY;
+	}
+
+	if (scsi_dev_state->dev == NULL || scsi_dev_state->status == VHOST_SCSI_DEV_ADDING) {
+		SPDK_ERRLOG("%s: SCSI target %u is not occupied\n", vdev->name, scsi_tgt_num);
+		return -ENODEV;
+	}
+
+	assert(scsi_dev_state->status != VHOST_SCSI_DEV_EMPTY);
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		SPDK_ERRLOG("calloc failed\n");
+		return -ENOMEM;
+	}
+
+	ctx->scsi_tgt_num = scsi_tgt_num;
+	ctx->async_fini = false;
+
+	scsi_dev_state->remove_cb = cb_fn;
+	scsi_dev_state->remove_ctx = cb_arg;
+	scsi_dev_state->status = VHOST_SCSI_DEV_REMOVING;
+
+	vhost_dev_foreach_session(vdev, vhost_scsi_session_remove_tgt,
+				  vhost_scsi_dev_remove_tgt_cpl_cb, ctx);
+	return 0;
+}
+
+int
+vhost_scsi_controller_construct(void)
+{
+	struct spdk_conf_section *sp = spdk_conf_first_section(NULL);
+	struct spdk_vhost_dev *vdev;
+	int i, dev_num;
+	unsigned ctrlr_num = 0;
+	char *bdev_name, *tgt_num_str;
+	char *cpumask;
+	char *name;
+	char *tgt = NULL;
+
+	while (sp != NULL) {
+		if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) {
+			sp = spdk_conf_next_section(sp);
+			continue;
+		}
+
+		if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) {
+			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+				    spdk_conf_section_get_name(sp));
+			return -1;
+		}
+
+		name =  spdk_conf_section_get_val(sp, "Name");
+		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+
+		if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) {
+			return -1;
+		}
+
+		vdev = spdk_vhost_dev_find(name);
+		assert(vdev);
+
+		for (i = 0; ; i++) {
+
+			tgt = spdk_conf_section_get_nval(sp, "Target", i);
+			if (tgt == NULL) {
+				break;
+			}
+
+			tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0);
+			if (tgt_num_str == NULL) {
+				SPDK_ERRLOG("%s: invalid or missing SCSI target number\n", name);
+				return -1;
+			}
+
+			dev_num = (int)strtol(tgt_num_str, NULL, 10);
+			bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1);
+			if (bdev_name == NULL) {
+				SPDK_ERRLOG("%s: invalid or missing bdev name for SCSI target %d\n", name, dev_num);
+				return -1;
+			} else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) {
+				SPDK_ERRLOG("%s: only one LUN per SCSI target is supported\n", name);
+				return -1;
+			}
+
+			if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) {
+				return -1;
+			}
+		}
+
+		sp = spdk_conf_next_section(sp);
+	}
+
+	return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_scsi_session *svsession)
+{
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+	struct spdk_vhost_virtqueue *vq;
+	uint16_t i;
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		vq = &vsession->virtqueue[i];
+		if (vq->tasks == NULL) {
+			continue;
+		}
+
+		spdk_free(vq->tasks);
+		vq->tasks = NULL;
+	}
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_scsi_session *svsession)
+{
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+	struct spdk_vhost_virtqueue *vq;
+	struct spdk_vhost_scsi_task *task;
+	uint32_t task_cnt;
+	uint16_t i;
+	uint32_t j;
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		vq = &vsession->virtqueue[i];
+		if (vq->vring.desc == NULL) {
+			continue;
+		}
+
+		task_cnt = vq->vring.size;
+		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+			/* sanity check */
+			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+			free_task_pool(svsession);
+			return -1;
+		}
+		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt,
+					 SPDK_CACHE_LINE_SIZE, NULL,
+					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+		if (vq->tasks == NULL) {
+			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+				    vsession->name, task_cnt, i);
+			free_task_pool(svsession);
+			return -1;
+		}
+
+		for (j = 0; j < task_cnt; j++) {
+			task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j];
+			task->svsession = svsession;
+			task->vq = vq;
+			task->req_idx = j;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_scsi_start_cb(struct spdk_vhost_dev *vdev,
+		    struct spdk_vhost_session *vsession, void *unused)
+{
+	struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+	struct spdk_vhost_scsi_dev *svdev = svsession->svdev;
+	struct spdk_scsi_dev_vhost_state *state;
+	uint32_t i;
+	int rc;
+
+	/* validate all I/O queues are in a contiguous index range */
+	for (i = VIRTIO_SCSI_REQUESTQ; i < vsession->max_queues; i++) {
+		if (vsession->virtqueue[i].vring.desc == NULL) {
+			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
+			rc = -1;
+			goto out;
+		}
+	}
+
+	rc = alloc_task_pool(svsession);
+	if (rc != 0) {
+		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
+		goto out;
+	}
+
+	for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+		state = &svdev->scsi_dev_state[i];
+		if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) {
+			continue;
+		}
+
+		assert(svsession->scsi_dev_state[i].status == VHOST_SCSI_DEV_EMPTY);
+		svsession->scsi_dev_state[i].dev = state->dev;
+		svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_PRESENT;
+		rc = spdk_scsi_dev_allocate_io_channels(state->dev);
+		if (rc != 0) {
+			SPDK_ERRLOG("%s: failed to alloc io_channel for SCSI target %"PRIu32"\n",
+				    vsession->name, i);
+			/* unset the SCSI target so that all I/O to it will be rejected */
+			svsession->scsi_dev_state[i].dev = NULL;
+			/* set EMPTY state so that we won't reply with SCSI hotremove
+			 * sense codes - the device hasn't ever been added.
+			 */
+			svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_EMPTY;
+			continue;
+		}
+	}
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
+		     vsession->name, spdk_env_get_current_core());
+
+	svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0);
+	if (vsession->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc &&
+	    vsession->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) {
+		svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession,
+					 MGMT_POLL_PERIOD_US);
+	}
+out:
+	vhost_session_start_done(vsession, rc);
+	return rc;
+}
+
+static int
+vhost_scsi_start(struct spdk_vhost_session *vsession)
+{
+	struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+	struct spdk_vhost_scsi_dev *svdev;
+
+	svdev = to_scsi_dev(vsession->vdev);
+	assert(svdev != NULL);
+	svsession->svdev = svdev;
+
+	return vhost_session_send_event(vsession, vhost_scsi_start_cb,
+					3, "start session");
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+	struct spdk_vhost_scsi_session *svsession = arg;
+	struct spdk_vhost_session *vsession = &svsession->vsession;
+	struct spdk_scsi_dev_session_state *state;
+	uint32_t i;
+
+	if (vsession->task_cnt > 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	if (spdk_vhost_trylock() != 0) {
+		return SPDK_POLLER_BUSY;
+	}
+
+	for (i = 0; i < vsession->max_queues; i++) {
+		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
+	}
+
+	for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+		enum spdk_scsi_dev_vhost_status prev_status;
+
+		state = &svsession->scsi_dev_state[i];
+		/* clear the REMOVED status so that we won't send hotremove events anymore */
+		prev_status = state->status;
+		state->status = VHOST_SCSI_DEV_EMPTY;
+		if (state->dev == NULL) {
+			continue;
+		}
+
+		spdk_scsi_dev_free_io_channels(state->dev);
+
+		state->dev = NULL;
+
+		if (prev_status == VHOST_SCSI_DEV_REMOVING) {
+			/* try to detach it globally */
+			vhost_dev_foreach_session(vsession->vdev,
+						  vhost_scsi_session_process_removed,
+						  vhost_scsi_dev_process_removed_cpl_cb,
+						  (void *)(uintptr_t)i);
+		}
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
+		     vsession->name, spdk_env_get_current_core());
+
+	free_task_pool(svsession);
+
+	spdk_poller_unregister(&svsession->stop_poller);
+	vhost_session_stop_done(vsession, 0);
+
+	spdk_vhost_unlock();
+	return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev,
+		   struct spdk_vhost_session *vsession, void *unused)
+{
+	struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+
+	/* Stop receiving new I/O requests */
+	spdk_poller_unregister(&svsession->requestq_poller);
+
+	/* Stop receiving controlq requests, also stop processing the
+	 * asynchronous hotremove events. All the remaining events
+	 * will be finalized by the stop_poller below.
+	 */
+	spdk_poller_unregister(&svsession->mgmt_poller);
+
+	/* Wait for all pending I/Os to complete, then process all the
+	 * remaining hotremove events one last time.
+	 */
+	svsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
+				 svsession, 1000);
+
+	return 0;
+}
+
+static int
+vhost_scsi_stop(struct spdk_vhost_session *vsession)
+{
+	return vhost_session_send_event(vsession, vhost_scsi_stop_cb,
+					3, "stop session");
+}
+
+static void
+vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct spdk_scsi_dev *sdev;
+	struct spdk_scsi_lun *lun;
+	uint32_t dev_idx;
+	uint32_t lun_idx;
+
+	assert(vdev != NULL);
+	spdk_json_write_named_array_begin(w, "scsi");
+	for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) {
+		sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx);
+		if (!sdev) {
+			continue;
+		}
+
+		spdk_json_write_object_begin(w);
+
+		spdk_json_write_named_uint32(w, "scsi_dev_num", dev_idx);
+
+		spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev));
+
+		spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev));
+
+		spdk_json_write_named_array_begin(w, "luns");
+
+		for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) {
+			lun = spdk_scsi_dev_get_lun(sdev, lun_idx);
+			if (!lun) {
+				continue;
+			}
+
+			spdk_json_write_object_begin(w);
+
+			spdk_json_write_named_int32(w, "id", spdk_scsi_lun_get_id(lun));
+
+			spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+
+			spdk_json_write_object_end(w);
+		}
+
+		spdk_json_write_array_end(w);
+		spdk_json_write_object_end(w);
+	}
+
+	spdk_json_write_array_end(w);
+}
+
+static void
+vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct spdk_scsi_dev *scsi_dev;
+	struct spdk_scsi_lun *lun;
+	uint32_t i;
+
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "vhost_create_scsi_controller");
+
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_string(w, "ctrlr", vdev->name);
+	spdk_json_write_named_string(w, "cpumask",
+				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+	spdk_json_write_object_end(w);
+
+	spdk_json_write_object_end(w);
+
+	for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+		scsi_dev = spdk_vhost_scsi_dev_get_tgt(vdev, i);
+		if (scsi_dev == NULL) {
+			continue;
+		}
+
+		lun = spdk_scsi_dev_get_lun(scsi_dev, 0);
+		assert(lun != NULL);
+
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "method", "vhost_scsi_controller_add_target");
+
+		spdk_json_write_named_object_begin(w, "params");
+		spdk_json_write_named_string(w, "ctrlr", vdev->name);
+		spdk_json_write_named_uint32(w, "scsi_target_num", i);
+
+		spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+		spdk_json_write_object_end(w);
+
+		spdk_json_write_object_end(w);
+	}
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA)
diff --git a/src/spdk/lib/virtio/Makefile b/src/spdk/lib/virtio/Makefile
new file mode 100644
index 000000000..8ea173c3b
--- /dev/null
+++ b/src/spdk/lib/virtio/Makefile
@@ -0,0 +1,46 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = virtio.c virtio_user.c virtio_pci.c vhost_user.c
+LIBNAME = virtio
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_virtio.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/virtio/spdk_virtio.map b/src/spdk/lib/virtio/spdk_virtio.map
new file mode 100644
index 000000000..76e02cff8
--- /dev/null
+++ b/src/spdk/lib/virtio/spdk_virtio.map
@@ -0,0 +1,33 @@
+{
+	global:
+
+	# internal functions in spdk_internal/virtio.h
+	virtio_recv_pkts;
+	virtqueue_req_start;
+	virtqueue_req_flush;
+	virtqueue_req_abort;
+	virtqueue_req_add_iovs;
+	virtio_dev_construct;
+	virtio_dev_reset;
+	virtio_dev_start;
+	virtio_dev_stop;
+	virtio_dev_destruct;
+	virtio_dev_acquire_queue;
+	virtio_dev_find_and_acquire_queue;
+	virtio_dev_queue_get_thread;
+	virtio_dev_queue_is_acquired;
+	virtio_dev_release_queue;
+	virtio_dev_get_status;
+	virtio_dev_set_status;
+	virtio_dev_write_dev_config;
+	virtio_dev_read_dev_config;
+	virtio_dev_backend_ops;
+	virtio_dev_has_feature;
+	virtio_dev_dump_json_info;
+	virtio_pci_dev_enumerate;
+	virtio_pci_dev_attach;
+	virtio_user_dev_init;
+	virtio_pci_dev_init;
+
+	local: *;
+};
diff --git a/src/spdk/lib/virtio/vhost_user.c b/src/spdk/lib/virtio/vhost_user.c
new file mode 100644
index 000000000..b3da9d988
--- /dev/null
+++ b/src/spdk/lib/virtio/vhost_user.c
@@ -0,0 +1,489 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vhost_user.h"
+
+#include "spdk/string.h"
+#include "spdk_internal/vhost_user.h"
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+static int
+vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
+{
+	int r;
+	struct msghdr msgh;
+	struct iovec iov;
+	size_t fd_size = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+	struct cmsghdr *cmsg;
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = (uint8_t *)buf;
+	iov.iov_len = len;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+
+	if (fds && fd_num > 0) {
+		msgh.msg_control = control;
+		msgh.msg_controllen = sizeof(control);
+		cmsg = CMSG_FIRSTHDR(&msgh);
+		cmsg->cmsg_len = CMSG_LEN(fd_size);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		memcpy(CMSG_DATA(cmsg), fds, fd_size);
+	} else {
+		msgh.msg_control = NULL;
+		msgh.msg_controllen = 0;
+	}
+
+	do {
+		r = sendmsg(fd, &msgh, 0);
+	} while (r < 0 && errno == EINTR);
+
+	if (r == -1) {
+		return -errno;
+	}
+
+	return 0;
+}
+
+static int
+vhost_user_read(int fd, struct vhost_user_msg *msg)
+{
+	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
+	ssize_t ret;
+	size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
+
+	ret = recv(fd, (void *)msg, sz_hdr, 0);
+	if ((size_t)ret != sz_hdr) {
+		SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
+			     ret, sz_hdr);
+		if (ret == -1) {
+			return -errno;
+		} else {
+			return -EBUSY;
+		}
+	}
+
+	/* validate msg flags */
+	if (msg->flags != (valid_flags)) {
+		SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
+			     msg->flags, valid_flags);
+		return -EIO;
+	}
+
+	sz_payload = msg->size;
+
+	if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
+		SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
+			     sz_payload, VHOST_USER_PAYLOAD_SIZE);
+		return -EIO;
+	}
+
+	if (sz_payload) {
+		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
+		if ((size_t)ret != sz_payload) {
+			SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
+				     ret, msg->size);
+			if (ret == -1) {
+				return -errno;
+			} else {
+				return -EBUSY;
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct hugepage_file_info {
+	uint64_t addr;            /**< virtual addr */
+	size_t   size;            /**< the file size */
+	char     path[PATH_MAX];  /**< path to backing file */
+};
+
+/* Two possible options:
+ * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
+ * array. This is simple but cannot be used in secondary process because
+ * secondary process will close and munmap that file.
+ * 2. Match HUGEFILE_FMT to find hugepage files directly.
+ *
+ * We choose option 2.
+ */
+static int
+get_hugepage_file_info(struct hugepage_file_info huges[], int max)
+{
+	int idx, rc;
+	FILE *f;
+	char buf[BUFSIZ], *tmp, *tail;
+	char *str_underline, *str_start;
+	int huge_index;
+	uint64_t v_start, v_end;
+
+	f = fopen("/proc/self/maps", "r");
+	if (!f) {
+		SPDK_ERRLOG("cannot open /proc/self/maps\n");
+		rc = -errno;
+		assert(rc < 0); /* scan-build hack */
+		return rc;
+	}
+
+	idx = 0;
+	while (fgets(buf, sizeof(buf), f) != NULL) {
+		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
+			SPDK_ERRLOG("Failed to parse address\n");
+			rc = -EIO;
+			goto out;
+		}
+
+		tmp = strchr(buf, ' ') + 1; /** skip address */
+		tmp = strchr(tmp, ' ') + 1; /** skip perm */
+		tmp = strchr(tmp, ' ') + 1; /** skip offset */
+		tmp = strchr(tmp, ' ') + 1; /** skip dev */
+		tmp = strchr(tmp, ' ') + 1; /** skip inode */
+		while (*tmp == ' ') {       /** skip spaces */
+			tmp++;
+		}
+		tail = strrchr(tmp, '\n');  /** remove newline if exists */
+		if (tail) {
+			*tail = '\0';
+		}
+
+		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
+		 * which is defined in eal_filesystem.h
+		 */
+		str_underline = strrchr(tmp, '_');
+		if (!str_underline) {
+			continue;
+		}
+
+		str_start = str_underline - strlen("map");
+		if (str_start < tmp) {
+			continue;
+		}
+
+		if (sscanf(str_start, "map_%d", &huge_index) != 1) {
+			continue;
+		}
+
+		if (idx >= max) {
+			SPDK_ERRLOG("Exceed maximum of %d\n", max);
+			rc = -ENOSPC;
+			goto out;
+		}
+
+		if (idx > 0 &&
+		    strncmp(tmp, huges[idx - 1].path, PATH_MAX) == 0 &&
+		    v_start == huges[idx - 1].addr + huges[idx - 1].size) {
+			huges[idx - 1].size += (v_end - v_start);
+			continue;
+		}
+
+		huges[idx].addr = v_start;
+		huges[idx].size = v_end - v_start;
+		snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
+		idx++;
+	}
+
+	rc = idx;
+out:
+	fclose(f);
+	return rc;
+}
+
+static int
+prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
+{
+	int i, num;
+	struct hugepage_file_info huges[VHOST_USER_MEMORY_MAX_NREGIONS];
+
+	num = get_hugepage_file_info(huges, VHOST_USER_MEMORY_MAX_NREGIONS);
+	if (num < 0) {
+		SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
+		return num;
+	}
+
+	for (i = 0; i < num; ++i) {
+		/* the memory regions are unaligned */
+		msg->payload.memory.regions[i].guest_phys_addr = huges[i].addr; /* use vaddr! */
+		msg->payload.memory.regions[i].userspace_addr = huges[i].addr;
+		msg->payload.memory.regions[i].memory_size = huges[i].size;
+		msg->payload.memory.regions[i].flags_padding = 0;
+		fds[i] = open(huges[i].path, O_RDWR);
+	}
+
+	msg->payload.memory.nregions = num;
+	msg->payload.memory.padding = 0;
+
+	return 0;
+}
+
+static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
+	[VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
+	[VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
+	[VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
+	[VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
+	[VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
+	[VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+	[VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+	[VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
+	[VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
+	[VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
+	[VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
+	[VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
+	[VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
+	[VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
+	[VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+};
+
+static int
+vhost_user_sock(struct virtio_user_dev *dev,
+		enum vhost_user_request req,
+		void *arg)
+{
+	struct vhost_user_msg msg;
+	struct vhost_vring_file *file = 0;
+	int need_reply = 0;
+	int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
+	int fd_num = 0;
+	int i, len, rc;
+	int vhostfd = dev->vhostfd;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_USER, "sent message %d = %s\n", req, vhost_msg_strings[req]);
+
+	msg.request = req;
+	msg.flags = VHOST_USER_VERSION;
+	msg.size = 0;
+
+	switch (req) {
+	case VHOST_USER_GET_FEATURES:
+	case VHOST_USER_GET_PROTOCOL_FEATURES:
+	case VHOST_USER_GET_QUEUE_NUM:
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_FEATURES:
+	case VHOST_USER_SET_LOG_BASE:
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
+		msg.payload.u64 = *((__u64 *)arg);
+		msg.size = sizeof(msg.payload.u64);
+		break;
+
+	case VHOST_USER_SET_OWNER:
+	case VHOST_USER_RESET_OWNER:
+		break;
+
+	case VHOST_USER_SET_MEM_TABLE:
+		rc = prepare_vhost_memory_user(&msg, fds);
+		if (rc < 0) {
+			return rc;
+		}
+		fd_num = msg.payload.memory.nregions;
+		msg.size = sizeof(msg.payload.memory.nregions);
+		msg.size += sizeof(msg.payload.memory.padding);
+		msg.size += fd_num * sizeof(struct vhost_memory_region);
+		break;
+
+	case VHOST_USER_SET_LOG_FD:
+		fds[fd_num++] = *((int *)arg);
+		break;
+
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_ENABLE:
+		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+		msg.size = sizeof(msg.payload.state);
+		break;
+
+	case VHOST_USER_GET_VRING_BASE:
+		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+		msg.size = sizeof(msg.payload.state);
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_VRING_ADDR:
+		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
+		msg.size = sizeof(msg.payload.addr);
+		break;
+
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+		file = arg;
+		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
+		msg.size = sizeof(msg.payload.u64);
+		if (file->fd > 0) {
+			fds[fd_num++] = file->fd;
+		} else {
+			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
+		}
+		break;
+
+	case VHOST_USER_GET_CONFIG:
+		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
+		msg.size = sizeof(msg.payload.cfg);
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_CONFIG:
+		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
+		msg.size = sizeof(msg.payload.cfg);
+		break;
+
+	default:
+		SPDK_ERRLOG("trying to send unknown msg\n");
+		return -EINVAL;
+	}
+
+	len = VHOST_USER_HDR_SIZE + msg.size;
+	rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
+	if (rc < 0) {
+		SPDK_ERRLOG("%s failed: %s\n",
+			    vhost_msg_strings[req], spdk_strerror(-rc));
+		return rc;
+	}
+
+	if (req == VHOST_USER_SET_MEM_TABLE)
+		for (i = 0; i < fd_num; ++i) {
+			close(fds[i]);
+		}
+
+	if (need_reply) {
+		rc = vhost_user_read(vhostfd, &msg);
+		if (rc < 0) {
+			SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
+			return rc;
+		}
+
+		if (req != msg.request) {
+			SPDK_WARNLOG("Received unexpected msg type\n");
+			return -EIO;
+		}
+
+		switch (req) {
+		case VHOST_USER_GET_FEATURES:
+		case VHOST_USER_GET_PROTOCOL_FEATURES:
+		case VHOST_USER_GET_QUEUE_NUM:
+			if (msg.size != sizeof(msg.payload.u64)) {
+				SPDK_WARNLOG("Received bad msg size\n");
+				return -EIO;
+			}
+			*((__u64 *)arg) = msg.payload.u64;
+			break;
+		case VHOST_USER_GET_VRING_BASE:
+			if (msg.size != sizeof(msg.payload.state)) {
+				SPDK_WARNLOG("Received bad msg size\n");
+				return -EIO;
+			}
+			memcpy(arg, &msg.payload.state,
+			       sizeof(struct vhost_vring_state));
+			break;
+		case VHOST_USER_GET_CONFIG:
+			if (msg.size != sizeof(msg.payload.cfg)) {
+				SPDK_WARNLOG("Received bad msg size\n");
+				return -EIO;
+			}
+			memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
+			break;
+		default:
+			SPDK_WARNLOG("Received unexpected msg type\n");
+			return -EBADMSG;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Set up environment to talk with a vhost user backend.
+ *
+ * @return
+ *   - (-1) if fail;
+ *   - (0) if succeed.
+ */
+static int
+vhost_user_setup(struct virtio_user_dev *dev)
+{
+	int fd;
+	int flag;
+	struct sockaddr_un un;
+	ssize_t rc;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0) {
+		SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
+		return -errno;
+	}
+
+	flag = fcntl(fd, F_GETFD);
+	if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
+		SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
+	if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
+		SPDK_ERRLOG("socket path too long\n");
+		close(fd);
+		if (rc < 0) {
+			return -errno;
+		} else {
+			return -EINVAL;
+		}
+	}
+	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
+		close(fd);
+		return -errno;
+	}
+
+	dev->vhostfd = fd;
+	return 0;
+}
+
+struct virtio_user_backend_ops ops_user = {
+	.setup = vhost_user_setup,
+	.send_request = vhost_user_sock,
+};
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_user", SPDK_LOG_VIRTIO_USER)
diff --git a/src/spdk/lib/virtio/vhost_user.h b/src/spdk/lib/virtio/vhost_user.h
new file mode 100644
index 000000000..0caf51ebc
--- /dev/null
+++ b/src/spdk/lib/virtio/vhost_user.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_H
+#define _VHOST_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/virtio.h"
+#include "spdk_internal/vhost_user.h"
+
+struct virtio_user_backend_ops;
+
+struct virtio_user_dev {
+	int		vhostfd;
+
+	int		callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
+	int		kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
+	uint32_t	queue_size;
+
+	uint8_t		status;
+	char		path[PATH_MAX];
+	uint64_t	protocol_features;
+	struct vring	vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
+	struct virtio_user_backend_ops *ops;
+	struct spdk_mem_map *mem_map;
+};
+
+struct virtio_user_backend_ops {
+	int (*setup)(struct virtio_user_dev *dev);
+	int (*send_request)(struct virtio_user_dev *dev,
+			    enum vhost_user_request req,
+			    void *arg);
+};
+
+extern struct virtio_user_backend_ops ops_user;
+
+#endif
diff --git a/src/spdk/lib/virtio/virtio.c b/src/spdk/lib/virtio/virtio.c
new file mode 100644
index 000000000..03866040a
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio.c
@@ -0,0 +1,717 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/barrier.h"
+
+#include "spdk_internal/virtio.h"
+
+/* We use SMP memory barrier variants as all virtio_pci devices
+ * are purely virtual. All MMIO is executed on a CPU core, so
+ * there's no need to do full MMIO synchronization.
+ */
+#define virtio_mb()	spdk_smp_mb()
+#define virtio_rmb()	spdk_smp_rmb()
+#define virtio_wmb()	spdk_smp_wmb()
+
+/* Chain all the descriptors in the ring with an END */
+static inline void
+vring_desc_init(struct vring_desc *dp, uint16_t n)
+{
+	uint16_t i;
+
+	for (i = 0; i < n - 1; i++) {
+		dp[i].next = (uint16_t)(i + 1);
+	}
+	dp[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+static void
+virtio_init_vring(struct virtqueue *vq)
+{
+	int size = vq->vq_nentries;
+	struct vring *vr = &vq->vq_ring;
+	uint8_t *ring_mem = vq->vq_ring_virt_mem;
+
+	/*
+	 * Reinitialise since virtio port might have been stopped and restarted
+	 */
+	memset(ring_mem, 0, vq->vq_ring_size);
+	vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN);
+	vq->vq_used_cons_idx = 0;
+	vq->vq_desc_head_idx = 0;
+	vq->vq_avail_idx = 0;
+	vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1);
+	vq->vq_free_cnt = vq->vq_nentries;
+	vq->req_start = VQ_RING_DESC_CHAIN_END;
+	vq->req_end = VQ_RING_DESC_CHAIN_END;
+	vq->reqs_finished = 0;
+	memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
+
+	vring_desc_init(vr->desc, size);
+
+	/* Tell the backend not to interrupt us.
+	 * If F_EVENT_IDX is negotiated, we will always set incredibly high
+	 * used event idx, so that we will practically never receive an
+	 * interrupt. See virtqueue_req_flush()
+	 */
+	if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+		vring_used_event(&vq->vq_ring) = UINT16_MAX;
+	} else {
+		vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+	}
+}
+
+static int
+virtio_init_queue(struct virtio_dev *dev, uint16_t vtpci_queue_idx)
+{
+	unsigned int vq_size, size;
+	struct virtqueue *vq;
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "setting up queue: %"PRIu16"\n", vtpci_queue_idx);
+
+	/*
+	 * Read the virtqueue size from the Queue Size field
+	 * Always power of 2 and if 0 virtqueue does not exist
+	 */
+	vq_size = virtio_dev_backend_ops(dev)->get_queue_size(dev, vtpci_queue_idx);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq_size: %u\n", vq_size);
+	if (vq_size == 0) {
+		SPDK_ERRLOG("virtqueue %"PRIu16" does not exist\n", vtpci_queue_idx);
+		return -EINVAL;
+	}
+
+	if (!spdk_u32_is_pow2(vq_size)) {
+		SPDK_ERRLOG("virtqueue %"PRIu16" size (%u) is not powerof 2\n",
+			    vtpci_queue_idx, vq_size);
+		return -EINVAL;
+	}
+
+	size = sizeof(*vq) + vq_size * sizeof(struct vq_desc_extra);
+
+	if (posix_memalign((void **)&vq, SPDK_CACHE_LINE_SIZE, size)) {
+		SPDK_ERRLOG("can not allocate vq\n");
+		return -ENOMEM;
+	}
+	memset(vq, 0, size);
+	dev->vqs[vtpci_queue_idx] = vq;
+
+	vq->vdev = dev;
+	vq->vq_queue_index = vtpci_queue_idx;
+	vq->vq_nentries = vq_size;
+
+	/*
+	 * Reserve a memzone for vring elements
+	 */
+	size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
+	vq->vq_ring_size = SPDK_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vring_size: %u, rounded_vring_size: %u\n",
+		      size, vq->vq_ring_size);
+
+	vq->owner_thread = NULL;
+
+	rc = virtio_dev_backend_ops(dev)->setup_queue(dev, vq);
+	if (rc < 0) {
+		SPDK_ERRLOG("setup_queue failed\n");
+		free(vq);
+		dev->vqs[vtpci_queue_idx] = NULL;
+		return rc;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_mem:      0x%" PRIx64 "\n",
+		      vq->vq_ring_mem);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_virt_mem: 0x%" PRIx64 "\n",
+		      (uint64_t)(uintptr_t)vq->vq_ring_virt_mem);
+
+	virtio_init_vring(vq);
+	return 0;
+}
+
+static void
+virtio_free_queues(struct virtio_dev *dev)
+{
+	uint16_t nr_vq = dev->max_queues;
+	struct virtqueue *vq;
+	uint16_t i;
+
+	if (dev->vqs == NULL) {
+		return;
+	}
+
+	for (i = 0; i < nr_vq; i++) {
+		vq = dev->vqs[i];
+		if (!vq) {
+			continue;
+		}
+
+		virtio_dev_backend_ops(dev)->del_queue(dev, vq);
+
+		free(vq);
+		dev->vqs[i] = NULL;
+	}
+
+	free(dev->vqs);
+	dev->vqs = NULL;
+}
+
+static int
+virtio_alloc_queues(struct virtio_dev *dev, uint16_t request_vq_num, uint16_t fixed_vq_num)
+{
+	uint16_t nr_vq;
+	uint16_t i;
+	int ret;
+
+	nr_vq = request_vq_num + fixed_vq_num;
+	if (nr_vq == 0) {
+		/* perfectly fine to have a device with no virtqueues. */
+		return 0;
+	}
+
+	assert(dev->vqs == NULL);
+	dev->vqs = calloc(1, sizeof(struct virtqueue *) * nr_vq);
+	if (!dev->vqs) {
+		SPDK_ERRLOG("failed to allocate %"PRIu16" vqs\n", nr_vq);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_vq; i++) {
+		ret = virtio_init_queue(dev, i);
+		if (ret < 0) {
+			virtio_free_queues(dev);
+			return ret;
+		}
+	}
+
+	dev->max_queues = nr_vq;
+	dev->fixed_queues_num = fixed_vq_num;
+	return 0;
+}
+
+/**
+ * Negotiate virtio features. For virtio_user this will also set
+ * dev->modern flag if VIRTIO_F_VERSION_1 flag is negotiated.
+ */
+static int
+virtio_negotiate_features(struct virtio_dev *dev, uint64_t req_features)
+{
+	uint64_t host_features = virtio_dev_backend_ops(dev)->get_features(dev);
+	int rc;
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "guest features = %" PRIx64 "\n", req_features);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "device features = %" PRIx64 "\n", host_features);
+
+	rc = virtio_dev_backend_ops(dev)->set_features(dev, req_features & host_features);
+	if (rc != 0) {
+		SPDK_ERRLOG("failed to negotiate device features.\n");
+		return rc;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "negotiated features = %" PRIx64 "\n",
+		      dev->negotiated_features);
+
+	virtio_dev_set_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+	if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+		SPDK_ERRLOG("failed to set FEATURES_OK status!\n");
+		/* either the device failed, or we offered some features that
+		 * depend on other, not offered features.
+		 */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int
+virtio_dev_construct(struct virtio_dev *vdev, const char *name,
+		     const struct virtio_dev_ops *ops, void *ctx)
+{
+	int rc;
+
+	vdev->name = strdup(name);
+	if (vdev->name == NULL) {
+		return -ENOMEM;
+	}
+
+	rc = pthread_mutex_init(&vdev->mutex, NULL);
+	if (rc != 0) {
+		free(vdev->name);
+		return -rc;
+	}
+
+	vdev->backend_ops = ops;
+	vdev->ctx = ctx;
+
+	return 0;
+}
+
+int
+virtio_dev_reset(struct virtio_dev *dev, uint64_t req_features)
+{
+	req_features |= (1ULL << VIRTIO_F_VERSION_1);
+
+	virtio_dev_stop(dev);
+
+	virtio_dev_set_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_ACKNOWLEDGE)) {
+		SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_ACKNOWLEDGE status.\n");
+		return -EIO;
+	}
+
+	virtio_dev_set_status(dev, VIRTIO_CONFIG_S_DRIVER);
+	if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_DRIVER)) {
+		SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER status.\n");
+		return -EIO;
+	}
+
+	return virtio_negotiate_features(dev, req_features);
+}
+
+int
+virtio_dev_start(struct virtio_dev *vdev, uint16_t max_queues, uint16_t fixed_queue_num)
+{
+	int ret;
+
+	ret = virtio_alloc_queues(vdev, max_queues, fixed_queue_num);
+	if (ret < 0) {
+		return ret;
+	}
+
+	virtio_dev_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK);
+	if (!(virtio_dev_get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER_OK status.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+virtio_dev_destruct(struct virtio_dev *dev)
+{
+	virtio_dev_backend_ops(dev)->destruct_dev(dev);
+	pthread_mutex_destroy(&dev->mutex);
+	free(dev->name);
+}
+
+static void
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+	struct vring_desc *dp, *dp_tail;
+	struct vq_desc_extra *dxp;
+	uint16_t desc_idx_last = desc_idx;
+
+	dp  = &vq->vq_ring.desc[desc_idx];
+	dxp = &vq->vq_descx[desc_idx];
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
+	if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
+		while (dp->flags & VRING_DESC_F_NEXT) {
+			desc_idx_last = dp->next;
+			dp = &vq->vq_ring.desc[dp->next];
+		}
+	}
+	dxp->ndescs = 0;
+
+	/*
+	 * We must append the existing free chain, if any, to the end of
+	 * newly freed chain. If the virtqueue was completely used, then
+	 * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+	 */
+	if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
+		vq->vq_desc_head_idx = desc_idx;
+	} else {
+		dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
+		dp_tail->next = desc_idx;
+	}
+
+	vq->vq_desc_tail_idx = desc_idx_last;
+	dp->next = VQ_RING_DESC_CHAIN_END;
+}
+
+static uint16_t
+virtqueue_dequeue_burst_rx(struct virtqueue *vq, void **rx_pkts,
+			   uint32_t *len, uint16_t num)
+{
+	struct vring_used_elem *uep;
+	void *cookie;
+	uint16_t used_idx, desc_idx;
+	uint16_t i;
+
+	/*  Caller does the check */
+	for (i = 0; i < num ; i++) {
+		used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+		uep = &vq->vq_ring.used->ring[used_idx];
+		desc_idx = (uint16_t) uep->id;
+		len[i] = uep->len;
+		cookie = vq->vq_descx[desc_idx].cookie;
+
+		if (spdk_unlikely(cookie == NULL)) {
+			SPDK_WARNLOG("vring descriptor with no mbuf cookie at %"PRIu16"\n",
+				     vq->vq_used_cons_idx);
+			break;
+		}
+
+		__builtin_prefetch(cookie);
+
+		rx_pkts[i]  = cookie;
+		vq->vq_used_cons_idx++;
+		vq_ring_free_chain(vq, desc_idx);
+		vq->vq_descx[desc_idx].cookie = NULL;
+	}
+
+	return i;
+}
+
+static void
+finish_req(struct virtqueue *vq)
+{
+	struct vring_desc *desc;
+	uint16_t avail_idx;
+
+	desc = &vq->vq_ring.desc[vq->req_end];
+	desc->flags &= ~VRING_DESC_F_NEXT;
+
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_req_flush() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	vq->vq_ring.avail->ring[avail_idx] = vq->req_start;
+	vq->vq_avail_idx++;
+	vq->req_end = VQ_RING_DESC_CHAIN_END;
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+	vq->reqs_finished++;
+}
+
+int
+virtqueue_req_start(struct virtqueue *vq, void *cookie, int iovcnt)
+{
+	struct vq_desc_extra *dxp;
+
+	if (iovcnt > vq->vq_free_cnt) {
+		return iovcnt > vq->vq_nentries ? -EINVAL : -ENOMEM;
+	}
+
+	if (vq->req_end != VQ_RING_DESC_CHAIN_END) {
+		finish_req(vq);
+	}
+
+	vq->req_start = vq->vq_desc_head_idx;
+	dxp = &vq->vq_descx[vq->req_start];
+	dxp->cookie = cookie;
+	dxp->ndescs = 0;
+
+	return 0;
+}
+
+void
+virtqueue_req_flush(struct virtqueue *vq)
+{
+	uint16_t reqs_finished;
+
+	if (vq->req_end == VQ_RING_DESC_CHAIN_END) {
+		/* no non-empty requests have been started */
+		return;
+	}
+
+	finish_req(vq);
+	virtio_mb();
+
+	reqs_finished = vq->reqs_finished;
+	vq->reqs_finished = 0;
+
+	if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+		/* Set used event idx to a value the device will never reach.
+		 * This effectively disables interrupts.
+		 */
+		vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - vq->vq_nentries - 1;
+
+		if (!vring_need_event(vring_avail_event(&vq->vq_ring),
+				      vq->vq_avail_idx,
+				      vq->vq_avail_idx - reqs_finished)) {
+			return;
+		}
+	} else if (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) {
+		return;
+	}
+
+	virtio_dev_backend_ops(vq->vdev)->notify_queue(vq->vdev, vq);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "Notified backend after xmit\n");
+}
+
+void
+virtqueue_req_abort(struct virtqueue *vq)
+{
+	struct vring_desc *desc;
+
+	if (vq->req_start == VQ_RING_DESC_CHAIN_END) {
+		/* no requests have been started */
+		return;
+	}
+
+	desc = &vq->vq_ring.desc[vq->req_end];
+	desc->flags &= ~VRING_DESC_F_NEXT;
+
+	vq_ring_free_chain(vq, vq->req_start);
+	vq->req_start = VQ_RING_DESC_CHAIN_END;
+}
+
+void
+virtqueue_req_add_iovs(struct virtqueue *vq, struct iovec *iovs, uint16_t iovcnt,
+		       enum spdk_virtio_desc_type desc_type)
+{
+	struct vring_desc *desc;
+	struct vq_desc_extra *dxp;
+	uint16_t i, prev_head, new_head;
+
+	assert(vq->req_start != VQ_RING_DESC_CHAIN_END);
+	assert(iovcnt <= vq->vq_free_cnt);
+
+	/* TODO use indirect descriptors if iovcnt is high enough
+	 * or the caller specifies SPDK_VIRTIO_DESC_F_INDIRECT
+	 */
+
+	prev_head = vq->req_end;
+	new_head = vq->vq_desc_head_idx;
+	for (i = 0; i < iovcnt; ++i) {
+		desc = &vq->vq_ring.desc[new_head];
+
+		if (!vq->vdev->is_hw) {
+			desc->addr  = (uintptr_t)iovs[i].iov_base;
+		} else {
+			desc->addr = spdk_vtophys(iovs[i].iov_base, NULL);
+		}
+
+		desc->len = iovs[i].iov_len;
+		/* always set NEXT flag. unset it on the last descriptor
+		 * in the request-ending function.
+		 */
+		desc->flags = desc_type | VRING_DESC_F_NEXT;
+
+		prev_head = new_head;
+		new_head = desc->next;
+	}
+
+	dxp = &vq->vq_descx[vq->req_start];
+	dxp->ndescs += iovcnt;
+
+	vq->req_end = prev_head;
+	vq->vq_desc_head_idx = new_head;
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - iovcnt);
+	if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) {
+		assert(vq->vq_free_cnt == 0);
+		vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
+	}
+}
+
+#define DESC_PER_CACHELINE (SPDK_CACHE_LINE_SIZE / sizeof(struct vring_desc))
+uint16_t
+virtio_recv_pkts(struct virtqueue *vq, void **io, uint32_t *len, uint16_t nb_pkts)
+{
+	uint16_t nb_used, num;
+
+	nb_used = vq->vq_ring.used->idx - vq->vq_used_cons_idx;
+	virtio_rmb();
+
+	num = (uint16_t)(spdk_likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
+	if (spdk_likely(num > DESC_PER_CACHELINE)) {
+		num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
+	}
+
+	return virtqueue_dequeue_burst_rx(vq, io, len, num);
+}
+
+int
+virtio_dev_acquire_queue(struct virtio_dev *vdev, uint16_t index)
+{
+	struct virtqueue *vq = NULL;
+
+	if (index >= vdev->max_queues) {
+		SPDK_ERRLOG("requested vq index %"PRIu16" exceeds max queue count %"PRIu16".\n",
+			    index, vdev->max_queues);
+		return -1;
+	}
+
+	pthread_mutex_lock(&vdev->mutex);
+	vq = vdev->vqs[index];
+	if (vq == NULL || vq->owner_thread != NULL) {
+		pthread_mutex_unlock(&vdev->mutex);
+		return -1;
+	}
+
+	vq->owner_thread = spdk_get_thread();
+	pthread_mutex_unlock(&vdev->mutex);
+	return 0;
+}
+
+int32_t
+virtio_dev_find_and_acquire_queue(struct virtio_dev *vdev, uint16_t start_index)
+{
+	struct virtqueue *vq = NULL;
+	uint16_t i;
+
+	pthread_mutex_lock(&vdev->mutex);
+	for (i = start_index; i < vdev->max_queues; ++i) {
+		vq = vdev->vqs[i];
+		if (vq != NULL && vq->owner_thread == NULL) {
+			break;
+		}
+	}
+
+	if (vq == NULL || i == vdev->max_queues) {
+		SPDK_ERRLOG("no more unused virtio queues with idx >= %"PRIu16".\n", start_index);
+		pthread_mutex_unlock(&vdev->mutex);
+		return -1;
+	}
+
+	vq->owner_thread = spdk_get_thread();
+	pthread_mutex_unlock(&vdev->mutex);
+	return i;
+}
+
+struct spdk_thread *
+virtio_dev_queue_get_thread(struct virtio_dev *vdev, uint16_t index)
+{
+	struct spdk_thread *thread = NULL;
+
+	if (index >= vdev->max_queues) {
+		SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16"\n",
+			    index, vdev->max_queues);
+		abort(); /* This is not recoverable */
+	}
+
+	pthread_mutex_lock(&vdev->mutex);
+	thread = vdev->vqs[index]->owner_thread;
+	pthread_mutex_unlock(&vdev->mutex);
+
+	return thread;
+}
+
+bool
+virtio_dev_queue_is_acquired(struct virtio_dev *vdev, uint16_t index)
+{
+	return virtio_dev_queue_get_thread(vdev, index) != NULL;
+}
+
+void
+virtio_dev_release_queue(struct virtio_dev *vdev, uint16_t index)
+{
+	struct virtqueue *vq = NULL;
+
+	if (index >= vdev->max_queues) {
+		SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16".\n",
+			    index, vdev->max_queues);
+		return;
+	}
+
+	pthread_mutex_lock(&vdev->mutex);
+	vq = vdev->vqs[index];
+	if (vq == NULL) {
+		SPDK_ERRLOG("virtqueue at index %"PRIu16" is not initialized.\n", index);
+		pthread_mutex_unlock(&vdev->mutex);
+		return;
+	}
+
+	assert(vq->owner_thread == spdk_get_thread());
+	vq->owner_thread = NULL;
+	pthread_mutex_unlock(&vdev->mutex);
+}
+
+int
+virtio_dev_read_dev_config(struct virtio_dev *dev, size_t offset,
+			   void *dst, int length)
+{
+	return virtio_dev_backend_ops(dev)->read_dev_cfg(dev, offset, dst, length);
+}
+
+int
+virtio_dev_write_dev_config(struct virtio_dev *dev, size_t offset,
+			    const void *src, int length)
+{
+	return virtio_dev_backend_ops(dev)->write_dev_cfg(dev, offset, src, length);
+}
+
+void
+virtio_dev_stop(struct virtio_dev *dev)
+{
+	virtio_dev_backend_ops(dev)->set_status(dev, VIRTIO_CONFIG_S_RESET);
+	/* flush status write */
+	virtio_dev_backend_ops(dev)->get_status(dev);
+	virtio_free_queues(dev);
+}
+
+void
+virtio_dev_set_status(struct virtio_dev *dev, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_S_RESET) {
+		status |= virtio_dev_backend_ops(dev)->get_status(dev);
+	}
+
+	virtio_dev_backend_ops(dev)->set_status(dev, status);
+}
+
+uint8_t
+virtio_dev_get_status(struct virtio_dev *dev)
+{
+	return virtio_dev_backend_ops(dev)->get_status(dev);
+}
+
+const struct virtio_dev_ops *
+virtio_dev_backend_ops(struct virtio_dev *dev)
+{
+	return dev->backend_ops;
+}
+
+void
+virtio_dev_dump_json_info(struct virtio_dev *hw, struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_named_object_begin(w, "virtio");
+
+	spdk_json_write_named_uint32(w, "vq_count", hw->max_queues);
+
+	spdk_json_write_named_uint32(w, "vq_size",
+				     virtio_dev_backend_ops(hw)->get_queue_size(hw, 0));
+
+	virtio_dev_backend_ops(hw)->dump_json_info(hw, w);
+
+	spdk_json_write_object_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_dev", SPDK_LOG_VIRTIO_DEV)
diff --git a/src/spdk/lib/virtio/virtio_pci.c b/src/spdk/lib/virtio/virtio_pci.c
new file mode 100644
index 000000000..646f77c1a
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio_pci.c
@@ -0,0 +1,599 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/memory.h"
+#include "spdk/mmio.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk_internal/virtio.h"
+
+struct virtio_hw {
+	uint8_t	    use_msix;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+
+	struct {
+		/** Mem-mapped resources from given PCI BAR */
+		void        *vaddr;
+
+		/** Length of the address space */
+		uint32_t    len;
+	} pci_bar[6];
+
+	struct virtio_pci_common_cfg *common_cfg;
+	struct spdk_pci_device *pci_dev;
+
+	/** Device-specific PCI config space */
+	void *dev_cfg;
+};
+
+struct virtio_pci_probe_ctx {
+	virtio_pci_create_cb enum_cb;
+	void *enum_ctx;
+	uint16_t device_id;
+};
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+	    (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		SPDK_ERRLOG("vring address shouldn't be above 16TB!\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static void
+free_virtio_hw(struct virtio_hw *hw)
+{
+	unsigned i;
+
+	for (i = 0; i < 6; ++i) {
+		if (hw->pci_bar[i].vaddr == NULL) {
+			continue;
+		}
+
+		spdk_pci_device_unmap_bar(hw->pci_dev, i, hw->pci_bar[i].vaddr);
+	}
+
+	free(hw);
+}
+
+static void
+pci_dump_json_info(struct virtio_dev *dev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_hw *hw = dev->ctx;
+	struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr((struct spdk_pci_device *)hw->pci_dev);
+	char addr[32];
+
+	spdk_json_write_name(w, "type");
+	if (dev->modern) {
+		spdk_json_write_string(w, "pci-modern");
+	} else {
+		spdk_json_write_string(w, "pci-legacy");
+	}
+
+	spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
+	spdk_json_write_named_string(w, "pci_address", addr);
+}
+
+static void
+pci_write_json_config(struct virtio_dev *dev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_hw *hw = dev->ctx;
+	struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(hw->pci_dev);
+	char addr[32];
+
+	spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
+
+	spdk_json_write_named_string(w, "trtype", "pci");
+	spdk_json_write_named_string(w, "traddr", addr);
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	spdk_mmio_write_4(lo, val & ((1ULL << 32) - 1));
+	spdk_mmio_write_4(hi, val >> 32);
+}
+
+static int
+modern_read_dev_config(struct virtio_dev *dev, size_t offset,
+		       void *dst, int length)
+{
+	struct virtio_hw *hw = dev->ctx;
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++) {
+			*p++ = spdk_mmio_read_1((uint8_t *)hw->dev_cfg + offset + i);
+		}
+
+		new_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+
+	return 0;
+}
+
+static int
+modern_write_dev_config(struct virtio_dev *dev, size_t offset,
+			const void *src, int length)
+{
+	struct virtio_hw *hw = dev->ctx;
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++) {
+		spdk_mmio_write_1(((uint8_t *)hw->dev_cfg) + offset + i, *p++);
+	}
+
+	return 0;
+}
+
+static uint64_t
+modern_get_features(struct virtio_dev *dev)
+{
+	struct virtio_hw *hw = dev->ctx;
+	uint32_t features_lo, features_hi;
+
+	spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 0);
+	features_lo = spdk_mmio_read_4(&hw->common_cfg->device_feature);
+
+	spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 1);
+	features_hi = spdk_mmio_read_4(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static int
+modern_set_features(struct virtio_dev *dev, uint64_t features)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	if ((features & (1ULL << VIRTIO_F_VERSION_1)) == 0) {
+		SPDK_ERRLOG("VIRTIO_F_VERSION_1 feature is not enabled.\n");
+		return -EINVAL;
+	}
+
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 0);
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature, features & ((1ULL << 32) - 1));
+
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 1);
+	spdk_mmio_write_4(&hw->common_cfg->guest_feature, features >> 32);
+
+	dev->negotiated_features = features;
+
+	return 0;
+}
+
+static void
+modern_destruct_dev(struct virtio_dev *vdev)
+{
+	struct virtio_hw *hw = vdev->ctx;
+	struct spdk_pci_device *pci_dev = hw->pci_dev;
+
+	free_virtio_hw(hw);
+	spdk_pci_device_detach(pci_dev);
+}
+
+static uint8_t
+modern_get_status(struct virtio_dev *dev)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	return spdk_mmio_read_1(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_dev *dev, uint8_t status)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	spdk_mmio_write_1(&hw->common_cfg->device_status, status);
+}
+
+static uint16_t
+modern_get_queue_size(struct virtio_dev *dev, uint16_t queue_id)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_select, queue_id);
+	return spdk_mmio_read_2(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+	struct virtio_hw *hw = dev->ctx;
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+	void *queue_mem;
+	uint64_t queue_mem_phys_addr;
+
+	/* To ensure physical address contiguity we make the queue occupy
+	 * only a single hugepage (2MB). As of Virtio 1.0, the queue size
+	 * always falls within this limit.
+	 */
+	if (vq->vq_ring_size > VALUE_2MB) {
+		return -ENOMEM;
+	}
+
+	queue_mem = spdk_zmalloc(vq->vq_ring_size, VALUE_2MB, NULL,
+				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (queue_mem == NULL) {
+		return -ENOMEM;
+	}
+
+	queue_mem_phys_addr = spdk_vtophys(queue_mem, NULL);
+	if (queue_mem_phys_addr == SPDK_VTOPHYS_ERROR) {
+		spdk_free(queue_mem);
+		return -EFAULT;
+	}
+
+	vq->vq_ring_mem = queue_mem_phys_addr;
+	vq->vq_ring_virt_mem = queue_mem;
+
+	if (!check_vq_phys_addr_ok(vq)) {
+		spdk_free(queue_mem);
+		return -ENOMEM;
+	}
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = (avail_addr + offsetof(struct vring_avail, ring[vq->vq_nentries])
+		     + VIRTIO_PCI_VRING_ALIGN - 1) & ~(VIRTIO_PCI_VRING_ALIGN - 1);
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+			   &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+			   &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+			   &hw->common_cfg->queue_used_hi);
+
+	notify_off = spdk_mmio_read_2(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				   notify_off * hw->notify_off_multiplier);
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_enable, 1);
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "queue %"PRIu16" addresses:\n", vq->vq_queue_index);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t desc_addr: %" PRIx64 "\n", desc_addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t aval_addr: %" PRIx64 "\n", avail_addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t used_addr: %" PRIx64 "\n", used_addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t notify addr: %p (notify offset: %"PRIu16")\n",
+		      vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+	struct virtio_hw *hw = dev->ctx;
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+			   &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+			   &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+			   &hw->common_cfg->queue_used_hi);
+
+	spdk_mmio_write_2(&hw->common_cfg->queue_enable, 0);
+
+	spdk_free(vq->vq_ring_virt_mem);
+}
+
+static void
+modern_notify_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+	spdk_mmio_write_2(vq->notify_addr, vq->vq_queue_index);
+}
+
+static const struct virtio_dev_ops modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.destruct_dev	= modern_destruct_dev,
+	.get_queue_size	= modern_get_queue_size,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+	.dump_json_info = pci_dump_json_info,
+	.write_json_config = pci_write_json_config,
+};
+
+static void *
+get_cfg_addr(struct virtio_hw *hw, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+
+	if (bar > 5) {
+		SPDK_ERRLOG("invalid bar: %"PRIu8"\n", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		SPDK_ERRLOG("offset(%"PRIu32") + length(%"PRIu32") overflows\n",
+			    offset, length);
+		return NULL;
+	}
+
+	if (offset + length > hw->pci_bar[bar].len) {
+		SPDK_ERRLOG("invalid cap: overflows bar space: %"PRIu32" > %"PRIu32"\n",
+			    offset + length, hw->pci_bar[bar].len);
+		return NULL;
+	}
+
+	if (hw->pci_bar[bar].vaddr == NULL) {
+		SPDK_ERRLOG("bar %"PRIu8" base addr is NULL\n", bar);
+		return NULL;
+	}
+
+	return hw->pci_bar[bar].vaddr + offset;
+}
+
+static int
+virtio_read_caps(struct virtio_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	ret = spdk_pci_device_cfg_read(hw->pci_dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "failed to read pci capability list\n");
+		return ret;
+	}
+
+	while (pos) {
+		ret = spdk_pci_device_cfg_read(hw->pci_dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			SPDK_ERRLOG("failed to read pci cap at pos: %"PRIx8"\n", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			hw->use_msix = 1;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI,
+				      "[%2"PRIx8"] skipping non VNDR cap id: %02"PRIx8"\n",
+				      pos, cap.cap_vndr);
+			goto next;
+		}
+
+		SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI,
+			      "[%2"PRIx8"] cfg type: %"PRIu8", bar: %"PRIu8", offset: %04"PRIx32", len: %"PRIu32"\n",
+			      pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			spdk_pci_device_cfg_read(hw->pci_dev, &hw->notify_off_multiplier,
+						 4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(hw, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "no modern virtio pci device found.\n");
+		if (ret < 0) {
+			return ret;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "found modern virtio pci device.\n");
+
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "common cfg mapped at: %p\n", hw->common_cfg);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "device cfg mapped at: %p\n", hw->dev_cfg);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "isr cfg mapped at: %p\n", hw->isr);
+	SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "notify base: %p, notify off multiplier: %u\n",
+		      hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+static int
+virtio_pci_dev_probe(struct spdk_pci_device *pci_dev, struct virtio_pci_probe_ctx *ctx)
+{
+	struct virtio_hw *hw;
+	uint8_t *bar_vaddr;
+	uint64_t bar_paddr, bar_len;
+	int rc;
+	unsigned i;
+	char bdf[32];
+	struct spdk_pci_addr addr;
+
+	addr = spdk_pci_device_get_addr(pci_dev);
+	rc = spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr);
+	if (rc != 0) {
+		SPDK_ERRLOG("Ignoring a device with non-parseable PCI address\n");
+		return -1;
+	}
+
+	hw = calloc(1, sizeof(*hw));
+	if (hw == NULL) {
+		SPDK_ERRLOG("%s: calloc failed\n", bdf);
+		return -1;
+	}
+
+	hw->pci_dev = pci_dev;
+
+	for (i = 0; i < 6; ++i) {
+		rc = spdk_pci_device_map_bar(pci_dev, i, (void *) &bar_vaddr, &bar_paddr,
+					     &bar_len);
+		if (rc != 0) {
+			SPDK_ERRLOG("%s: failed to memmap PCI BAR %u\n", bdf, i);
+			free_virtio_hw(hw);
+			return -1;
+		}
+
+		hw->pci_bar[i].vaddr = bar_vaddr;
+		hw->pci_bar[i].len = bar_len;
+	}
+
+	/* Virtio PCI caps exist only on modern PCI devices.
+	 * Legacy devices are not supported.
+	 */
+	if (virtio_read_caps(hw) != 0) {
+		SPDK_NOTICELOG("Ignoring legacy PCI device at %s\n", bdf);
+		free_virtio_hw(hw);
+		return -1;
+	}
+
+	rc = ctx->enum_cb((struct virtio_pci_ctx *)hw, ctx->enum_ctx);
+	if (rc != 0) {
+		free_virtio_hw(hw);
+	}
+
+	return rc;
+}
+
+static int
+virtio_pci_dev_probe_cb(void *probe_ctx, struct spdk_pci_device *pci_dev)
+{
+	struct virtio_pci_probe_ctx *ctx = probe_ctx;
+	uint16_t pci_device_id = spdk_pci_device_get_device_id(pci_dev);
+
+	if (pci_device_id != ctx->device_id) {
+		return 1;
+	}
+
+	return virtio_pci_dev_probe(pci_dev, ctx);
+}
+
+int
+virtio_pci_dev_enumerate(virtio_pci_create_cb enum_cb, void *enum_ctx,
+			 uint16_t pci_device_id)
+{
+	struct virtio_pci_probe_ctx ctx;
+
+	if (!spdk_process_is_primary()) {
+		SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n");
+		return 0;
+	}
+
+	ctx.enum_cb = enum_cb;
+	ctx.enum_ctx = enum_ctx;
+	ctx.device_id = pci_device_id;
+
+	return spdk_pci_enumerate(spdk_pci_virtio_get_driver(),
+				  virtio_pci_dev_probe_cb, &ctx);
+}
+
+int
+virtio_pci_dev_attach(virtio_pci_create_cb enum_cb, void *enum_ctx,
+		      uint16_t pci_device_id, struct spdk_pci_addr *pci_address)
+{
+	struct virtio_pci_probe_ctx ctx;
+
+	if (!spdk_process_is_primary()) {
+		SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n");
+		return 0;
+	}
+
+	ctx.enum_cb = enum_cb;
+	ctx.enum_ctx = enum_ctx;
+	ctx.device_id = pci_device_id;
+
+	return spdk_pci_device_attach(spdk_pci_virtio_get_driver(),
+				      virtio_pci_dev_probe_cb, &ctx, pci_address);
+}
+
+int
+virtio_pci_dev_init(struct virtio_dev *vdev, const char *name,
+		    struct virtio_pci_ctx *pci_ctx)
+{
+	int rc;
+
+	rc = virtio_dev_construct(vdev, name, &modern_ops, pci_ctx);
+	if (rc != 0) {
+		return rc;
+	}
+
+	vdev->is_hw = 1;
+	vdev->modern = 1;
+
+	return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_pci", SPDK_LOG_VIRTIO_PCI)
diff --git a/src/spdk/lib/virtio/virtio_user.c b/src/spdk/lib/virtio/virtio_user.c
new file mode 100644
index 000000000..4f4932db9
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio_user.c
@@ -0,0 +1,628 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <sys/eventfd.h>
+
+#include "vhost_user.h"
+#include "spdk/string.h"
+#include "spdk/config.h"
+
+#include "spdk_internal/virtio.h"
+
+#define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
+	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+	(1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+static int
+virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
+	 * firstly because vhost depends on this msg to allocate virtqueue
+	 * pair.
+	 */
+	struct vhost_vring_file file;
+
+	file.index = queue_sel;
+	file.fd = dev->callfds[queue_sel];
+	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file);
+}
+
+static int
+virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vring *vring = &dev->vrings[queue_sel];
+	struct vhost_vring_addr addr = {
+		.index = queue_sel,
+		.desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
+		.avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
+		.used_user_addr = (uint64_t)(uintptr_t)vring->used,
+		.log_guest_addr = 0,
+		.flags = 0, /* disable log */
+	};
+
+	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr);
+}
+
+static int
+virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_vring_file file;
+	struct vhost_vring_state state;
+	struct vring *vring = &dev->vrings[queue_sel];
+	int rc;
+
+	state.index = queue_sel;
+	state.num = vring->num;
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state);
+	if (rc < 0) {
+		return rc;
+	}
+
+	state.index = queue_sel;
+	state.num = 0; /* no reservation */
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state);
+	if (rc < 0) {
+		return rc;
+	}
+
+	virtio_user_set_vring_addr(vdev, queue_sel);
+
+	/* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
+	 * lastly because vhost depends on this msg to judge if
+	 * virtio is ready.
+	 */
+	file.index = queue_sel;
+	file.fd = dev->kickfds[queue_sel];
+	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file);
+}
+
+static int
+virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_vring_state state;
+
+	state.index = queue_sel;
+	state.num = 0;
+
+	return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state);
+}
+
+static int
+virtio_user_queue_setup(struct virtio_dev *vdev,
+			int (*fn)(struct virtio_dev *, uint32_t))
+{
+	uint32_t i;
+	int rc;
+
+	for (i = 0; i < vdev->max_queues; ++i) {
+		rc = fn(vdev, i);
+		if (rc < 0) {
+			SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int
+virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
+		       enum spdk_mem_map_notify_action action,
+		       void *vaddr, size_t size)
+{
+	struct virtio_dev *vdev = cb_ctx;
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t features;
+	int ret;
+
+	/* We have to resend all mappings anyway, so don't bother with any
+	 * page tracking.
+	 */
+	ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
+	if (ret < 0) {
+		return ret;
+	}
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+	/* Our internal rte_vhost lib requires SET_VRING_ADDR to flush a pending
+	 * SET_MEM_TABLE. On the other hand, the upstream rte_vhost will invalidate
+	 * the entire queue upon receiving SET_VRING_ADDR message, so we mustn't
+	 * send it here. Both behaviors are strictly implementation specific, but
+	 * this message isn't needed from the point of the spec, so send it only
+	 * if vhost is compiled with our internal lib.
+	 */
+	ret = virtio_user_queue_setup(vdev, virtio_user_set_vring_addr);
+	if (ret < 0) {
+		return ret;
+	}
+#endif
+
+	/* Since we might want to use that mapping straight away, we have to
+	 * make sure the guest has already processed our SET_MEM_TABLE message.
+	 * F_REPLY_ACK is just a feature and the host is not obliged to
+	 * support it, so we send a simple message that always has a response
+	 * and we wait for that response. Messages are always processed in order.
+	 */
+	return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
+}
+
+static int
+virtio_user_register_mem(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	const struct spdk_mem_map_ops virtio_user_map_ops = {
+		.notify_cb = virtio_user_map_notify,
+		.are_contiguous = NULL
+	};
+
+	dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
+	if (dev->mem_map == NULL) {
+		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+virtio_user_unregister_mem(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	spdk_mem_map_free(&dev->mem_map);
+}
+
+static int
+virtio_user_start_device(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t host_max_queues;
+	int ret;
+
+	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
+	    vdev->max_queues > 1 + vdev->fixed_queues_num) {
+		SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
+			     "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
+			     "Only one request queue will be used.\n",
+			     vdev->name, vdev->max_queues - vdev->fixed_queues_num);
+		vdev->max_queues = 1 + vdev->fixed_queues_num;
+	}
+
+	/* negotiate the number of I/O queues. */
+	ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
+	if (ret < 0) {
+		return ret;
+	}
+
+	if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
+		SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
+			     "but only %"PRIu64" available\n",
+			     vdev->name, vdev->max_queues - vdev->fixed_queues_num,
+			     host_max_queues);
+		vdev->max_queues = host_max_queues;
+	}
+
+	/* tell vhost to create queues */
+	ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
+	if (ret < 0) {
+		return ret;
+	}
+
+	ret = virtio_user_register_mem(vdev);
+	if (ret < 0) {
+		return ret;
+	}
+
+	return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
+}
+
+static int
+virtio_user_stop_device(struct virtio_dev *vdev)
+{
+	int ret;
+
+	ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
+	/* a queue might fail to stop for various reasons, e.g. socket
+	 * connection going down, but this mustn't prevent us from freeing
+	 * the mem map.
+	 */
+	virtio_user_unregister_mem(vdev);
+	return ret;
+}
+
+static int
+virtio_user_dev_setup(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint16_t i;
+
+	dev->vhostfd = -1;
+
+	for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
+		dev->callfds[i] = -1;
+		dev->kickfds[i] = -1;
+	}
+
+	dev->ops = &ops_user;
+
+	return dev->ops->setup(dev);
+}
+
+static int
+virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
+			    void *dst, int length)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_user_config cfg = {0};
+	int rc;
+
+	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
+		return -ENOTSUP;
+	}
+
+	cfg.offset = 0;
+	cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
+
+	rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg);
+	if (rc < 0) {
+		SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
+		return rc;
+	}
+
+	memcpy(dst, cfg.region + offset, length);
+	return 0;
+}
+
+static int
+virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
+			     const void *src, int length)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_user_config cfg = {0};
+	int rc;
+
+	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
+		return -ENOTSUP;
+	}
+
+	cfg.offset = offset;
+	cfg.size = length;
+	memcpy(cfg.region, src, length);
+
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg);
+	if (rc < 0) {
+		SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
+		return rc;
+	}
+
+	return 0;
+}
+
+static void
+virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	int rc = 0;
+
+	if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
+	    status != VIRTIO_CONFIG_S_RESET) {
+		rc = -1;
+	} else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+		rc = virtio_user_start_device(vdev);
+	} else if (status == VIRTIO_CONFIG_S_RESET &&
+		   (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		rc = virtio_user_stop_device(vdev);
+	}
+
+	if (rc != 0) {
+		dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
+	} else {
+		dev->status = status;
+	}
+}
+
+static uint8_t
+virtio_user_get_status(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	return dev->status;
+}
+
+static uint64_t
+virtio_user_get_features(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t features;
+	int rc;
+
+	rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
+	if (rc < 0) {
+		SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
+		return 0;
+	}
+
+	return features;
+}
+
+static int
+virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	uint64_t protocol_features;
+	int ret;
+
+	ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features);
+	if (ret < 0) {
+		return ret;
+	}
+
+	vdev->negotiated_features = features;
+	vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
+
+	if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
+		/* nothing else to do */
+		return 0;
+	}
+
+	ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
+	if (ret < 0) {
+		return ret;
+	}
+
+	protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
+	ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
+	if (ret < 0) {
+		return ret;
+	}
+
+	dev->protocol_features = protocol_features;
+	return 0;
+}
+
+static uint16_t
+virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	/* Currently each queue has same queue size */
+	return dev->queue_size;
+}
+
+static int
+virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+	struct vhost_vring_state state;
+	uint16_t queue_idx = vq->vq_queue_index;
+	void *queue_mem;
+	uint64_t desc_addr, avail_addr, used_addr;
+	int callfd, kickfd, rc;
+
+	if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
+		SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
+		return -EEXIST;
+	}
+
+	/* May use invalid flag, but some backend uses kickfd and
+	 * callfd as criteria to judge if dev is alive. so finally we
+	 * use real event_fd.
+	 */
+	callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+	if (callfd < 0) {
+		SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
+		return -errno;
+	}
+
+	kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+	if (kickfd < 0) {
+		SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
+		close(callfd);
+		return -errno;
+	}
+
+	queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
+				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+	if (queue_mem == NULL) {
+		close(kickfd);
+		close(callfd);
+		return -ENOMEM;
+	}
+
+	vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
+	vq->vq_ring_virt_mem = queue_mem;
+
+	state.index = vq->vq_queue_index;
+	state.num = 0;
+
+	if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
+		rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state);
+		if (rc < 0) {
+			SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
+				    spdk_strerror(-rc));
+			close(kickfd);
+			close(callfd);
+			spdk_free(queue_mem);
+			return -rc;
+		}
+	}
+
+	dev->callfds[queue_idx] = callfd;
+	dev->kickfds[queue_idx] = kickfd;
+
+	desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+				    ring[vq->vq_nentries]),
+				    VIRTIO_PCI_VRING_ALIGN);
+
+	dev->vrings[queue_idx].num = vq->vq_nentries;
+	dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
+	dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
+	dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
+
+	return 0;
+}
+
+static void
+virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+	/* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
+	 * correspondingly stops the ioeventfds, and reset the status of
+	 * the device.
+	 * For modern devices, set queue desc, avail, used in PCI bar to 0,
+	 * not see any more behavior in QEMU.
+	 *
+	 * Here we just care about what information to deliver to vhost-user.
+	 * So we just close ioeventfd for now.
+	 */
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	close(dev->callfds[vq->vq_queue_index]);
+	close(dev->kickfds[vq->vq_queue_index]);
+	dev->callfds[vq->vq_queue_index] = -1;
+	dev->kickfds[vq->vq_queue_index] = -1;
+
+	spdk_free(vq->vq_ring_virt_mem);
+}
+
+static void
+virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+	uint64_t buf = 1;
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
+		SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
+	}
+}
+
+static void
+virtio_user_destroy(struct virtio_dev *vdev)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	close(dev->vhostfd);
+	free(dev);
+}
+
+static void
+virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	spdk_json_write_named_string(w, "type", "user");
+	spdk_json_write_named_string(w, "socket", dev->path);
+}
+
+static void
+virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
+{
+	struct virtio_user_dev *dev = vdev->ctx;
+
+	spdk_json_write_named_string(w, "trtype", "user");
+	spdk_json_write_named_string(w, "traddr", dev->path);
+	spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
+	spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
+}
+
+static const struct virtio_dev_ops virtio_user_ops = {
+	.read_dev_cfg	= virtio_user_read_dev_config,
+	.write_dev_cfg	= virtio_user_write_dev_config,
+	.get_status	= virtio_user_get_status,
+	.set_status	= virtio_user_set_status,
+	.get_features	= virtio_user_get_features,
+	.set_features	= virtio_user_set_features,
+	.destruct_dev	= virtio_user_destroy,
+	.get_queue_size	= virtio_user_get_queue_size,
+	.setup_queue	= virtio_user_setup_queue,
+	.del_queue	= virtio_user_del_queue,
+	.notify_queue	= virtio_user_notify_queue,
+	.dump_json_info = virtio_user_dump_json_info,
+	.write_json_config = virtio_user_write_json_config,
+};
+
+int
+virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
+		     uint32_t queue_size)
+{
+	struct virtio_user_dev *dev;
+	int rc;
+
+	if (name == NULL) {
+		SPDK_ERRLOG("No name gived for controller: %s\n", path);
+		return -EINVAL;
+	}
+
+	dev = calloc(1, sizeof(*dev));
+	if (dev == NULL) {
+		return -ENOMEM;
+	}
+
+	rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
+	if (rc != 0) {
+		SPDK_ERRLOG("Failed to init device: %s\n", path);
+		free(dev);
+		return rc;
+	}
+
+	vdev->is_hw = 0;
+
+	snprintf(dev->path, PATH_MAX, "%s", path);
+	dev->queue_size = queue_size;
+
+	rc = virtio_user_dev_setup(vdev);
+	if (rc < 0) {
+		SPDK_ERRLOG("backend set up fails\n");
+		goto err;
+	}
+
+	rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL);
+	if (rc < 0) {
+		SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
+		goto err;
+	}
+
+	return 0;
+
+err:
+	virtio_dev_destruct(vdev);
+	return rc;
+}
diff --git a/src/spdk/lib/vmd/Makefile b/src/spdk/lib/vmd/Makefile
new file mode 100644
index 000000000..13813c559
--- /dev/null
+++ b/src/spdk/lib/vmd/Makefile
@@ -0,0 +1,45 @@
+#
+#  BSD LICENSE
+#
+#  Copyright (c) Intel Corporation.
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vmd.c led.c
+LIBNAME = vmd
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vmd.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vmd/led.c b/src/spdk/lib/vmd/led.c
new file mode 100644
index 000000000..878983aab
--- /dev/null
+++ b/src/spdk/lib/vmd/led.c
@@ -0,0 +1,166 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "vmd.h"
+
+struct vmd_led_indicator_config {
+	uint8_t attention_indicator	: 2;
+	uint8_t power_indicator		: 2;
+	uint8_t reserved		: 4;
+};
+
+/*
+ * VMD LED     Attn       Power       LED Amber
+ * State       Indicator  Indicator
+ *             Control    Control
+ * ------------------------------------------------
+ * Off         11b        11b         Off
+ * Ident       11b        01b         Blink 4Hz
+ * Fault       01b        11b         On
+ * Rebuild     01b        01b         Blink 1Hz
+ */
+static const struct vmd_led_indicator_config g_led_config[] = {
+	[SPDK_VMD_LED_STATE_OFF]	= { .attention_indicator = 3, .power_indicator = 3 },
+	[SPDK_VMD_LED_STATE_IDENTIFY]	= { .attention_indicator = 3, .power_indicator = 1 },
+	[SPDK_VMD_LED_STATE_FAULT]	= { .attention_indicator = 1, .power_indicator = 3 },
+	[SPDK_VMD_LED_STATE_REBUILD]	= { .attention_indicator = 1, .power_indicator = 1 },
+};
+
+static void
+vmd_led_set_indicator_control(struct vmd_pci_device *vmd_device, enum spdk_vmd_led_state state)
+{
+	const struct vmd_led_indicator_config *config;
+	union express_slot_control_register slot_control;
+
+	assert(state >= SPDK_VMD_LED_STATE_OFF && state <= SPDK_VMD_LED_STATE_REBUILD);
+	config = &g_led_config[state];
+
+	slot_control = vmd_device->pcie_cap->slot_control;
+	slot_control.bit_field.attention_indicator_control = config->attention_indicator;
+	slot_control.bit_field.power_indicator_control = config->power_indicator;
+
+	/*
+	 * Due to the fact that writes to the PCI config space are posted writes, we need to issue
+	 * a read to the register we've just written to ensure it reached its destination.
+	 * TODO: wrap all register writes with a function taking care of that.
+	 */
+	vmd_device->pcie_cap->slot_control = slot_control;
+	vmd_device->cached_slot_control = vmd_device->pcie_cap->slot_control;
+}
+
+static unsigned int
+vmd_led_get_state(struct vmd_pci_device *vmd_device)
+{
+	const struct vmd_led_indicator_config *config;
+	union express_slot_control_register slot_control;
+	unsigned int state;
+
+	slot_control = vmd_device->cached_slot_control;
+	for (state = SPDK_VMD_LED_STATE_OFF; state <= SPDK_VMD_LED_STATE_REBUILD; ++state) {
+		config = &g_led_config[state];
+
+		if (slot_control.bit_field.attention_indicator_control == config->attention_indicator &&
+		    slot_control.bit_field.power_indicator_control == config->power_indicator) {
+			return state;
+		}
+	}
+
+	return SPDK_VMD_LED_STATE_UNKNOWN;
+}
+
+/*
+ * The identifying device under VMD is located in the global list of VMD controllers.  If the BDF
+ * identifies an endpoint, then the LED is attached to the endpoint's parent.  If the BDF identifies
+ * a type 1 header, then this device has the corresponding LED. This may arise when a user wants to
+ * identify a given empty slot under VMD.
+ */
+static struct vmd_pci_device *
+vmd_get_led_device(const struct spdk_pci_device *pci_device)
+{
+	struct vmd_pci_device *vmd_device;
+
+	assert(strcmp(spdk_pci_device_get_type(pci_device), "vmd") == 0);
+
+	vmd_device = vmd_find_device(&pci_device->addr);
+	if (spdk_unlikely(vmd_device == NULL)) {
+		return NULL;
+	}
+
+	if (vmd_device->header_type == PCI_HEADER_TYPE_NORMAL) {
+		if (spdk_unlikely(vmd_device->parent == NULL)) {
+			return NULL;
+		}
+
+		return vmd_device->parent->self;
+	}
+
+	return vmd_device;
+}
+
+int
+spdk_vmd_set_led_state(struct spdk_pci_device *pci_device, enum spdk_vmd_led_state state)
+{
+	struct vmd_pci_device *vmd_device;
+
+	if (state < SPDK_VMD_LED_STATE_OFF || state > SPDK_VMD_LED_STATE_REBUILD) {
+		SPDK_ERRLOG("Invalid LED state\n");
+		return -EINVAL;
+	}
+
+	vmd_device = vmd_get_led_device(pci_device);
+	if (spdk_unlikely(vmd_device == NULL)) {
+		SPDK_ERRLOG("The PCI device is not behind the VMD\n");
+		return -ENODEV;
+	}
+
+	vmd_led_set_indicator_control(vmd_device, state);
+	return 0;
+}
+
+int
+spdk_vmd_get_led_state(struct spdk_pci_device *pci_device, enum spdk_vmd_led_state *state)
+{
+	struct vmd_pci_device *vmd_device;
+
+	vmd_device = vmd_get_led_device(pci_device);
+	if (spdk_unlikely(vmd_device == NULL)) {
+		SPDK_ERRLOG("The PCI device is not behind the VMD\n");
+		return -ENODEV;
+	}
+
+	*state = (enum spdk_vmd_led_state)vmd_led_get_state(vmd_device);
+	return 0;
+}
diff --git a/src/spdk/lib/vmd/spdk_vmd.map b/src/spdk/lib/vmd/spdk_vmd.map
new file mode 100644
index 000000000..036d079b5
--- /dev/null
+++ b/src/spdk/lib/vmd/spdk_vmd.map
@@ -0,0 +1,13 @@
+{
+	global:
+
+	# public functions
+	spdk_vmd_init;
+	spdk_vmd_fini;
+	spdk_vmd_pci_device_list;
+	spdk_vmd_set_led_state;
+	spdk_vmd_get_led_state;
+	spdk_vmd_hotplug_monitor;
+
+	local: *;
+};
diff --git a/src/spdk/lib/vmd/vmd.c b/src/spdk/lib/vmd/vmd.c
new file mode 100644
index 000000000..14d9558c2
--- /dev/null
+++ b/src/spdk/lib/vmd/vmd.c
@@ -0,0 +1,1376 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vmd.h"
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+
+static unsigned char *device_type[] = {
+	"PCI Express Endpoint",
+	"Legacy PCI Express Endpoint",
+	"Reserved 1",
+	"Reserved 2",
+	"Root Port of PCI Express Root Complex",
+	"Upstream Port of PCI Express Switch",
+	"Downstream Port of PCI Express Switch",
+	"PCI Express to PCI/PCI-X Bridge",
+	"PCI/PCI-X to PCI Express Bridge",
+	"Root Complex Integrated Endpoint",
+	"Root Complex Event Collector",
+	"Reserved Capability"
+};
+
+/*
+ * Container for all VMD adapter probed in the system.
+ */
+struct vmd_container {
+	uint32_t count;
+	struct vmd_adapter vmd[MAX_VMD_SUPPORTED];
+};
+
+static struct vmd_container g_vmd_container;
+static uint8_t g_end_device_count;
+
+static bool
+vmd_is_valid_cfg_addr(struct vmd_pci_bus *bus, uint64_t addr)
+{
+	return addr >= (uint64_t)bus->vmd->cfg_vaddr &&
+	       addr < bus->vmd->cfgbar_size + (uint64_t)bus->vmd->cfg_vaddr;
+}
+
+static void
+vmd_align_base_addrs(struct vmd_adapter *vmd, uint32_t alignment)
+{
+	uint32_t pad;
+
+	/*
+	 *  Device is not in hot plug path, align the base address remaining from membar 1.
+	 */
+	if (vmd->physical_addr & (alignment - 1)) {
+		pad = alignment - (vmd->physical_addr & (alignment - 1));
+		vmd->physical_addr += pad;
+		vmd->current_addr_size -= pad;
+	}
+}
+
+static bool
+vmd_device_is_enumerated(const struct vmd_pci_device *vmd_device)
+{
+	return vmd_device->header->one.prefetch_base_upper == VMD_UPPER_BASE_SIGNATURE &&
+	       vmd_device->header->one.prefetch_limit_upper == VMD_UPPER_LIMIT_SIGNATURE;
+}
+
+static bool
+vmd_device_is_root_port(const struct vmd_pci_device *vmd_device)
+{
+	return vmd_device->header->common.vendor_id == 0x8086 &&
+	       (vmd_device->header->common.device_id == 0x2030 ||
+		vmd_device->header->common.device_id == 0x2031 ||
+		vmd_device->header->common.device_id == 0x2032 ||
+		vmd_device->header->common.device_id == 0x2033);
+}
+
+static void
+vmd_hotplug_coalesce_regions(struct vmd_hot_plug *hp)
+{
+	struct pci_mem_mgr *region, *prev;
+
+	do {
+		prev = NULL;
+		TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) {
+			if (prev != NULL && (prev->addr + prev->size == region->addr)) {
+				break;
+			}
+
+			prev = region;
+		}
+
+		if (region != NULL) {
+			prev->size += region->size;
+			TAILQ_REMOVE(&hp->free_mem_queue, region, tailq);
+			TAILQ_INSERT_TAIL(&hp->unused_mem_queue, region, tailq);
+		}
+	} while (region != NULL);
+}
+
+static void
+vmd_hotplug_free_region(struct vmd_hot_plug *hp, struct pci_mem_mgr *region)
+{
+	struct pci_mem_mgr *current, *prev = NULL;
+
+	assert(region->addr >= hp->bar.start && region->addr < hp->bar.start + hp->bar.size);
+
+	TAILQ_FOREACH(current, &hp->free_mem_queue, tailq) {
+		if (current->addr > region->addr) {
+			break;
+		}
+
+		prev = current;
+	}
+
+	if (prev != NULL) {
+		assert(prev->addr + prev->size <= region->addr);
+		assert(current == NULL || (region->addr + region->size <= current->addr));
+		TAILQ_INSERT_AFTER(&hp->free_mem_queue, prev, region, tailq);
+	} else {
+		TAILQ_INSERT_HEAD(&hp->free_mem_queue, region, tailq);
+	}
+
+	vmd_hotplug_coalesce_regions(hp);
+}
+
+static void
+vmd_hotplug_free_addr(struct vmd_hot_plug *hp, uint64_t addr)
+{
+	struct pci_mem_mgr *region;
+
+	TAILQ_FOREACH(region, &hp->alloc_mem_queue, tailq) {
+		if (region->addr == addr) {
+			break;
+		}
+	}
+
+	assert(region != NULL);
+	TAILQ_REMOVE(&hp->alloc_mem_queue, region, tailq);
+
+	vmd_hotplug_free_region(hp, region);
+}
+
+static uint64_t
+vmd_hotplug_allocate_base_addr(struct vmd_hot_plug *hp, uint32_t size)
+{
+	struct pci_mem_mgr *region = NULL, *free_region;
+
+	TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) {
+		if (region->size >= size) {
+			break;
+		}
+	}
+
+	if (region == NULL) {
+		SPDK_DEBUGLOG(SPDK_LOG_VMD, "Unable to find free hotplug memory region of size:"
+			      "%"PRIx32"\n", size);
+		return 0;
+	}
+
+	TAILQ_REMOVE(&hp->free_mem_queue, region, tailq);
+	if (size < region->size) {
+		free_region = TAILQ_FIRST(&hp->unused_mem_queue);
+		if (free_region == NULL) {
+			SPDK_DEBUGLOG(SPDK_LOG_VMD, "Unable to find unused descriptor to store the "
+				      "free region of size: %"PRIu32"\n", region->size - size);
+		} else {
+			TAILQ_REMOVE(&hp->unused_mem_queue, free_region, tailq);
+			free_region->size = region->size - size;
+			free_region->addr = region->addr + size;
+			region->size = size;
+			vmd_hotplug_free_region(hp, free_region);
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&hp->alloc_mem_queue, region, tailq);
+
+	return region->addr;
+}
+
+/*
+ *  Allocates an address from vmd membar for the input memory size
+ *  vmdAdapter - vmd adapter object
+ *  dev - vmd_pci_device to allocate a base address for.
+ *  size - size of the memory window requested.
+ *  Size must be an integral multiple of 2. Addresses are returned on the size boundary.
+ *  Returns physical address within the VMD membar window, or 0x0 if cannot allocate window.
+ *  Consider increasing the size of vmd membar if 0x0 is returned.
+ */
+static uint64_t
+vmd_allocate_base_addr(struct vmd_adapter *vmd, struct vmd_pci_device *dev, uint32_t size)
+{
+	uint64_t base_address = 0, padding = 0;
+	struct vmd_pci_bus *hp_bus;
+
+	if (size && ((size & (~size + 1)) != size)) {
+		return base_address;
+	}
+
+	/*
+	 *  If device is downstream of a hot plug port, allocate address from the
+	 *  range dedicated for the hot plug slot. Search the list of addresses allocated to determine
+	 *  if a free range exists that satisfy the input request.  If a free range cannot be found,
+	 *  get a buffer from the  unused chunk. First fit algorithm, is used.
+	 */
+	if (dev) {
+		hp_bus = dev->parent;
+		if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) {
+			return vmd_hotplug_allocate_base_addr(&hp_bus->self->hp, size);
+		}
+	}
+
+	/* Ensure physical membar allocated is size aligned */
+	if (vmd->physical_addr & (size - 1)) {
+		padding = size - (vmd->physical_addr & (size - 1));
+	}
+
+	/* Allocate from membar if enough memory is left */
+	if (vmd->current_addr_size >= size + padding) {
+		base_address = vmd->physical_addr + padding;
+		vmd->physical_addr += size + padding;
+		vmd->current_addr_size -= size + padding;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "allocated(size) %lx (%x)\n", base_address, size);
+
+	return base_address;
+}
+
+static bool
+vmd_is_end_device(struct vmd_pci_device *dev)
+{
+	return (dev && dev->header) &&
+	       ((dev->header->common.header_type & ~PCI_MULTI_FUNCTION) == PCI_HEADER_TYPE_NORMAL);
+}
+
+static void
+vmd_update_base_limit_register(struct vmd_pci_device *dev, uint16_t base, uint16_t limit)
+{
+	struct vmd_pci_bus *bus;
+	struct vmd_pci_device *bridge;
+
+	if (base == 0 ||  limit == 0) {
+		return;
+	}
+
+	if (dev->header->common.header_type == PCI_HEADER_TYPE_BRIDGE) {
+		bus = dev->bus_object;
+	} else {
+		bus = dev->parent;
+	}
+
+	bridge = bus->self;
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "base:limit = %x:%x\n", bridge->header->one.mem_base,
+		      bridge->header->one.mem_limit);
+
+	if (dev->bus->vmd->scan_completed) {
+		return;
+	}
+
+	while (bus && bus->self != NULL) {
+		bridge = bus->self;
+
+		/* This is only for 32-bit memory space, need to revisit to support 64-bit */
+		if (bridge->header->one.mem_base > base) {
+			bridge->header->one.mem_base = base;
+			base = bridge->header->one.mem_base;
+		}
+
+		if (bridge->header->one.mem_limit < limit) {
+			bridge->header->one.mem_limit = limit;
+			limit = bridge->header->one.mem_limit;
+		}
+
+		bus = bus->parent;
+	}
+}
+
+static uint64_t
+vmd_get_base_addr(struct vmd_pci_device *dev, uint32_t index, uint32_t size)
+{
+	struct vmd_pci_bus *bus = dev->parent;
+
+	if (dev->header_type == PCI_HEADER_TYPE_BRIDGE) {
+		return dev->header->zero.BAR[index] & ~0xf;
+	} else {
+		if (bus->self->hotplug_capable) {
+			return vmd_hotplug_allocate_base_addr(&bus->self->hp, size);
+		} else {
+			return (uint64_t)bus->self->header->one.mem_base << 16;
+		}
+	}
+}
+
+static bool
+vmd_assign_base_addrs(struct vmd_pci_device *dev)
+{
+	uint16_t mem_base = 0, mem_limit = 0;
+	unsigned char mem_attr = 0;
+	int last;
+	struct vmd_adapter *vmd = NULL;
+	bool ret_val = false;
+	uint32_t bar_value;
+	uint32_t table_offset;
+
+	if (dev && dev->bus) {
+		vmd = dev->bus->vmd;
+	}
+
+	if (!vmd) {
+		return 0;
+	}
+
+	vmd_align_base_addrs(vmd, ONE_MB);
+
+	last = dev->header_type ? 2 : 6;
+	for (int i = 0; i < last; i++) {
+		bar_value = dev->header->zero.BAR[i];
+		dev->header->zero.BAR[i] = ~(0U);
+		dev->bar[i].size = dev->header->zero.BAR[i];
+		dev->header->zero.BAR[i] = bar_value;
+
+		if (dev->bar[i].size == ~(0U) || dev->bar[i].size == 0  ||
+		    dev->header->zero.BAR[i] & 1) {
+			dev->bar[i].size = 0;
+			continue;
+		}
+		mem_attr = dev->bar[i].size & PCI_BASE_ADDR_MASK;
+		dev->bar[i].size = TWOS_COMPLEMENT(dev->bar[i].size & PCI_BASE_ADDR_MASK);
+
+		if (vmd->scan_completed) {
+			dev->bar[i].start = vmd_get_base_addr(dev, i, dev->bar[i].size);
+		} else {
+			dev->bar[i].start = vmd_allocate_base_addr(vmd, dev, dev->bar[i].size);
+		}
+
+		dev->header->zero.BAR[i] = (uint32_t)dev->bar[i].start;
+
+		if (!dev->bar[i].start) {
+			if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) {
+				i++;
+			}
+			continue;
+		}
+
+		dev->bar[i].vaddr = ((uint64_t)vmd->mem_vaddr + (dev->bar[i].start - vmd->membar));
+		mem_limit = BRIDGE_BASEREG(dev->header->zero.BAR[i]) +
+			    BRIDGE_BASEREG(dev->bar[i].size - 1);
+		if (!mem_base) {
+			mem_base = BRIDGE_BASEREG(dev->header->zero.BAR[i]);
+		}
+
+		ret_val = true;
+
+		if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) {
+			i++;
+			if (i < last) {
+				dev->header->zero.BAR[i] = (uint32_t)(dev->bar[i].start >> PCI_DWORD_SHIFT);
+			}
+		}
+	}
+
+	/* Enable device MEM and bus mastering */
+	dev->header->zero.command |= (PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+	uint16_t cmd = dev->header->zero.command;
+	cmd++;
+
+	if (dev->msix_cap && ret_val) {
+		table_offset = ((volatile struct pci_msix_cap *)dev->msix_cap)->msix_table_offset;
+		if (dev->bar[table_offset & 0x3].vaddr) {
+			dev->msix_table = (volatile struct pci_msix_table_entry *)
+					  (dev->bar[table_offset & 0x3].vaddr + (table_offset & 0xfff8));
+		}
+	}
+
+	if (ret_val && vmd_is_end_device(dev)) {
+		vmd_update_base_limit_register(dev, mem_base, mem_limit);
+	}
+
+	return ret_val;
+}
+
+static void
+vmd_get_device_capabilities(struct vmd_pci_device *dev)
+
+{
+	volatile uint8_t *config_space;
+	uint8_t capabilities_offset;
+	struct pci_capabilities_header *capabilities_hdr;
+
+	config_space = (volatile uint8_t *)dev->header;
+	if ((dev->header->common.status  & PCI_CAPABILITIES_LIST) == 0) {
+		return;
+	}
+
+	capabilities_offset = dev->header->zero.cap_pointer;
+	if (dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
+		capabilities_offset = dev->header->one.cap_pointer;
+	}
+
+	while (capabilities_offset > 0) {
+		capabilities_hdr = (struct pci_capabilities_header *)
+				   &config_space[capabilities_offset];
+		switch (capabilities_hdr->capability_id) {
+		case CAPABILITY_ID_PCI_EXPRESS:
+			dev->pcie_cap = (volatile struct pci_express_cap *)(capabilities_hdr);
+			break;
+
+		case CAPABILITY_ID_MSI:
+			dev->msi_cap = (volatile struct pci_msi_cap *)capabilities_hdr;
+			break;
+
+		case CAPABILITY_ID_MSIX:
+			dev->msix_cap = (volatile struct pci_msix_capability *)capabilities_hdr;
+			dev->msix_table_size = dev->msix_cap->message_control.bit.table_size + 1;
+			break;
+
+		default:
+			break;
+		}
+		capabilities_offset = capabilities_hdr->next;
+	}
+}
+
+static volatile struct pci_enhanced_capability_header *
+vmd_get_enhanced_capabilities(struct vmd_pci_device *dev, uint16_t capability_id)
+{
+	uint8_t *data;
+	uint16_t cap_offset = EXTENDED_CAPABILITY_OFFSET;
+	volatile struct pci_enhanced_capability_header *cap_hdr = NULL;
+
+	data = (uint8_t *)dev->header;
+	while (cap_offset >= EXTENDED_CAPABILITY_OFFSET) {
+		cap_hdr = (volatile struct pci_enhanced_capability_header *) &data[cap_offset];
+		if (cap_hdr->capability_id == capability_id) {
+			return cap_hdr;
+		}
+		cap_offset = cap_hdr->next;
+		if (cap_offset == 0 || cap_offset < EXTENDED_CAPABILITY_OFFSET) {
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+vmd_read_config_space(struct vmd_pci_device *dev)
+{
+	/*
+	 * Writes to the pci config space is posted weite. To ensure transaction reaches its destination
+	 * before another write is posed, an immediate read of the written value should be performed.
+	 */
+	dev->header->common.command |= (BUS_MASTER_ENABLE | MEMORY_SPACE_ENABLE);
+	{ uint16_t cmd = dev->header->common.command; (void)cmd; }
+
+	vmd_get_device_capabilities(dev);
+	dev->sn_cap = (struct serial_number_capability *)vmd_get_enhanced_capabilities(dev,
+			DEVICE_SERIAL_NUMBER_CAP_ID);
+}
+
+static void
+vmd_update_scan_info(struct vmd_pci_device *dev)
+{
+	struct vmd_adapter *vmd_adapter = dev->bus->vmd;
+
+	if (vmd_adapter->root_port_updated) {
+		return;
+	}
+
+	if (dev->header_type == PCI_HEADER_TYPE_NORMAL) {
+		return;
+	}
+
+	if (vmd_device_is_root_port(dev)) {
+		vmd_adapter->root_port_updated = 1;
+		SPDK_DEBUGLOG(SPDK_LOG_VMD, "root_port_updated = %d\n",
+			      vmd_adapter->root_port_updated);
+		SPDK_DEBUGLOG(SPDK_LOG_VMD, "upper:limit = %x : %x\n",
+			      dev->header->one.prefetch_base_upper,
+			      dev->header->one.prefetch_limit_upper);
+		if (vmd_device_is_enumerated(dev)) {
+			vmd_adapter->scan_completed = 1;
+			SPDK_DEBUGLOG(SPDK_LOG_VMD, "scan_completed = %d\n",
+				      vmd_adapter->scan_completed);
+		}
+	}
+}
+
+static void
+vmd_reset_base_limit_registers(struct vmd_pci_device *dev)
+{
+	uint32_t reg __attribute__((unused));
+
+	assert(dev->header_type != PCI_HEADER_TYPE_NORMAL);
+	/*
+	 * Writes to the pci config space are posted writes.
+	 * To ensure transaction reaches its destination
+	 * before another write is posted, an immediate read
+	 * of the written value should be performed.
+	 */
+	dev->header->one.mem_base = 0xfff0;
+	reg = dev->header->one.mem_base;
+	dev->header->one.mem_limit = 0x0;
+	reg = dev->header->one.mem_limit;
+	dev->header->one.prefetch_base = 0x0;
+	reg = dev->header->one.prefetch_base;
+	dev->header->one.prefetch_limit = 0x0;
+	reg = dev->header->one.prefetch_limit;
+	dev->header->one.prefetch_base_upper = 0x0;
+	reg = dev->header->one.prefetch_base_upper;
+	dev->header->one.prefetch_limit_upper = 0x0;
+	reg = dev->header->one.prefetch_limit_upper;
+	dev->header->one.io_base_upper = 0x0;
+	reg = dev->header->one.io_base_upper;
+	dev->header->one.io_limit_upper = 0x0;
+	reg = dev->header->one.io_limit_upper;
+	dev->header->one.primary = 0;
+	reg = dev->header->one.primary;
+	dev->header->one.secondary = 0;
+	reg = dev->header->one.secondary;
+	dev->header->one.subordinate = 0;
+	reg = dev->header->one.subordinate;
+}
+
+static void
+vmd_init_hotplug(struct vmd_pci_device *dev, struct vmd_pci_bus *bus)
+{
+	struct vmd_adapter *vmd = bus->vmd;
+	struct vmd_hot_plug *hp = &dev->hp;
+	size_t mem_id;
+
+	dev->hotplug_capable = true;
+	hp->bar.size = 1 << 20;
+
+	if (!vmd->scan_completed) {
+		hp->bar.start = vmd_allocate_base_addr(vmd, NULL, hp->bar.size);
+		bus->self->header->one.mem_base = BRIDGE_BASEREG(hp->bar.start);
+		bus->self->header->one.mem_limit =
+			bus->self->header->one.mem_base + BRIDGE_BASEREG(hp->bar.size - 1);
+	} else {
+		hp->bar.start = (uint64_t)bus->self->header->one.mem_base << 16;
+	}
+
+	hp->bar.vaddr = (uint64_t)vmd->mem_vaddr + (hp->bar.start - vmd->membar);
+
+	TAILQ_INIT(&hp->free_mem_queue);
+	TAILQ_INIT(&hp->unused_mem_queue);
+	TAILQ_INIT(&hp->alloc_mem_queue);
+
+	hp->mem[0].size = hp->bar.size;
+	hp->mem[0].addr = hp->bar.start;
+
+	TAILQ_INSERT_TAIL(&hp->free_mem_queue, &hp->mem[0], tailq);
+
+	for (mem_id = 1; mem_id < ADDR_ELEM_COUNT; ++mem_id) {
+		TAILQ_INSERT_TAIL(&hp->unused_mem_queue, &hp->mem[mem_id], tailq);
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "%s: mem_base:mem_limit = %x : %x\n", __func__,
+		      bus->self->header->one.mem_base, bus->self->header->one.mem_limit);
+}
+
+static bool
+vmd_bus_device_present(struct vmd_pci_bus *bus, uint32_t devfn)
+{
+	volatile struct pci_header *header;
+
+	header = (volatile struct pci_header *)(bus->vmd->cfg_vaddr +
+						CONFIG_OFFSET_ADDR(bus->bus_number, devfn, 0, 0));
+	if (!vmd_is_valid_cfg_addr(bus, (uint64_t)header)) {
+		return false;
+	}
+
+	if (header->common.vendor_id == PCI_INVALID_VENDORID || header->common.vendor_id == 0x0) {
+		return false;
+	}
+
+	return true;
+}
+
+static struct vmd_pci_device *
+vmd_alloc_dev(struct vmd_pci_bus *bus, uint32_t devfn)
+{
+	struct vmd_pci_device *dev = NULL;
+	struct pci_header volatile *header;
+	uint8_t header_type;
+	uint32_t rev_class;
+
+	/* Make sure we're not creating two devices on the same dev/fn */
+	TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
+		if (dev->devfn == devfn) {
+			return NULL;
+		}
+	}
+
+	if (!vmd_bus_device_present(bus, devfn)) {
+		return NULL;
+	}
+
+	header = (struct pci_header * volatile)(bus->vmd->cfg_vaddr +
+						CONFIG_OFFSET_ADDR(bus->bus_number, devfn, 0, 0));
+
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "PCI device found: %04x:%04x ***\n",
+		      header->common.vendor_id, header->common.device_id);
+
+	dev = calloc(1, sizeof(*dev));
+	if (!dev) {
+		return NULL;
+	}
+
+	dev->header = header;
+	dev->vid = dev->header->common.vendor_id;
+	dev->did = dev->header->common.device_id;
+	dev->bus = bus;
+	dev->parent = bus;
+	dev->devfn = devfn;
+	header_type = dev->header->common.header_type;
+	rev_class = dev->header->common.rev_class;
+	dev->class = rev_class >> 8;
+	dev->header_type = header_type & 0x7;
+
+	if (header_type == PCI_HEADER_TYPE_BRIDGE) {
+		vmd_update_scan_info(dev);
+		if (!dev->bus->vmd->scan_completed) {
+			vmd_reset_base_limit_registers(dev);
+		}
+	}
+
+	vmd_read_config_space(dev);
+
+	return dev;
+}
+
+static struct vmd_pci_bus *
+vmd_create_new_bus(struct vmd_pci_bus *parent, struct vmd_pci_device *bridge, uint8_t bus_number)
+{
+	struct vmd_pci_bus *new_bus;
+
+	new_bus = calloc(1, sizeof(*new_bus));
+	if (!new_bus) {
+		return NULL;
+	}
+
+	new_bus->parent = parent;
+	new_bus->domain = parent->domain;
+	new_bus->bus_number = bus_number;
+	new_bus->secondary_bus = new_bus->subordinate_bus = bus_number;
+	new_bus->self = bridge;
+	new_bus->vmd = parent->vmd;
+	TAILQ_INIT(&new_bus->dev_list);
+
+	bridge->subordinate = new_bus;
+
+	bridge->pci.addr.bus = new_bus->bus_number;
+	bridge->pci.addr.dev = bridge->devfn;
+	bridge->pci.addr.func = 0;
+	bridge->pci.addr.domain = parent->vmd->pci->addr.domain;
+
+	return new_bus;
+}
+
+/*
+ * Assigns a bus number from the list of available
+ * bus numbers. If the device is downstream of a hot plug port,
+ * assign the bus number from thiose assigned to the HP port. Otherwise,
+ * assign the next bus number from the vmd bus number list.
+ */
+static uint8_t
+vmd_get_next_bus_number(struct vmd_pci_device *dev, struct vmd_adapter *vmd)
+{
+	uint8_t bus = 0xff;
+	struct vmd_pci_bus *hp_bus;
+
+	if (dev) {
+		hp_bus = vmd_is_dev_in_hotplug_path(dev);
+		if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) {
+			return vmd_hp_get_next_bus_number(&hp_bus->self->hp);
+		}
+	}
+
+	/* Device is not under a hot plug path. Return next global bus number */
+	if ((vmd->next_bus_number + 1) < vmd->max_pci_bus) {
+		bus = vmd->next_bus_number;
+		vmd->next_bus_number++;
+	}
+	return bus;
+}
+
+static uint8_t
+vmd_get_hotplug_bus_numbers(struct vmd_pci_device *dev)
+{
+	uint8_t bus_number = 0xff;
+
+	if (dev && dev->bus && dev->bus->vmd &&
+	    ((dev->bus->vmd->next_bus_number + RESERVED_HOTPLUG_BUSES) < dev->bus->vmd->max_pci_bus)) {
+		bus_number = RESERVED_HOTPLUG_BUSES;
+		dev->bus->vmd->next_bus_number += RESERVED_HOTPLUG_BUSES;
+	}
+
+	return bus_number;
+}
+
+static void
+vmd_enable_msix(struct vmd_pci_device *dev)
+{
+	volatile uint16_t control;
+
+	control = dev->msix_cap->message_control.as_uint16_t | (1 << 14);
+	dev->msix_cap->message_control.as_uint16_t = control;
+	control = dev->msix_cap->message_control.as_uint16_t;
+	dev->msix_cap->message_control.as_uint16_t = (control | (1 << 15));
+	control = dev->msix_cap->message_control.as_uint16_t;
+	control = control & ~(1 << 14);
+	dev->msix_cap->message_control.as_uint16_t = control;
+	control = dev->msix_cap->message_control.as_uint16_t;
+}
+
+static void
+vmd_disable_msix(struct vmd_pci_device *dev)
+{
+	volatile uint16_t control;
+
+	control = dev->msix_cap->message_control.as_uint16_t | (1 << 14);
+	dev->msix_cap->message_control.as_uint16_t = control;
+	control = dev->msix_cap->message_control.as_uint16_t & ~(1 << 15);
+	dev->msix_cap->message_control.as_uint16_t = control;
+	control = dev->msix_cap->message_control.as_uint16_t;
+}
+
+/*
+ * Set up MSI-X table entries for the port. Vmd MSIX vector 0 is used for
+ * port interrupt, so vector 0 is mapped to all MSIX entries for the port.
+ */
+static void
+vmd_setup_msix(struct vmd_pci_device *dev, volatile struct pci_msix_table_entry *vmdEntry)
+{
+	int entry;
+
+	if (!dev || !vmdEntry || !dev->msix_cap) {
+		return;
+	}
+
+	vmd_disable_msix(dev);
+	if (dev->msix_table == NULL || dev->msix_table_size > MAX_MSIX_TABLE_SIZE) {
+		return;
+	}
+
+	for (entry = 0; entry < dev->msix_table_size; ++entry) {
+		dev->msix_table[entry].vector_control = 1;
+	}
+	vmd_enable_msix(dev);
+}
+
+static void
+vmd_bus_update_bridge_info(struct vmd_pci_device *bridge)
+{
+	/* Update the subordinate bus of all bridges above this bridge */
+	volatile struct vmd_pci_device *dev = bridge;
+	uint8_t subordinate_bus;
+
+	if (!dev) {
+		return;
+	}
+	subordinate_bus = bridge->header->one.subordinate;
+	while (dev->parent_bridge != NULL) {
+		dev = dev->parent_bridge;
+		if (dev->header->one.subordinate < subordinate_bus) {
+			dev->header->one.subordinate = subordinate_bus;
+			subordinate_bus = dev->header->one.subordinate;
+		}
+	}
+}
+
+static bool
+vmd_is_supported_device(struct vmd_pci_device *dev)
+{
+	return dev->class == PCI_CLASS_STORAGE_EXPRESS;
+}
+
+static int
+vmd_dev_map_bar(struct spdk_pci_device *pci_dev, uint32_t bar,
+		void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+	struct vmd_pci_device *dev = SPDK_CONTAINEROF(pci_dev, struct vmd_pci_device, pci);
+
+	*size = dev->bar[bar].size;
+	*phys_addr = dev->bar[bar].start;
+	*mapped_addr = (void *)dev->bar[bar].vaddr;
+
+	return 0;
+}
+
+static int
+vmd_dev_unmap_bar(struct spdk_pci_device *_dev, uint32_t bar, void *addr)
+{
+	return 0;
+}
+
+static int
+vmd_dev_cfg_read(struct spdk_pci_device *_dev, void *value, uint32_t len,
+		 uint32_t offset)
+{
+	struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci);
+	volatile uint8_t *src = (volatile uint8_t *)dev->header;
+	uint8_t *dst = value;
+	size_t i;
+
+	if (len + offset > PCI_MAX_CFG_SIZE) {
+		return -1;
+	}
+
+	for (i = 0; i < len; ++i) {
+		dst[i] = src[offset + i];
+	}
+
+	return 0;
+}
+
+static int
+vmd_dev_cfg_write(struct spdk_pci_device *_dev,  void *value,
+		  uint32_t len, uint32_t offset)
+{
+	struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci);
+	volatile uint8_t *dst = (volatile uint8_t *)dev->header;
+	uint8_t *src = value;
+	size_t i;
+
+	if ((len + offset) > PCI_MAX_CFG_SIZE) {
+		return -1;
+	}
+
+	for (i = 0; i < len; ++i) {
+		dst[offset + i] = src[i];
+	}
+
+	return 0;
+}
+
+static void
+vmd_dev_detach(struct spdk_pci_device *dev)
+{
+	struct vmd_pci_device *vmd_device = (struct vmd_pci_device *)dev;
+	struct vmd_pci_device *bus_device = vmd_device->bus->self;
+	struct vmd_pci_bus *bus = vmd_device->bus;
+	size_t i, num_bars = vmd_device->header_type ? 2 : 6;
+
+	spdk_pci_unhook_device(dev);
+	TAILQ_REMOVE(&bus->dev_list, vmd_device, tailq);
+
+	/* Release the hotplug region if the device is under hotplug-capable bus */
+	if (bus_device && bus_device->hotplug_capable) {
+		for (i = 0; i < num_bars; ++i) {
+			if (vmd_device->bar[i].start != 0) {
+				vmd_hotplug_free_addr(&bus_device->hp, vmd_device->bar[i].start);
+			}
+		}
+	}
+
+	free(dev);
+}
+
+static void
+vmd_dev_init(struct vmd_pci_device *dev)
+{
+	uint8_t bdf[32];
+
+	dev->pci.addr.domain = dev->bus->vmd->domain;
+	dev->pci.addr.bus = dev->bus->bus_number;
+	dev->pci.addr.dev = dev->devfn;
+	dev->pci.addr.func = 0;
+	dev->pci.id.vendor_id = dev->header->common.vendor_id;
+	dev->pci.id.device_id = dev->header->common.device_id;
+	dev->pci.type = "vmd";
+	dev->pci.map_bar = vmd_dev_map_bar;
+	dev->pci.unmap_bar = vmd_dev_unmap_bar;
+	dev->pci.cfg_read = vmd_dev_cfg_read;
+	dev->pci.cfg_write = vmd_dev_cfg_write;
+	dev->hotplug_capable = false;
+	if (dev->pcie_cap != NULL) {
+		dev->cached_slot_control = dev->pcie_cap->slot_control;
+	}
+
+	if (vmd_is_supported_device(dev)) {
+		spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->pci.addr);
+		SPDK_DEBUGLOG(SPDK_LOG_VMD, "Initalizing NVMe device at %s\n", bdf);
+		dev->pci.parent = dev->bus->vmd->pci;
+		spdk_pci_hook_device(spdk_pci_nvme_get_driver(), &dev->pci);
+	}
+}
+
+/*
+ * Scans a single bus for all devices attached and return a count of
+ * how many devices found. In the VMD topology, it is assume there are no multi-
+ * function devices. Hence a bus(bridge) will not have multi function with both type
+ * 0 and 1 header.
+ *
+ * The other option  for implementing this function is the bus is an int and
+ * create a new device PciBridge. PciBridge would inherit from PciDevice with extra fields,
+ * sub/pri/sec bus. The input becomes PciPort, bus number and parent_bridge.
+ *
+ * The bus number is scanned and if a device is found, based on the header_type, create
+ * either PciBridge(1) or PciDevice(0).
+ *
+ * If a PciBridge, assign bus numbers and rescan new bus. The currenty PciBridge being
+ * scanned becomes the passed in parent_bridge with the new bus number.
+ *
+ * The linked list becomes list of pciBridges with PciDevices attached.
+ *
+ * Return count of how many devices found(type1 + type 0 header devices)
+ */
+static uint8_t
+vmd_scan_single_bus(struct vmd_pci_bus *bus, struct vmd_pci_device *parent_bridge)
+{
+	/* assuming only single function devices are on the bus */
+	struct vmd_pci_device *new_dev;
+	struct vmd_adapter *vmd;
+	union express_slot_capabilities_register slot_cap;
+	struct vmd_pci_bus *new_bus;
+	uint8_t  device_number, dev_cnt = 0;
+	uint8_t new_bus_num;
+
+	for (device_number = 0; device_number < 32; device_number++) {
+		new_dev = vmd_alloc_dev(bus, device_number);
+		if (new_dev == NULL) {
+			continue;
+		}
+
+		dev_cnt++;
+		if (new_dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
+			slot_cap.as_uint32_t = 0;
+			if (new_dev->pcie_cap != NULL) {
+				slot_cap.as_uint32_t = new_dev->pcie_cap->slot_cap.as_uint32_t;
+			}
+
+			new_bus_num = vmd_get_next_bus_number(bus->vmd->is_hotplug_scan ? new_dev : NULL, bus->vmd);
+			if (new_bus_num == 0xff) {
+				free(new_dev);
+				return dev_cnt;
+			}
+			new_bus = vmd_create_new_bus(bus, new_dev, new_bus_num);
+			if (!new_bus) {
+				free(new_dev);
+				return dev_cnt;
+			}
+			new_bus->primary_bus = bus->secondary_bus;
+			new_bus->self = new_dev;
+			new_dev->bus_object = new_bus;
+
+			if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL &&
+			    new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
+				new_bus->hotplug_buses = vmd_get_hotplug_bus_numbers(new_dev);
+				new_bus->subordinate_bus += new_bus->hotplug_buses;
+
+				/* Attach hot plug instance if HP is supported */
+				/* Hot inserted SSDs can be assigned port bus of sub-ordinate + 1 */
+				SPDK_DEBUGLOG(SPDK_LOG_VMD, "hotplug_capable/slot_implemented = "
+					      "%x:%x\n", slot_cap.bit_field.hotplug_capable,
+					      new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented);
+			}
+
+			new_dev->parent_bridge = parent_bridge;
+			new_dev->header->one.primary = new_bus->primary_bus;
+			new_dev->header->one.secondary = new_bus->secondary_bus;
+			new_dev->header->one.subordinate = new_bus->subordinate_bus;
+
+			vmd_bus_update_bridge_info(new_dev);
+			TAILQ_INSERT_TAIL(&bus->vmd->bus_list, new_bus, tailq);
+
+			vmd_dev_init(new_dev);
+
+			if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL &&
+			    new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
+				vmd_init_hotplug(new_dev, new_bus);
+			}
+
+			dev_cnt += vmd_scan_single_bus(new_bus, new_dev);
+			if (new_dev->pcie_cap != NULL) {
+				if (new_dev->pcie_cap->express_cap_register.bit_field.device_type == SwitchUpstreamPort) {
+					return dev_cnt;
+				}
+			}
+		} else {
+			/* Attach the device to the current bus and assign base addresses */
+			TAILQ_INSERT_TAIL(&bus->dev_list, new_dev, tailq);
+			g_end_device_count++;
+			if (vmd_assign_base_addrs(new_dev)) {
+				vmd_setup_msix(new_dev, &bus->vmd->msix_table[0]);
+				vmd_dev_init(new_dev);
+				if (vmd_is_supported_device(new_dev)) {
+					vmd = bus->vmd;
+					vmd->target[vmd->nvme_count] = new_dev;
+					vmd->nvme_count++;
+				}
+			} else {
+				SPDK_DEBUGLOG(SPDK_LOG_VMD, "Removing failed device:%p\n", new_dev);
+				TAILQ_REMOVE(&bus->dev_list, new_dev, tailq);
+				free(new_dev);
+				if (dev_cnt) {
+					dev_cnt--;
+				}
+			}
+		}
+	}
+
+	return dev_cnt;
+}
+
+static void
+vmd_print_pci_info(struct vmd_pci_device *dev)
+{
+	if (!dev) {
+		return;
+	}
+
+	if (dev->pcie_cap != NULL) {
+		SPDK_INFOLOG(SPDK_LOG_VMD, "PCI DEVICE: [%04X:%04X] type(%x) : %s\n",
+			     dev->header->common.vendor_id, dev->header->common.device_id,
+			     dev->pcie_cap->express_cap_register.bit_field.device_type,
+			     device_type[dev->pcie_cap->express_cap_register.bit_field.device_type]);
+	} else {
+		SPDK_INFOLOG(SPDK_LOG_VMD, "PCI DEVICE: [%04X:%04X]\n",
+			     dev->header->common.vendor_id, dev->header->common.device_id);
+	}
+
+	SPDK_INFOLOG(SPDK_LOG_VMD, "\tDOMAIN:BDF: %04x:%02x:%02x:%x\n", dev->pci.addr.domain,
+		     dev->pci.addr.bus, dev->pci.addr.dev, dev->pci.addr.func);
+
+	if (!(dev->header_type & PCI_HEADER_TYPE_BRIDGE) && dev->bus) {
+		SPDK_INFOLOG(SPDK_LOG_VMD, "\tbase addr: %x : %p\n",
+			     dev->header->zero.BAR[0], (void *)dev->bar[0].vaddr);
+	}
+
+	if ((dev->header_type & PCI_HEADER_TYPE_BRIDGE)) {
+		SPDK_INFOLOG(SPDK_LOG_VMD, "\tPrimary = %d, Secondary = %d, Subordinate = %d\n",
+			     dev->header->one.primary, dev->header->one.secondary, dev->header->one.subordinate);
+		if (dev->pcie_cap && dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
+			SPDK_INFOLOG(SPDK_LOG_VMD, "\tSlot implemented on this device.\n");
+			if (dev->pcie_cap->slot_cap.bit_field.hotplug_capable) {
+				SPDK_INFOLOG(SPDK_LOG_VMD, "Device has HOT-PLUG capable slot.\n");
+			}
+		}
+	}
+
+	if (dev->sn_cap != NULL) {
+		uint8_t *snLow = (uint8_t *)&dev->sn_cap->sn_low;
+		uint8_t *snHi = (uint8_t *)&dev->sn_cap->sn_hi;
+
+		SPDK_INFOLOG(SPDK_LOG_VMD, "\tSN: %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x\n",
+			     snHi[3], snHi[2], snHi[1], snHi[0], snLow[3], snLow[2], snLow[1], snLow[0]);
+	}
+}
+
+static void
+vmd_cache_scan_info(struct vmd_pci_device *dev)
+{
+	uint32_t reg __attribute__((unused));
+
+	if (dev->header_type == PCI_HEADER_TYPE_NORMAL) {
+		return;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "vendor/device id:%x:%x\n", dev->header->common.vendor_id,
+		      dev->header->common.device_id);
+
+	if (vmd_device_is_root_port(dev)) {
+		dev->header->one.prefetch_base_upper = VMD_UPPER_BASE_SIGNATURE;
+		reg = dev->header->one.prefetch_base_upper;
+		dev->header->one.prefetch_limit_upper = VMD_UPPER_LIMIT_SIGNATURE;
+		reg = dev->header->one.prefetch_limit_upper;
+
+		SPDK_DEBUGLOG(SPDK_LOG_VMD, "prefetch: %x:%x\n",
+			      dev->header->one.prefetch_base_upper,
+			      dev->header->one.prefetch_limit_upper);
+	}
+}
+
+static uint8_t
+vmd_scan_pcibus(struct vmd_pci_bus *bus)
+{
+	struct vmd_pci_bus *bus_entry;
+	struct vmd_pci_device *dev;
+	uint8_t dev_cnt;
+
+	g_end_device_count = 0;
+	TAILQ_INSERT_TAIL(&bus->vmd->bus_list, bus, tailq);
+	bus->vmd->next_bus_number = bus->bus_number + 1;
+	dev_cnt = vmd_scan_single_bus(bus, NULL);
+
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "VMD scan found %u devices\n", dev_cnt);
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "VMD scan found %u END DEVICES\n", g_end_device_count);
+
+	SPDK_INFOLOG(SPDK_LOG_VMD, "PCIe devices attached to VMD %04x:%02x:%02x:%x...\n",
+		     bus->vmd->pci->addr.domain, bus->vmd->pci->addr.bus,
+		     bus->vmd->pci->addr.dev, bus->vmd->pci->addr.func);
+
+	TAILQ_FOREACH(bus_entry, &bus->vmd->bus_list, tailq) {
+		if (bus_entry->self != NULL) {
+			vmd_print_pci_info(bus_entry->self);
+			vmd_cache_scan_info(bus_entry->self);
+		}
+
+		TAILQ_FOREACH(dev, &bus_entry->dev_list, tailq) {
+			vmd_print_pci_info(dev);
+		}
+	}
+
+	return dev_cnt;
+}
+
+static int
+vmd_map_bars(struct vmd_adapter *vmd, struct spdk_pci_device *dev)
+{
+	int rc;
+
+	rc = spdk_pci_device_map_bar(dev, 0, (void **)&vmd->cfg_vaddr,
+				     &vmd->cfgbar, &vmd->cfgbar_size);
+	if (rc == 0) {
+		rc = spdk_pci_device_map_bar(dev, 2, (void **)&vmd->mem_vaddr,
+					     &vmd->membar, &vmd->membar_size);
+	}
+
+	if (rc == 0) {
+		rc = spdk_pci_device_map_bar(dev, 4, (void **)&vmd->msix_vaddr,
+					     &vmd->msixbar, &vmd->msixbar_size);
+	}
+
+	if (rc == 0) {
+		vmd->physical_addr = vmd->membar;
+		vmd->current_addr_size = vmd->membar_size;
+	}
+	return rc;
+}
+
+static int
+vmd_enumerate_devices(struct vmd_adapter *vmd)
+{
+	vmd->vmd_bus.vmd = vmd;
+	vmd->vmd_bus.secondary_bus = vmd->vmd_bus.subordinate_bus = 0;
+	vmd->vmd_bus.primary_bus = vmd->vmd_bus.bus_number = 0;
+	vmd->vmd_bus.domain = vmd->pci->addr.domain;
+
+	return vmd_scan_pcibus(&vmd->vmd_bus);
+}
+
+struct vmd_pci_device *
+vmd_find_device(const struct spdk_pci_addr *addr)
+{
+	struct vmd_pci_bus *bus;
+	struct vmd_pci_device *dev;
+	int i;
+
+	for (i = 0; i < MAX_VMD_TARGET; ++i) {
+		TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
+			if (bus->self) {
+				if (spdk_pci_addr_compare(&bus->self->pci.addr, addr) == 0) {
+					return bus->self;
+				}
+			}
+
+			TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
+				if (spdk_pci_addr_compare(&dev->pci.addr, addr) == 0) {
+					return dev;
+				}
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static int
+vmd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+	uint32_t cmd_reg = 0;
+	char bdf[32] = {0};
+	struct vmd_container *vmd_c = ctx;
+	size_t i;
+
+	spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
+	cmd_reg |= 0x6;                      /* PCI bus master/memory enable. */
+	spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
+
+	spdk_pci_addr_fmt(bdf, sizeof(bdf), &pci_dev->addr);
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "Found a VMD[ %d ] at %s\n", vmd_c->count, bdf);
+
+	/* map vmd bars */
+	i = vmd_c->count;
+	vmd_c->vmd[i].pci = pci_dev;
+	vmd_c->vmd[i].vmd_index = i;
+	vmd_c->vmd[i].domain =
+		(pci_dev->addr.bus << 16) | (pci_dev->addr.dev << 8) | pci_dev->addr.func;
+	vmd_c->vmd[i].max_pci_bus = PCI_MAX_BUS_NUMBER;
+	TAILQ_INIT(&vmd_c->vmd[i].bus_list);
+
+	if (vmd_map_bars(&vmd_c->vmd[i], pci_dev) == -1) {
+		return -1;
+	}
+
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd config bar(%p) vaddr(%p) size(%x)\n",
+		      (void *)vmd_c->vmd[i].cfgbar, (void *)vmd_c->vmd[i].cfg_vaddr,
+		      (uint32_t)vmd_c->vmd[i].cfgbar_size);
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd mem bar(%p) vaddr(%p) size(%x)\n",
+		      (void *)vmd_c->vmd[i].membar, (void *)vmd_c->vmd[i].mem_vaddr,
+		      (uint32_t)vmd_c->vmd[i].membar_size);
+	SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd msix bar(%p) vaddr(%p) size(%x)\n\n",
+		      (void *)vmd_c->vmd[i].msixbar, (void *)vmd_c->vmd[i].msix_vaddr,
+		      (uint32_t)vmd_c->vmd[i].msixbar_size);
+
+	vmd_c->count = i + 1;
+
+	vmd_enumerate_devices(&vmd_c->vmd[i]);
+
+	return 0;
+}
+
+int
+spdk_vmd_pci_device_list(struct spdk_pci_addr vmd_addr, struct spdk_pci_device *nvme_list)
+{
+	int cnt = 0;
+	struct vmd_pci_bus *bus;
+	struct vmd_pci_device *dev;
+
+	if (!nvme_list) {
+		return -1;
+	}
+
+	for (int i = 0; i < MAX_VMD_TARGET; ++i) {
+		if (spdk_pci_addr_compare(&vmd_addr, &g_vmd_container.vmd[i].pci->addr) == 0) {
+			TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
+				TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
+					nvme_list[cnt++] = dev->pci;
+					if (!dev->is_hooked) {
+						vmd_dev_init(dev);
+						dev->is_hooked = 1;
+					}
+				}
+			}
+		}
+	}
+
+	return cnt;
+}
+
+static void
+vmd_clear_hotplug_status(struct vmd_pci_bus *bus)
+{
+	struct vmd_pci_device *device = bus->self;
+	uint16_t status __attribute__((unused));
+
+	status = device->pcie_cap->slot_status.as_uint16_t;
+	device->pcie_cap->slot_status.as_uint16_t = status;
+	status = device->pcie_cap->slot_status.as_uint16_t;
+
+	status = device->pcie_cap->link_status.as_uint16_t;
+	device->pcie_cap->link_status.as_uint16_t = status;
+	status = device->pcie_cap->link_status.as_uint16_t;
+}
+
+static void
+vmd_bus_handle_hotplug(struct vmd_pci_bus *bus)
+{
+	uint8_t num_devices, sleep_count;
+
+	for (sleep_count = 0; sleep_count < 20; ++sleep_count) {
+		/* Scan until a new device is found */
+		num_devices = vmd_scan_single_bus(bus, bus->self);
+		if (num_devices > 0) {
+			break;
+		}
+
+		spdk_delay_us(200000);
+	}
+
+	if (num_devices == 0) {
+		SPDK_ERRLOG("Timed out while scanning for hotplugged devices\n");
+	}
+}
+
+static void
+vmd_bus_handle_hotremove(struct vmd_pci_bus *bus)
+{
+	struct vmd_pci_device *device, *tmpdev;
+
+	TAILQ_FOREACH_SAFE(device, &bus->dev_list, tailq, tmpdev) {
+		if (!vmd_bus_device_present(bus, device->devfn)) {
+			device->pci.internal.pending_removal = true;
+
+			/* If the device isn't attached, remove it immediately */
+			if (!device->pci.internal.attached) {
+				vmd_dev_detach(&device->pci);
+			}
+		}
+	}
+}
+
+int
+spdk_vmd_hotplug_monitor(void)
+{
+	struct vmd_pci_bus *bus;
+	struct vmd_pci_device *device;
+	int num_hotplugs = 0;
+	uint32_t i;
+
+	for (i = 0; i < g_vmd_container.count; ++i) {
+		TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
+			device = bus->self;
+			if (device == NULL || !device->hotplug_capable) {
+				continue;
+			}
+
+			if (device->pcie_cap->slot_status.bit_field.datalink_state_changed != 1) {
+				continue;
+			}
+
+			if (device->pcie_cap->link_status.bit_field.datalink_layer_active == 1) {
+				SPDK_DEBUGLOG(SPDK_LOG_VMD, "Device hotplug detected on bus "
+					      "%"PRIu32"\n", bus->bus_number);
+				vmd_bus_handle_hotplug(bus);
+			} else {
+				SPDK_DEBUGLOG(SPDK_LOG_VMD, "Device hotremove detected on bus "
+					      "%"PRIu32"\n", bus->bus_number);
+				vmd_bus_handle_hotremove(bus);
+			}
+
+			vmd_clear_hotplug_status(bus);
+			num_hotplugs++;
+		}
+	}
+
+	return num_hotplugs;
+}
+
+int
+spdk_vmd_init(void)
+{
+	return spdk_pci_enumerate(spdk_pci_vmd_get_driver(), vmd_enum_cb, &g_vmd_container);
+}
+
+void
+spdk_vmd_fini(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < g_vmd_container.count; ++i) {
+		spdk_pci_device_detach(g_vmd_container.vmd[i].pci);
+	}
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vmd", SPDK_LOG_VMD)
diff --git a/src/spdk/lib/vmd/vmd.h b/src/spdk/lib/vmd/vmd.h
new file mode 100644
index 000000000..46490a6f7
--- /dev/null
+++ b/src/spdk/lib/vmd/vmd.h
@@ -0,0 +1,201 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VMD_H
+#define VMD_H
+
+#include "spdk/stdinc.h"
+#include "spdk/vmd.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+#include "vmd_spec.h"
+
+struct vmd_hot_plug;
+struct vmd_adapter;
+struct vmd_pci_device;
+
+struct pci_bars {
+	uint64_t vaddr;
+	uint64_t start;
+	uint32_t size;
+};
+
+struct vmd_pci_bus {
+	struct vmd_adapter *vmd;
+	struct vmd_pci_bus *parent;	/* parent bus that this bus is attached to(primary bus. */
+	struct vmd_pci_device *self;		/* Pci device that describes this bus(bar, bus numbers, etc */
+
+	uint32_t  domain           : 8;
+	uint32_t  hotplug_buses    : 10;
+	uint32_t  is_added         : 1;
+	uint32_t  hp_event_queued  : 1;
+	uint32_t  rsv              : 12;
+
+	uint32_t  bus_number      : 8;
+	uint32_t  primary_bus     : 8;
+	uint32_t  secondary_bus   : 8;
+	uint32_t  subordinate_bus : 8;
+
+	TAILQ_HEAD(, vmd_pci_device) dev_list;	/* list of pci end device attached to this bus */
+	TAILQ_ENTRY(vmd_pci_bus) tailq;		/* link for all buses found during scan */
+};
+
+/*
+ * memory element for base address assignment and reuse
+ */
+struct pci_mem_mgr {
+	uint32_t			size : 30;        /* size of memory element */
+	uint32_t			in_use : 1;
+	uint32_t			rsv : 1;
+	uint64_t			addr;
+	TAILQ_ENTRY(pci_mem_mgr)	tailq;
+};
+
+struct vmd_hot_plug {
+	uint32_t count  : 12;
+	uint32_t reserved_bus_count : 4;
+	uint32_t max_hotplug_bus_number : 8;
+	uint32_t next_bus_number : 8;
+	struct pci_bars bar;
+	union express_slot_status_register slot_status;
+	struct pci_mem_mgr mem[ADDR_ELEM_COUNT];
+	uint8_t bus_numbers[RESERVED_HOTPLUG_BUSES];
+	struct vmd_pci_bus *bus;
+	TAILQ_HEAD(, pci_mem_mgr) free_mem_queue;
+	TAILQ_HEAD(, pci_mem_mgr) alloc_mem_queue;
+	TAILQ_HEAD(, pci_mem_mgr) unused_mem_queue;
+};
+
+struct vmd_pci_device {
+	struct spdk_pci_device pci;
+	struct pci_bars bar[6];
+
+	struct vmd_pci_device *parent_bridge;
+	struct vmd_pci_bus *bus, *parent;
+	struct vmd_pci_bus *bus_object;  /* bus tracks pci bus associated with this dev if type 1 dev. */
+	struct vmd_pci_bus *subordinate;
+	volatile struct pci_header *header;
+	volatile struct pci_express_cap *pcie_cap;
+	volatile struct pci_msix_capability *msix_cap;
+	volatile struct pci_msi_cap *msi_cap;
+	volatile struct serial_number_capability *sn_cap;
+	volatile struct pci_msix_table_entry *msix_table;
+
+	TAILQ_ENTRY(vmd_pci_device) tailq;
+
+	uint32_t  class;
+	uint16_t  vid;
+	uint16_t  did;
+	uint16_t  pcie_flags, msix_table_size;
+	uint32_t  devfn;
+	bool      hotplug_capable;
+
+	uint32_t  header_type    : 1;
+	uint32_t  multifunction  : 1;
+	uint32_t  hotplug_bridge : 1;
+	uint32_t  is_added       : 1;
+	uint32_t  is_hooked      : 1;
+	uint32_t  rsv1           : 12;
+	uint32_t  target         : 16;
+
+	struct vmd_hot_plug hp;
+	/* Cached version of the slot_control register */
+	union express_slot_control_register cached_slot_control;
+};
+
+/*
+ * The VMD adapter
+ */
+struct vmd_adapter {
+	struct spdk_pci_device *pci;
+	uint32_t domain;
+	/* physical and virtual VMD bars */
+	uint64_t cfgbar, cfgbar_size;
+	uint64_t membar, membar_size;
+	uint64_t msixbar, msixbar_size;
+	volatile uint8_t *cfg_vaddr;
+	volatile uint8_t *mem_vaddr;
+	volatile uint8_t *msix_vaddr;
+	volatile struct pci_msix_table_entry *msix_table;
+	uint32_t bar_sizes[6];
+
+	uint64_t physical_addr;
+	uint32_t current_addr_size;
+
+	uint32_t next_bus_number : 10;
+	uint32_t max_pci_bus : 10;
+	uint32_t is_hotplug_scan : 1;
+	uint32_t is_ready : 1;
+	uint32_t processing_hp : 1;
+	uint32_t max_payload_size: 3;
+	uint32_t root_port_updated : 1;
+	uint32_t scan_completed : 1;
+	uint32_t rsv : 4;
+
+	/* end devices attached to vmd adapters */
+	struct vmd_pci_device *target[MAX_VMD_TARGET];
+	uint32_t  dev_count  : 16;
+	uint32_t  nvme_count : 8;
+	uint32_t  vmd_index  : 8;
+
+	struct vmd_pci_bus vmd_bus;
+
+	TAILQ_HEAD(, vmd_pci_bus) bus_list;
+
+	struct event_fifo *hp_queue;
+};
+
+/* TODO: Temporary stubs for Hot Plug interface */
+static inline struct vmd_pci_bus *
+vmd_is_dev_in_hotplug_path(struct vmd_pci_device *dev)
+{
+	return NULL;
+}
+
+static inline void
+vmd_hp_enable_hotplug(struct vmd_hot_plug *hp)
+{
+
+}
+
+static inline uint8_t
+vmd_hp_get_next_bus_number(struct vmd_hot_plug *hp)
+{
+	assert(false);
+	return 0;
+}
+
+struct vmd_pci_device *vmd_find_device(const struct spdk_pci_addr *addr);
+
+#endif /* VMD_H */
diff --git a/src/spdk/lib/vmd/vmd_spec.h b/src/spdk/lib/vmd/vmd_spec.h
new file mode 100644
index 000000000..07a4a113d
--- /dev/null
+++ b/src/spdk/lib/vmd/vmd_spec.h
@@ -0,0 +1,473 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef VMD_SPEC_H
+#define VMD_SPEC_H
+
+#define MAX_VMD_SUPPORTED 48  /* max number of vmd controllers in a system - */
+#define VMD_DOMAIN_START 0x201D
+
+#define PCI_INVALID_VENDORID 0xFFFF
+#define ONE_MB (1<<20)
+#define PCI_OFFSET_OF(object, member)  ((uint32_t)&((object*)0)->member)
+#define TWOS_COMPLEMENT(value) (~(value) + 1)
+
+#define VMD_UPPER_BASE_SIGNATURE  0xFFFFFFEF
+#define VMD_UPPER_LIMIT_SIGNATURE 0xFFFFFFED
+
+/*
+ *  BAR assignment constants
+ */
+#define  PCI_DWORD_SHIFT            32
+#define  PCI_BASE_ADDR_MASK         0xFFFFFFF0
+#define  PCI_BAR_MEMORY_MASK        0x0000000F
+#define  PCI_BAR_MEMORY_MEM_IND     0x1
+#define  PCI_BAR_MEMORY_TYPE        0x6
+#define  PCI_BAR_MEMORY_PREFETCH    0x8
+#define  PCI_BAR_MEMORY_TYPE_32     0x0
+#define  PCI_BAR_MEMORY_TYPE_64     0x4
+#define  PCI_BAR_MB_MASK            0xFFFFF
+#define  PCI_PCI_BRIDGE_ADDR_DEF    0xFFF0
+#define  PCI_BRIDGE_MEMORY_MASK     0xFFF0
+#define  PCI_BRIDGE_PREFETCH_64     0x0001
+#define  PCI_BRIDGE_MEMORY_SHIFT    16
+#define  PCI_CONFIG_ACCESS_DELAY    500
+
+#define PCI_MAX_CFG_SIZE            0x1000
+
+#define PCI_HEADER_TYPE             0x0e
+#define PCI_HEADER_TYPE_NORMAL   0
+#define PCI_HEADER_TYPE_BRIDGE   1
+#define PCI_MULTI_FUNCTION 0x80
+
+#define PCI_COMMAND_MEMORY 0x2
+#define PCI_COMMAND_MASTER 0x4
+
+#define PCIE_TYPE_FLAGS 0xf0
+#define PCIE_TYPE_SHIFT 4
+#define PCIE_TYPE_ROOT_PORT 0x4
+#define PCIE_TYPE_DOWNSTREAM 0x6
+
+#define PCI_CLASS_STORAGE_EXPRESS   0x010802
+#define ADDR_ELEM_COUNT 32
+#define PCI_MAX_BUS_NUMBER 0x7F
+#define RESERVED_HOTPLUG_BUSES 1
+#define isHotPlugCapable(slotCap)  ((slotCap) & (1<<6))
+#define CONFIG_OFFSET_ADDR(bus, device, function, reg) (((bus)<<20) | (device)<<15 | (function<<12) | (reg))
+#define BRIDGE_BASEREG(reg)  (0xFFF0 & ((reg)>>16))
+
+#define MISCCTRLSTS_0_OFFSET  0x188
+#define ENABLE_ACPI_MODE_FOR_HOTPLUG  (1 << 3)
+
+/* Bit encodings for Command Register */
+#define IO_SPACE_ENABLE               0x0001
+#define MEMORY_SPACE_ENABLE           0x0002
+#define BUS_MASTER_ENABLE             0x0004
+
+/* Bit encodings for Status Register */
+#define PCI_CAPABILITIES_LIST        0x0010
+#define PCI_RECEIVED_TARGET_ABORT    0x1000
+#define PCI_RECEIVED_MASTER_ABORT    0x2000
+#define PCI_SIGNALED_SYSTEM_ERROR    0x4000
+#define PCI_DETECTED_PARITY_ERROR    0x8000
+
+/* Capability IDs */
+#define CAPABILITY_ID_POWER_MANAGEMENT  0x01
+#define CAPABILITY_ID_MSI   0x05
+#define CAPABILITY_ID_PCI_EXPRESS   0x10
+#define CAPABILITY_ID_MSIX  0x11
+
+#define  PCI_MSIX_ENABLE (1 << 15)          /* bit 15 of MSIX Message Control */
+#define  PCI_MSIX_FUNCTION_MASK (1 << 14)   /* bit 14 of MSIX Message Control */
+
+/* extended capability */
+#define EXTENDED_CAPABILITY_OFFSET 0x100
+#define DEVICE_SERIAL_NUMBER_CAP_ID  0x3
+
+#define BAR_SIZE (1 << 20)
+
+struct pci_enhanced_capability_header {
+	uint16_t capability_id;
+	uint16_t version: 4;
+	uint16_t next: 12;
+};
+
+struct serial_number_capability {
+	struct pci_enhanced_capability_header hdr;
+	uint32_t sn_low;
+	uint32_t sn_hi;
+};
+
+struct pci_header_common {
+	uint16_t  vendor_id;
+	uint16_t  device_id;
+	uint16_t  command;
+	uint16_t  status;
+	uint32_t  rev_class;
+	uint8_t   cache_line_size;
+	uint8_t   master_lat_timer;
+	uint8_t   header_type;
+	uint8_t   BIST;
+	uint8_t   rsvd12[36];
+	uint8_t   cap_pointer;
+	uint8_t   rsvd53[7];
+	uint8_t   int_line;
+	uint8_t   int_pin;
+	uint8_t   rsvd62[2];
+};
+
+struct pci_header_zero {
+	uint16_t  vendor_id;
+	uint16_t  device_id;
+	uint16_t  command;
+	uint16_t  status;
+	uint32_t  rev_class;
+	uint8_t   cache_line_size;
+	uint8_t   master_lat_timer;
+	uint8_t   header_type;
+	uint8_t   BIST;
+	uint32_t  BAR[6];
+	uint32_t  carbus_cis_pointer;
+	uint16_t  ssvid;
+	uint16_t  ssid;
+	uint32_t  exp_rom_base_addr;
+	uint8_t   cap_pointer;
+	uint8_t   rsvd53[7];
+	uint8_t   intLine;
+	uint8_t   int_pin;
+	uint8_t   min_gnt;
+	uint8_t   max_lat;
+};
+
+struct pci_header_one {
+	uint16_t  vendor_id;
+	uint16_t  device_id;
+	uint16_t  command;
+	uint16_t  status;
+	uint32_t  rev_class;
+	uint8_t   cache_line_size;
+	uint8_t   master_lat_timer;
+	uint8_t   header_type;
+	uint8_t   BIST;
+	uint32_t  BAR[2];
+	uint8_t   primary;
+	uint8_t   secondary;
+	uint8_t   subordinate;
+	uint8_t   secondary_lat_timer;
+	uint8_t   io_base;
+	uint8_t   io_limit;
+	uint16_t  secondary_status;
+	uint16_t  mem_base;
+	uint16_t  mem_limit;
+	uint16_t  prefetch_base;
+	uint16_t  prefetch_limit;
+	uint32_t  prefetch_base_upper;
+	uint32_t  prefetch_limit_upper;
+	uint16_t  io_base_upper;
+	uint16_t  io_limit_upper;
+	uint8_t   cap_pointer;
+	uint8_t   rsvd53[3];
+	uint32_t  exp_romBase_addr;
+	uint8_t   int_line;
+	uint8_t   int_pin;
+	uint16_t  bridge_control;
+};
+
+struct pci_capabilities_header {
+	uint8_t   capability_id;
+	uint8_t   next;
+};
+
+/*
+ * MSI capability structure for msi interrupt vectors
+ */
+#define MAX_MSIX_TABLE_SIZE 0x800
+#define MSIX_ENTRY_VECTOR_CTRL_MASKBIT 1
+#define PORT_INT_VECTOR  0;
+#define CLEAR_MSIX_DESTINATION_ID 0xfff00fff
+struct pci_msi_cap {
+	struct pci_capabilities_header header;
+	union _MsiControl {
+		uint16_t as_uint16_t;
+		struct _PCI_MSI_MESSAGE_CONTROL {
+			uint16_t msi_enable : 1;
+			uint16_t multiple_message_capable : 3;
+			uint16_t multiple_message_enable : 3;
+			uint16_t capable_of_64bits : 1;
+			uint16_t per_vector_mask_capable : 1;
+			uint16_t reserved : 7;
+		} bit;
+	} message_control;
+	union {
+		struct _PCI_MSI_MESSAGE_ADDRESS {
+			uint32_t reserved : 2;
+			uint32_t address : 30;
+		} reg;
+		uint32_t  raw;
+	} message_address_lower;
+	union {
+		struct _Option32_bit {
+			uint16_t message_data;
+		} option32_bit;
+		struct _Option64_bit {
+			uint32_t  message_address_upper;
+			uint16_t  message_data;
+			uint16_t  reserved;
+			uint32_t  mask_bits;
+			uint32_t  pending_bits;
+		} option64_bit;
+	};
+};
+
+struct pcix_table_pointer {
+	union {
+		struct {
+			uint32_t BaseIndexRegister : 3;
+			uint32_t Reserved : 29;
+		} TableBIR;
+		uint32_t  TableOffset;
+	};
+};
+
+struct pci_msix_capability {
+	struct pci_capabilities_header header;
+	union _MsixControl {
+		uint16_t as_uint16_t;
+		struct msg_ctrl {
+			uint16_t table_size : 11;
+			uint16_t reserved : 3;
+			uint16_t function_mask : 1;
+			uint16_t msix_enable : 1;
+		} bit;
+	} message_control;
+
+	struct pcix_table_pointer message_table;
+	struct pcix_table_pointer   pba_table;
+};
+
+struct pci_msix_table_entry {
+	volatile uint32_t  message_addr_lo;
+	volatile uint32_t  message_addr_hi;
+	volatile uint32_t  message_data;
+	volatile uint32_t  vector_control;
+};
+
+/*
+ * Pci express capability
+ */
+enum PciExpressCapabilities {
+	/* 0001b Legacy PCI Express Endpoint            */
+	LegacyEndpoint       = 0x1,
+	/* 0000b PCI Express Endpoint                   */
+	ExpressEndpoint      = 0x0,
+	/* 0100b Root Port of PCI Express Root Complex* */
+	RootComplexRootPort  = 0x4,
+	/* 0101b Upstream Port of PCI Express Switch*   */
+	SwitchUpstreamPort   = 0x5,
+	/* 0110b Downstream Port of PCI Express Switch* */
+	SwitchDownStreamPort = 0x6,
+	/* 0111b PCI Express to PCI/PCI-X Bridge*       */
+	ExpressToPciBridge   = 0x7,
+	/* 1000b PCI/PCI-X to PCI Express Bridge*       */
+	PciToExpressBridge   = 0x8,
+	/* 1001b Root Complex Integrated Endpoint       */
+	RCIntegratedEndpoint = 0x9,
+	/* 1010b Root Complex Event Collector           */
+	RootComplexEventCollector = 0xa,
+	InvalidCapability = 0xff
+};
+
+union express_capability_register {
+	struct {
+		uint16_t capability_version : 4;
+		uint16_t device_type : 4;
+		uint16_t slot_implemented : 1;
+		uint16_t interrupt_message_number : 5;
+		uint16_t rsv : 2;
+	} bit_field;
+	uint16_t as_uint16_t;
+};
+
+union express_slot_capabilities_register {
+	struct {
+		uint32_t attention_button_present : 1;
+		uint32_t power_controller_present : 1;
+		uint32_t MRL_sensor_present : 1;
+		uint32_t attention_indicator_present : 1;
+		uint32_t power_indicator_present : 1;
+		uint32_t hotplug_surprise : 1;
+		uint32_t hotplug_capable : 1;
+		uint32_t slot_power_limit : 8;
+		uint32_t slotPower_limit_scale : 2;
+		uint32_t electromechanical_lock_present : 1;
+		uint32_t no_command_completed_support : 1;
+		uint32_t physical_slot_number : 13;
+	} bit_field;
+	uint32_t as_uint32_t;
+};
+
+union express_slot_control_register {
+	struct {
+		uint16_t attention_button_enable : 1;
+		uint16_t power_fault_detect_enable : 1;
+		uint16_t MRLsensor_enable : 1;
+		uint16_t presence_detect_enable : 1;
+		uint16_t command_completed_enable : 1;
+		uint16_t hotplug_interrupt_enable : 1;
+		uint16_t attention_indicator_control : 2;
+		uint16_t power_indicator_control : 2;
+		uint16_t power_controller_control : 1;
+		uint16_t electromechanical_lockcontrol : 1;
+		uint16_t datalink_state_change_enable : 1;
+		uint16_t Rsvd : 3;
+	} bit_field;
+	uint16_t as_uint16_t;
+};
+
+union express_slot_status_register {
+	struct {
+		uint16_t attention_button_pressed : 1;
+		uint16_t power_fault_detected : 1;
+		uint16_t MRL_sensor_changed : 1;
+		uint16_t presence_detect_changed : 1;
+		uint16_t command_completed : 1;
+		uint16_t MRL_sensor_state : 1;
+		uint16_t presence_detect_state : 1;
+		uint16_t electromechanical_lock_engaged : 1;
+		uint16_t datalink_state_changed : 1;
+		uint16_t rsvd : 7;
+	} bit_field;
+	uint16_t as_uint16_t;
+};
+
+union express_root_control_register {
+	struct {
+		uint16_t CorrectableSerrEnable : 1;
+		uint16_t NonFatalSerrEnable : 1;
+		uint16_t FatalSerrEnable : 1;
+		uint16_t PMEInterruptEnable : 1;
+		uint16_t CRSSoftwareVisibilityEnable : 1;
+		uint16_t Rsvd : 11;
+	} bit_field;
+	uint16_t as_uint16_t;
+};
+
+union express_link_capability_register {
+	struct {
+		uint32_t maximum_link_speed : 4;
+		uint32_t maximum_link_width : 6;
+		uint32_t active_state_pms_support : 2;
+		uint32_t l0_exit_latency : 3;
+		uint32_t l1_exit_latency : 3;
+		uint32_t clock_power_management : 1;
+		uint32_t surprise_down_error_reporting_capable : 1;
+		uint32_t datalink_layer_active_reporting_capable : 1;
+		uint32_t link_bandwidth_notification_capability : 1;
+		uint32_t aspm_optionality_compliance : 1;
+		uint32_t rsvd : 1;
+		uint32_t port_number : 8;
+	} bit_field;
+	uint32_t as_uint32_t;
+};
+
+union express_link_control_register {
+	struct {
+		uint16_t active_state_pm_control : 2;
+		uint16_t rsvd1 : 1;
+		uint16_t read_completion_boundary : 1;
+		uint16_t link_disable : 1;
+		uint16_t retrain_link : 1;
+		uint16_t common_clock_config : 1;
+		uint16_t extended_synch : 1;
+		uint16_t enable_clock_power_management : 1;
+		uint16_t rsvd2 : 7;
+	} bit_field;
+	uint16_t as_uint16_t;
+};
+
+union express_link_status_register {
+	struct {
+		uint16_t link_speed : 4;
+		uint16_t link_width : 6;
+		uint16_t undefined : 1;
+		uint16_t link_training : 1;
+		uint16_t slot_clock_config : 1;
+		uint16_t datalink_layer_active : 1;
+		uint16_t asvd : 2;
+	} bit_field;
+	uint16_t as_uint16_t;
+};
+
+struct pci_express_cap {
+	uint8_t capid;
+	uint8_t next_cap;
+	union express_capability_register express_cap_register;
+	uint32_t device_cap;
+	uint16_t device_control;
+	uint16_t device_status;
+	union express_link_capability_register link_cap;
+	union express_link_control_register link_control;
+	union express_link_status_register link_status;
+	union express_slot_capabilities_register slot_cap;
+	union express_slot_control_register slot_control;
+	union express_slot_status_register slot_status;
+	uint32_t root_status;
+	uint32_t deviceCap2;
+	uint16_t deviceControl2;
+	uint16_t deviceStatus2;
+	uint32_t linkCap2;
+	uint16_t linkControl2;
+	uint16_t linkStatus2;
+	uint32_t slotCap2;
+	uint16_t slotControl2;
+	uint16_t slotStatus2;
+};
+
+struct pci_msix_cap {
+	uint8_t   cap_idd;
+	uint8_t   next_cap;
+	uint16_t  msg_control_reg;
+	uint32_t  msix_table_offset;
+	uint32_t  pba_offset;
+};
+
+struct pci_header {
+	union {
+		struct pci_header_common common;
+		struct pci_header_zero zero;
+		struct pci_header_one one;
+	};
+};
+
+#endif /* VMD_SPEC_H */